def predict_for_user(self, user): use_ids = False interests = collections.defaultdict(int) if re.findall("[a-zA-Z]", user) == []: use_ids = True users_info = self.tweets.collect_users_info([user], use_ids=use_ids) info = json.loads(users_info[0]) following = self.tweets.collect_user_follows(info["screen_name"], "following") tweets = self.tweets.collect_user_tweets(info["id"], 1, 200) user_infos = self.tweets.collect_users_info(following, use_ids=True) if len(user_infos) == 0: return [] r_users = [] for i in user_infos: info = json.loads(i) if info["followers_count"] > util.minimum_number_of_followers: r_users.append(info) self.logger.info("representative users: %s" % len(r_users)) for ind, info in enumerate(r_users): if ind == util.number_of_users_used_to_predict: break tweets = self.tweets.collect_user_tweets(info["id"], 1, 200) self.logger.info("%s tweets extracted ------------------------------" % len(tweets)) user_doc = "" for t in tweets: text = t["text"] text = util.clear_tweet(text) user_doc += text test = self.vectorizer.transform([user_doc]) cat = self.clf.predict(test) interests[str(cat[0])] += 1 self.logger.info("%s predicted category: %s" % (info["screen_name"], cat)) # return categories which appear at least three times categories = filter(lambda x: interests[x] > 2, interests.keys()) return categories
def predict_for_user(self, user): use_ids = False interests = collections.defaultdict(int) if re.findall("[a-zA-Z]", user) == []: use_ids = True users_info = self.tweets.collect_users_info([user], use_ids=use_ids) info = json.loads(users_info[0]) following = self.tweets.collect_user_follows(info["screen_name"], "following") tweets = self.tweets.collect_user_tweets(info["id"], 1, 200) user_infos = self.tweets.collect_users_info(following, use_ids=True) if len(user_infos) == 0: return [] r_users = [] for i in user_infos: info = json.loads(i) if info["followers_count"] > util.minimum_number_of_followers: r_users.append(info) self.logger.info("representative users: %s" % len(r_users)) for ind, info in enumerate(r_users): if ind == util.number_of_users_used_to_predict: break tweets = self.tweets.collect_user_tweets(info["id"], 1, 200) self.logger.info( "%s tweets extracted ------------------------------" % len(tweets)) user_doc = "" for t in tweets: text = t["text"] text = util.clear_tweet(text) user_doc += text test = self.vectorizer.transform([user_doc]) cat = self.clf.predict(test) interests[str(cat[0])] += 1 self.logger.info("%s predicted category: %s" % (info["screen_name"], cat)) # return categories which appear at least three times categories = filter(lambda x: interests[x] > 2, interests.keys()) return categories
def collect_category_data(self, category_name, seed_user=None): users = self.get_users_for_category(category_name, seed_user) self.logger.info("users for category %s: %s" % (category_name, len(users))) users = users[:util.users_per_category] user_infos = self.tweets.collect_users_info(users, use_ids=True) r_users = [] for i in user_infos: info = json.loads(i) if info["followers_count"] > util.minimum_number_of_followers: r_users.append(info) self.logger.info("representative users: %s" % len(r_users)) for ind, user in enumerate(r_users): user_name = user["screen_name"] base_dir = util.data_dir path = os.path.join(base_dir, category_name) if not os.path.exists(path): os.makedirs(path) tweets = self.tweets.collect_user_tweets(user["id"], 1, 200) self.logger.info("%s tweets extracted" % len(tweets)) docs = [] ids = [] for t in tweets: t_id = str(t["id"]) text = t["text"] text = text.replace("\n", " ") text = util.clear_tweet(text) if text.strip() == "": continue docs.append(str(t_id) + " " + text + "\n") ids.append(t_id) file_path = os.path.join(path, user_name) user_doc = "" if os.path.exists(file_path): f = open(file_path) first_line = f.readline() last_tweet_id = first_line.split(" ")[0] for ind, doc in enumerate(docs): t_id = ids[ind] if t_id != last_tweet_id: user_doc += doc else: break user_doc += "\n" + user_doc f.close() f = open(file_path, "w") else: f = open(file_path, "w+") for ind, doc in enumerate(docs): user_doc += doc if user_doc.strip() == "": f.close() os.remove(file_path) continue is_english = False count = 0 for stop_word in self.stop_words: if stop_word in user_doc: count += 1 if count == 10: is_english = True break if is_english: f.write(user_doc.encode("utf-8")) f.close() else: self.logger.info("Seems to be a non english text.") f.close() os.remove(file_path) return len(r_users)