def predict_for_user(self, user):
     use_ids = False
     interests = collections.defaultdict(int)
     if re.findall("[a-zA-Z]", user) == []:
         use_ids = True
     users_info = self.tweets.collect_users_info([user], use_ids=use_ids)
     info = json.loads(users_info[0])
     following = self.tweets.collect_user_follows(info["screen_name"], "following")
     tweets = self.tweets.collect_user_tweets(info["id"], 1, 200)
     user_infos = self.tweets.collect_users_info(following, use_ids=True)
     if len(user_infos) == 0:
         return []
     r_users = []
     for i in user_infos:
         info = json.loads(i)
         if info["followers_count"] > util.minimum_number_of_followers:
             r_users.append(info)
     self.logger.info("representative users: %s" % len(r_users))
     for ind, info in enumerate(r_users):
         if ind == util.number_of_users_used_to_predict:
             break
         tweets = self.tweets.collect_user_tweets(info["id"], 1, 200)
         self.logger.info("%s tweets extracted ------------------------------" % len(tweets))
         user_doc = ""
         for t in tweets:
             text = t["text"]
             text = util.clear_tweet(text)
             user_doc += text
         test = self.vectorizer.transform([user_doc])
         cat = self.clf.predict(test)
         interests[str(cat[0])] += 1
         self.logger.info("%s predicted category: %s" % (info["screen_name"], cat))
     # return categories which appear at least three times
     categories = filter(lambda x: interests[x] > 2, interests.keys())
     return categories
 def predict_for_user(self, user):
     use_ids = False
     interests = collections.defaultdict(int)
     if re.findall("[a-zA-Z]", user) == []:
         use_ids = True
     users_info = self.tweets.collect_users_info([user], use_ids=use_ids)
     info = json.loads(users_info[0])
     following = self.tweets.collect_user_follows(info["screen_name"],
                                                  "following")
     tweets = self.tweets.collect_user_tweets(info["id"], 1, 200)
     user_infos = self.tweets.collect_users_info(following, use_ids=True)
     if len(user_infos) == 0:
         return []
     r_users = []
     for i in user_infos:
         info = json.loads(i)
         if info["followers_count"] > util.minimum_number_of_followers:
             r_users.append(info)
     self.logger.info("representative users: %s" % len(r_users))
     for ind, info in enumerate(r_users):
         if ind == util.number_of_users_used_to_predict:
             break
         tweets = self.tweets.collect_user_tweets(info["id"], 1, 200)
         self.logger.info(
             "%s tweets extracted ------------------------------" %
             len(tweets))
         user_doc = ""
         for t in tweets:
             text = t["text"]
             text = util.clear_tweet(text)
             user_doc += text
         test = self.vectorizer.transform([user_doc])
         cat = self.clf.predict(test)
         interests[str(cat[0])] += 1
         self.logger.info("%s predicted category: %s" %
                          (info["screen_name"], cat))
     # return categories which appear at least three times
     categories = filter(lambda x: interests[x] > 2, interests.keys())
     return categories
示例#3
0
 def collect_category_data(self, category_name, seed_user=None):
     users = self.get_users_for_category(category_name, seed_user)
     self.logger.info("users for category %s: %s" % (category_name, len(users)))
     users = users[:util.users_per_category]
     user_infos = self.tweets.collect_users_info(users, use_ids=True)
     r_users = []
     for i in user_infos:
         info = json.loads(i)
         if info["followers_count"] > util.minimum_number_of_followers:
             r_users.append(info)
     self.logger.info("representative users: %s" % len(r_users))
     for ind, user in enumerate(r_users):
         user_name = user["screen_name"]
         base_dir = util.data_dir
         path = os.path.join(base_dir, category_name)
         if not os.path.exists(path):
             os.makedirs(path)
         tweets = self.tweets.collect_user_tweets(user["id"], 1, 200)
         self.logger.info("%s tweets extracted" % len(tweets))
         docs = []
         ids = []
         for t in tweets:
             t_id = str(t["id"])
             text = t["text"]
             text = text.replace("\n", " ")
             text = util.clear_tweet(text)
             if text.strip() == "":
                 continue
             docs.append(str(t_id) + " " + text + "\n")
             ids.append(t_id)
         file_path = os.path.join(path, user_name)
         user_doc = ""
         if os.path.exists(file_path):
             f = open(file_path)
             first_line = f.readline()
             last_tweet_id = first_line.split(" ")[0]
             for ind, doc in enumerate(docs):
                 t_id = ids[ind]
                 if t_id != last_tweet_id:
                     user_doc += doc
                 else:
                     break
             user_doc += "\n" + user_doc
             f.close()
             f = open(file_path, "w")
         else:
             f = open(file_path, "w+")
             for ind, doc in enumerate(docs):
                 user_doc += doc
         if user_doc.strip() == "":
             f.close()
             os.remove(file_path)
             continue
         is_english = False
         count = 0
         for stop_word in self.stop_words:
             if stop_word in user_doc:
                 count += 1
                 if count == 10:
                     is_english = True
                     break
         if is_english:
             f.write(user_doc.encode("utf-8"))
             f.close()
         else:
             self.logger.info("Seems to be a non english text.")
             f.close()
             os.remove(file_path)
     return len(r_users)