Пример #1
0
def _classify_replies_with_emojis(prefix=''):
    emotions = ['anger', 'fear', 'joy', 'sadness']
    emojis = [
        ['😠', '😡', '😤', '🤬'],  #anger
        ['😰', '😱', '😨', '😟'],  #fear
        ['😂', '😁', '😄', '😊'],  #, '🤣', '😀', '😃', '😄', '😆', '😍', '😋'], #joy
        ['💔', '😢', '😭', '😔']  #sadness
    ]

    tdb = TweetDB()
    tweets_data = []
    for i, emoji_list in enumerate(emojis):
        tweets_with_an_emoji = []
        other_emojis = [e for el in emojis for e in el if e not in emoji_list]
        for emoji in emoji_list:
            tweets_list = filter(
                lambda t: None if any(True if e in t.text else False
                                      for e in other_emojis) else True,
                list(tdb.all_replies_like(emoji)))
            tweets_with_an_emoji += tweets_list
        tweets_with_emoji = list(set(tweets_with_an_emoji))
        tweets_data += list(
            map(lambda t: ['', t.text, f'{prefix}{emotions[i]}'],
                tweets_with_emoji))

    return emotions, tweets_data
Пример #2
0
def get_parent_tweets_metrics(tweets_with_emotion, emotion):
    tdb = TweetDB()
    parent_tweet = lambda tid: tdb.get_by(str(tid))
    parents_tweets = list(
        set([
            parent_tweet(tweet.parent_tweet_id)
            for tweet in tweets_with_emotion
        ]))
    parents_retweets_count = [int(t.retweet_count) for t in parents_tweets]
    parents_retweets_count_median = np.median(
        np.array(parents_retweets_count)) if parents_retweets_count else 0
    parents_retweets_75_top = np.percentile(np.array(parents_retweets_count),
                                            75)
    parents_retweets_90_top = np.percentile(np.array(parents_retweets_count),
                                            90)
    parents_retweets_95_top = np.percentile(np.array(parents_retweets_count),
                                            95)
    parents_retweets_count_sorted = np.sort(parents_retweets_count)[::-1]
    parents_retweets_count_top_20_percentage_len = int(
        len(parents_retweets_count) * 0.2)
    parents_retweets_count_top_20_percentage = parents_retweets_count_sorted[:
                                                                             parents_retweets_count_top_20_percentage_len]
    print('%d on %s category' % (len(tweets_with_emotion), emotion))
    print(
        '%d spreader tweets with retweets median: %.2f, 75%% percentile: %.2f, 90%% percentile: %.2f, 95%% percentile: %.2f, and responses average: %.2f)'
        % (len(parents_tweets), parents_retweets_count_median,
           parents_retweets_75_top,
           parents_retweets_90_top, parents_retweets_95_top,
           len(tweets_with_emotion) / len(parents_tweets)))
    print('top 20%%: avg: %.2f, median: %.2f' %
          (np.mean(parents_retweets_count_top_20_percentage),
           np.median(parents_retweets_count_top_20_percentage)))
Пример #3
0
 def classify_replies_tweets(self):
     tdb = TweetDB()
     tweets = list(tdb.all_replies())
     tweets_with_emotion_indexes, total = self.classify_sentences([t.text for t in tweets])
     tweets_with_emotion = [int(tweets[i].parent_tweet.retweet_count) for i in tweets_with_emotion_indexes]
     tweets_with_emotion_mean = np.array(tweets_with_emotion).mean() if tweets_with_emotion else 0
     self.logger.info('%d of %d on %s category (retweets average: %.2f)' % (
         len(tweets_with_emotion), total, self.emotion_label, tweets_with_emotion_mean))
Пример #4
0
def predict_replies(filepath, **kwargs):
    identified_replies = []
    files_tec = [os.path.join('output', '%s-tec.pickle' % e) for e in emotions]
    for i, f in enumerate(files_tec):
        # nb = SingleEmotionSemEvalNaiveBayes(emotion_index=i, filename=f)
        nb = SingleEmotionTECNaiveBayes(emotion_name=emotions[i], filename=f)
        tdb = TweetDB()
        tweets = list(tdb.all_replies())
        tweets_with_emotion_indexes, total = nb.classify_sentences(
            [t.text for t in tweets])
        identified_replies.append(tweets_with_emotion_indexes)
        print('%d of %d on %s category' %
              (len(tweets_with_emotion_indexes), total, emotions[i]))

    # Count how many additional emotions each tweet has, and prints to the console the average and median for each emotion
    for index, tweets_indexes in enumerate(identified_replies):
        other_tweets_indexes = [
            ti for tis in identified_replies for ti in tis
            if tis != tweets_indexes
        ]
        repeated_tweets_indexes = [
            other_tweets_indexes.count(ti) for ti in tweets_indexes
            if ti in other_tweets_indexes
        ]
        one_emotion_tweets = [
            tweets[ti] for ti in tweets_indexes
            if ti not in other_tweets_indexes
        ]
        print(
            '%s: %d out of %d have been identified with another emotion as well (avg: %.2f, mdn: %.2f)'
            % (emotions[index], len(repeated_tweets_indexes),
               len(tweets_indexes), np.mean(repeated_tweets_indexes),
               np.median(repeated_tweets_indexes)))

        print('%d tweets have been identified only with %s' %
              ((len(tweets_indexes) - len(repeated_tweets_indexes)),
               emotions[index]))

        get_parent_tweets_metrics(one_emotion_tweets, emotions[index])

    count_repeated_emotions(tweets, identified_replies)
Пример #5
0
 def get_tweets_ids_from_csv(self):
     yield [str(t.id) for t in TweetDB().all_sorted_by(
         sort=TweetModel.retweet_count.desc(), source="politifact")]
     yield [str(t.id) for t in TweetDB().all_sorted_by(
         sort=TweetModel.retweet_count.desc(), source="gossipcop")]
Пример #6
0
def predict_top_retweeted_fake_news_tweets(models: list, consolidate=2):
    number_of_tweets_to_evaluate = 500
    tdb = TweetDB()
    # most_popular_tweets = list(tdb.all_sorted_by(sort=TweetModel.retweet_count.desc()))
    most_popular_tweets = list(
        tdb.all_sorted_by_eager_loading(sort=TweetModel.retweet_count.desc()))
    classified_init = False

    get_model_name = lambda model: model.__name__.replace("SingleEmotion", "")

    with open('output/popular_tweets.csv', 'w', newline='') as f:
        csv_writer = csv.writer(f)

        csv_top_row = [
            'TweetID',
            'Tweet retweet count',
            'Tweet replies count',
            'Tweet source',
        ]
        for model, _ in models:
            for emotion in emotions:
                csv_top_row.append(f'{get_model_name(model)} {emotion}')
            for emotion in emotions:
                csv_top_row.append(f'{get_model_name(model)} {emotion}%')
        if consolidate:
            for emotion in emotions:
                csv_top_row.append(f'Consolidation {emotion}')
            for emotion in emotions:
                csv_top_row.append(f'Consolidation {emotion}%')
        csv_writer.writerow(csv_top_row)

        model_instances = {}
        for model, filename_template in models:
            model_instances[get_model_name(model)] = []
            for emotion in emotions:
                model_instances[get_model_name(model)].append(
                    model(emotion_name=emotion,
                          filename=filename_template.format(emotion)))

        tweet_count = 0
        for tweet in most_popular_tweets:
            if tweet_count > number_of_tweets_to_evaluate:
                break
            replies = tweet.replies
            if not len(replies):
                continue

            tweet_count += 1
            print(
                f'Found {len(replies)} for tweet {tweet.id} with {tweet.retweet_count} retweets'
            )

            datasets_lists = {}
            csv_row = [
                tweet.id, tweet.retweet_count,
                len(replies), tweet.source
            ]
            for model_name in model_instances.keys():
                num_replies_with_emotions = []

                for model_instance in model_instances[model_name]:
                    tweets_with_emotion_indexes, _ = model_instance.classify_sentences(
                        [t.text for t in replies])
                    num_replies_with_emotions.append(
                        tweets_with_emotion_indexes)

                csv_row += [len(x) for x in num_replies_with_emotions]
                csv_row += [
                    len(x) / len(replies) for x in num_replies_with_emotions
                ]
                datasets_lists[model_name] = num_replies_with_emotions

            if consolidate:
                consolidate_results = []
                for index_emotion, _ in enumerate(emotions):
                    consolidate_results.append(
                        consolidate_classifiers(consolidate, replies, [
                            x[index_emotion] for x in datasets_lists.values()
                        ]))
                csv_row += [len(x) for x in consolidate_results]
                csv_row += [len(x) / len(replies) for x in consolidate_results]
                datasets_lists['Consolidate'] = consolidate_results

            csv_writer.writerow(csv_row)

            save_classified_replies(replies, emotions, datasets_lists,
                                    classified_init)
            classified_init = True
Пример #7
0
 def __init__(self):
     self.twitter = TwitterData()
     self.tweet_db = TweetDB()
     self.reply_id = False
     super().__init__()
Пример #8
0
class TweetCollector(Loggable):
    """
    Helper class to handle API calls and database connections
    """
    def __init__(self):
        self.twitter = TwitterData()
        self.tweet_db = TweetDB()
        self.reply_id = False
        super().__init__()

    def get_tweet(self, tweet_id: str):
        tweets = self._get_tweet(tweet_id) or []
        self.logger.info("Got %d tweets for %s" % (len(tweets), tweet_id))
        for tweet in tweets:
            self._enqueue_tweet(tweet, tweet.get('id'))
        self.tweet_db.commit()

    def get_last_tweet_id(self):
        return False

    def wrap_up(self):
        self.logger.info("Finished collecting tweets")

    def close(self):
        try:
            self.tweet_db.commit()
        except:
            self.tweet_db.rollback()
        self.tweet_db.close()
        self.logger.debug("Queue completed")

    @abstractmethod
    def _request_tweet(self, tweet_id):
        pass

    def _db_has_tweet_id(self, db_conn, tweet_id) -> TweetModel:
        return db_conn.session.query(TweetModel).get(tweet_id)

    def _get_tweet(self, tweet_id):
        while True:
            try:
                return self._request_tweet(tweet_id)
            except TwythonRateLimitError as e:
                self._wait_retry_after(e.retry_after)
            except Exception as e:
                self.logger.exception(str(e))
                return None

    def _wait_retry_after(self, retry_after: str):
        self.logger.debug("Retry-after: %s" % retry_after)
        try:
            sleep_time = datetime.fromtimestamp(
                int(retry_after)) - datetime.now()
            sleep_time = sleep_time.total_seconds()
        except Exception as e:
            self.logger.debug(str(e))
            sleep_time = 60 * 15
        self.logger.info("Sleeping for %d seconds" % sleep_time)
        time.sleep(sleep_time)

    def _enqueue_tweet(self, tweet: dict, tid: str):
        if not tweet:
            self.logger.warning("Could not get tweet %s" % tid)
            return
        if self.reply_id:
            if str(tweet['in_reply_to_status_id']) != str(self.reply_id):
                self.logger.warning("Tweet %s not reply to %s, skiping..." %
                                    (tid, self.reply_id))
                return
        else:
            tweet[
                'in_reply_to_status_id'] = None  # Do not treat these tweets as replies, even if they are
        if self._db_has_tweet_id(self.tweet_db, tid):
            self.logger.info("Tweet %s already saved, skiping..." % tid)
        else:
            self.tweet_db.save_tweet(tweet)
            self.logger.info("Twitter %s saved" % tid)