예제 #1
0
    def get_df_most_retweeted_users(self):
        # Load cleaned data
        df_clean = pd.read_csv(self.clean_data_path)

        # Return if no tweet was a retweet
        if 'retweeted_user_screen_name' not in df_clean.columns:
            return

        # Count how many times each retweeted user was retweeted and also
        # add the number of followers per user that was retweeted
        df = df_clean[df_clean.retweeted_user_screen_name.notnull()]
        df = df.groupby('retweeted_user_screen_name', as_index=False) \
            .id.count() \
            .merge(df_clean[['retweeted_user_screen_name',
                             'retweeted_user_followers_count']],
                   on='retweeted_user_screen_name', how='inner') \
            .rename({'retweeted_user_screen_name': 'user',
                     'id': 'count_retweets',
                     'retweeted_user_followers_count': 'count_followers'},
                    axis=1) \
            .drop_duplicates()
        df['link'] = 'https://twitter.com/' + df['user']
        df = df.groupby('user').max().reset_index()\
            .sort_values('count_retweets', ascending=False) \

        save_data = Save(df, self.save_path, self.keyword,
                         'most_retweeted_users', 'most_retweeted_users', True)
        save_data.save_data()

        return df
예제 #2
0
    def get_df_most_mentioned_users(self):
        # Load cleaned data and return if there were no users mentioned
        df = pd.read_csv(self.clean_data_path)

        # Flatten nested list of users mentioned per tweet
        df['user_mentions'] = df['user_mentions'].apply(
            lambda x: literal_eval(x) if pd.notnull(x) else '')
        users = [
            user.replace('@', '') for sublist in df['user_mentions'].values
            for user in sublist
        ]
        users_empty = True if len(users) == 0 else False
        if users_empty:
            return

        # Count number of times each user was mentioned
        df = pd.DataFrame({'user': users}) \
            .user.value_counts() \
            .reset_index() \
            .rename({'index': 'user', 'user': '******'}, axis=1) \
            .sort_values('mentions_count', ascending=False)
        df['link'] = 'https://twitter.com/' + df['user']

        save_data = Save(df, self.save_path, self.keyword,
                         'most_mentioned_users', 'most_mentioned_users', True)
        save_data.save_data()

        return df
    def get_metadata(self):
        """
        Gets metadata for all of the Twitter ids extracted by extract_all_ids.

        Args:
            - None, but it uses attributes from initializing the class.
        Returns:
            - None, but is saves
        """
        # Load the list of tweet ids
        with open(f'{self.path_raw_data}/ids.pickle', 'rb') as handle:
            list_of_lists = pickle.load(handle)
            # Flatten list and dedup again just to make sure if it is not null
            if not list_of_lists:
                return "There are no tweet ids to extract metadata"
            else:
                # "not sublist" returns True if list is empty, so we use "not
                # not sublist" to return True if list is not empty
                ids = list(
                    set([
                        item for sublist in list_of_lists if not not sublist
                        for item in sublist
                    ]))

        print('Total ids to be processed: {}'.format(len(ids)))

        # Set credentials
        auth = tweepy.OAuthHandler(self.consumer_key, self.consumer_secret)
        auth.set_access_token(self.access_token, self.access_token_secret)
        api = tweepy.API(auth)

        # Set batch list to only call the API once every 100 ids
        floor_batch_lst = list(range(0, len(ids), 100))
        total_ids = len(ids)
        batch_list = [(i, i + 100) if i + 100 < total_ids else (i, total_ids)
                      for i in floor_batch_lst]

        all_data = []
        for floor, ceil in batch_list:
            print(f'Currently getting {floor} - {ceil} ids out of {total_ids}')
            ids_batch = ids[floor:ceil]
            response = api.statuses_lookup(ids_batch, tweet_mode='extended')
            # Getting tweet in JSON format uses private attribute and may
            # break in the future
            tweets = [dict(tweet._json) for tweet in response]
            all_data += tweets
        print('Metadata collection complete!')

        # Metadata comes in JSON format, so we convert it to CSV and also drop
        # observations without tweet id
        df = pd.DataFrame(all_data).dropna(subset=['entities'])
        save_data = Save(df, self.save_path, self.keyword, '.raw_data',
                         'df_raw', False)
        save_data.save_data()
예제 #4
0
    def get_df_most_active_users(self):
        df = pd.read_csv(self.clean_data_path) \
            .user_screen_name.value_counts() \
            .reset_index() \
            .rename({'index': 'user', 'user_screen_name': 'tweets_published'},
                    axis=1) \
            .sort_values('tweets_published', ascending=False)
        df['link'] = 'https://twitter.com/' + df['user']

        save_data = Save(df, self.save_path, self.keyword, 'most_active_users',
                         'most_active_users', True)
        save_data.save_data()

        return df
예제 #5
0
    def get_df_tweets_sorted_by_retweets(self):
        df = pd.read_csv(self.clean_data_path) \
            .sort_values('retweet_count', ascending=False) \
            .reset_index(drop=True)
        df['link'] = 'https://twitter.com/' + df['user_screen_name']
        df = df[[
            'user_screen_name', 'link', 'date', 'year', 'month_name', 'day',
            'full_text', 'retweet_count', 'favorite_count',
            'user_followers_count', 'user_friends_count', 'user_statuses_count'
        ]]

        save_data = Save(df, self.save_path, self.keyword,
                         'tweets_sorted_by_retweets',
                         'tweets_sorted_by_retweets', True)
        save_data.save_data()

        return df
예제 #6
0
    def get_df_cohashtags_matrix(self):
        # Load hashtags data
        df = self.get_hashtags_df()
        if df.empty:
            return

        # Get one hot encoding of hashtags per tweet
        one_hot_encoding = pd.get_dummies(
            df.hashtags.apply(pd.Series).stack()).sum(level=0).astype(int)
        # Get co occurrence matrix
        df = one_hot_encoding.T.dot(one_hot_encoding)

        save_data = Save(df, self.save_path, self.keyword,
                         'co_hashtags_matrix', 'co_hashtags_matrix', True)
        save_data.save_data()

        return df
예제 #7
0
    def get_df_key_topics(self, num_tfidf_feat=40):
        # Load cleaned data
        df = pd.read_csv(self.clean_data_path)

        # Create list of stop_words for the count vectorizer
        stop_words = nltk.corpus.stopwords.words('spanish') \
            + nltk.corpus.stopwords.words('english')

        # Instantiate count vectorizer
        count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
            min_df=1, ngram_range=(1, 2), stop_words=stop_words)
        tfidf = sklearn.feature_extraction.text.TfidfTransformer()

        # Cleaning tweets' text can generate nulls, so we make them empty
        # strings for the vectorizer
        df.text_clean.fillna(' ', inplace=True)

        # Fit count vectorizer to get counts and generate tfidf features
        count_vectorizer.fit(df.text_clean)
        counts = count_vectorizer.transform(df.text_clean)
        transformed_weights = tfidf.fit_transform(counts)
        weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
        weights = pd.DataFrame({
            'topic': count_vectorizer.get_feature_names(),
            'weight': weights
        })
        df_tfidf = weights.sort_values('weight', ascending=False)

        # Get normalized weights between 0 and 1
        df_tfidf['weight_normalized'] = MinMaxScaler().fit_transform(
            np.array(df_tfidf.weight).reshape(-1, 1)) * 100

        # Only return the top x tfidf features
        df_tfidf = df_tfidf.head(num_tfidf_feat)

        # Save data
        save_data = Save(df_tfidf, self.save_path, self.keyword, 'key_topics',
                         'key_topics', True)
        save_data.save_data()

        return df_tfidf
예제 #8
0
    def get_df_users_by_followers(self):
        # Load cleaned data
        df = pd.read_csv(self.clean_data_path,
                         usecols=['user_screen_name', 'user_followers_count',
                                  'user_friends_count',
                                  'user_statuses_count']) \
            .rename({'user_screen_name': 'user',
                     'user_followers_count': 'count_followers',
                     'user_friends_count': 'count_following',
                     'user_statuses_count': 'count_tweets_published_all_time'},
                    axis=1) \
            .sort_values('count_followers', ascending=False) \
            .drop_duplicates()
        df['link'] = 'https://twitter.com/' + df['user']
        df = df.groupby('user').max().reset_index()\
            .sort_values('count_followers', ascending=False)

        save_data = Save(df, self.save_path, self.keyword,
                         'users_by_followers', 'users_by_followers', True)
        save_data.save_data()

        return df
예제 #9
0
    def get_df_most_mentioned_hashtags(self):
        # Load hashtags data
        df = self.get_hashtags_df()
        if df.empty:
            return

        # Count number of times each hashtag was mentioned
        df = df.hashtags.astype(str).value_counts() \
               .reset_index() \
               .rename({'index': 'hashtags', 'hashtags': 'hashtags_count'},
                       axis=1) \
               .sort_values('hashtags_count', ascending=False)

        # Remove from counts nulls and empty lists of hashtags
        df = df[(df.astype(str)['hashtags'] != '[]') & (df.hashtags != np.NaN)
                & (df.hashtags != '')]

        save_data = Save(df, self.save_path, self.keyword,
                         'most_mentioned_hashtags', 'most_mentioned_hashtags',
                         True)
        save_data.save_data()

        return df
예제 #10
0
    def get_df_grouped_date(self):
        # Load cleaned data
        clean_data = pd.read_csv(self.clean_data_path)

        # Group by date and get sums and counts
        df = clean_data.groupby('date') \
            .agg({'retweet_count': 'sum',
                  'favorite_count': 'sum',
                  'date': 'count'}) \
            .rename({'date': 'tweets_published'}, axis=1) \
            .reset_index()

        # Process date fields
        df['date'] = pd.to_datetime(df['date'])
        df['month'] = df['date'].dt.month.map(self.mapping_months)
        df['year'] = df['date'].dt.year

        # Save cleaned data
        save_data = Save(df, self.save_path, self.keyword, 'grouped_date',
                         'grouped_date', True)
        save_data.save_data()

        return df
예제 #11
0
    def get_df_clean_data(self):
        df = pd.read_csv(self.path_raw_data)

        # Cast timestamp as datetime using EST as timezone. The column ts refers
        # to the timestamp when the tweet was published
        df['ts'] = pd.to_datetime(df['created_at'], utc=True,
                                  errors='coerce').dt.tz_convert('US/Eastern')

        # The raw data returns JSON strings, but when those JSON strings
        # are loaded to Pandas they are parsed only as strings
        cols_to_json = [
            'coordinates', 'entities', 'quoted_status', 'user', 'place'
        ]
        for col in cols_to_json:
            if col in df.columns:
                df[col] = df[col].apply(self._func_to_json)

        # Sometimes scraping for tweet ids returns noise records and we want
        # to make sure we don't have any records with null ids. We also want
        # to cast the ids as string to remove trailing zeros
        df['id'] = pd.to_numeric(df.id, errors='coerce')
        df = df[df.id.notnull()]
        df['id'] = df.id.astype(int).astype(str)

        # Cast boolean attributes
        df['is_quote'] = df['is_quote_status'].astype(bool)
        df['is_truncated'] = df['truncated'].astype(bool)

        # Process timestamp
        df['date'] = df['ts'].apply(lambda x: x.date())
        df['year'] = df['ts'].apply(lambda x: x.year)
        df['month_number'] = df['ts'].apply(lambda x: x.month)
        df['month_name'] = df['month_number'].map(self.mapping_months)
        df['day'] = df['ts'].apply(lambda x: x.day)
        df['weekday_num'] = df['ts'].apply(lambda x: x.dayofweek)
        df['date_weekday'] = df['weekday_num'].map(self.mapping_weekdays)
        df['time'] = df['ts'].apply(lambda x: x.time())
        df['hour'] = df['time'].apply(lambda x: x.hour)

        # Process location if available
        df['lon'] = df['coordinates'].apply(lambda x: x['coordinates'][0]
                                            if x is not None else None)
        df['lat'] = df['coordinates'].apply(lambda x: x['coordinates'][1]
                                            if x is not None else None)

        # Get the URL of each tweet
        df['url'] = df['id'].apply(
            lambda x: "https://twitter.com/i/web/status/" + str(x))

        # Process entities
        df['hashtags'] = df['entities'].apply(lambda x: [
            '#' + hashtag['text'].lower() for hashtag in x['hashtags']
        ] if pd.notnull(x) else '')

        df['user_mentions'] = df['entities'].apply(lambda x: [
            '@' + user['screen_name'].lower() for user in x['user_mentions']
        ] if pd.notnull(x) else '')

        # Process in_reply_to_screen_name
        df['reply_to_user'] = df['in_reply_to_screen_name'].apply(
            lambda x: '@' + x if pd.notnull(x) else '')

        # Process language
        df['lang'] = df['lang'].apply(lambda x: 'english' if x == 'en' else
                                      ('spanish' if x == 'sp' else 'other'))

        # Process place
        df['country'] = df['place'].apply(lambda x: x['country_code']
                                          if pd.notnull(x) else np.NaN)
        df['city_state'] = df['place'].apply(lambda x: x['full_name']
                                             if pd.notnull(x) else np.NaN)
        df['city'] = df['place'].apply(lambda x: x['name']
                                       if pd.notnull(x) else np.NaN)

        # Process information from the users who published each tweet
        df['user_screen_name'] = df['user'].apply(
            lambda x: x['screen_name'].lower() if pd.notnull(x) else np.NaN)
        df['user_followers_count'] = df['user'].apply(
            lambda x: x['followers_count'] if pd.notnull(x) else 0).astype(int)
        df['user_friends_count'] = df['user'].apply(
            lambda x: x['friends_count'] if pd.notnull(x) else 0).astype(int)
        df['user_statuses_count'] = df['user'].apply(
            lambda x: x['statuses_count'] if pd.notnull(x) else 0).astype(int)
        df['user_location'] = df['user'].apply(lambda x: x['location']
                                               if pd.notnull(x) else np.NaN)
        df['user_ts'] = pd.to_datetime(df['user'].apply(
            lambda x: x['created_at'] if pd.notnull(x) else np.NaN),
                                       utc=True).dt.tz_convert('US/Eastern')

        # Process quoted status, which indicates whether the tweet was
        # a retweet or quote
        if 'quoted_status' in df.columns:
            df['retweeted_dummy'] = df['quoted_status'].astype(bool)
            df['retweeted_user_screen_name'] = df['quoted_status'].apply(
                lambda x: x['user']['screen_name']
                if pd.notnull(x) else np.NaN)
            df['retweeted_retweet_count'] = df['quoted_status'].apply(
                lambda x: x['retweet_count'] if pd.notnull(x) else np.NaN)
            df['retweeted_user_followers_count'] = df['quoted_status'].apply(
                lambda x: x['user']['followers_count']
                if pd.notnull(x) else np.NaN)
            df['retweeted_user_friends_count'] = df['quoted_status'].apply(
                lambda x: x['user']['friends_count']
                if pd.notnull(x) else np.NaN)
            df['retweeted_user_statuses_count'] = df['quoted_status'].apply(
                lambda x: x['user']['statuses_count']
                if pd.notnull(x) else np.NaN)
            df['retweeted_user_location'] = df['quoted_status'].apply(
                lambda x: x['user']['location'] if pd.notnull(x) else np.NaN)

        # Process text
        df['text_clean'] = df['full_text'].apply(
            lambda x: self._text_cleaner(x))

        # Drop unnecessary columns
        # noinspection SpellCheckingInspection
        unnecessary_cols = [
            'contributors', 'favorited', 'geo', 'date_time', 'tokenize',
            'date_weekday_num', 'in_reply_to_screen_name',
            'in_reply_to_status_id', 'in_reply_to_status_id_str',
            'in_reply_to_user_id', 'in_reply_to_user_id_str',
            'is_quote_status', 'lang', 'place', 'possibly_sensitive',
            'quoted_status_id', 'quoted_status_id_str', 'retweeted', 'source',
            'truncated', 'user', 'entities', 'extended_entities',
            'coordinates', 'id_str'
        ]
        df.drop(unnecessary_cols, axis=1, inplace=True, errors='ignore')

        save_data = Save(df, self.save_path, self.keyword, 'clean_data',
                         'df_clean', True)
        save_data.save_data()

        return df