def prepare_hist_and_plot(self, n_tweets, users, n_bins, campaign_id): import numpy import matplotlib.pyplot as plot plot_graphs = False hist = { 'user_creation': { 'data': None, 'bins': None, }, 'user_n_tweets': { 'data': None, 'bins': None, }, 'user_n_tweets_overall': { 'data': None, 'bins': None, }, 'n_tweets': None, 'n_unique_users': None, 'n_default_profile_image': None, 'n_lower_than_threshold': None, } self.logger.debug("How many tweets? %d" % n_tweets) hist['n_tweets'] = n_tweets # TODO: abort if there are more than 200000 tweets. if n_tweets > 200000: return # # How many unique users? # n_unique_users = len(users) self.logger.debug("How many unique users? %d" % n_unique_users) hist['n_unique_users'] = n_unique_users ###### sec_title = "Histogram of user creation dates?" # tmp_dates = [] for x in users: tmp_date = x['user']['created_at'] if type(tmp_date) != float: tmp_date = py_utc_time2drnj_time(tmp_date) tmp_dates.append(tmp_date) # tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users] (hist['user_creation']['data'], hist['user_creation']['bins']) = numpy.histogram(tmp_dates, bins=n_bins) if plot_graphs: bins = hist['user_creation']['bins'][:-1] width = (hist['user_creation']['bins'][1] - hist['user_creation']['bins'][0])/2 plot.bar(bins, hist['user_creation']['data'], width=width, align='center') xticklabels = [time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x))) for x in bins] plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('1.pdf', dpi=600) ##### sec_title = "Histogram of number of tweets of each user in this campaign" tmp_counts = [int(x['n_user_tweets']) for x in users] # (hist['user_n_tweets']['data'], hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts, bins=n_bins) if plot_graphs: bins = hist['user_n_tweets']['bins'][:-1] data = hist['user_n_tweets']['data'] width = (hist['user_n_tweets']['bins'][1] - hist['user_n_tweets']['bins'][0])/2 plot.bar(bins, data, width=width, align='center') xticklabels = bins plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('2.pdf', dpi=600) ##### sec_title = "What percentage of them used the default profile image?" # n_default_profile_image = 0 for u in users: if u['user']['default_profile_image']: n_default_profile_image += 1 hist['n_default_profile_image'] = n_default_profile_image self.logger.debug("%s: %0.2f%%" % (sec_title, 100*(float(n_default_profile_image)/n_unique_users))) ##### sec_title = "Histogram of tweet counts of unique users" tmp_counts = [int(x['user']['statuses_count']) for x in users] (hist['user_n_tweets_overall']['data'], hist['user_n_tweets_overall']['bins']) = numpy.histogram(tmp_counts, bins=n_bins) if plot_graphs: bins = hist['user_n_tweets_overall']['bins'][:-1] data = hist['user_n_tweets_overall']['data'] width = (hist['user_n_tweets_overall']['bins'][1] - hist['user_n_tweets_overall']['bins'][0])/2 plot.bar(bins, data, width=width, align='center') xticklabels = bins plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('3.pdf', dpi=600) # sec_title = "What percentage of them have lower than 5 tweets?" n_lower_than_threshold = 0 for u in users: if u['user']['statuses_count'] < 5: n_lower_than_threshold += 1 hist['n_lower_than_threshold'] = n_lower_than_threshold self.logger.debug("%s: %0.2f%%" % (sec_title, 100*(float(n_lower_than_threshold)/n_unique_users))) self.logger.debug(hist) # converting numpy.array's to normal python lists. for k in hist.keys(): if type(hist[k]) == dict: for k2 in hist[k].keys(): if type(hist[k][k2]) == type(numpy.array([])): hist[k][k2] = list(hist[k][k2]) hist = {'campaign_id': campaign_id, 'histogram': hist, 'created_at': now_in_drnj_time()} return hist
def insert_tweet(self, tweet_obj_array): # actual tweet insertion yield self.motor_column.tweets.insert(tweet_obj_array) for tweet_obj in tweet_obj_array: # build the analytics freq = {} campaign_id = tweet_obj['campaign_id'] # strip unnecessary fields tweet_obj = tweet_obj['tweet'] freq['campaigns'] = {campaign_id: 1} # freq['tokens'] = {'ali': 1, 'veli': 1} freq['hashtags'] = {} if 'entities' in tweet_obj and 'hashtags' in tweet_obj['entities']: for hashtag in tweet_obj['entities']['hashtags']: if 'text' in hashtag: item_key = hashtag['text'] if item_key in freq['hashtags']: freq['hashtags'][item_key] += 1 else: freq['hashtags'][item_key] = 1 else: # log this missing attribute. pass freq['mentions'] = {} if 'entities' in tweet_obj and 'user_mentions' in tweet_obj['entities']: for mention in tweet_obj['entities']['user_mentions']: if 'id_str' in mention: item_key = "|".join([mention['id_str'], mention['screen_name']]) if item_key in freq['mentions']: freq['mentions'][item_key] += 1 else: freq['mentions'][item_key] = 1 else: # log this missing attribute. pass freq['urls'] = {} if 'entities' in tweet_obj and 'urls' in tweet_obj['entities']: for url in tweet_obj['entities']['urls']: if 'expanded_url' in url: item_key = url['expanded_url'] if item_key in freq['urls']: freq['urls'][item_key] += 1 else: freq['urls'][item_key] = 1 else: # log this missing attribute. pass if 'created_at' in tweet_obj: # turns out that we've already transformed into drnj_time t = drnj_time2py_time(tweet_obj['created_at']) else: t = time.time() gm_t = time.gmtime(t) today_str = time.strftime('%Y-%m-%d', gm_t) hour = time.strftime('%H', gm_t) minute = "%04d" % (int(hour)*60 + int(time.strftime('%M', gm_t))) for key in freq: for item in freq[key].keys(): count = freq[key][item] yield self.colls[key].update({'campaign_id': campaign_id, 'date': today_str, 'key': item}, {'$inc': {('hour.%s' % hour): count, ('minute.%s' % minute): count, ('day_total'): count}, '$set': {'last_updated_minute': minute}}, upsert=True)
sec_title = "Histogram of user creation dates?" # tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users] (hist['user_creation']['data'], hist['user_creation']['bins']) = numpy.histogram(tmp_dates, bins=100) if plotGraphs: bins = hist['user_creation']['bins'][:-1] width = (hist['user_creation']['bins'][1] - hist['user_creation']['bins'][0]) / 2 plot.bar(bins, hist['user_creation']['data'], width=width, align='center') xticklabels = [ time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x))) for x in bins ] plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('1.pdf', dpi=600) ##### sec_title = "Histogram of number of tweets of each user in this campaign" tmp_counts = [int(x['n_user_tweets']) for x in users] # (hist['user_n_tweets']['data'], hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts, bins=100)
def prepare_hist_and_plot(self, n_tweets, users, n_bins, campaign_id): import numpy import matplotlib.pyplot as plot plot_graphs = False hist = { 'user_creation': { 'data': None, 'bins': None, }, 'user_n_tweets': { 'data': None, 'bins': None, }, 'user_n_tweets_overall': { 'data': None, 'bins': None, }, 'n_tweets': None, 'n_unique_users': None, 'n_default_profile_image': None, 'n_lower_than_threshold': None, } self.logger.debug("How many tweets? %d" % n_tweets) hist['n_tweets'] = n_tweets # TODO: abort if there are more than 200000 tweets. if n_tweets > 200000: return # # How many unique users? # n_unique_users = len(users) self.logger.debug("How many unique users? %d" % n_unique_users) hist['n_unique_users'] = n_unique_users ###### sec_title = "Histogram of user creation dates?" # tmp_dates = [] for x in users: tmp_date = x['user']['created_at'] if type(tmp_date) != float: tmp_date = py_utc_time2drnj_time(tmp_date) tmp_dates.append(tmp_date) # tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users] (hist['user_creation']['data'], hist['user_creation']['bins']) = numpy.histogram(tmp_dates, bins=n_bins) if plot_graphs: bins = hist['user_creation']['bins'][:-1] width = (hist['user_creation']['bins'][1] - hist['user_creation']['bins'][0]) / 2 plot.bar(bins, hist['user_creation']['data'], width=width, align='center') xticklabels = [ time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x))) for x in bins ] plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('1.pdf', dpi=600) ##### sec_title = "Histogram of number of tweets of each user in this campaign" tmp_counts = [int(x['n_user_tweets']) for x in users] # (hist['user_n_tweets']['data'], hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts, bins=n_bins) if plot_graphs: bins = hist['user_n_tweets']['bins'][:-1] data = hist['user_n_tweets']['data'] width = (hist['user_n_tweets']['bins'][1] - hist['user_n_tweets']['bins'][0]) / 2 plot.bar(bins, data, width=width, align='center') xticklabels = bins plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('2.pdf', dpi=600) ##### sec_title = "What percentage of them used the default profile image?" # n_default_profile_image = 0 for u in users: if u['user']['default_profile_image']: n_default_profile_image += 1 hist['n_default_profile_image'] = n_default_profile_image self.logger.debug("%s: %0.2f%%" % (sec_title, 100 * (float(n_default_profile_image) / n_unique_users))) ##### sec_title = "Histogram of tweet counts of unique users" tmp_counts = [int(x['user']['statuses_count']) for x in users] (hist['user_n_tweets_overall']['data'], hist['user_n_tweets_overall']['bins']) = numpy.histogram(tmp_counts, bins=n_bins) if plot_graphs: bins = hist['user_n_tweets_overall']['bins'][:-1] data = hist['user_n_tweets_overall']['data'] width = (hist['user_n_tweets_overall']['bins'][1] - hist['user_n_tweets_overall']['bins'][0]) / 2 plot.bar(bins, data, width=width, align='center') xticklabels = bins plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('3.pdf', dpi=600) # sec_title = "What percentage of them have lower than 5 tweets?" n_lower_than_threshold = 0 for u in users: if u['user']['statuses_count'] < 5: n_lower_than_threshold += 1 hist['n_lower_than_threshold'] = n_lower_than_threshold self.logger.debug("%s: %0.2f%%" % (sec_title, 100 * (float(n_lower_than_threshold) / n_unique_users))) self.logger.debug(hist) # converting numpy.array's to normal python lists. for k in hist.keys(): if type(hist[k]) == dict: for k2 in hist[k].keys(): if type(hist[k][k2]) == type(numpy.array([])): hist[k][k2] = list(hist[k][k2]) hist = { 'campaign_id': campaign_id, 'histogram': hist, 'created_at': now_in_drnj_time() } return hist
def insert_tweet(self, tweet_obj_array): # actual tweet insertion yield self.motor_column.tweets.insert(tweet_obj_array) for tweet_obj in tweet_obj_array: # build the analytics freq = {} campaign_id = tweet_obj['campaign_id'] # strip unnecessary fields tweet_obj = tweet_obj['tweet'] freq['campaigns'] = {campaign_id: 1} # freq['tokens'] = {'ali': 1, 'veli': 1} freq['hashtags'] = {} if 'entities' in tweet_obj and 'hashtags' in tweet_obj['entities']: for hashtag in tweet_obj['entities']['hashtags']: if 'text' in hashtag: item_key = hashtag['text'] if item_key in freq['hashtags']: freq['hashtags'][item_key] += 1 else: freq['hashtags'][item_key] = 1 else: # log this missing attribute. pass freq['mentions'] = {} if 'entities' in tweet_obj and 'user_mentions' in tweet_obj[ 'entities']: for mention in tweet_obj['entities']['user_mentions']: if 'id_str' in mention: item_key = "|".join( [mention['id_str'], mention['screen_name']]) if item_key in freq['mentions']: freq['mentions'][item_key] += 1 else: freq['mentions'][item_key] = 1 else: # log this missing attribute. pass freq['urls'] = {} if 'entities' in tweet_obj and 'urls' in tweet_obj['entities']: for url in tweet_obj['entities']['urls']: if 'expanded_url' in url: item_key = url['expanded_url'] if item_key in freq['urls']: freq['urls'][item_key] += 1 else: freq['urls'][item_key] = 1 else: # log this missing attribute. pass if 'created_at' in tweet_obj: # turns out that we've already transformed into drnj_time t = drnj_time2py_time(tweet_obj['created_at']) else: t = time.time() gm_t = time.gmtime(t) today_str = time.strftime('%Y-%m-%d', gm_t) hour = time.strftime('%H', gm_t) minute = "%04d" % (int(hour) * 60 + int(time.strftime('%M', gm_t))) for key in freq: for item in freq[key].keys(): count = freq[key][item] yield self.colls[key].update( { 'campaign_id': campaign_id, 'date': today_str, 'key': item }, { '$inc': { ('hour.%s' % hour): count, ('minute.%s' % minute): count, ('day_total'): count }, '$set': { 'last_updated_minute': minute } }, upsert=True)
hist['n_unique_users'] = n_unique_users ###### sec_title = "Histogram of user creation dates?" # tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users] (hist['user_creation']['data'], hist['user_creation']['bins']) = numpy.histogram(tmp_dates, bins=100) if plotGraphs: bins = hist['user_creation']['bins'][:-1] width = (hist['user_creation']['bins'][1] - hist['user_creation']['bins'][0])/2 plot.bar(bins, hist['user_creation']['data'], width=width, align='center') xticklabels = [time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x))) for x in bins] plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('1.pdf', dpi=600) ##### sec_title = "Histogram of number of tweets of each user in this campaign" tmp_counts = [int(x['n_user_tweets']) for x in users] # (hist['user_n_tweets']['data'], hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts, bins=100) if plotGraphs: bins = hist['user_n_tweets']['bins'][:-1] data = hist['user_n_tweets']['data']