def update_watchlist(self, user, since_tweet_id, page_not_found): pre_watchlist_column = self.motor_column.pre_watchlist watchlist_column = self.motor_column.watchlist self.logger.debug(user) doc = yield pre_watchlist_column.find_one({'user.id_str': user['id_str'], 'state': 1}) if not doc: doc = yield pre_watchlist_column.find_one({'user.screen_name': user['screen_name'], 'state': 1}) if doc: yield pre_watchlist_column.remove({'_id': doc['_id']}) del doc['_id'] else: doc = yield watchlist_column.find_one({'user.id_str': user['id_str'], 'state': 1}) if not doc: return if 'campaign_ids' in doc['user']: user.update({'campaign_ids': doc['user']['campaign_ids']}) doc['user'] = user doc['since_tweet_id'] = since_tweet_id doc['page_not_found'] = page_not_found doc['updated_at'] = now_in_drnj_time() doc['state'] = 0 yield watchlist_column.update({'user.id_str': doc['user']['id_str']}, doc, upsert=True)
def create_campaign(self, params): campaigns_column = self.motor_column.campaigns params.update({'created_at': now_in_drnj_time()}) user_id_strs_to_follow = str(params["user_id_strs_to_follow"]) user_screen_names_to_follow = str(params["user_screen_names_to_follow"]) # removing it to be used elsewhere params.pop("user_id_strs_to_follow", None) params.pop("user_screen_names_to_follow", None) add_to_watchlist(params['campaign_id'], user_id_strs_to_follow, user_screen_names_to_follow) yield campaigns_column.insert(params)
def create_campaign(self, params): campaigns_column = self.motor_column.campaigns params.update({'created_at': now_in_drnj_time()}) user_id_strs_to_follow = str(params["user_id_strs_to_follow"]) user_screen_names_to_follow = str( params["user_screen_names_to_follow"]) # removing it to be used elsewhere params.pop("user_id_strs_to_follow", None) params.pop("user_screen_names_to_follow", None) add_to_watchlist(params['campaign_id'], user_id_strs_to_follow, user_screen_names_to_follow) yield campaigns_column.insert(params)
def update_watchlist(self, user, since_tweet_id, page_not_found): pre_watchlist_column = self.motor_column.pre_watchlist watchlist_column = self.motor_column.watchlist self.logger.debug(user) doc = yield pre_watchlist_column.find_one({ 'user.id_str': user['id_str'], 'state': 1 }) if not doc: doc = yield pre_watchlist_column.find_one({ 'user.screen_name': user['screen_name'], 'state': 1 }) if doc: yield pre_watchlist_column.remove({'_id': doc['_id']}) del doc['_id'] else: doc = yield watchlist_column.find_one({ 'user.id_str': user['id_str'], 'state': 1 }) if not doc: return if 'campaign_ids' in doc['user']: user.update({'campaign_ids': doc['user']['campaign_ids']}) doc['user'] = user doc['since_tweet_id'] = since_tweet_id doc['page_not_found'] = page_not_found doc['updated_at'] = now_in_drnj_time() doc['state'] = 0 yield watchlist_column.update({'user.id_str': doc['user']['id_str']}, doc, upsert=True)
def prepare_hist_and_plot(self, n_tweets, users, n_bins, campaign_id): import numpy import matplotlib.pyplot as plot plot_graphs = False hist = { 'user_creation': { 'data': None, 'bins': None, }, 'user_n_tweets': { 'data': None, 'bins': None, }, 'user_n_tweets_overall': { 'data': None, 'bins': None, }, 'n_tweets': None, 'n_unique_users': None, 'n_default_profile_image': None, 'n_lower_than_threshold': None, } self.logger.debug("How many tweets? %d" % n_tweets) hist['n_tweets'] = n_tweets # TODO: abort if there are more than 200000 tweets. if n_tweets > 200000: return # # How many unique users? # n_unique_users = len(users) self.logger.debug("How many unique users? %d" % n_unique_users) hist['n_unique_users'] = n_unique_users ###### sec_title = "Histogram of user creation dates?" # tmp_dates = [] for x in users: tmp_date = x['user']['created_at'] if type(tmp_date) != float: tmp_date = py_utc_time2drnj_time(tmp_date) tmp_dates.append(tmp_date) # tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users] (hist['user_creation']['data'], hist['user_creation']['bins']) = numpy.histogram(tmp_dates, bins=n_bins) if plot_graphs: bins = hist['user_creation']['bins'][:-1] width = (hist['user_creation']['bins'][1] - hist['user_creation']['bins'][0])/2 plot.bar(bins, hist['user_creation']['data'], width=width, align='center') xticklabels = [time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x))) for x in bins] plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('1.pdf', dpi=600) ##### sec_title = "Histogram of number of tweets of each user in this campaign" tmp_counts = [int(x['n_user_tweets']) for x in users] # (hist['user_n_tweets']['data'], hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts, bins=n_bins) if plot_graphs: bins = hist['user_n_tweets']['bins'][:-1] data = hist['user_n_tweets']['data'] width = (hist['user_n_tweets']['bins'][1] - hist['user_n_tweets']['bins'][0])/2 plot.bar(bins, data, width=width, align='center') xticklabels = bins plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('2.pdf', dpi=600) ##### sec_title = "What percentage of them used the default profile image?" # n_default_profile_image = 0 for u in users: if u['user']['default_profile_image']: n_default_profile_image += 1 hist['n_default_profile_image'] = n_default_profile_image self.logger.debug("%s: %0.2f%%" % (sec_title, 100*(float(n_default_profile_image)/n_unique_users))) ##### sec_title = "Histogram of tweet counts of unique users" tmp_counts = [int(x['user']['statuses_count']) for x in users] (hist['user_n_tweets_overall']['data'], hist['user_n_tweets_overall']['bins']) = numpy.histogram(tmp_counts, bins=n_bins) if plot_graphs: bins = hist['user_n_tweets_overall']['bins'][:-1] data = hist['user_n_tweets_overall']['data'] width = (hist['user_n_tweets_overall']['bins'][1] - hist['user_n_tweets_overall']['bins'][0])/2 plot.bar(bins, data, width=width, align='center') xticklabels = bins plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('3.pdf', dpi=600) # sec_title = "What percentage of them have lower than 5 tweets?" n_lower_than_threshold = 0 for u in users: if u['user']['statuses_count'] < 5: n_lower_than_threshold += 1 hist['n_lower_than_threshold'] = n_lower_than_threshold self.logger.debug("%s: %0.2f%%" % (sec_title, 100*(float(n_lower_than_threshold)/n_unique_users))) self.logger.debug(hist) # converting numpy.array's to normal python lists. for k in hist.keys(): if type(hist[k]) == dict: for k2 in hist[k].keys(): if type(hist[k][k2]) == type(numpy.array([])): hist[k][k2] = list(hist[k][k2]) hist = {'campaign_id': campaign_id, 'histogram': hist, 'created_at': now_in_drnj_time()} return hist
def store_friends_or_followers(self, user_id, IDS, drnjID, fof): """Stores/updates list of drenaj user data using raw twitter data IDS -- list of user ids client crawler reports """ # db = mongo_client[DRENAJ_DB[DRENAJ_APP_ENVIRONMENT]] queue_collection = self.application.db.motor_column.queue graph_collection = self.application.db.motor_column.graph num_new_discovered_users = 0 num_edges_inserted = 0 dt = now_in_drnj_time() queue_query = {"id": user_id} print queue_query # ATC: This mechanism requires finding the id twice # With indexing, this may not be a big problem # Alternative is trying to update and catching the pymongo.errors.OperationFailure exception n_ids = yield queue_collection.find(queue_query).count() id_exists = n_ids > 0 if id_exists: print 'ID EXISTS' queue_document = { "$set": { fof + "_retrieved_at": dt, "retrieved_by": drnjID } } # creates entry if query does not exist # queue_collection.update(queue_query, queue_document, upsert=True) yield queue_collection.update(queue_query, queue_document) else: print 'ID NOT EXISTS' if fof == 'friends': queue_document = validate_document( new_queue_document(), { "id": user_id, "id_str": str(user_id), "profile_retrieved_at": 0, "friends_retrieved_at": dt, "followers_retrieved_at": 0, "retrieved_by": drnjID }) elif fof == 'followers': queue_document = validate_document( new_queue_document(), { "id": user_id, "id_str": str(user_id), "profile_retrieved_at": 0, "friends_retrieved_at": 0, "followers_retrieved_at": dt, "retrieved_by": drnjID }) print "QUEUE DOCUMENT: " + queue_document yield queue_collection.insert(queue_document) num_new_discovered_users += 1 # process each user id in IDS for id in reversed(IDS): # Insert the newly discovered id into the queue # insert will be rejected if _id exists queue_document = validate_document( new_queue_document(), { "id": id, "id_str": str(id), "profile_retrieved_at": 0, "friends_retrieved_at": 0, "followers_retrieved_at": 0, "retrieved_by": drnjID }) yield queue_collection.insert(queue_document) num_new_discovered_users += 1 dt = now_in_drnj_time() if fof == 'friends': source = user_id sink = id elif fof == 'followers': source = id sink = user_id else: return edge = yield graph_collection.find_one({ "id": source, "friend_id": sink }) if edge == None: doc = validate_document( new_graph_document(), { 'id': source, 'friend_id': sink, 'id_str': str(source), 'friend_id_str': str(sink), 'record_retrieved_at': dt, "retrieved_by": drnjID }) yield graph_collection.insert(doc) num_edges_inserted += 1 # TODO: Handle unfollows: Find edges that no longer exist and move old record to graph_history and add unfollow record raise Return({ 'num_new_users': num_new_discovered_users, 'num_new_edges': num_edges_inserted })
def store_multiple_profiles(self, ids, S, drnjID, campaign_id): """ """ # print "Received recent profile of ", v['name'], ' a.k.a. ', v['screen_name'] db = self.application.db queue_coll = db.motor_column.queue print S for i in range(len(S)): status = None if 'status' in S[i]: status = S[i]['status'] del S[i]['status'] else: status = {} status['text'] = None status['user'] = S[i] status['user']['history'] = False DB_TEST_VERSION = 0.2 tweet_dat = validate_document(new_tweet_template(), { "tweet": status, # TODO: Replace this DB_TEST_VERSION with source code # version later "drenaj_service_version": DB_TEST_VERSION, "campaign_id": campaign_id, "record_retrieved_at": drnj_time.now_in_drnj_time(), "retrieved_by": drnjID, }, fail=False) print tweet_dat user_id = S[i]['id_str'] # print profile_dat # Check Queue now = drnj_time.now_in_drnj_time() queue_query = {"id": user_id} queue_document = validate_document(new_queue_document(), { "id": int(user_id), "id_str": user_id, "profile_retrieved_at": now, "$setOnInsert": { "friends_retrieved_at": 0, "followers_retrieved_at": 0, }, "retrieved_by": drnjID }) # creates entry if query does not exist yield queue_coll.update(queue_query, queue_document, upsert=True) # Insert to profiles ## profiles_query = {"profile.id": user_id} ## prof = profiles_collection.find_and_modify(profiles_query, remove=True) ## if prof is not None: ## profiles_history_collection.insert(prof) ## ## profiles_collection.insert(profile_dat) # this call marks the current entries as history # maybe we won't need this for certain queries db.move_to_history(user_id) db.insert_tweet(tweet_dat) # tweets_collection.insert(tweet_dat) ids.remove(int(user_id)) raise Return(ids)
def store_multiple_profiles(self, ids, S, drnjID, campaign_id): """ """ # print "Received recent profile of ", v['name'], ' a.k.a. ', v['screen_name'] db = self.application.db queue_coll = db.motor_column.queue print S for i in range(len(S)): status = None if 'status' in S[i]: status = S[i]['status'] del S[i]['status'] else: status = {} status['text'] = None status['user'] = S[i] status['user']['history'] = False DB_TEST_VERSION = 0.2 tweet_dat = validate_document( new_tweet_template(), { "tweet": status, # TODO: Replace this DB_TEST_VERSION with source code # version later "drenaj_service_version": DB_TEST_VERSION, "campaign_id": campaign_id, "record_retrieved_at": drnj_time.now_in_drnj_time(), "retrieved_by": drnjID, }, fail=False) print tweet_dat user_id = S[i]['id_str'] # print profile_dat # Check Queue now = drnj_time.now_in_drnj_time() queue_query = {"id": user_id} queue_document = validate_document( new_queue_document(), { "id": int(user_id), "id_str": user_id, "profile_retrieved_at": now, "$setOnInsert": { "friends_retrieved_at": 0, "followers_retrieved_at": 0, }, "retrieved_by": drnjID }) # creates entry if query does not exist yield queue_coll.update(queue_query, queue_document, upsert=True) # Insert to profiles ## profiles_query = {"profile.id": user_id} ## prof = profiles_collection.find_and_modify(profiles_query, remove=True) ## if prof is not None: ## profiles_history_collection.insert(prof) ## ## profiles_collection.insert(profile_dat) # this call marks the current entries as history # maybe we won't need this for certain queries db.move_to_history(user_id) db.insert_tweet(tweet_dat) # tweets_collection.insert(tweet_dat) ids.remove(int(user_id)) raise Return(ids)
def store_friends_or_followers(self, user_id, IDS, drnjID, fof): """Stores/updates list of drenaj user data using raw twitter data IDS -- list of user ids client crawler reports """ # db = mongo_client[DRENAJ_DB[DRENAJ_APP_ENVIRONMENT]] queue_collection = self.application.db.motor_column.queue graph_collection = self.application.db.motor_column.graph num_new_discovered_users = 0 num_edges_inserted = 0 dt = now_in_drnj_time() queue_query = {"id": user_id} print queue_query # ATC: This mechanism requires finding the id twice # With indexing, this may not be a big problem # Alternative is trying to update and catching the pymongo.errors.OperationFailure exception n_ids = yield queue_collection.find(queue_query).count() id_exists = n_ids > 0 if id_exists: print 'ID EXISTS' queue_document = {"$set": { fof + "_retrieved_at": dt, "retrieved_by": drnjID} } # creates entry if query does not exist # queue_collection.update(queue_query, queue_document, upsert=True) yield queue_collection.update(queue_query, queue_document) else: print 'ID NOT EXISTS' if fof == 'friends': queue_document = validate_document(new_queue_document(), { "id": user_id, "id_str": str(user_id), "profile_retrieved_at": 0, "friends_retrieved_at": dt, "followers_retrieved_at": 0, "retrieved_by": drnjID }) elif fof == 'followers': queue_document = validate_document(new_queue_document(), { "id": user_id, "id_str": str(user_id), "profile_retrieved_at": 0, "friends_retrieved_at": 0, "followers_retrieved_at": dt, "retrieved_by": drnjID }) print "QUEUE DOCUMENT: " + queue_document yield queue_collection.insert(queue_document) num_new_discovered_users += 1 # process each user id in IDS for id in reversed(IDS): # Insert the newly discovered id into the queue # insert will be rejected if _id exists queue_document = validate_document(new_queue_document(), { "id": id, "id_str": str(id), "profile_retrieved_at": 0, "friends_retrieved_at": 0, "followers_retrieved_at": 0, "retrieved_by": drnjID}) yield queue_collection.insert(queue_document) num_new_discovered_users += 1 dt = now_in_drnj_time() if fof == 'friends': source = user_id sink = id elif fof == 'followers': source = id sink = user_id else: return edge = yield graph_collection.find_one({"id": source, "friend_id": sink}) if edge == None: doc = validate_document(new_graph_document(), { 'id': source, 'friend_id': sink, 'id_str': str(source), 'friend_id_str': str(sink), 'record_retrieved_at': dt, "retrieved_by": drnjID }) yield graph_collection.insert(doc) num_edges_inserted += 1 # TODO: Handle unfollows: Find edges that no longer exist and move old record to graph_history and add unfollow record raise Return( {'num_new_users': num_new_discovered_users, 'num_new_edges': num_edges_inserted})
def post(self, *args, **keywords): """ `view` - params: - user_id: user_id of the twitter user that we want to retrieve all tweets. `store` - params: - tweet_data: array of tweets in json format. `retweets` - params: - tweet_id: `id` of the tweet that we want to retrieve all retweets. `filter` - not implemented yet. """ if len(args) > 1: (action) = args else: action = args[0] print action verbose_response = self.get_argument('verbose', '') if (action == 'view'): try: # TODO: we need to get here again. user_id = self.get_argument('user_id') # if no user_id is supplied. if user_id == '': tmp = [] else: tweets_coll = self.application.db.motor_column.tweets # running the query cursor = tweets_coll.find({ 'tweet.user.id_str': str(user_id), }) tmp = [x for x in (yield cursor.to_list(length=100))] self.write(bson.json_util.dumps({'results': tmp})) self.add_header('Content-Type', 'application/json') except MissingArgumentError as e: # TODO: implement logging. raise HTTPError(500, 'You didn''t supply %s as an argument' % e.arg_name) elif (action == 'store'): try: tweet_data = self.get_argument('tweet_data') campaign_id = self.get_argument('campaign_id', 'default') watchlist_related = self.get_argument('watchlist_related', '') if tweet_data: #tweet_array = bson.json_util.loads(tweet_data, object_hook=self.datetime_hook) tweet_array = bson.json_util.loads(tweet_data) tmp_tweets = [] # TODO: Sanity check the data! # For example, treat 'entities', 'user' specially. DB_TEST_VERSION = 0.2 for tweet_obj in tweet_array: tweet_obj['user']['history'] = False tmp_tweets.append(validate_document(new_tweet_template(), { "tweet": tweet_obj, # TODO: Replace this DB_TEST_VERSION with source code # version later "drenaj_service_version": DB_TEST_VERSION, # TODO: "retrieved_by": keywords['drnjID'], "retrieved_by": "drenaj", "campaign_id": campaign_id, "record_retrieved_at": drnj_time.now_in_drnj_time(), }, fail=False)) ### TODO: add these users later. ### tmp_users.append(validate_document(new_user_template(), tweet_obj['user'])) if tmp_tweets: self.application.db.insert_tweet(tmp_tweets) # WARN: This functionality is not available for now. # campaign_id, user_objects = extract_arguments(tmp_tweets) # res = app_object.send_task('init_user_to_graph_offline', # [[ campaign_id, bson.json_util.dumps(user_objects) ]], # queue="offline_jobs") # logger.info(str(res)) if watchlist_related: print watchlist_related watchlist_related = bson.json_util.loads(watchlist_related) print watchlist_related #self.application.db.update_watchlist(**watchlist_related) update_task_state_in_watchlist(**watchlist_related) else: raise HTTPError(500, 'You tried to insert no tweets?!') # tweets_coll.insert(tmp_tweets) else: tmp = [] if verbose_response: self.write(bson.json_util.dumps({'results': tmp})) else: self.write(bson.json_util.dumps({'results': 'ok'})) self.add_header('Content-Type', 'application/json') self.finish() except MissingArgumentError as e: # TODO: implement logging. raise HTTPError(500, 'You didn''t supply %s as an argument' % e.arg_name) elif (action == 'retweets'): try: tweet_id = self.get_argument('tweet_id') self.write('not implemented yet') except MissingArgumentError as e: # TODO: implement logging. raise HTTPError(500, 'You didn''t supply %s as an argument' % e.arg_name) elif (action == 'filter'): try: campaign_id = self.get_argument('campaign_id') skip = self.get_argument('skip', 0) limit = self.get_argument('limit', 100) res_format = self.get_argument('format', 'json') since_datetime = self.get_argument('since_datetime', -1) until_datetime = self.get_argument('until_datetime', -1) sort_by_datetime = self.get_argument('sort_by_datetime', 0) produce_dbpedia_spotlight_result = self.get_argument('dbpedia_spotlight_result', 0) tweets_coll = self.application.db.motor_column.tweets query_string = {'campaign_id' : '%s' % campaign_id} if since_datetime != -1: if 'tweet.created_at' not in query_string: query_string['tweet.created_at'] = {} query_string['tweet.created_at']['$gte'] = float(since_datetime) if until_datetime != -1: if 'tweet.created_at' not in query_string: query_string['tweet.created_at'] = {} query_string['tweet.created_at']['$lt'] = float(until_datetime) sort_string = [] if sort_by_datetime != 0: sort_string = [('tweet.created_at', pymongo.ASCENDING)] # ascending print "STARTED " + str(query_string) cursor = tweets_coll.find(query_string) if sort_string: cursor = cursor.sort(sort_string) cursor = cursor.skip(int(skip)).limit(int(limit)) # TODO: removing because of complaint: # TypeError: if no direction is specified, key_or_list must be an instance of list # .sort({"$natural" : 1})\ tmp = [] for x in (yield cursor.to_list(length=100)): if int(produce_dbpedia_spotlight_result) == 1: x['tweet']['dbpedia_spotlight_result'] = self.produce_dbpedia_spotlight_result(x['tweet']['text']) tmp.append(x) # tmp = [x for x in (yield cursor.to_list(length=100))] print "ENDED " + str(query_string) DB_TEST_VERSION = 0.2 if res_format == 'json': self.write(bson.json_util.dumps( {'results': tmp, # TODO: Replace this DB_TEST_VERSION with source code # version later "drenaj_service_version": DB_TEST_VERSION, # TODO: "requested_by": keywords['drnjID'], "requested_by": "drenaj", "campaign_id": campaign_id, "served_at": drnj_time.now_in_drnj_time(), 'skip': int(skip), 'limit': int(limit)})) self.add_header('Content-Type', 'application/json') elif res_format == 'html': env = Environment(loader=FileSystemLoader('templates')) template = env.get_template('statuses/filter.html') result = template.render(statuses=[x['tweet'] for x in tmp]) self.write(result) self.finish() except MissingArgumentError as e: # TODO: implement logging. raise HTTPError(500, 'You didn''t supply %s as an argument' % e.arg_name) # db.tweets.find({'campaign_id' : 'syria'}).sort({$natural : 1}).skip(10).limit(14) else: # Tornado will make sure that this point will not be reached, if # it's regexp based router works correctly raise HTTPError(404, 'No such method')
def post(self, *args, **keywords): """ `view` - params: - user_id: user_id of the twitter user that we want to retrieve all tweets. `store` - params: - tweet_data: array of tweets in json format. `retweets` - params: - tweet_id: `id` of the tweet that we want to retrieve all retweets. `filter` - not implemented yet. """ if len(args) > 1: (action) = args else: action = args[0] print action verbose_response = self.get_argument('verbose', '') if (action == 'view'): try: # TODO: we need to get here again. user_id = self.get_argument('user_id') # if no user_id is supplied. if user_id == '': tmp = [] else: tweets_coll = self.application.db.motor_column.tweets # running the query cursor = tweets_coll.find({ 'tweet.user.id_str': str(user_id), }) tmp = [x for x in (yield cursor.to_list(length=100))] self.write(bson.json_util.dumps({'results': tmp})) self.add_header('Content-Type', 'application/json') except MissingArgumentError as e: # TODO: implement logging. raise HTTPError( 500, 'You didn' 't supply %s as an argument' % e.arg_name) elif (action == 'store'): try: tweet_data = self.get_argument('tweet_data') campaign_id = self.get_argument('campaign_id', 'default') watchlist_related = self.get_argument('watchlist_related', '') if tweet_data: #tweet_array = bson.json_util.loads(tweet_data, object_hook=self.datetime_hook) tweet_array = bson.json_util.loads(tweet_data) tmp_tweets = [] # TODO: Sanity check the data! # For example, treat 'entities', 'user' specially. DB_TEST_VERSION = 0.2 for tweet_obj in tweet_array: tweet_obj['user']['history'] = False tmp_tweets.append( validate_document( new_tweet_template(), { "tweet": tweet_obj, # TODO: Replace this DB_TEST_VERSION with source code # version later "drenaj_service_version": DB_TEST_VERSION, # TODO: "retrieved_by": keywords['drnjID'], "retrieved_by": "drenaj", "campaign_id": campaign_id, "record_retrieved_at": drnj_time.now_in_drnj_time(), }, fail=False)) ### TODO: add these users later. ### tmp_users.append(validate_document(new_user_template(), tweet_obj['user'])) if tmp_tweets: self.application.db.insert_tweet(tmp_tweets) # WARN: This functionality is not available for now. # campaign_id, user_objects = extract_arguments(tmp_tweets) # res = app_object.send_task('init_user_to_graph_offline', # [[ campaign_id, bson.json_util.dumps(user_objects) ]], # queue="offline_jobs") # logger.info(str(res)) if watchlist_related: print watchlist_related watchlist_related = bson.json_util.loads( watchlist_related) print watchlist_related #self.application.db.update_watchlist(**watchlist_related) update_task_state_in_watchlist(**watchlist_related) else: raise HTTPError(500, 'You tried to insert no tweets?!') # tweets_coll.insert(tmp_tweets) else: tmp = [] if verbose_response: self.write(bson.json_util.dumps({'results': tmp})) else: self.write(bson.json_util.dumps({'results': 'ok'})) self.add_header('Content-Type', 'application/json') self.finish() except MissingArgumentError as e: # TODO: implement logging. raise HTTPError( 500, 'You didn' 't supply %s as an argument' % e.arg_name) elif (action == 'retweets'): try: tweet_id = self.get_argument('tweet_id') self.write('not implemented yet') except MissingArgumentError as e: # TODO: implement logging. raise HTTPError( 500, 'You didn' 't supply %s as an argument' % e.arg_name) elif (action == 'filter'): try: campaign_id = self.get_argument('campaign_id') skip = self.get_argument('skip', 0) limit = self.get_argument('limit', 100) res_format = self.get_argument('format', 'json') since_datetime = self.get_argument('since_datetime', -1) until_datetime = self.get_argument('until_datetime', -1) sort_by_datetime = self.get_argument('sort_by_datetime', 0) produce_dbpedia_spotlight_result = self.get_argument( 'dbpedia_spotlight_result', 0) tweets_coll = self.application.db.motor_column.tweets query_string = {'campaign_id': '%s' % campaign_id} if since_datetime != -1: if 'tweet.created_at' not in query_string: query_string['tweet.created_at'] = {} query_string['tweet.created_at']['$gte'] = float( since_datetime) if until_datetime != -1: if 'tweet.created_at' not in query_string: query_string['tweet.created_at'] = {} query_string['tweet.created_at']['$lt'] = float( until_datetime) sort_string = [] if sort_by_datetime != 0: sort_string = [('tweet.created_at', pymongo.ASCENDING) ] # ascending print "STARTED " + str(query_string) cursor = tweets_coll.find(query_string) if sort_string: cursor = cursor.sort(sort_string) cursor = cursor.skip(int(skip)).limit(int(limit)) # TODO: removing because of complaint: # TypeError: if no direction is specified, key_or_list must be an instance of list # .sort({"$natural" : 1})\ tmp = [] for x in (yield cursor.to_list(length=100)): if int(produce_dbpedia_spotlight_result) == 1: x['tweet'][ 'dbpedia_spotlight_result'] = self.produce_dbpedia_spotlight_result( x['tweet']['text']) tmp.append(x) # tmp = [x for x in (yield cursor.to_list(length=100))] print "ENDED " + str(query_string) DB_TEST_VERSION = 0.2 if res_format == 'json': self.write( bson.json_util.dumps({ 'results': tmp, # TODO: Replace this DB_TEST_VERSION with source code # version later "drenaj_service_version": DB_TEST_VERSION, # TODO: "requested_by": keywords['drnjID'], "requested_by": "drenaj", "campaign_id": campaign_id, "served_at": drnj_time.now_in_drnj_time(), 'skip': int(skip), 'limit': int(limit) })) self.add_header('Content-Type', 'application/json') elif res_format == 'html': env = Environment(loader=FileSystemLoader('templates')) template = env.get_template('statuses/filter.html') result = template.render( statuses=[x['tweet'] for x in tmp]) self.write(result) self.finish() except MissingArgumentError as e: # TODO: implement logging. raise HTTPError( 500, 'You didn' 't supply %s as an argument' % e.arg_name) # db.tweets.find({'campaign_id' : 'syria'}).sort({$natural : 1}).skip(10).limit(14) else: # Tornado will make sure that this point will not be reached, if # it's regexp based router works correctly raise HTTPError(404, 'No such method')
def prepare_hist_and_plot(self, n_tweets, users, n_bins, campaign_id): import numpy import matplotlib.pyplot as plot plot_graphs = False hist = { 'user_creation': { 'data': None, 'bins': None, }, 'user_n_tweets': { 'data': None, 'bins': None, }, 'user_n_tweets_overall': { 'data': None, 'bins': None, }, 'n_tweets': None, 'n_unique_users': None, 'n_default_profile_image': None, 'n_lower_than_threshold': None, } self.logger.debug("How many tweets? %d" % n_tweets) hist['n_tweets'] = n_tweets # TODO: abort if there are more than 200000 tweets. if n_tweets > 200000: return # # How many unique users? # n_unique_users = len(users) self.logger.debug("How many unique users? %d" % n_unique_users) hist['n_unique_users'] = n_unique_users ###### sec_title = "Histogram of user creation dates?" # tmp_dates = [] for x in users: tmp_date = x['user']['created_at'] if type(tmp_date) != float: tmp_date = py_utc_time2drnj_time(tmp_date) tmp_dates.append(tmp_date) # tmp_dates = [py_utc_time2drnj_time(x['user']['created_at']) for x in users] (hist['user_creation']['data'], hist['user_creation']['bins']) = numpy.histogram(tmp_dates, bins=n_bins) if plot_graphs: bins = hist['user_creation']['bins'][:-1] width = (hist['user_creation']['bins'][1] - hist['user_creation']['bins'][0]) / 2 plot.bar(bins, hist['user_creation']['data'], width=width, align='center') xticklabels = [ time.strftime('%d %b %Y', time.gmtime(drnj_time2py_time(x))) for x in bins ] plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('1.pdf', dpi=600) ##### sec_title = "Histogram of number of tweets of each user in this campaign" tmp_counts = [int(x['n_user_tweets']) for x in users] # (hist['user_n_tweets']['data'], hist['user_n_tweets']['bins']) = numpy.histogram(tmp_counts, bins=n_bins) if plot_graphs: bins = hist['user_n_tweets']['bins'][:-1] data = hist['user_n_tweets']['data'] width = (hist['user_n_tweets']['bins'][1] - hist['user_n_tweets']['bins'][0]) / 2 plot.bar(bins, data, width=width, align='center') xticklabels = bins plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('2.pdf', dpi=600) ##### sec_title = "What percentage of them used the default profile image?" # n_default_profile_image = 0 for u in users: if u['user']['default_profile_image']: n_default_profile_image += 1 hist['n_default_profile_image'] = n_default_profile_image self.logger.debug("%s: %0.2f%%" % (sec_title, 100 * (float(n_default_profile_image) / n_unique_users))) ##### sec_title = "Histogram of tweet counts of unique users" tmp_counts = [int(x['user']['statuses_count']) for x in users] (hist['user_n_tweets_overall']['data'], hist['user_n_tweets_overall']['bins']) = numpy.histogram(tmp_counts, bins=n_bins) if plot_graphs: bins = hist['user_n_tweets_overall']['bins'][:-1] data = hist['user_n_tweets_overall']['data'] width = (hist['user_n_tweets_overall']['bins'][1] - hist['user_n_tweets_overall']['bins'][0]) / 2 plot.bar(bins, data, width=width, align='center') xticklabels = bins plot.xticks(bins, xticklabels) plot.title(sec_title) #plot.show() plot.savefig('3.pdf', dpi=600) # sec_title = "What percentage of them have lower than 5 tweets?" n_lower_than_threshold = 0 for u in users: if u['user']['statuses_count'] < 5: n_lower_than_threshold += 1 hist['n_lower_than_threshold'] = n_lower_than_threshold self.logger.debug("%s: %0.2f%%" % (sec_title, 100 * (float(n_lower_than_threshold) / n_unique_users))) self.logger.debug(hist) # converting numpy.array's to normal python lists. for k in hist.keys(): if type(hist[k]) == dict: for k2 in hist[k].keys(): if type(hist[k][k2]) == type(numpy.array([])): hist[k][k2] = list(hist[k][k2]) hist = { 'campaign_id': campaign_id, 'histogram': hist, 'created_at': now_in_drnj_time() } return hist