def __db_trustworthy_users(db_users, db_tweets, config): """ Generate a database of trustworthy users. We trust in the user if she has a verified account or has more than X number of followers :param db_users: database of user :param config: dictionary with the configuration parameters of the heuristic :return: database of trustworthy users """ trustworthy_users_db = DBManager('trustworthy_users') if trustworthy_users_db.num_records_collection() == 0: logging.info('The trustworthy_users collection is being created...') for doc in db_users.find_all(): data = get_user(db_tweets, doc['screen_name']) if data['verified'] or int( data['followers_count']) > config['min_num_followers']: if not trustworthy_users_db.find_record( {'screen_name': data['screen_name']}): trustworthy_users_db.save_record({ 'screen_name': doc['screen_name'], 'name': data['name'], 'created_at': data['created_at'], 'followers_count': data['followers_count'], 'verified': data['verified'] }) return trustworthy_users_db
class NetworkAnalyzer: __dbm_tweets = None __dbm_users = None __dbm_networks = None __network = None __graph = None __nodes = set() __unknown_users = set() __node_sizes = None def __init__(self): self.__dbm_tweets = DBManager('tweets') self.__dbm_users = DBManager('users') self.__dbm_networks = DBManager('networks') self.__network = [] def __computer_ff_ratio(self, friends, followers): if followers > 0 and friends > 0: return friends / followers else: return 0 # Get interactions in of a given users def get_in_interactions(self, user_screen_name): # compute in interactions, meaning, interactions in which the user # was mentioned, retweeted, quoted, replied in_inter_query = {'interactions.' + user_screen_name: {'$exists': 1}, 'screen_name': {'$ne': user_screen_name}} n_users = self.__dbm_users.search(in_inter_query) in_interactions_dict, in_rts, in_rps = {}, {}, {} in_qts, in_mts = {}, {} total_in_interactions = 0 total_in_retweets, total_in_replies = 0, 0 total_in_mentions, total_in_quotes = 0, 0 for n_user in n_users: n_user_interactions = n_user['interactions'] for i_user, interactions in n_user_interactions.items(): if i_user == user_screen_name: in_interactions_dict[n_user['screen_name']] = interactions['total'] total_in_interactions += interactions['total'] if 'retweets' in interactions.keys(): total_in_retweets += interactions['retweets'] in_rts[n_user['screen_name']] = interactions['retweets'] if 'replies' in interactions.keys(): total_in_replies += interactions['replies'] in_rps[n_user['screen_name']] = interactions['replies'] if 'mentions' in interactions.keys(): total_in_mentions += interactions['mentions'] in_mts[n_user['screen_name']] = interactions['mentions'] if 'quotes' in interactions.keys(): total_in_quotes += interactions['quotes'] in_qts[n_user['screen_name']] = interactions['quotes'] in_interactions_obj = { 'total': { 'count': total_in_interactions, 'details': in_interactions_dict }, 'replies': { 'count': total_in_replies, 'details': in_rps }, 'retweets': { 'count': total_in_retweets, 'details': in_rts }, 'mentions': { 'count': total_in_mentions, 'details': in_mts }, 'quotes': { 'count': total_in_quotes, 'details': in_qts } } user_dict = { 'in_interactions': in_interactions_obj } return user_dict # Get interactions out of a given users def get_out_interactions(self, user_screen_name): user = self.__dbm_users.search({'screen_name': user_screen_name})[0] # compute out interactions, meaning, interactions originated by # the user user_interactions = user['interactions'] out_interactions_dict, out_rts = {}, {} out_rps, out_qts, out_mts = {}, {}, {} total_out_interactions, total_out_retweets = 0, 0 total_out_mentions, total_out_replies = 0, 0 total_out_quotes = 0 for recipient, interactions in user_interactions.items(): out_interactions_dict[recipient] = interactions['total'] total_out_interactions += interactions['total'] if 'retweets' in interactions: total_out_retweets += interactions['retweets'] out_rts[recipient] = interactions['retweets'] if 'replies' in interactions: total_out_replies += interactions['replies'] out_rps[recipient] = interactions['replies'] if 'mentions' in interactions: total_out_mentions += interactions['mentions'] out_mts[recipient] = interactions['mentions'] if 'quotes' in interactions: total_out_quotes += interactions['quotes'] out_qts[recipient] = interactions['quotes'] out_interactions_obj = { 'total': { 'count': total_out_interactions, 'details': out_interactions_dict }, 'replies': { 'count': total_out_replies, 'details': out_rps }, 'retweets': { 'count': total_out_retweets, 'details': out_rts }, 'mentions': { 'count': total_out_mentions, 'details': out_mts }, 'quotes': { 'count': total_out_quotes, 'details': out_qts } } # compile all information in a dictionary user_dict = { 'out_interactions': out_interactions_obj } return user_dict def create_users_db(self, clear_collection=False): logging.info('::. Network Analyzer: Creating database of users, it can take several minutes, please wait_') if clear_collection: self.__dbm_users.clear_collection() users = self.__dbm_tweets.get_unique_users() users_count = len(users) logging.info('::. Network Analyzer: Extracted {0} unique users from the database...'.format(users_count)) progress = 1 for user in users: db_user = { 'screen_name': user['screen_name'], 'friends': user['friends'], 'followers': user['followers'], 'ff_ratio': self.__computer_ff_ratio(user['friends'], user['followers']), 'interactions': user['interactions'], 'tweets': user['tweets_count'], 'original_tweets': user['original_count'], 'rts': user['retweets_count'], 'qts': user['quotes_count'], 'rps': user['replies_count'], 'verified': user['verified'] } # Assign the party and movement to the party and movement that are more related to the user # counting both Hashtags and Mentions by the user user_parties = self.__dbm_tweets.get_party_user(user['screen_name']) user_parties_count = len(user_parties) or 0 logging.debug('::. Network Analyzer: User {0} has {1} associated parties...' .format(user['screen_name'],user_parties_count)) if user_parties_count > 0: user_party = user_parties[0] db_user.update({'most_interacted_party': user_party['partido']}) user_movements = self.__dbm_tweets.get_movement_user(user['screen_name']) user_movements_count = len(user_movements) or 0 logging.debug('::. Network Analyzer: User {0} has {1} associated movements...' .format(user['screen_name'], user_movements_count)) if user_movements_count > 0: user_movement = user_movements[0] db_user.update({'most_interacted_movement': user_movement['movimiento']}) else: db_user.update({'most_interacted_movement': ''}) else: db_user.update({'most_interacted_party': '', 'movement': ''}) # Assign the party and movement to the party and movement that are more related to the user # counting both Hashtags and Mentions by the user upp = UserPoliticalPreference() user_party = upp.get_user_political_party(user['screen_name']) user_movement = upp.get_user_political_movement(user['screen_name']) db_user.update({'party': user_party, 'movement': user_movement}) filter_query = {'screen_name': user['screen_name']} logging.debug('::. Network Analyzer: Updating/creating user {0} ({1}/{2})...' .format(user['screen_name'], progress, users_count)) progress += 1 self.__dbm_users.update_record(filter_query, db_user, create_if_doesnt_exist=True) def generate_network(self, subnet_query={}, depth=1, file_name='network', override_net=False): net_query = subnet_query.copy() net_query.update({'depth': depth}) ret_net = self.__dbm_networks.search(net_query) # the net doesn't exist yet, let's create it if ret_net.count() == 0 or override_net: logging.info('Generating the network, it can take several minutes, please wait_') users = self.__dbm_users.search(subnet_query) # for each user generate his/her edges for user in users: if 'ff_ratio' in user.keys(): u_ff_ratio = user['ff_ratio'] else: u_ff_ratio = self.__computer_ff_ratio(user['friends'], user['followers']) pbb_score = user['bot_analysis']['pbb'] if 'bot_analysis' in user.keys() else '' self.__nodes.add(tuple({'screen_name': user['screen_name'], 'party': user['party'], 'movement': user['movement'], 'ff_ratio': u_ff_ratio, 'pbb': pbb_score}.items())) for interacted_user, interactions in user['interactions'].items(): iuser = self.__dbm_users.find_record({'screen_name': interacted_user}) if not iuser: if depth > 1: iuser_ffratio = self.__get_ffratio(interacted_user) if not iuser_ffratio: self.__unknown_users.add(interacted_user) continue else: self.__unknown_users.add(interacted_user) continue else: if 'ff_ratio' in iuser.keys(): i_ff_ratio = iuser['ff_ratio'] else: i_ff_ratio = self.__computer_ff_ratio(iuser['friends'], iuser['followers']) pbb_iuser_score = user['bot_analysis']['pbb'] if 'bot_analysis' in iuser.keys() else '' self.__nodes.add(tuple({'screen_name': iuser['screen_name'], 'party': iuser['party'], 'movement': iuser['movement'], 'ff_ratio': i_ff_ratio, 'pbb': pbb_iuser_score}.items())) edge = { 'nodeA': {'screen_name': user['screen_name'], 'ff_ratio': u_ff_ratio, 'party': user['party'], 'movement': user['movement'], 'pbb': pbb_score}, 'nodeB': {'screen_name': interacted_user, 'ff_ratio': i_ff_ratio, 'party': iuser['party'], 'movement': iuser['movement'], 'pbb': pbb_iuser_score}, 'weight': interactions['total'] } self.__network.append(edge) logging.info('Created a network of {0} nodes and {1} edges'.format(len(self.__nodes), len(self.__network))) logging.info('Unknown users {0}'.format(len(self.__unknown_users))) # save the net in a gefx file for posterior usage f_name = self.save_network_in_gexf_format(file_name) logging.info('Saved the network in the file {0}'.format(f_name)) db_net = {'file_name': str(f_name)} db_net.update(net_query) self.__dbm_networks.save_record(db_net) else: f_net = ret_net[0] logging.info('The network was already generated, please find it at {0}'.format(f_net['file_name'])) def create_graph(self): logging.info('Creating the graph, please wait_') self.__graph = net.DiGraph() ff_ratio = defaultdict(lambda: 0.0) # create a directed graph from the edge data and populate a dictionary # with the friends/followers ratio for edge in self.__network: user = edge['nodeA']['screen_name'] interacted_with = edge['nodeB']['screen_name'] num_interactions = edge['weight'] u_ff_ratio = edge['nodeA']['ff_ratio'] self.__graph.add_edge(user, interacted_with, weight=int(num_interactions)) ff_ratio[user] = float(u_ff_ratio) # obtain central node # degrees = net.degree(self.__graph) # central_node, max_degree = sorted(degrees, key=itemgetter(1))[-1] # center the graph around the central node # ego_graph = net.DiGraph(net.ego_graph(self.__graph, central_node)) return def get_graph_nodes(self): return len(self.__nodes) def get_graph_edges(self): return len(self.__network) def get_graph(self): return self.__graph def get_node_sizes(self): return self.__node_sizes def __get_ffratio(self, screen_name): query = { '$or': [ {'tweet_obj.user.screen_name': screen_name}, {'tweet_obj.retweeted_status.user.screen_name': screen_name}, {'tweet_obj.quoted_status.user.screen_name': screen_name} ] } tweet_obj = self.__dbm_tweets.find_record(query) if tweet_obj: tweet = tweet_obj['tweet_obj'] if 'retweeted_status' in tweet.keys(): return self.__computer_ff_ratio(tweet['retweeted_status']['user']['friends_count'], tweet['retweeted_status']['user']['followers_count']) elif 'quoted_status' in tweet.keys(): return self.__computer_ff_ratio(tweet['quoted_status']['user']['friends_count'], tweet['quoted_status']['user']['followers_count']) else: return self.__computer_ff_ratio(tweet['user']['friends_count'], tweet['user']['followers_count']) else: return None def save_network_in_gexf_format(self, file_name): today = datetime.strftime(datetime.now(), '%m/%d/%y') f_name = pathlib.Path(__file__).parents[2].joinpath('sna', 'gefx', file_name+'.gexf') with open(str(f_name), 'w', encoding='utf-8') as f: f.write('<?xml version="1.0" encoding="UTF-8"?>\n') f.write('<gexf xmlns="http://www.gexf.net/1.2draft" xmlns:viz="http://www.gexf.net/1.1draft/viz" ' 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' 'xsi:schemaLocation="http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd" ' 'version="1.2">\n') f.write('<meta lastmodifieddate="{0}">\n'.format(today)) f.write('<creator>PoliticBots</creator>\n') f.write('<description>{0}</description>\n'.format(file_name)) f.write('</meta>\n') f.write('<graph mode="static" defaultedgetype="directed">\n') # add data attributes f.write('<attributes class="node">\n') f.write('<attribute id="0" title="party" type="string"/>\n') f.write('<attribute id="1" title="movement" type="string"/>\n') f.write('<attribute id="2" title="ff_ratio" type="float"/>\n') f.write('<attribute id="3" title="pbb" type="float"/>\n') f.write('</attributes>\n') # add nodes f.write('<nodes>\n') node_id = 0 list_nodes = [] for node_tup in self.__nodes: node = dict(node_tup) f.write('<node id="{0}" label="{1}">\n'.format(node_id, node['screen_name'])) f.write('<attvalues>\n') f.write('<attvalue for="0" value="{0}"/>\n'.format(node['party'])) f.write('<attvalue for="1" value="{0}"/>\n'.format(node['movement'])) f.write('<attvalue for="2" value="{0}"/>\n'.format(node['ff_ratio'])) f.write('<attvalue for="3" value="{0}"/>\n'.format(node['pbb'])) f.write('</attvalues>\n') #f.write('<viz:size value="{0}"/>\n'.format(node['ff_ratio'])) f.write('</node>\n') node_id += 1 list_nodes.append(node['screen_name']) f.write('</nodes>\n') # add edges f.write('<edges>\n') edge_id = 0 for edge in list(self.__network): id_vertexA = list_nodes.index(edge['nodeA']['screen_name']) id_vertexB = list_nodes.index(edge['nodeB']['screen_name']) weight = edge['weight'] f.write('<edge id="{0}" source="{1}" target="{2}" weight="{3}"/>\n'.format(edge_id, id_vertexA, id_vertexB, weight)) edge_id += 1 f.write('</edges>\n') f.write('</graph>\n') f.write('</gexf>\n') return f_name
class TweetEvaluator: special_chars = r'[=\+/&<>;:\'\"\?%$!¡\,\. \t\r\n]+' hashtags, user_handlers = [], [] __dbm = None BATCH_SIZE = 1000 def __init__(self): self.user_handlers, self.hashtags = get_user_handlers_and_hashtags() self.__dbm = DBManager('tweets') def __is_relevant(self, users_counter, hashtags_counter): # a tweet is considered relevant if fulfills one of two # conditions; candidates are mentioned or if candidates are # are not mentioned but there are at least more than one # campaign hashtag if users_counter > 0 or hashtags_counter > 1: return True else: return False def __assess_tweet_by_text(self, tweet_text): tweet_text = re.sub(u'\u2026', '', tweet_text) # remove ellipsis unicode char users_counter, hashtags_counter = 0, 0 for token in tweet_text.split(): token = re.sub(self.special_chars, '', token) # remove special chars if token.lower() in self.user_handlers: users_counter += 1 if token.lower() in self.hashtags: hashtags_counter += 1 return self.__is_relevant(users_counter, hashtags_counter) def __assess_tweet_by_entities(self, tweet_hashtags, tweet_mentions): users_counter, hashtags_counter = 0, 0 for tweet_hashtag in tweet_hashtags: tweet_hashtag_txt = '#' + tweet_hashtag['text'].lower() if tweet_hashtag_txt in self.hashtags: hashtags_counter += 1 for tweet_mention in tweet_mentions: screen_name = '@' + tweet_mention['screen_name'].lower() if screen_name in self.user_handlers: users_counter += 1 return self.__is_relevant(users_counter, hashtags_counter) def is_tweet_relevant(self, tweet): tweet_author = tweet['user']['screen_name'] tweet_handler = '@{0}'.format(tweet_author.lower()) if tweet_handler in self.user_handlers: return True else: if 'retweeted_status' in tweet.keys(): original_tweet = tweet['retweeted_status'] else: original_tweet = tweet if 'entities' in original_tweet.keys(): t_user_mentions = original_tweet['entities']['user_mentions'] t_hashtags = original_tweet['entities']['hashtags'] return self.__assess_tweet_by_entities(t_hashtags, t_user_mentions) else: if 'full_text' in original_tweet.keys(): return self.__assess_tweet_by_text(tweet['full_text']) else: return self.__assess_tweet_by_text(tweet['text']) def __mark_relevance_rt(self, tweet_reg): logging.info('Marking RTS...') query = { 'tweet_obj.retweeted_status': { '$exists': 1 }, 'tweet_obj.retweeted_status.id_str': { '$eq': tweet_reg['tweet_obj']['id_str'] }, 'relevante': { '$ne': tweet_reg['relevante'] } } update = {'$set': {'relevante': tweet_reg['relevante']}} update_res = self.__dbm.update_record_many(query, update) logging.info('Marked {0} RTS...'.format(update_res.matched_count)) def identify_relevant_tweets(self): # select only original tweets that are not marked as relevant query = { 'relevante': { '$exists': 0 }, 'tweet_obj.retweeted_status': { '$exists': 0 } } logging.info('Relevant Tweets: Running query to count...') # processing by batch as workaround cursor not found error total_tweets = self.__dbm.search(query, only_relevant_tws=False).count() total_batches = ceil(total_tweets / self.BATCH_SIZE) batch = 1 moreToProcess = batch <= total_batches while moreToProcess: logging.info( 'Querying records in batches of {0} records...'.format( self.BATCH_SIZE)) search_res = self.__dbm.search( query, only_relevant_tws=False).limit(self.BATCH_SIZE) logging.info('Loading batch {0}/{1} into memory...'.format( batch, total_batches)) tweets = [doc for doc in search_res] total_tweets_batch = self.BATCH_SIZE if batch == total_batches: total_tweets_batch = len(tweets) logging.info( 'Identifying relevant tweets in batch {0}/{1} out of {2} tweets...' .format(batch, total_batches, total_tweets_batch)) tweet_counter = 0 try: for tweet_reg in tweets: tweet_counter += 1 tweet = tweet_reg['tweet_obj'] if self.is_tweet_relevant(tweet): tweet_reg['relevante'] = 1 logging.info( 'Identifying {0}/{1} tweets (relevant)'.format( tweet_counter, total_tweets)) else: tweet_reg['relevante'] = 0 logging.info( 'Identifying {0}/{1} tweets (irrelevant)'.format( tweet_counter, total_tweets)) self.__dbm.update_record( {'tweet_obj.id_str': tweet['id_str']}, tweet_reg) # copy the relevance flag to rts self.__mark_relevance_rt(tweet_reg) logging.info( 'Finished identifying relevant tweets in batch {0}/{1} out of {2} tweets...' .format(batch, total_batches, total_tweets_batch)) batch += 1 moreToProcess = batch <= total_batches except Exception as e: logging.info("Exception occurred...") logging.info("Exception message {0}".format(e)) logging.info('Finished identifying relevant tweets...') return True # set to 'user' the type of tweets which keyword contains @ def fix_tweet_type(self): query = {'type': 'hashtag', 'keyword': {'$regex': '@'}} objs = self.__dbm.search(query) num_fixed_tweets = objs.count() for obj in objs: obj['type'] = 'user' self.__dbm.save_record(obj) return num_fixed_tweets def __get_hashtags(self, hashtags_list): hts = [] for ht in hashtags_list: hts.append(ht['text']) return hts def __get_screen_names(self, screen_names_list): scs = [] for sc in screen_names_list: scs.append('@' + sc['screen_name']) return scs # fix value of candidatura if hashtags related to a candidacy # are present in the text of the tweet def fix_value_of_candidatura(self): script_parent_dir = pathlib.Path(__file__).parents[1] config_fn = script_parent_dir.joinpath('config.json') configuration = get_config(config_fn) keyword, k_metadata = parse_metadata(configuration['metadata']) interested_data = [] # keep metadata that refer to candidacies for kword, kmetada in zip(keyword, k_metadata): if kmetada['candidatura'] != '': kmetada.update({'keyword': kword}) interested_data.append(kmetada) query = {'candidatura': ''} # select tweets without candidacy s_objs = self.__dbm.search(query) num_fixed_tweets = 0 # iterate over tweets without candidacy and fix those # whose text mention a candidate or have hashtags # related to a candidacy for s_obj in s_objs: party = s_obj['partido_politico'] movement = s_obj['movimiento'] tweet = s_obj['tweet_obj'] relevant_data = [] candidacy = '' # keep metadata related to the political party # (and movement) of the tweet (s_obj) for ida in interested_data: if ida['partido_politico'] == party: if movement != '': if ida['movimiento'] == movement: relevant_data.append(ida) else: relevant_data.append(ida) if len(relevant_data) > 0: # extract relevant information of the tweet. hashtags and mentions if # the tweet obj has these entities otherwise the text of the tweet if 'retweeted_status' in tweet.keys(): original_tweet = tweet['retweeted_status'] else: original_tweet = tweet if 'entities' in original_tweet.keys(): t_user_mentions = self.__get_screen_names( original_tweet['entities']['user_mentions']) t_hashtags = self.__get_hashtags( original_tweet['entities']['hashtags']) # see if the interested keywords are part of the tweet hashtags or mentions for rd in relevant_data: if rd['keyword'] in t_user_mentions: candidacy = rd['candidatura'] break else: if rd['keyword'] in t_hashtags: candidacy = rd['candidatura'] break else: if 'full_text' in original_tweet.keys(): t_text = tweet['full_text'] else: t_text = tweet['text'] # see if the interested keywords are present in the text for rd in relevant_data: if rd['keyword'] in t_text: candidacy = rd['candidatura'] break # fix candidacy key if candidacy: s_obj['candidatura'] = candidacy num_fixed_tweets += 1 self.__dbm.save_record(s_obj) return num_fixed_tweets