示例#1
0
def __db_trustworthy_users(db_users, db_tweets, config):
    """
    Generate a database of trustworthy users. We trust in the user if she
    has a verified account or has more than X number of followers
    
    :param db_users: database of user
    :param config: dictionary with the configuration parameters of the heuristic

    :return: database of trustworthy users
    """
    trustworthy_users_db = DBManager('trustworthy_users')
    if trustworthy_users_db.num_records_collection() == 0:
        logging.info('The trustworthy_users collection is being created...')
        for doc in db_users.find_all():
            data = get_user(db_tweets, doc['screen_name'])
            if data['verified'] or int(
                    data['followers_count']) > config['min_num_followers']:
                if not trustworthy_users_db.find_record(
                    {'screen_name': data['screen_name']}):
                    trustworthy_users_db.save_record({
                        'screen_name':
                        doc['screen_name'],
                        'name':
                        data['name'],
                        'created_at':
                        data['created_at'],
                        'followers_count':
                        data['followers_count'],
                        'verified':
                        data['verified']
                    })
    return trustworthy_users_db
示例#2
0
class NetworkAnalyzer:
    __dbm_tweets = None
    __dbm_users = None
    __dbm_networks = None
    __network = None
    __graph = None
    __nodes = set()
    __unknown_users = set()
    __node_sizes = None

    def __init__(self):
        self.__dbm_tweets = DBManager('tweets')
        self.__dbm_users = DBManager('users')
        self.__dbm_networks = DBManager('networks')
        self.__network = []

    def __computer_ff_ratio(self, friends, followers):
        if followers > 0 and friends > 0:
            return friends / followers
        else:
            return 0

    # Get interactions in of a given users
    def get_in_interactions(self, user_screen_name):
        # compute in interactions, meaning, interactions in which the user
        # was mentioned, retweeted, quoted, replied
        in_inter_query = {'interactions.' + user_screen_name: {'$exists': 1},
                          'screen_name': {'$ne': user_screen_name}}
        n_users = self.__dbm_users.search(in_inter_query)
        in_interactions_dict, in_rts, in_rps = {}, {}, {}
        in_qts, in_mts = {}, {}
        total_in_interactions = 0
        total_in_retweets, total_in_replies = 0, 0
        total_in_mentions, total_in_quotes = 0, 0
        for n_user in n_users:
            n_user_interactions = n_user['interactions']
            for i_user, interactions in n_user_interactions.items():
                if i_user == user_screen_name:
                    in_interactions_dict[n_user['screen_name']] = interactions['total']
                    total_in_interactions += interactions['total']
                    if 'retweets' in interactions.keys():
                        total_in_retweets += interactions['retweets']
                        in_rts[n_user['screen_name']] = interactions['retweets']
                    if 'replies' in interactions.keys():
                        total_in_replies += interactions['replies']
                        in_rps[n_user['screen_name']] = interactions['replies']
                    if 'mentions' in interactions.keys():
                        total_in_mentions += interactions['mentions']
                        in_mts[n_user['screen_name']] = interactions['mentions']
                    if 'quotes' in interactions.keys():
                        total_in_quotes += interactions['quotes']
                        in_qts[n_user['screen_name']] = interactions['quotes']
        in_interactions_obj = {
            'total': {
                'count': total_in_interactions,
                'details': in_interactions_dict
            },
            'replies': {
                'count': total_in_replies,
                'details': in_rps
            },
            'retweets': {
                'count': total_in_retweets,
                'details': in_rts
            },
            'mentions': {
                'count': total_in_mentions,
                'details': in_mts
            },
            'quotes': {
                'count': total_in_quotes,
                'details': in_qts
            }
        }
        user_dict = {
            'in_interactions': in_interactions_obj
        }
        return user_dict

    # Get interactions out of a given users
    def get_out_interactions(self, user_screen_name):
        user = self.__dbm_users.search({'screen_name': user_screen_name})[0]
        # compute out interactions, meaning, interactions originated by
        # the user
        user_interactions = user['interactions']
        out_interactions_dict, out_rts = {}, {}
        out_rps, out_qts, out_mts = {}, {}, {}
        total_out_interactions, total_out_retweets = 0, 0
        total_out_mentions, total_out_replies = 0, 0
        total_out_quotes = 0
        for recipient, interactions in user_interactions.items():
            out_interactions_dict[recipient] = interactions['total']
            total_out_interactions += interactions['total']
            if 'retweets' in interactions:
                total_out_retweets += interactions['retweets']
                out_rts[recipient] = interactions['retweets']
            if 'replies' in interactions:
                total_out_replies += interactions['replies']
                out_rps[recipient] = interactions['replies']
            if 'mentions' in interactions:
                total_out_mentions += interactions['mentions']
                out_mts[recipient] = interactions['mentions']
            if 'quotes' in interactions:
                total_out_quotes += interactions['quotes']
                out_qts[recipient] = interactions['quotes']
        out_interactions_obj = {
            'total': {
                'count': total_out_interactions,
                'details': out_interactions_dict
            },
            'replies': {
                'count': total_out_replies,
                'details': out_rps
            },
            'retweets': {
                'count': total_out_retweets,
                'details': out_rts
            },
            'mentions': {
                'count': total_out_mentions,
                'details': out_mts
            },
            'quotes': {
                'count': total_out_quotes,
                'details': out_qts
            }
        }
        # compile all information in a dictionary
        user_dict = {
            'out_interactions': out_interactions_obj
        }
        return user_dict

    def create_users_db(self, clear_collection=False):
        logging.info('::. Network Analyzer: Creating database of users, it can take several minutes, please wait_')
        if clear_collection:
            self.__dbm_users.clear_collection()
        users = self.__dbm_tweets.get_unique_users()
        users_count = len(users)
        logging.info('::. Network Analyzer: Extracted {0} unique users from the database...'.format(users_count))
        progress = 1
        for user in users:
            db_user = {
                'screen_name': user['screen_name'],
                'friends': user['friends'],
                'followers': user['followers'],
                'ff_ratio': self.__computer_ff_ratio(user['friends'], user['followers']),
                'interactions': user['interactions'],
                'tweets': user['tweets_count'],
                'original_tweets': user['original_count'],
                'rts': user['retweets_count'],
                'qts': user['quotes_count'],
                'rps': user['replies_count'],
                'verified': user['verified']

            }
            # Assign the party and movement to the party and movement that are more related to the user
            # counting both Hashtags and Mentions by the user
            user_parties = self.__dbm_tweets.get_party_user(user['screen_name'])
            user_parties_count = len(user_parties) or 0
            logging.debug('::. Network Analyzer: User {0} has {1} associated parties...'
                          .format(user['screen_name'],user_parties_count))

            if user_parties_count > 0:
                user_party = user_parties[0]
                db_user.update({'most_interacted_party': user_party['partido']})
                user_movements = self.__dbm_tweets.get_movement_user(user['screen_name'])
                user_movements_count = len(user_movements) or 0
                logging.debug('::. Network Analyzer: User {0} has {1} associated movements...'
                              .format(user['screen_name'], user_movements_count))
                if user_movements_count > 0:
                    user_movement = user_movements[0]
                    db_user.update({'most_interacted_movement': user_movement['movimiento']})
                else:
                    db_user.update({'most_interacted_movement': ''})
            else:
                db_user.update({'most_interacted_party': '', 'movement': ''})


            # Assign the party and movement to the party and movement that are more related to the user
            # counting both Hashtags and Mentions by the user
            upp = UserPoliticalPreference()
            user_party = upp.get_user_political_party(user['screen_name'])
            user_movement = upp.get_user_political_movement(user['screen_name'])
            db_user.update({'party': user_party, 'movement': user_movement})

            filter_query = {'screen_name': user['screen_name']}
            logging.debug('::. Network Analyzer: Updating/creating user {0} ({1}/{2})...'
                          .format(user['screen_name'], progress, users_count))
            progress += 1
            self.__dbm_users.update_record(filter_query, db_user, create_if_doesnt_exist=True)

    def generate_network(self, subnet_query={}, depth=1, file_name='network', override_net=False):
        net_query = subnet_query.copy()
        net_query.update({'depth': depth})
        ret_net = self.__dbm_networks.search(net_query)
        # the net doesn't exist yet, let's create it
        if ret_net.count() == 0 or override_net:
            logging.info('Generating the network, it can take several minutes, please wait_')
            users = self.__dbm_users.search(subnet_query)
            # for each user generate his/her edges
            for user in users:
                if 'ff_ratio' in user.keys():
                    u_ff_ratio = user['ff_ratio']
                else:
                    u_ff_ratio = self.__computer_ff_ratio(user['friends'], user['followers'])
                pbb_score = user['bot_analysis']['pbb'] if 'bot_analysis' in user.keys() else ''
                self.__nodes.add(tuple({'screen_name': user['screen_name'], 'party': user['party'],
                                        'movement': user['movement'], 'ff_ratio': u_ff_ratio,
                                        'pbb': pbb_score}.items()))
                for interacted_user, interactions in user['interactions'].items():
                    iuser = self.__dbm_users.find_record({'screen_name': interacted_user})
                    if not iuser:
                        if depth > 1:
                            iuser_ffratio = self.__get_ffratio(interacted_user)
                            if not iuser_ffratio:
                                self.__unknown_users.add(interacted_user)
                                continue
                        else:
                            self.__unknown_users.add(interacted_user)
                            continue
                    else:
                        if 'ff_ratio' in iuser.keys():
                            i_ff_ratio = iuser['ff_ratio']
                        else:
                            i_ff_ratio = self.__computer_ff_ratio(iuser['friends'], iuser['followers'])

                    pbb_iuser_score = user['bot_analysis']['pbb'] if 'bot_analysis' in iuser.keys() else ''
                    self.__nodes.add(tuple({'screen_name': iuser['screen_name'], 'party': iuser['party'],
                                            'movement': iuser['movement'], 'ff_ratio': i_ff_ratio,

                                            'pbb': pbb_iuser_score}.items()))
                    edge = {
                        'nodeA': {'screen_name': user['screen_name'], 'ff_ratio': u_ff_ratio,
                                  'party': user['party'], 'movement': user['movement'],
                                  'pbb': pbb_score},
                        'nodeB': {'screen_name': interacted_user, 'ff_ratio': i_ff_ratio,
                                  'party': iuser['party'], 'movement': iuser['movement'],
                                  'pbb': pbb_iuser_score},
                        'weight': interactions['total']
                    }
                    self.__network.append(edge)
            logging.info('Created a network of {0} nodes and {1} edges'.format(len(self.__nodes), len(self.__network)))
            logging.info('Unknown users {0}'.format(len(self.__unknown_users)))
            # save the net in a gefx file for posterior usage
            f_name = self.save_network_in_gexf_format(file_name)
            logging.info('Saved the network in the file {0}'.format(f_name))
            db_net = {'file_name': str(f_name)}
            db_net.update(net_query)
            self.__dbm_networks.save_record(db_net)
        else:
            f_net = ret_net[0]
            logging.info('The network was already generated, please find it at {0}'.format(f_net['file_name']))

    def create_graph(self):
        logging.info('Creating the graph, please wait_')
        self.__graph = net.DiGraph()
        ff_ratio = defaultdict(lambda: 0.0)
        # create a directed graph from the edge data and populate a dictionary
        # with the friends/followers ratio
        for edge in self.__network:
            user = edge['nodeA']['screen_name']
            interacted_with = edge['nodeB']['screen_name']
            num_interactions = edge['weight']
            u_ff_ratio = edge['nodeA']['ff_ratio']
            self.__graph.add_edge(user, interacted_with, weight=int(num_interactions))
            ff_ratio[user] = float(u_ff_ratio)
        # obtain central node
        # degrees = net.degree(self.__graph)
        # central_node, max_degree = sorted(degrees, key=itemgetter(1))[-1]
        # center the graph around the central node
        # ego_graph = net.DiGraph(net.ego_graph(self.__graph, central_node))
        return

    def get_graph_nodes(self):
        return len(self.__nodes)

    def get_graph_edges(self):
        return len(self.__network)

    def get_graph(self):
        return self.__graph

    def get_node_sizes(self):
        return self.__node_sizes

    def __get_ffratio(self, screen_name):
        query = {
            '$or': [
                {'tweet_obj.user.screen_name': screen_name},
                {'tweet_obj.retweeted_status.user.screen_name': screen_name},
                {'tweet_obj.quoted_status.user.screen_name': screen_name}
            ]
        }
        tweet_obj = self.__dbm_tweets.find_record(query)
        if tweet_obj:
            tweet = tweet_obj['tweet_obj']
            if 'retweeted_status' in tweet.keys():
                return self.__computer_ff_ratio(tweet['retweeted_status']['user']['friends_count'],
                                                tweet['retweeted_status']['user']['followers_count'])
            elif 'quoted_status' in tweet.keys():
                return self.__computer_ff_ratio(tweet['quoted_status']['user']['friends_count'],
                                                tweet['quoted_status']['user']['followers_count'])
            else:
                return self.__computer_ff_ratio(tweet['user']['friends_count'],
                                                tweet['user']['followers_count'])
        else:
            return None

    def save_network_in_gexf_format(self, file_name):
        today = datetime.strftime(datetime.now(), '%m/%d/%y')
        f_name = pathlib.Path(__file__).parents[2].joinpath('sna', 'gefx', file_name+'.gexf')
        with open(str(f_name), 'w', encoding='utf-8') as f:
            f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
            f.write('<gexf xmlns="http://www.gexf.net/1.2draft" xmlns:viz="http://www.gexf.net/1.1draft/viz" '
                    'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
                    'xsi:schemaLocation="http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd" '
                    'version="1.2">\n')
            f.write('<meta lastmodifieddate="{0}">\n'.format(today))
            f.write('<creator>PoliticBots</creator>\n')
            f.write('<description>{0}</description>\n'.format(file_name))
            f.write('</meta>\n')
            f.write('<graph mode="static" defaultedgetype="directed">\n')
            # add data attributes
            f.write('<attributes class="node">\n')
            f.write('<attribute id="0" title="party" type="string"/>\n')
            f.write('<attribute id="1" title="movement" type="string"/>\n')
            f.write('<attribute id="2" title="ff_ratio" type="float"/>\n')
            f.write('<attribute id="3" title="pbb" type="float"/>\n')
            f.write('</attributes>\n')
            # add nodes
            f.write('<nodes>\n')
            node_id = 0
            list_nodes = []
            for node_tup in self.__nodes:
                node = dict(node_tup)
                f.write('<node id="{0}" label="{1}">\n'.format(node_id, node['screen_name']))
                f.write('<attvalues>\n')
                f.write('<attvalue for="0" value="{0}"/>\n'.format(node['party']))
                f.write('<attvalue for="1" value="{0}"/>\n'.format(node['movement']))
                f.write('<attvalue for="2" value="{0}"/>\n'.format(node['ff_ratio']))
                f.write('<attvalue for="3" value="{0}"/>\n'.format(node['pbb']))
                f.write('</attvalues>\n')
                #f.write('<viz:size value="{0}"/>\n'.format(node['ff_ratio']))
                f.write('</node>\n')
                node_id += 1
                list_nodes.append(node['screen_name'])
            f.write('</nodes>\n')
            # add edges
            f.write('<edges>\n')
            edge_id = 0
            for edge in list(self.__network):
                id_vertexA = list_nodes.index(edge['nodeA']['screen_name'])
                id_vertexB = list_nodes.index(edge['nodeB']['screen_name'])
                weight = edge['weight']
                f.write('<edge id="{0}" source="{1}" target="{2}" weight="{3}"/>\n'.format(edge_id, id_vertexA,
                                                                                           id_vertexB, weight))
                edge_id += 1
            f.write('</edges>\n')
            f.write('</graph>\n')
            f.write('</gexf>\n')
        return f_name
示例#3
0
class SentimentAnalysis:
    config_file_name = pathlib.Path(__file__).parents[1].joinpath(
        'config.json')
    config = None
    language = ''
    method = ''
    __db = None

    def __init__(self, collection='tweets', language='spanish'):
        self.config = get_config(self.config_file_name)
        self.language = language
        self.__dbm = DBManager(collection)

    def __get_analyzed_tweet(self, analyzed_tweets, id_tweet_to_search):
        for analyzed_tweet in analyzed_tweets:
            if id_tweet_to_search == analyzed_tweet['id']:
                return analyzed_tweet
        return None

    def update_sentiment_of_non_original_tweets(self,
                                                query={},
                                                update_sentiment=False):
        if update_sentiment:
            query.update({
                'relevante': 1,
            })
        else:
            query.update({'relevante': 1, 'sentimiento': {'$exists': 0}})
        tweet_regs = self.__dbm.search(query)
        rts_wo_tw = []
        for tweet_reg in tweet_regs:
            if 'retweeted_status' in tweet_reg['tweet_obj'].keys():
                id_original_tweet = tweet_reg['tweet_obj']['retweeted_status'][
                    'id_str']
                original_tweet_reg = self.__dbm.find_record(
                    {'tweet_obj.id_str': id_original_tweet})
                if original_tweet_reg:
                    sentiment_ot = original_tweet_reg['sentimiento']
                    if sentiment_ot:
                        self.__dbm.update_record(
                            {
                                'tweet_obj.id_str':
                                tweet_reg['tweet_obj']['id_str']
                            }, {'sentimiento': sentiment_ot})
                    else:
                        raise Exception(
                            'Error, found an original tweet without sentiment')
                else:
                    rts_wo_tw.append(tweet_reg['tweet_obj'])
            elif tweet_reg['tweet_obj']['in_reply_to_status_id_str']:
                rts_wo_tw.append(tweet_reg['tweet_obj'])
                logging.info('Tweet not RT {0}'.format(
                    tweet_reg['tweet_obj']['id_str']))
        self.__analyze_sentiment_of_rt_wo_tws(rts_wo_tw)

    def __update_sentimient_rts(self, analyzed_tweets):
        for analyzed_tweet in analyzed_tweets:
            # search rts of the analyzed tweet
            rts = self.__dbm.search(
                {'tweet_obj.retweeted_status.id_str': analyzed_tweet['id']})
            for rt in rts:
                self.__dbm.update_record(
                    {'tweet_obj.id_str': rt['tweet_obj']['id_str']},
                    {'sentimiento': analyzed_tweet['sentimiento']})

    def __analyze_sentiment_of_rt_wo_tws(self, tweets):
        tot_tws = len(tweets)
        batch_size = 5
        tweets_to_analyze = []
        for current_tw in range(tot_tws):
            tweet_id = tweets[current_tw]['id_str']
            if 'retweeted_status' in tweets[current_tw].keys():
                tweet = tweets[current_tw]['retweeted_status']
            else:
                tweet = tweets[current_tw]
            if 'full_text' in tweet.keys():
                tweet_text = tweet['full_text']
            else:
                tweet_text = tweet['text']
            if len(tweets_to_analyze) < batch_size and current_tw < tot_tws:
                tweets_to_analyze.append({'id': tweet_id, 'text': tweet_text})
                if len(tweets_to_analyze) < batch_size and current_tw < (
                        tot_tws - 1):
                    continue
            sentiment_results = self.do_sentiment_analysis(tweets_to_analyze)
            tweets_to_analyze = []
            for sentiment_result in sentiment_results:
                sentiment_info = sentiment_result['sentimiento']
                tweet_id = sentiment_result['id']
                tweet_text = sentiment_result['text']
                self.__dbm.update_record({'tweet_obj.id_str': tweet_id},
                                         {'sentimiento': sentiment_info})
                logging.debug('Tweet text: {0}, Sentimiento: {1} ({2})'.format(
                    tweet_text.encode('utf-8'), sentiment_info['tono'],
                    sentiment_info['score']))

    def analyze_sentiments(self, query={}, update_sentiment=False):
        """
        :param query: dictionary of <key, value> terms to be used in querying the db
        """
        if update_sentiment:
            query.update({
                'relevante': 1,
                'tweet_obj.retweeted_status': {
                    '$exists': 0
                }
            })
        else:
            query.update({
                'relevante': 1,
                'tweet_obj.retweeted_status': {
                    '$exists': 0
                },
                'sentimiento': {
                    '$exists': 0
                }
            })
        tweet_regs = self.__dbm.search(query)
        analyzed_tweets = []
        tot_reg = tweet_regs.count()
        logging.info(
            'Going to analyze the sentiment of {0} tweets, '
            'it can take a lot of time, be patient...'.format(tot_reg))
        batch_size = 100
        total_batches = ceil(tot_reg / batch_size)
        batch = 0
        tweets_to_analyze = []
        try:
            for current_reg in range(tot_reg):
                tweet_reg = tweet_regs[current_reg]
                tweet = tweet_reg['tweet_obj']
                if 'full_text' in tweet.keys():
                    tweet_text = tweet['full_text']
                else:
                    tweet_text = tweet['text']
                if len(tweets_to_analyze
                       ) < batch_size and current_reg < tot_reg:
                    tweets_to_analyze.append({
                        'id': tweet['id_str'],
                        'text': tweet_text
                    })
                    if len(tweets_to_analyze) < batch_size:
                        continue
                batch += 1
                logging.info(
                    'Analyzing the sentiment of {0} tweets in batch {1}/{2} '
                    'out of {3} tweets...'.format(len(tweets_to_analyze),
                                                  batch, total_batches,
                                                  tot_reg))
                sentiment_results = self.do_sentiment_analysis(
                    tweets_to_analyze)
                logging.info(
                    'Finished analyzing the sentiment of {0} tweets in batch {1}/{2} '
                    'out of {3} tweets...'.format(len(tweets_to_analyze),
                                                  batch, total_batches,
                                                  tot_reg))
                logging.info('Updating sentiment scores in database...')
                tweets_to_analyze = []
                for sentiment_result in sentiment_results:
                    sentiment_info = sentiment_result['sentimiento']
                    tweet_id = sentiment_result['id']
                    tweet_text = sentiment_result['text']
                    self.__dbm.update_record({'tweet_obj.id_str': tweet_id},
                                             {'sentimiento': sentiment_info})
                    analyzed_tweets.append({
                        'id': tweet_id,
                        'texto': tweet_text,
                        'sentimiento': sentiment_info
                    })
                    logging.debug(
                        'Tweet text: {0}, Sentimiento: {1} ({2})'.format(
                            tweet_text.encode('utf-8'), sentiment_info['tono'],
                            sentiment_info['score']))
        except Exception as e:
            logging.error(e)
        finally:
            self.__update_sentimient_rts(analyzed_tweets)

        return analyzed_tweets

    def do_sentiment_analysis(self, tweets):
        sa = SentimentAnalyzer(language='spanish')
        tweet_texts = []
        for tweet in tweets:
            tweet_texts.append(tweet['text'] +
                               ' -$%#$&- {0}'.format(tweet['id']))
        sa.analyze_docs(tweet_texts)
        results = sa.tagged_docs
        logging.info(
            'Finished the sentiment analysis, now {0} results are going to '
            'be processed...'.format(len(results)))
        ret = self.__process_results(results)
        logging.info('Computed correctly the sentiment of {0} tweets'.format(
            len(tweet_texts)))
        return ret

    def remote_sentiment_analysis(self, tweets):
        accepted_codes = [200, 201, 202]
        error_codes = [400, 401]
        url_base = 'http://159.203.77.35:8080/api'
        url_sentiment = url_base + '/analysis/sentiment-analysis/'
        url_auth = url_base + '/auth/'
        headers = {'Authorization': 'JWT ' + self.config['inhouse']['api_key']}
        tweet_texts = []
        for tweet in tweets:
            tweet_texts.append(tweet['text'] +
                               ' -$%#$&- {0}'.format(tweet['id']))
        parameters = {
            'neu_inf_lim': -0.3,
            'neu_sup_lim': 0.3,
            'language': 'spanish'
        }
        data = {
            'name': (None, 'politic-bots'),
            'parameters': (None, json.dumps(parameters), 'application/json'),
            'data_object': (None, json.dumps(tweet_texts), 'application/json')
        }
        ret = []
        logging.info('Computing the sentiment of {0} tweets'.format(
            len(tweet_texts)))
        resp = requests.post(url_sentiment, headers=headers, files=data)
        if resp.status_code in error_codes:
            # have to renew the api token
            body_auth = {
                'username': self.config['inhouse']['username'],
                'password': self.config['inhouse']['password']
            }
            resp = requests.post(url_auth, data=body_auth)
            if resp.status_code in accepted_codes:
                resp_json = resp.json()
                api_token = resp_json['token']
                self.config['inhouse']['api_key'] = api_token
                update_config(self.config_file_name, self.config)
                resp = requests.post(url_sentiment,
                                     headers=headers,
                                     files=data)
            else:
                raise Exception(
                    'Error {0} when trying to renew the token of the api'.
                    format(resp.status_code))
        if resp.status_code in accepted_codes:
            resp_json = resp.json()
            get_url = url_sentiment + str(resp_json['id']) + '/'
            results = []
            # wait some time before trying to get
            # the results
            time.sleep(60)
            while len(results) == 0:
                # wait some time before trying to
                # get the results
                time.sleep(30)
                resp = requests.get(get_url, headers=headers)
                if resp.status_code in accepted_codes:
                    resp_json = resp.json()
                    results = json.loads(resp_json['result'])
                else:
                    raise Exception(
                        'Got an unexpected response, code: {0}'.format(
                            resp.status_code))
            logging.info(
                'Obtained the results of sentiment analysis, now the results are going to be processed...'
            )
            ret = self.__process_results(results)
        else:
            logging.error(
                'Error {0} when trying to compute the sentiment of the tweets'.
                format(resp.status_code))
        logging.info('Computed correctly the sentiment of {0} tweets'.format(
            len(tweet_texts)))
        return ret

    def __process_results(self, results):
        ret = []
        for result in results:
            text, tone, score = result
            if tone == 'neg':
                sentiment = 'negative'
            elif tone == 'pos':
                sentiment = 'positive'
            else:
                sentiment = 'neutral'
            tw_text_id = text.split('-$%#$&-')
            id_tweet = tw_text_id[1].strip()
            text_tweet = tw_text_id[0].strip()
            dic_ret = {
                'id': id_tweet,
                'text': text_tweet,
                'sentimiento': {
                    'tono': sentiment,
                    'score': score
                }
            }
            ret.append(dic_ret)
        return ret
示例#4
0
class BotDetector:
    __dbm_tweets = None
    __dbm_users = None
    __api = None

    def __init__(self):
        self.__dbm_tweets = DBManager('tweets')
        self.__dbm_users = DBManager('users')
        name_config_file = pathlib.Path(__file__).parents[1].joinpath('config.json')
        conf = get_config(name_config_file)
        auth = tweepy.AppAuthHandler(conf['twitter']['consumer_key'], conf['twitter']['consumer_secret'])
        self.__api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

    def __save_user_pbb(self, user_screen_name, pbb, bot_score, user_bot_features, num_heuristics,
                        sum_weights, exist_user):
        new_fields = {
            'exists': int(exist_user),
            'bot_analysis': {'features': user_bot_features,
                             'pbb': pbb,
                             'raw_score': bot_score,
                             'num_evaluated_heuristics': num_heuristics,
                             'sum_weights': sum_weights}
        }
        self.__dbm_users.update_record({'screen_name': user_screen_name}, new_fields)

    def __check_if_user_exists(self, user_screen_name):
        user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0]
        if 'exists' in user_obj.keys():
            return int(user_obj['exists'])
        else:
            try:
                self.__api.get_user(user_screen_name)
                return True
            except tweepy.TweepError:
                return False

    def __compute_bot_formula(self, user_bot_features, exists_user):
        name_weights_file = pathlib.Path(__file__).parents[0].joinpath('heuristic_weights.json')
        weights_file = get_config(name_weights_file)
        sum_heuristic_values = 0
        sum_weights = 0
        for feature_name in user_bot_features.keys():
            feature_weight = weights_file[feature_name]
            feature_value = user_bot_features[feature_name]['value']
            sum_heuristic_values += feature_weight * feature_value
            sum_weights += feature_weight
        sum_heuristic_values += weights_file['exists'] * (1-int(exists_user))
        sum_weights += weights_file['exists']
        return sum_heuristic_values, sum_weights, sum_heuristic_values/sum_weights

    def __get_timeline(self, user_screen_name, user_tweets):
        """
        Get the last 100 tweets in the timeline of a given user
        :param user: user from whom her timeline should be obtained from
        :return: user's timeline
        """
        user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0]
        if 'timeline' in user_obj.keys():
            return user_obj['timeline']
        logging.info('Get the last 100 tweets from Twitter')
        timeline = []
        try:
            for status in tweepy.Cursor(self.__api.user_timeline, screen_name=user_screen_name).items(100):
                timeline.append(status._json)
            # save the not electoral tweets of the user's timeline
            id_electoral_tweets = [tweet['id_str'] for tweet in user_tweets]
            timeline_tweets_to_save = [tweet for tweet in timeline
                                       if tweet['id_str'] not in id_electoral_tweets]
            logging.info('To save {0} not electoral tweets of {1}'.format(len(timeline_tweets_to_save),
                                                                          user_screen_name))
            new_field = {
                'timeline': timeline_tweets_to_save
            }
            self.__dbm_users.update_record({'screen_name': user_screen_name}, new_field)
        except tweepy.TweepError:
            pass
        return timeline

    def __get_tweets_user(self, user_screen_name):
        user_tweets_obj = self.__dbm_tweets.search({'tweet_obj.user.screen_name': user_screen_name})
        user_tweets = [user_tweet_obj['tweet_obj'] for user_tweet_obj in user_tweets_obj]
        return user_tweets

    def __get_user_info_from_twitter(self, user_screen_name):
        user_twitter_obj = None
        try:
            user_twitter_obj = self.__api.get_user(user_screen_name)
        except tweepy.TweepError:
            pass
        return user_twitter_obj._json

    def __get_computed_heuristics(self, user_screen_name):
        user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0]
        if 'bot_analysis' in user_obj.keys():
            return user_obj['bot_analysis']['features']
        else:
            return {}

    def __compute_heuristics(self, user_screen_name, recompute_heuristics=False):
        logging.info('\n\nComputing the probability of being bot of the user: {0}\n\n'.format(user_screen_name))

        # Get tweets of the user
        user_tweets = self.__get_tweets_user(user_screen_name)

        # Check if the user still exists on Twitter
        exist_user = self.__check_if_user_exists(user_screen_name)
        user_timeline = None
        if exist_user:
            # If the user still exists on Twitter, get her timeline
            user_timeline = self.__get_timeline(user_screen_name, user_tweets)

        # Get the information about the user and her tweets
        user_obj = get_user(self.__dbm_tweets, user_screen_name)
        if not user_obj:
            user_obj = self.__get_user_info_from_twitter(user_screen_name)
            if not user_obj:
                raise Exception('Error!, Cannot fetch information about the user {0}'.format(user_screen_name))

        if user_obj['verified']:
            # It is a verified account, it cannot be bot
            logging.info('The user {0} is an account verified by Twitter, it cannot be a bot'.format(user_screen_name))
            self.__save_user_pbb(user_screen_name, 0, 0, None, 0, 0, exist_user)
            return

        # Get the computed heuristics
        user_bot_features = self.__get_computed_heuristics(user_screen_name)
        if user_bot_features:
            user_computed_heuristics = user_bot_features.keys()
        else:
            user_computed_heuristics = []

        if recompute_heuristics or 'retweet_electoral' not in user_computed_heuristics:
            if user_tweets:
                # Compute the percentage of retweets in the electoral tweets
                per_rt = is_retweet_bot(user_tweets)
                user_bot_features['retweet_electoral'] = {
                    'value': per_rt
                }

        if recompute_heuristics or 'reply_electoral' not in user_computed_heuristics:
            if user_tweets:
                # Compute the percentage of replies in the electoral tweets
                per_rp = reply_percentage(user_tweets)
                user_bot_features['reply_electoral'] = {
                    'value': per_rp
                }

        if recompute_heuristics or 'retweet_timeline' not in user_computed_heuristics:
            # Compute the percentage of retweets in the user's timeline
                if user_timeline:
                    per_rt = is_retweet_bot(user_timeline)
                    user_bot_features['retweet_timeline'] = {
                        'value': per_rt
                    }

        if recompute_heuristics or 'reply_timeline' not in user_computed_heuristics:
            if user_timeline:
                per_rp = reply_percentage(user_timeline)
                user_bot_features['reply_timeline'] = {
                    'value': per_rp
                }

        if recompute_heuristics or 'creation_date' not in user_computed_heuristics:
            # Check the user's creation year
            extraction_date = self.__dbm_tweets.find_record({})['extraction_date']
            electoral_year = int('20' + extraction_date.split('/')[2])
            user_bot_features['creation_date'] = {
                'value': creation_date(parse_date(user_obj['created_at']), electoral_year)
            }

        if recompute_heuristics or 'default_profile' not in user_computed_heuristics:
            # Check if the user has default profile.
            user_bot_features['default_profile'] = {
                'value': default_profile(user_obj)
            }

        if recompute_heuristics or 'default_profile_picture' not in user_computed_heuristics:
            # Check if the user has default profile picture
            user_bot_features['default_profile_picture'] = {
                'value': default_profile_picture(user_obj)
            }

        if recompute_heuristics or 'default_background' not in user_computed_heuristics:
            # Check if the user has default background picture
            user_bot_features['default_background'] = {
                'value': default_background(user_obj)
            }

        if recompute_heuristics or 'empty_description' not in user_computed_heuristics:
            # Check if the user has a biography description
            user_bot_features['empty_description'] = {
                'value': default_description(user_obj)
            }

        if recompute_heuristics or 'location' not in user_computed_heuristics:
            # Check if the user has location
            user_bot_features['location'] = {
                'value': location(user_obj)
            }

        if recompute_heuristics or 'ff_ratio' not in user_computed_heuristics:
            # Check the user's following followers ratio
            ratio = followers_ratio(user_obj)
            user_bot_features['ff_ratio'] = {
                'value': ratio
            }

        if recompute_heuristics or 'random_letters' not in user_computed_heuristics:
            rl_value = random_account_letter(user_obj)
            user_bot_features['random_letters'] = {
                'value': rl_value
            }

        if recompute_heuristics or 'random_numbers' not in user_computed_heuristics:
            rn_value = random_account_number(user_obj)
            user_bot_features['random_numbers'] = {
                'value': rn_value
            }

        if recompute_heuristics or 'similar_account' not in user_computed_heuristics:
            similarity_score = similar_account_name(user_obj, self.__dbm_users, self.__dbm_tweets)
            user_bot_features['similar_account'] = {
                'value': similarity_score
            }

        # Compute the user's probability of being bot
        num_computed_heuristics = len(user_bot_features.keys())
        bot_score, sum_weights, pbb = self.__compute_bot_formula(user_bot_features, exist_user)

        self.__save_user_pbb(user_screen_name, pbb, bot_score, user_bot_features,
                             num_computed_heuristics, sum_weights, exist_user)
        logging.info('\n\nThe bot score of {0} is {1}\n\n'.format(user_screen_name, bot_score))
        return

    def compute_fake_promoter_heuristic(self, users):
        name_weights_file = pathlib.Path(__file__).parents[0].joinpath('heuristic_weights.json')
        weights_file = get_config(name_weights_file)

        if not users:
            users = self.__dbm_users.search({'bot_analysis.features.fake_promoter': {'$exists': 0},
                                             'verified': {'$ne': True}})

        tot_user = users.count()
        idx_user = 1
        for user in users:
            logging.info('Remaining users: {0}'.format(tot_user - idx_user))
            user_screen_name = user['screen_name']
            user_obj = self.__dbm_users.search({'screen_name': user_screen_name})[0]
            user_bot_features = user_obj['bot_analysis']['features']
            # Check if the user interacts with bot accounts
            fp = fake_promoter(user_screen_name, self.__dbm_users)
            logging.info('User: {0}, fake promoter score: {1}'.format(user_screen_name, fp))

            if not user_bot_features:
                user_bot_features = {}

            user_bot_features['fake_promoter'] = {
                'value': fp
            }
            bot_score = user_obj['bot_analysis']['raw_score']
            bot_score += user_bot_features['fake_promoter']['value'] * weights_file['fake_promoter']
            heuristics = user_obj['bot_analysis']['num_evaluated_heuristics'] + 1
            sum_weights = user_obj['bot_analysis']['sum_weights'] + weights_file['fake_promoter']
            pbb = bot_score/sum_weights
            exist_user = user_obj['exists']
            self.__save_user_pbb(user_screen_name, pbb, bot_score, user_bot_features, heuristics, sum_weights,
                                 exist_user)
            idx_user += 1

    def compute_bot_probability(self, users, source_users_collection = "", source_users_db = ""):
        reusers_db = None
        if source_users_db and source_users_collection:
            reusers_db = DBManager(source_users_collection, source_users_db)

        if not users:
            # Get all users who don't have the analysis of bot in current user
            users = self.__dbm_users.search({'bot_analysis': {'$exists': 0}})

        tot_user = len(users) if type(users) == list else users.count()
        idx_user = 1
        for user in users:
            logging.info('Remaining users: {0}'.format(tot_user-idx_user))
            if reusers_db:
                reuser_cursor = reusers_db.search({'screen_name': user['screen_name']})

                if reuser_cursor.count() > 0:
                    logging.info('Reusing bot analysis from another DB for {0}'.format(user['screen_name']))
                    reuser = reuser_cursor[0]
                    bot_analysis = reuser['bot_analysis']
                    self.__save_user_pbb(reuser['screen_name'], bot_analysis['pbb'], bot_analysis['raw_score'],
                                         bot_analysis['features'], bot_analysis['num_evaluated_heuristics'],
                                         bot_analysis['sum_weights'], reuser['exists'])
                    continue

            if type(users) == list:
                user_screen_name = user
            else:
                user_screen_name = user['screen_name']
            self.__compute_heuristics(user_screen_name)
            idx_user += 1

    def to_csv(self, output_file_name, include_verified_accounts=True):
        if not include_verified_accounts:
            query = {'bot_analysis': {'$exists': 1}, 'verified': {'$ne': True}}
        else:
            query = {'bot_analysis': {'$exists': 1}}
        users = self.__dbm_users.search(query)
        f_name = str(pathlib.Path(__file__).parents[2].joinpath('reports',output_file_name))
        logging.info('Saving bot analysis into the csv file {0}'.format(f_name))
        with open(f_name, 'w', encoding='utf-8') as f:
            user_info_fields = ['screen_name', 'profile_url', 'party', 'movement', 'exists', 'followers',
                                'friends', 'tweets', 'rts', 'rps', 'verified']
            bot_analysis_fields = ['location', 'default_profile_picture', 'retweet_electoral',
                                   'default_background', 'similar_account', 'random_numbers', 'ff_ratio',
                                   'random_letters', 'default_profile', 'creation_date', 'empty_description',
                                   'retweet_timeline', 'reply_electoral', 'reply_timeline', 'fake_promoter',
                                   'raw_score', 'sum_weights', 'pbb']
            writer = csv.DictWriter(f, fieldnames=user_info_fields+bot_analysis_fields)
            writer.writeheader()
            tot_users = users.count()
            logging.info('Going to save the information of the bot analysis of {0} users'.format(tot_users))
            idx_user = 1
            for user in users:
                logging.info('Remaining users: {0}'.format(tot_users - idx_user))
                row_dict = {}
                for field_name in bot_analysis_fields:
                    if field_name in user['bot_analysis']['features'].keys():
                        row_dict[field_name] = user['bot_analysis']['features'][field_name]['value']
                    elif field_name in user['bot_analysis'].keys():
                        row_dict[field_name] = user['bot_analysis'][field_name]
                for field_name in user_info_fields:
                    if field_name == 'profile_url':
                        continue
                    row_dict[field_name] = user[field_name]
                if user['exists']:
                    row_dict['profile_url'] = 'https://twitter.com/' + user['screen_name']
                else:
                    row_dict['profile_url'] = ' '
                writer.writerow(row_dict)
                idx_user += 1
        logging.info('The saving process has finished, please check the file {0}'.format(f_name))