Пример #1
0
def fake_promoter(user_screen_name, db_users, method=0):
    """
    :param method :
        Determines the method to be used in the heuristic
            0: Check if the proportion of interactions with the user's most
            frequent contacts is equal or larger than a defined threshold
            1: Check if the average probability of being bot of the user's
            most frequent contacts is equal or larger than a  defined threshold
    :param user_screen_name: screen name of the user under evaluation
    :param db_users: database of users

    :return: 1 if the user meets the condition defined in the method of the heuristic,
             0 otherwise
    """

    if method not in [0, 1, 2, 3]:
        raise Exception('Error. Unknown heuristic method {}'.format(method))

    # Get heuristic parameters
    file_path = pathlib.Path(__file__).parents[0].joinpath(
        'heuristic_config.json')
    config = get_config(file_path)['fake_promoter']

    prop_interaction_with_bots, avg_pbb_most_freq_contacts = is_fake_promoter(
        user_screen_name, db_users, config)

    if method == 0:
        return prop_interaction_with_bots
    else:
        return avg_pbb_most_freq_contacts
Пример #2
0
    def __init__(self, collection, db_name=""):
        script_parent_dir = pathlib.Path(__file__).parents[1]
        config_fn = script_parent_dir.joinpath('config.json')
        config = get_config(config_fn)
        self.__host = config['mongo']['host']
        self.__port = config['mongo']['port']
        client = MongoClient(self.__host + ':' + self.__port)

        if not db_name:
            self.__db = client[config['mongo']['db_name']]
        else:
            self.__db = client[db_name]
        self.__collection = collection
Пример #3
0
 def __get_hashtags_and_metadata(self):
     script_parent_dir = pathlib.Path(__file__).parents[1]
     config_fn = script_parent_dir.joinpath('config.json')
     configuration = get_config(config_fn)
     keywords, metadata = parse_metadata(configuration['metadata'])
     hashtags = []
     for keyword in keywords:
         if '@' not in keyword:
             # The following hashtags are excluded because they are proper names of
             # movements and people
             if keyword not in ['HonorColorado', 'ColoradoAñetete', 'tuma']:
                 hashtags.append(keyword.lower())
     return hashtags, metadata
Пример #4
0
def similar_account_name(data, db_users, db_tweets):
    """
    Check various conditions about the user's name and screen name:
    1. Condition 1: the user and screen name is inside the database of
    trustworthy users
    2. Condition 2: the user's name and screen name is a variation of
    a trustworthy user (like john and john jr)
    3. Condition 3: the user uses a name or screen names that is part of the
    name or screen name of a trustworthy user
    4. Condition 4: the user's name or screen_name has at least 75% similarity
    with the name or screen_name of a user in the trustworthy database
    
    :param data: dictionary with information about a Twitter user
    :param db_users: database of the Twitter users
    :param config: dictionary with the configuration parameters of the heuristic

    :return: 1 if condition 2, 3, or 4 is met 0 otherwise
    """
    mini_sn = 0.0

    # Get heuristic parameters
    file_path = pathlib.Path(__file__).parents[0].joinpath(
        'heuristic_config.json')
    config = get_config(file_path)['fake_handler']

    # create a database of "trustworthy" accounts
    dbm_trustworthy_users = __db_trustworthy_users(db_users, db_tweets, config)
    if dbm_trustworthy_users.find_record({'screen_name': data['screen_name']}) and \
       dbm_trustworthy_users.find_record({'name': data['name']}):
        return 0
    elif 'jr' in data['screen_name'] and \
        dbm_trustworthy_users.find_record({'screen_name': data['screen_name'].replace('jr', '')}):
        return 1
    elif 'junior' in data['screen_name'] and \
        dbm_trustworthy_users.find_record({'screen_name': data['screen_name'].replace('junior', '')}):
        return 1
    else:
        # check against the database of trustworthy users
        for doc in dbm_trustworthy_users.find_all():
            dist_sn = __string_similarity(doc['screen_name'],
                                          data['screen_name'])
            if doc['name'] in data['screen_name'] or doc[
                    'screen_name'] in data['screen_name']:
                return 1
            if mini_sn < dist_sn:
                mini_sn = dist_sn
        return mini_sn
Пример #5
0
def fix_tweets_with_empty_flags():
    dbm = DBManager('tweets')
    script_parent_dir = pathlib.Path(__file__).parents[1]
    conf_file = script_parent_dir.joinpath('config.json')
    configuration = get_config(conf_file)
    keyword, k_metadata = parse_metadata(configuration['metadata'])
    tweets_with_empty_flags = dbm.search({'flag.keyword': {'$size': 0}, 'relevante': 1})
    for tweet in tweets_with_empty_flags:
        logging.info('Updating flags of tweet {0}'.format(tweet['tweet_obj']['id_str']))
        flag, headers = create_flag(k_metadata)
        entities = get_entities_tweet(tweet['tweet_obj'])
        flag = add_values_to_flags(flag, entities, k_metadata)
        dbm.update_record({'tweet_obj.id_str': tweet['tweet_obj']['id_str']}, flag)


#if __name__ == '__main__':
#    fix_tweets_with_empty_flags()
Пример #6
0
def do_tweet_collection():
    script_parent_dir = pathlib.Path(__file__).parents[0]
    conf_file = script_parent_dir.joinpath('config.json')
    configuration = get_config(conf_file)
    credentials = {'key': configuration['twitter']['consumer_key'],
                   'secret': configuration['twitter']['consumer_secret']}
    keyword, k_metadata = parse_metadata(configuration['metadata'])
    dbm = DBManager('tweets')
    tm = TwitterAPIManager(credentials, dbm)
    for current_keyword, keyword_row in zip(keyword, k_metadata):
        logging.info('Searching tweets for %s' % current_keyword)
        if '@' in current_keyword:
            tm.search_tweets(configuration['tweets_qry'], current_keyword, 'user', k_metadata)
        else:
            tm.search_tweets(configuration['tweets_qry'], current_keyword, 'hashtag', k_metadata)
    logging.info('Evaluating the relevance of the new tweets...')
    te = TweetEvaluator()
    te.identify_relevant_tweets()
Пример #7
0
 def fix_value_of_candidatura(self):
     script_parent_dir = pathlib.Path(__file__).parents[1]
     config_fn = script_parent_dir.joinpath('config.json')
     configuration = get_config(config_fn)
     keyword, k_metadata = parse_metadata(configuration['metadata'])
     interested_data = []
     # keep metadata that refer to candidacies
     for kword, kmetada in zip(keyword, k_metadata):
         if kmetada['candidatura'] != '':
             kmetada.update({'keyword': kword})
             interested_data.append(kmetada)
     query = {'candidatura': ''}
     # select tweets without candidacy
     s_objs = self.__dbm.search(query)
     num_fixed_tweets = 0
     # iterate over tweets without candidacy and fix those
     # whose text mention a candidate or have hashtags
     # related to a candidacy
     for s_obj in s_objs:
         party = s_obj['partido_politico']
         movement = s_obj['movimiento']
         tweet = s_obj['tweet_obj']
         relevant_data = []
         candidacy = ''
         # keep metadata related to the political party
         # (and movement) of the tweet (s_obj)
         for ida in interested_data:
             if ida['partido_politico'] == party:
                 if movement != '':
                     if ida['movimiento'] == movement:
                         relevant_data.append(ida)
                 else:
                     relevant_data.append(ida)
         if len(relevant_data) > 0:
             # extract relevant information of the tweet. hashtags and mentions if
             # the tweet obj has these entities otherwise the text of the tweet
             if 'retweeted_status' in tweet.keys():
                 original_tweet = tweet['retweeted_status']
             else:
                 original_tweet = tweet
             if 'entities' in original_tweet.keys():
                 t_user_mentions = self.__get_screen_names(
                     original_tweet['entities']['user_mentions'])
                 t_hashtags = self.__get_hashtags(
                     original_tweet['entities']['hashtags'])
                 # see if the interested keywords are part of the tweet hashtags or mentions
                 for rd in relevant_data:
                     if rd['keyword'] in t_user_mentions:
                         candidacy = rd['candidatura']
                         break
                     else:
                         if rd['keyword'] in t_hashtags:
                             candidacy = rd['candidatura']
                             break
             else:
                 if 'full_text' in original_tweet.keys():
                     t_text = tweet['full_text']
                 else:
                     t_text = tweet['text']
                 # see if the interested keywords are present in the text
                 for rd in relevant_data:
                     if rd['keyword'] in t_text:
                         candidacy = rd['candidatura']
                         break
             # fix candidacy key
             if candidacy:
                 s_obj['candidatura'] = candidacy
                 num_fixed_tweets += 1
                 self.__dbm.save_record(s_obj)
     return num_fixed_tweets
Пример #8
0
def random_account_number(data):
    """
        Verify if user's name and screen name has strings of
        random numbers
        
        :param data: user's data
        :param config: dictionary with the configuration parameters of the heuristic

        :return: 1 if the name or screen has random numbers, 0 otherwise
        """

    # Get heuristic parameters
    file_path = pathlib.Path(__file__).parents[0].joinpath(
        'heuristic_config.json')
    config = get_config(file_path)['fake_handler']

    bot_prob = 0
    # random numbers
    # verify if the screen_name is composed of only of numbers
    if data['screen_name'].isdigit() or data['name'].isdigit():
        return 1
    number = ''
    for k in data['screen_name']:  # separate numbers of the name to analyze
        if k in string.digits:
            number += k
        else:
            number += ' '
    numbers = number.split()
    # increases the probability that the user is a bot if
    # there are letters between numbers in the name
    if len(numbers) > 1:
        bot_prob = 1
    partial_result = 0
    # iterate over the list of numbers
    for n in numbers:
        num = int(n)
        if num > config['max_date']:
            partial_result = 1
        else:
            # check if the number correspond to a date between
            # 1011000 and 31129999. we assume that years are
            # expressed in four digits and days and months
            # expressed using two digits
            len_num = len(str(abs(num)))
            found_date = False
            if 6 <= len_num <= 8:
                # parser number assuming it is expressed in the form yyyymmdd
                year, month, day = __parse_number_date_yyyymmdd(num, len_num)
                # years are represented by four digits
                if year >= MIN_YEAR and month <= MAX_MONTH and day <= MAX_DAY:
                    found_date = True
                # parser number assuming it is expressed in the form ddmmyyyy
                day, month, year = __parse_number_date_ddmmyyyy(num, len_num)
                # years are represented by four digits
                if year >= MIN_YEAR and month <= MAX_MONTH and day <= MAX_DAY:
                    found_date = True
                if not found_date:
                    partial_result = 1
            # check if number correspond to the year of birthday, a favorite number less than 100,
            # year of the account creation date or
            # the high-school class number or
            if num in range(999, 10000, 1) or \
               num < 100 or \
               data['created_at'].split()[5] in n or \
               str(int(data['created_at'].split()[5]) - 2000) in n:
                partial_result = 0
        bot_prob += partial_result
    if bot_prob >= 1:
        return 1
    else:
        return 0
Пример #9
0
 def __init__(self, collection='tweets', language='spanish'):
     self.config = get_config(self.config_file_name)
     self.language = language
     self.__dbm = DBManager(collection)
Пример #10
0
def main():
    # GET CONFIGS
    # TODO: for each model specify training path
    # config_dir='configs/rope/inference_pipeline/pspnet_first_stage.json'
    #config_dir='configs/rope/inference_pipeline_1/enet_hand_fixer_segm.json'
    #config_dir = 'configs/rope/inference_pipeline/pspnet_hand_fixer_segm_.json'
    #config_dir = 'configs/rope/inference_pipeline_1/enet_blade_segm.json'
    config_dir = 'configs/rope/inference_pipeline_1/enet_hand_parts_segm.json'
    config = get_config(config_dir)

    torch.manual_seed(config['seed'])

    lr = config['lr']
    batch_size = config['batch_size']
    num_epoch = config['num_epoch']

    loss_names = config['loss_names']
    loss_weights = config['loss_weights']

    log_dir = config['log_dir']
    log_interval = config['log_interval']

    scale = config['scale']
    num_classes = config['num_classes']
    image_height = config['image_height']
    image_width = config['image_width']
    roi = config['roi']

    if roi is not None:
        roi = tuple(roi)
    print('train.py: roi = ', roi)

    model_name = config['model_name']
    model_type = config['model_type']
    checkpoint_path = config['checkpoint_path']
    pretrained = config['pretrained']
    backbone = config['backbone']
    num_input_channels = config['num_input_channels']

    image_shape = (image_height, image_width, num_input_channels)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print('Device is ', device)

    # PREPARE DATA
    train_loader, test_loader = get_data_loaders(model_type, num_classes,
                                                 batch_size, image_height,
                                                 image_width, roi)

    # PREPARE MODEL
    model = SegmentationModel(num_input_channels=num_input_channels,
                              num_classes=num_classes,
                              name=model_name,
                              backbone=backbone,
                              pretrained=pretrained)
    print('model_name = ', model_name)
    # TODO: freeze layers
    model.freeze(num_freeze_layers=13)
    model = model.to(device)

    # PREPARE EVALUATION AND OPTIMIZATION PARTS FOR TRAINING
    optimizer = Adam(params=filter(lambda param: param.requires_grad,
                                   model.basemodel.parameters()))
    # scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-8)

    multi_loss = MultiLoss(loss_names, loss_weights, device, num_classes)

    # INITIALIZE TRAINING
    trainer = SegmentationTrainer(model, train_loader, test_loader, optimizer,
                                  multi_loss, device, log_interval, num_epoch,
                                  log_dir, checkpoint_path, pretrained,
                                  image_shape, scale)
    # TRAIN
    trainer.train()