def fake_promoter(user_screen_name, db_users, method=0): """ :param method : Determines the method to be used in the heuristic 0: Check if the proportion of interactions with the user's most frequent contacts is equal or larger than a defined threshold 1: Check if the average probability of being bot of the user's most frequent contacts is equal or larger than a defined threshold :param user_screen_name: screen name of the user under evaluation :param db_users: database of users :return: 1 if the user meets the condition defined in the method of the heuristic, 0 otherwise """ if method not in [0, 1, 2, 3]: raise Exception('Error. Unknown heuristic method {}'.format(method)) # Get heuristic parameters file_path = pathlib.Path(__file__).parents[0].joinpath( 'heuristic_config.json') config = get_config(file_path)['fake_promoter'] prop_interaction_with_bots, avg_pbb_most_freq_contacts = is_fake_promoter( user_screen_name, db_users, config) if method == 0: return prop_interaction_with_bots else: return avg_pbb_most_freq_contacts
def __init__(self, collection, db_name=""): script_parent_dir = pathlib.Path(__file__).parents[1] config_fn = script_parent_dir.joinpath('config.json') config = get_config(config_fn) self.__host = config['mongo']['host'] self.__port = config['mongo']['port'] client = MongoClient(self.__host + ':' + self.__port) if not db_name: self.__db = client[config['mongo']['db_name']] else: self.__db = client[db_name] self.__collection = collection
def __get_hashtags_and_metadata(self): script_parent_dir = pathlib.Path(__file__).parents[1] config_fn = script_parent_dir.joinpath('config.json') configuration = get_config(config_fn) keywords, metadata = parse_metadata(configuration['metadata']) hashtags = [] for keyword in keywords: if '@' not in keyword: # The following hashtags are excluded because they are proper names of # movements and people if keyword not in ['HonorColorado', 'ColoradoAñetete', 'tuma']: hashtags.append(keyword.lower()) return hashtags, metadata
def similar_account_name(data, db_users, db_tweets): """ Check various conditions about the user's name and screen name: 1. Condition 1: the user and screen name is inside the database of trustworthy users 2. Condition 2: the user's name and screen name is a variation of a trustworthy user (like john and john jr) 3. Condition 3: the user uses a name or screen names that is part of the name or screen name of a trustworthy user 4. Condition 4: the user's name or screen_name has at least 75% similarity with the name or screen_name of a user in the trustworthy database :param data: dictionary with information about a Twitter user :param db_users: database of the Twitter users :param config: dictionary with the configuration parameters of the heuristic :return: 1 if condition 2, 3, or 4 is met 0 otherwise """ mini_sn = 0.0 # Get heuristic parameters file_path = pathlib.Path(__file__).parents[0].joinpath( 'heuristic_config.json') config = get_config(file_path)['fake_handler'] # create a database of "trustworthy" accounts dbm_trustworthy_users = __db_trustworthy_users(db_users, db_tweets, config) if dbm_trustworthy_users.find_record({'screen_name': data['screen_name']}) and \ dbm_trustworthy_users.find_record({'name': data['name']}): return 0 elif 'jr' in data['screen_name'] and \ dbm_trustworthy_users.find_record({'screen_name': data['screen_name'].replace('jr', '')}): return 1 elif 'junior' in data['screen_name'] and \ dbm_trustworthy_users.find_record({'screen_name': data['screen_name'].replace('junior', '')}): return 1 else: # check against the database of trustworthy users for doc in dbm_trustworthy_users.find_all(): dist_sn = __string_similarity(doc['screen_name'], data['screen_name']) if doc['name'] in data['screen_name'] or doc[ 'screen_name'] in data['screen_name']: return 1 if mini_sn < dist_sn: mini_sn = dist_sn return mini_sn
def fix_tweets_with_empty_flags(): dbm = DBManager('tweets') script_parent_dir = pathlib.Path(__file__).parents[1] conf_file = script_parent_dir.joinpath('config.json') configuration = get_config(conf_file) keyword, k_metadata = parse_metadata(configuration['metadata']) tweets_with_empty_flags = dbm.search({'flag.keyword': {'$size': 0}, 'relevante': 1}) for tweet in tweets_with_empty_flags: logging.info('Updating flags of tweet {0}'.format(tweet['tweet_obj']['id_str'])) flag, headers = create_flag(k_metadata) entities = get_entities_tweet(tweet['tweet_obj']) flag = add_values_to_flags(flag, entities, k_metadata) dbm.update_record({'tweet_obj.id_str': tweet['tweet_obj']['id_str']}, flag) #if __name__ == '__main__': # fix_tweets_with_empty_flags()
def do_tweet_collection(): script_parent_dir = pathlib.Path(__file__).parents[0] conf_file = script_parent_dir.joinpath('config.json') configuration = get_config(conf_file) credentials = {'key': configuration['twitter']['consumer_key'], 'secret': configuration['twitter']['consumer_secret']} keyword, k_metadata = parse_metadata(configuration['metadata']) dbm = DBManager('tweets') tm = TwitterAPIManager(credentials, dbm) for current_keyword, keyword_row in zip(keyword, k_metadata): logging.info('Searching tweets for %s' % current_keyword) if '@' in current_keyword: tm.search_tweets(configuration['tweets_qry'], current_keyword, 'user', k_metadata) else: tm.search_tweets(configuration['tweets_qry'], current_keyword, 'hashtag', k_metadata) logging.info('Evaluating the relevance of the new tweets...') te = TweetEvaluator() te.identify_relevant_tweets()
def fix_value_of_candidatura(self): script_parent_dir = pathlib.Path(__file__).parents[1] config_fn = script_parent_dir.joinpath('config.json') configuration = get_config(config_fn) keyword, k_metadata = parse_metadata(configuration['metadata']) interested_data = [] # keep metadata that refer to candidacies for kword, kmetada in zip(keyword, k_metadata): if kmetada['candidatura'] != '': kmetada.update({'keyword': kword}) interested_data.append(kmetada) query = {'candidatura': ''} # select tweets without candidacy s_objs = self.__dbm.search(query) num_fixed_tweets = 0 # iterate over tweets without candidacy and fix those # whose text mention a candidate or have hashtags # related to a candidacy for s_obj in s_objs: party = s_obj['partido_politico'] movement = s_obj['movimiento'] tweet = s_obj['tweet_obj'] relevant_data = [] candidacy = '' # keep metadata related to the political party # (and movement) of the tweet (s_obj) for ida in interested_data: if ida['partido_politico'] == party: if movement != '': if ida['movimiento'] == movement: relevant_data.append(ida) else: relevant_data.append(ida) if len(relevant_data) > 0: # extract relevant information of the tweet. hashtags and mentions if # the tweet obj has these entities otherwise the text of the tweet if 'retweeted_status' in tweet.keys(): original_tweet = tweet['retweeted_status'] else: original_tweet = tweet if 'entities' in original_tweet.keys(): t_user_mentions = self.__get_screen_names( original_tweet['entities']['user_mentions']) t_hashtags = self.__get_hashtags( original_tweet['entities']['hashtags']) # see if the interested keywords are part of the tweet hashtags or mentions for rd in relevant_data: if rd['keyword'] in t_user_mentions: candidacy = rd['candidatura'] break else: if rd['keyword'] in t_hashtags: candidacy = rd['candidatura'] break else: if 'full_text' in original_tweet.keys(): t_text = tweet['full_text'] else: t_text = tweet['text'] # see if the interested keywords are present in the text for rd in relevant_data: if rd['keyword'] in t_text: candidacy = rd['candidatura'] break # fix candidacy key if candidacy: s_obj['candidatura'] = candidacy num_fixed_tweets += 1 self.__dbm.save_record(s_obj) return num_fixed_tweets
def random_account_number(data): """ Verify if user's name and screen name has strings of random numbers :param data: user's data :param config: dictionary with the configuration parameters of the heuristic :return: 1 if the name or screen has random numbers, 0 otherwise """ # Get heuristic parameters file_path = pathlib.Path(__file__).parents[0].joinpath( 'heuristic_config.json') config = get_config(file_path)['fake_handler'] bot_prob = 0 # random numbers # verify if the screen_name is composed of only of numbers if data['screen_name'].isdigit() or data['name'].isdigit(): return 1 number = '' for k in data['screen_name']: # separate numbers of the name to analyze if k in string.digits: number += k else: number += ' ' numbers = number.split() # increases the probability that the user is a bot if # there are letters between numbers in the name if len(numbers) > 1: bot_prob = 1 partial_result = 0 # iterate over the list of numbers for n in numbers: num = int(n) if num > config['max_date']: partial_result = 1 else: # check if the number correspond to a date between # 1011000 and 31129999. we assume that years are # expressed in four digits and days and months # expressed using two digits len_num = len(str(abs(num))) found_date = False if 6 <= len_num <= 8: # parser number assuming it is expressed in the form yyyymmdd year, month, day = __parse_number_date_yyyymmdd(num, len_num) # years are represented by four digits if year >= MIN_YEAR and month <= MAX_MONTH and day <= MAX_DAY: found_date = True # parser number assuming it is expressed in the form ddmmyyyy day, month, year = __parse_number_date_ddmmyyyy(num, len_num) # years are represented by four digits if year >= MIN_YEAR and month <= MAX_MONTH and day <= MAX_DAY: found_date = True if not found_date: partial_result = 1 # check if number correspond to the year of birthday, a favorite number less than 100, # year of the account creation date or # the high-school class number or if num in range(999, 10000, 1) or \ num < 100 or \ data['created_at'].split()[5] in n or \ str(int(data['created_at'].split()[5]) - 2000) in n: partial_result = 0 bot_prob += partial_result if bot_prob >= 1: return 1 else: return 0
def __init__(self, collection='tweets', language='spanish'): self.config = get_config(self.config_file_name) self.language = language self.__dbm = DBManager(collection)
def main(): # GET CONFIGS # TODO: for each model specify training path # config_dir='configs/rope/inference_pipeline/pspnet_first_stage.json' #config_dir='configs/rope/inference_pipeline_1/enet_hand_fixer_segm.json' #config_dir = 'configs/rope/inference_pipeline/pspnet_hand_fixer_segm_.json' #config_dir = 'configs/rope/inference_pipeline_1/enet_blade_segm.json' config_dir = 'configs/rope/inference_pipeline_1/enet_hand_parts_segm.json' config = get_config(config_dir) torch.manual_seed(config['seed']) lr = config['lr'] batch_size = config['batch_size'] num_epoch = config['num_epoch'] loss_names = config['loss_names'] loss_weights = config['loss_weights'] log_dir = config['log_dir'] log_interval = config['log_interval'] scale = config['scale'] num_classes = config['num_classes'] image_height = config['image_height'] image_width = config['image_width'] roi = config['roi'] if roi is not None: roi = tuple(roi) print('train.py: roi = ', roi) model_name = config['model_name'] model_type = config['model_type'] checkpoint_path = config['checkpoint_path'] pretrained = config['pretrained'] backbone = config['backbone'] num_input_channels = config['num_input_channels'] image_shape = (image_height, image_width, num_input_channels) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('Device is ', device) # PREPARE DATA train_loader, test_loader = get_data_loaders(model_type, num_classes, batch_size, image_height, image_width, roi) # PREPARE MODEL model = SegmentationModel(num_input_channels=num_input_channels, num_classes=num_classes, name=model_name, backbone=backbone, pretrained=pretrained) print('model_name = ', model_name) # TODO: freeze layers model.freeze(num_freeze_layers=13) model = model.to(device) # PREPARE EVALUATION AND OPTIMIZATION PARTS FOR TRAINING optimizer = Adam(params=filter(lambda param: param.requires_grad, model.basemodel.parameters())) # scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-8) multi_loss = MultiLoss(loss_names, loss_weights, device, num_classes) # INITIALIZE TRAINING trainer = SegmentationTrainer(model, train_loader, test_loader, optimizer, multi_loss, device, log_interval, num_epoch, log_dir, checkpoint_path, pretrained, image_shape, scale) # TRAIN trainer.train()