def run(): Util.ensure_dir_exist(_DATA_DIR) category = None seeds = Util.load_seeds() #read twitter data gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category) log('Num ground_truth_rankings: %s' % len(gt_rankings)) target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS) log('Size target_news: %s' % len(target_news)) for delta in _DELTAS: (num_users, newsaholics, active_users, common_users) = basic_groups.group_users(delta, category) population = newsaholics.union(active_users).union(common_users) log('Num newsaholics: %s' % len(newsaholics)) log('Num active: %s' % len(active_users)) log('Num common: %s' % len(common_users)) log('Num users (population): %s' % len(population)) # -- Get experts -- ExpertGroup.precision = experts.select_experts_precision( newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.fscore = experts.select_experts_fscore( len(target_news), num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.union = experts.select_all_experts(ExpertGroup.precision, ExpertGroup.fscore, ExpertGroup.ci) log('Num experts (precision): %s' % len(ExpertGroup.precision)) log('Num experts (fscore): %s' % len(ExpertGroup.fscore)) log('Num experts (ci): %s' % len(ExpertGroup.ci)) log('Num all experts: %s' % len(ExpertGroup.union)) non_experts = population.difference(ExpertGroup.union) log('Num non_experts: %s' % len(non_experts)) # other_users = population.difference(all_experts).difference(common_users) # -- counting -- total_num_tweets = 0 hour_to_num_tweets = {} with open('../data/FolkWisdom/time_deltas.tsv') as in_file: for line in in_file: tokens = line.split('\t') time_delta_in_sec = int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]) url = tokens[_TIMEDELTAS_FILE_URL_INDEX].strip() user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX] if time_delta_in_sec > 0 and url in target_news: current_hour = time_delta_in_sec / _NUM_SEC_PER_HOUR total_num_tweets += 1 if current_hour not in hour_to_num_tweets: hour_to_num_tweets[current_hour] = GroupCount() gcount = hour_to_num_tweets[current_hour] gcount.population += 1 if user_id in ExpertGroup.union: gcount.union += 1 if user_id in ExpertGroup.precision: gcount.precision += 1 if user_id in ExpertGroup.fscore: gcount.fscore += 1 if user_id in ExpertGroup.ci: gcount.ci += 1 else: gcount.non_experts += 1 if user_id in common_users: gcount.common += 1 # print >> sys.stderr, 'Error, a user in expert union but not belongs to any expert group' # elif user_id in common_users: # gcount.common += 1 # else : # gcount.other += 1 # if user_id in non_experts: # gcount.non_experts += 1 gcount = GroupCount() with open(_DATA_DIR + 'hour_thresholds_%s.tsv' % delta, 'w') as out_file: for hour in hour_to_num_tweets.keys(): gc = hour_to_num_tweets[hour] gcount.add(gc) percentage = (gcount.population / float(total_num_tweets)) * 100.0 percentage_common = (gcount.common / float(total_num_tweets)) * 100.0 percentage_other = (gcount.other / float(total_num_tweets)) * 100.0 percentage_experts = (gcount.union / float(total_num_tweets)) * 100.0 percentage_non_experts = (gcount.non_experts / float(total_num_tweets)) * 100.0 out_file.write( '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (hour, percentage, percentage_non_experts, percentage_experts, percentage_common, (gcount.precision / float(total_num_tweets)) * 100.0, (gcount.fscore / float(total_num_tweets)) * 100.0, (gcount.ci / float(total_num_tweets)) * 100.0)) log('hour\tpopulation\tnon_experts\texperts\tcommon\tprecision\tfscore\tci' )
def get_all_user_groups(delta=4, category=None): seeds = Util.load_seeds() # Set up params appropriately. data_set = DataSet.TRAINING months = _TRAINING_SET_MONTHS if _SWITCHED: data_set = DataSet.TESTING months = _TESTING_SET_MONTHS retweets = set() if _EXCLUDE_RETWEETS: retweets = ground_truths.find_retweets(months) gt_rankings = ground_truths.get_gt_rankings(seeds, data_set, category, exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA, retweets=retweets) target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS) groups = UserGroups() (num_users, groups.newsaholics, groups.active_users, groups.common_users) = basic_groups.group_users(delta, category) groups.population = groups.newsaholics.union(groups.active_users).union(groups.common_users) num_users_eg, groups.even_groups = even_groups.group_users(delta, _NUM_GROUPS, _SIZE_OF_GROUP_IN_PERCENT, category) groups.precision = experts.select_experts_precision( groups.newsaholics.union(groups.active_users), num_users, delta, _SIZE_EXPERTS, category) groups.fscore = experts.select_experts_fscore(len(target_news), num_users, delta, _SIZE_EXPERTS, category) groups.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS, category) groups.super_experts = experts.select_super_experts(groups.precision, groups.fscore, groups.ci) groups.ci_hi, groups.ci_li = experts.split_ci_experts_by_followers(groups.ci) groups.ci_1 = set() groups.ci_2 = set() groups.ci_3 = set() counter = 0 for ci_expert in groups.ci: if counter % 3 == 0: groups.ci_1.add(ci_expert) elif counter % 3 == 1: groups.ci_2.add(ci_expert) elif counter % 3 == 2: groups.ci_3.add(ci_expert) counter += 1 groups.social_bias, d_num_followers = experts.select_experts_social_bias(num_users, _SIZE_EXPERTS) groups.all_experts = experts.select_all_experts(groups.precision, groups.fscore, groups.ci) groups.non_experts = groups.population.difference(groups.all_experts) sample_size = int(len(groups.non_experts) * _NON_EXPERTS_SAMPLE_SIZE) sample_size_25 = int(len(groups.non_experts) * 0.05) sample_size_10 = int(len(groups.non_experts) * 0.10) sample_size_1 = int(len(groups.non_experts) * 0.02) groups.non_experts_sampled = set(random.sample(groups.non_experts, sample_size)) groups.non_experts_25 = set(random.sample(groups.non_experts, sample_size_25)) groups.non_experts_10 = set(random.sample(groups.non_experts, sample_size_10)) groups.non_experts_1 = set(random.sample(groups.non_experts, sample_size_1)) return groups, d_num_followers
def run(): Util.ensure_dir_exist(_DATA_DIR) category = None seeds = Util.load_seeds() #read twitter data gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category) log('Num ground_truth_rankings: %s' % len(gt_rankings)) target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS) log('Size target_news: %s' % len(target_news)) for delta in _DELTAS: (num_users, newsaholics, active_users, common_users) = basic_groups.group_users(delta, category) population = newsaholics.union(active_users).union(common_users) log('Num newsaholics: %s' % len(newsaholics)) log('Num active: %s' % len(active_users)) log('Num common: %s' % len(common_users)) log('Num users (population): %s' % len(population)) # -- Get experts -- ExpertGroup.precision = experts.select_experts_precision( newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.fscore = experts.select_experts_fscore(len(target_news), num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS, category) ExpertGroup.union = experts.select_all_experts(ExpertGroup.precision, ExpertGroup.fscore, ExpertGroup.ci) log('Num experts (precision): %s' % len(ExpertGroup.precision)) log('Num experts (fscore): %s' % len(ExpertGroup.fscore)) log('Num experts (ci): %s' % len(ExpertGroup.ci)) log('Num all experts: %s' % len(ExpertGroup.union)) non_experts = population.difference(ExpertGroup.union) log('Num non_experts: %s' % len(non_experts)) # other_users = population.difference(all_experts).difference(common_users) # -- counting -- total_num_tweets = 0 hour_to_num_tweets = {} with open('../data/FolkWisdom/time_deltas.tsv') as in_file: for line in in_file: tokens = line.split('\t') time_delta_in_sec = int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]) url = tokens[_TIMEDELTAS_FILE_URL_INDEX].strip() user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX] if time_delta_in_sec > 0 and url in target_news: current_hour = time_delta_in_sec / _NUM_SEC_PER_HOUR total_num_tweets += 1 if current_hour not in hour_to_num_tweets: hour_to_num_tweets[current_hour] = GroupCount() gcount = hour_to_num_tweets[current_hour] gcount.population += 1 if user_id in ExpertGroup.union: gcount.union += 1 if user_id in ExpertGroup.precision: gcount.precision += 1 if user_id in ExpertGroup.fscore: gcount.fscore += 1 if user_id in ExpertGroup.ci: gcount.ci += 1 else: gcount.non_experts += 1 if user_id in common_users: gcount.common += 1 # print >> sys.stderr, 'Error, a user in expert union but not belongs to any expert group' # elif user_id in common_users: # gcount.common += 1 # else : # gcount.other += 1 # if user_id in non_experts: # gcount.non_experts += 1 gcount = GroupCount() with open(_DATA_DIR + 'hour_thresholds_%s.tsv' % delta, 'w') as out_file: for hour in hour_to_num_tweets.keys(): gc = hour_to_num_tweets[hour] gcount.add(gc) percentage = (gcount.population / float(total_num_tweets)) * 100.0 percentage_common = (gcount.common / float(total_num_tweets)) * 100.0 percentage_other = (gcount.other / float(total_num_tweets)) * 100.0 percentage_experts = (gcount.union / float(total_num_tweets)) * 100.0 percentage_non_experts = (gcount.non_experts / float(total_num_tweets)) * 100.0 out_file.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (hour, percentage, percentage_non_experts, percentage_experts, percentage_common, (gcount.precision / float(total_num_tweets)) * 100.0, (gcount.fscore / float(total_num_tweets)) * 100.0, (gcount.ci / float(total_num_tweets)) * 100.0)) log('hour\tpopulation\tnon_experts\texperts\tcommon\tprecision\tfscore\tci')