def run(): """Contains the main logic for this analysis.""" global _SIZE_TOP_NEWS FileLog.set_log_dir() seeds = Util.load_seeds() for category in _CATEGORIES: log('Preforming analysis for category: %s' % category) if category: _SIZE_TOP_NEWS = .10 else: _SIZE_TOP_NEWS = .02 gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category) log('Num ground_truth_rankings: %s' % len(gt_rankings)) target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS) log('Size target_news: %s' % len(target_news)) # for delta in _DELTAS: for delta in [4]: run_params_str = 'd%s_t%s_e%s_%s' % (delta, int(_SIZE_TOP_NEWS * 100), int(_SIZE_EXPERTS * 100), category) output_dir = '../graph/CrowdWisdomDef/%s/' % run_params_str Util.ensure_dir_exist(output_dir) info_output_dir = '../graph/CrowdWisdomDef/%s/info/' % run_params_str Util.ensure_dir_exist(info_output_dir) output_dir = '../graph/CrowdWisdomDef/%s/' % run_params_str Util.ensure_dir_exist(output_dir) (num_users, newsaholics, active_users, common_users) = basic_groups.group_users(delta, category) log('Num newsaholics: %s' % len(newsaholics)) log('Num active: %s' % len(active_users)) log('Num common: %s' % len(common_users)) common_user_buckets = common_user_groups.group_users(common_users, _NUM_GROUPS) for i, common_user_bucket in enumerate(common_user_buckets): print 'Number users in common user bucket %s: %s' % (i, len(common_user_bucket)) experts_precision = experts.select_experts_precision( newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS, category) experts_fscore = experts.select_experts_fscore(len(target_news), num_users, delta, _SIZE_EXPERTS, category) experts_ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS, category) super_experts = experts.select_super_experts(experts_precision, experts_fscore, experts_ci) log('Num experts (precision): %s' % len(experts_precision)) log('Num experts (fscore): %s' % len(experts_fscore)) log('Num experts (ci): %s' % len(experts_ci)) log('Finding rankings with an %s hour delta.' % delta) (market_rankings, newsaholic_rankings, active_rankings, common_rankings) = basic_groups.get_rankings(delta, seeds, newsaholics, active_users, category) (expert_precision_rankings, expert_fscore_rankings, expert_ci_rankings, expert_s_rankings) = experts.get_rankings(delta, seeds, experts_precision, experts_fscore, experts_ci, super_experts, category) common_groups_rankings = common_user_groups.get_rankings(delta, seeds, common_user_buckets, category) num_votes_common = 0 for url, count in common_rankings: num_votes_common += count log('Num common_rankings: %s' % len(common_rankings)) log('Num common votes: %s' % num_votes_common) num_votes_expert_precision = 0 for url, count in expert_precision_rankings: num_votes_expert_precision += count log('Num expert_precision rankings: %s' % len(expert_precision_rankings)) log('Num expert_precision votes: %s' % num_votes_expert_precision) num_votes_expert_fscore = 0 for url, count in expert_fscore_rankings: num_votes_expert_fscore += count log('Num expert_fscore rankings: %s' % len(expert_fscore_rankings)) log('Num expert_fscore votes: %s' % num_votes_expert_fscore) num_votes_expert_ci = 0 for url, count in expert_ci_rankings: num_votes_expert_ci += count log('Num expert_ci rankings: %s' % len(expert_ci_rankings)) log('Num expert_ci votes: %s' % num_votes_expert_ci) num_votes_buckets = [] for i, common_group_rankings in enumerate(common_groups_rankings): num_votes = 0 for url, count in common_group_rankings: num_votes += count num_votes_buckets.append(num_votes) log('Num common rankings (%s buckets): %s' % (i, len(common_group_rankings))) log('Num expert_ci votes (%s buckets): %s' % (i, num_votes)) with open('%suser_demographics_%s.txt' % (info_output_dir, run_params_str), 'w') as output_file: output_file.write('Number of Common Users: %s\n' % len(common_users)) output_file.write('\n'); output_file.write('Number of Precision Experts: %s\n' % len(experts_precision)) output_file.write('Number of F-Score Experts: %s\n' % len(experts_fscore)) output_file.write('Number of CI Experts: %s\n' % len(experts_ci)) output_file.write('Number users per common user bucket: %s\n' %len(common_user_buckets[0])) output_file.write('Number of Precision and F-Score Experts: %s\n' % len(experts_precision.intersection(experts_fscore))) output_file.write('Number of Precision and CI Experts: %s\n' % len(experts_precision.intersection(experts_ci))) output_file.write('Number of F-Score and CI Experts: %s\n' % len(experts_fscore.intersection(experts_ci))) output_file.write('\n'); output_file.write('Number of Users (Total): %s\n' % (len(newsaholics) + len(active_users) + len(common_users))) output_file.write('\n') output_file.write('Number of votes by Common Users: %s\n' % num_votes_common) output_file.write('\n'); output_file.write('Number of votes by Expert (Precision) Users: %s\n' % num_votes_expert_precision) output_file.write('Number of votes by Expert (fscore) Users: %s\n' % num_votes_expert_fscore) output_file.write('Number of votes by Expert (ci) Users: %s\n' % num_votes_expert_ci) output_file.write('Number of votes per bucket: %s\n' % num_votes_buckets) output_file.write('\n') output_file.write('Total Number of Good News: %s\n' % len(target_news)) log('Ground Truth Top 5') for i in range(min(len(gt_rankings), 5)): url, count = gt_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Common Top 5') for i in range(min(len(common_rankings), 5)): url, count = common_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (Precision) Top 5') for i in range(min(len(expert_precision_rankings), 5)): url, count = expert_precision_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (fscore) Top 5') for i in range(min(len(expert_fscore_rankings), 5)): url, count = expert_fscore_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (ci) Top 5') for i in range(min(len(expert_ci_rankings), 5)): url, count = expert_ci_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') common_precisions, common_recalls = calc_precision_recall(gt_rankings, common_rankings) (expert_p_precisions, expert_p_recalls) = calc_precision_recall(gt_rankings, expert_precision_rankings) (expert_f_precisions, expert_f_recalls) = calc_precision_recall(gt_rankings, expert_fscore_rankings) (expert_c_precisions, expert_c_recalls) = calc_precision_recall(gt_rankings, expert_ci_rankings) common_group_ps = [] common_group_rs = [] for common_group_ranking in common_groups_rankings: common_group_p, common_group_r = calc_precision_recall(gt_rankings, common_group_ranking) common_group_ps.append(common_group_p) common_group_rs.append(common_group_r) log('Drawing common group model precision-recall graph...') common_user_groups.draw_precision_recall(common_group_ps, common_group_rs, expert_p_precisions, expert_p_recalls, expert_f_precisions, expert_f_recalls, expert_c_precisions, expert_c_recalls, run_params_str) log('Drawing common group model precision graph...') common_user_groups.draw_precision(common_group_ps, expert_p_precisions, expert_f_precisions, expert_c_precisions, run_params_str)
def get_all_user_groups(delta=4, category=None): seeds = Util.load_seeds() # Set up params appropriately. data_set = DataSet.TRAINING months = _TRAINING_SET_MONTHS if _SWITCHED: data_set = DataSet.TESTING months = _TESTING_SET_MONTHS retweets = set() if _EXCLUDE_RETWEETS: retweets = ground_truths.find_retweets(months) gt_rankings = ground_truths.get_gt_rankings(seeds, data_set, category, exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA, retweets=retweets) target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS) groups = UserGroups() (num_users, groups.newsaholics, groups.active_users, groups.common_users) = basic_groups.group_users(delta, category) groups.population = groups.newsaholics.union(groups.active_users).union(groups.common_users) num_users_eg, groups.even_groups = even_groups.group_users(delta, _NUM_GROUPS, _SIZE_OF_GROUP_IN_PERCENT, category) groups.precision = experts.select_experts_precision( groups.newsaholics.union(groups.active_users), num_users, delta, _SIZE_EXPERTS, category) groups.fscore = experts.select_experts_fscore(len(target_news), num_users, delta, _SIZE_EXPERTS, category) groups.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS, category) groups.super_experts = experts.select_super_experts(groups.precision, groups.fscore, groups.ci) groups.ci_hi, groups.ci_li = experts.split_ci_experts_by_followers(groups.ci) groups.ci_1 = set() groups.ci_2 = set() groups.ci_3 = set() counter = 0 for ci_expert in groups.ci: if counter % 3 == 0: groups.ci_1.add(ci_expert) elif counter % 3 == 1: groups.ci_2.add(ci_expert) elif counter % 3 == 2: groups.ci_3.add(ci_expert) counter += 1 groups.social_bias, d_num_followers = experts.select_experts_social_bias(num_users, _SIZE_EXPERTS) groups.all_experts = experts.select_all_experts(groups.precision, groups.fscore, groups.ci) groups.non_experts = groups.population.difference(groups.all_experts) sample_size = int(len(groups.non_experts) * _NON_EXPERTS_SAMPLE_SIZE) sample_size_25 = int(len(groups.non_experts) * 0.05) sample_size_10 = int(len(groups.non_experts) * 0.10) sample_size_1 = int(len(groups.non_experts) * 0.02) groups.non_experts_sampled = set(random.sample(groups.non_experts, sample_size)) groups.non_experts_25 = set(random.sample(groups.non_experts, sample_size_25)) groups.non_experts_10 = set(random.sample(groups.non_experts, sample_size_10)) groups.non_experts_1 = set(random.sample(groups.non_experts, sample_size_1)) return groups, d_num_followers