Пример #1
0
def get_user_groups(delta, category=None):
    seeds = Util.load_seeds()

    log('Finding basic user groups for delta %s and category %s...' %
        (delta, category))
    (num_users, newsaholics, active_users,
     common_users) = basic_groups.group_users(delta, category)

    log('Finding precision experts for delta %s and category %s...' %
        (delta, category))
    experts_p = experts.select_experts_precision(
        newsaholics.union(active_users), num_users, delta, .02, category)

    log('Finding ground truths...')
    gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING,
                                                category)
    log('Finding target news...')
    target_news = ground_truths.find_target_news(gt_rankings, .02)
    size_target_news = len(target_news)

    log('Finding fscore experts for delta %s and category %s...' %
        (delta, category))
    experts_f = experts.select_experts_fscore(size_target_news, num_users,
                                              delta, .02, category)

    log('Finding ci experts for delta %s and category %s...' %
        (delta, category))
    experts_ci = experts.select_experts_ci(num_users, delta, .02, category)

    experts_all = experts_p.union(experts_f).union(experts_ci)

    return experts_all, newsaholics, active_users, common_users
Пример #2
0
def get_user_groups(delta, category=None):
  seeds = Util.load_seeds()

  log('Finding basic user groups for delta %s and category %s...' % (delta, category))
  (num_users, newsaholics, active_users, common_users) = basic_groups.group_users(delta, category)

  log('Finding precision experts for delta %s and category %s...' % (delta, category))
  experts_p = experts.select_experts_precision(newsaholics.union(active_users),
                                               num_users, delta, .02, category)

  log('Finding ground truths...')
  gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category)
  log('Finding target news...')
  target_news = ground_truths.find_target_news(gt_rankings, .02)
  size_target_news = len(target_news)

  log('Finding fscore experts for delta %s and category %s...' % (delta, category))
  experts_f = experts.select_experts_fscore(size_target_news, num_users,
                                            delta, .02, category)

  log('Finding ci experts for delta %s and category %s...' % (delta, category))
  experts_ci = experts.select_experts_ci(num_users, delta, .02, category)

  experts_all = experts_p.union(experts_f).union(experts_ci)

  return experts_all, newsaholics, active_users, common_users
def run():
  """Contains the main logic for this analysis."""
  global _SIZE_TOP_NEWS
  FileLog.set_log_dir()

  seeds = Util.load_seeds()
  for category in _CATEGORIES:
    log('Preforming analysis for category: %s' % category)
    if category:
      _SIZE_TOP_NEWS = .10
    else:
      _SIZE_TOP_NEWS = .02

    gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING,
                                                category)
    log('Num ground_truth_rankings: %s' % len(gt_rankings))


    target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)
    log('Size target_news: %s' % len(target_news))

    # for delta in _DELTAS:
    for delta in [4]:
      run_params_str = 'd%s_t%s_e%s_%s' % (delta, int(_SIZE_TOP_NEWS * 100),
                                           int(_SIZE_EXPERTS * 100), category)
      output_dir = '../graph/CrowdWisdomDef/%s/' % run_params_str
      Util.ensure_dir_exist(output_dir)

      info_output_dir = '../graph/CrowdWisdomDef/%s/info/' % run_params_str
      Util.ensure_dir_exist(info_output_dir)

      output_dir = '../graph/CrowdWisdomDef/%s/' % run_params_str
      Util.ensure_dir_exist(output_dir)

      (num_users, newsaholics,
       active_users, common_users) = basic_groups.group_users(delta, category)
      log('Num newsaholics: %s' % len(newsaholics))
      log('Num active: %s' % len(active_users))
      log('Num common: %s' % len(common_users))

      common_user_buckets = common_user_groups.group_users(common_users, _NUM_GROUPS)
      for i, common_user_bucket in enumerate(common_user_buckets):
        print 'Number users in common user bucket %s: %s' % (i, len(common_user_bucket))

      experts_precision = experts.select_experts_precision(
          newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS,
          category)
      experts_fscore = experts.select_experts_fscore(len(target_news),
                                                     num_users,
                                                     delta, _SIZE_EXPERTS,
                                                     category)
      experts_ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS,
                                             category)
      super_experts = experts.select_super_experts(experts_precision,
                                                   experts_fscore,
                                                   experts_ci)

      log('Num experts (precision): %s' % len(experts_precision))
      log('Num experts (fscore): %s' % len(experts_fscore))
      log('Num experts (ci): %s' % len(experts_ci))

      log('Finding rankings with an %s hour delta.' % delta)
      (market_rankings, newsaholic_rankings,
       active_rankings,
       common_rankings) = basic_groups.get_rankings(delta, seeds, newsaholics,
                                                    active_users, category)
      (expert_precision_rankings, expert_fscore_rankings,
       expert_ci_rankings,
       expert_s_rankings) = experts.get_rankings(delta, seeds,
                                                 experts_precision,
                                                 experts_fscore,
                                                 experts_ci,
                                                 super_experts,
                                                 category)

      common_groups_rankings = common_user_groups.get_rankings(delta, seeds,
                                                               common_user_buckets,
                                                               category)

      num_votes_common = 0
      for url, count in common_rankings:
        num_votes_common += count
      log('Num common_rankings: %s' % len(common_rankings))
      log('Num common votes: %s' % num_votes_common)
      num_votes_expert_precision = 0
      for url, count in expert_precision_rankings:
        num_votes_expert_precision += count
      log('Num expert_precision rankings: %s' % len(expert_precision_rankings))
      log('Num expert_precision votes: %s' % num_votes_expert_precision)
      num_votes_expert_fscore = 0
      for url, count in expert_fscore_rankings:
        num_votes_expert_fscore += count
      log('Num expert_fscore rankings: %s' % len(expert_fscore_rankings))
      log('Num expert_fscore votes: %s' % num_votes_expert_fscore)
      num_votes_expert_ci = 0
      for url, count in expert_ci_rankings:
        num_votes_expert_ci += count
      log('Num expert_ci rankings: %s' % len(expert_ci_rankings))
      log('Num expert_ci votes: %s' % num_votes_expert_ci)
      num_votes_buckets = []
      for i, common_group_rankings in enumerate(common_groups_rankings):
        num_votes = 0
        for url, count in common_group_rankings:
          num_votes += count
        num_votes_buckets.append(num_votes)
        log('Num common rankings (%s buckets): %s' % (i, len(common_group_rankings)))
        log('Num expert_ci votes (%s buckets): %s' % (i, num_votes))

      with open('%suser_demographics_%s.txt'
                % (info_output_dir, run_params_str), 'w') as output_file:
        output_file.write('Number of Common Users: %s\n' % len(common_users))
        output_file.write('\n');
        output_file.write('Number of Precision Experts: %s\n' % len(experts_precision))
        output_file.write('Number of F-Score Experts: %s\n' % len(experts_fscore))
        output_file.write('Number of CI Experts: %s\n' % len(experts_ci))
        output_file.write('Number users per common user bucket: %s\n' %len(common_user_buckets[0]))
        output_file.write('Number of Precision and F-Score Experts: %s\n'
                          % len(experts_precision.intersection(experts_fscore)))
        output_file.write('Number of Precision and CI Experts: %s\n'
                          % len(experts_precision.intersection(experts_ci)))
        output_file.write('Number of F-Score and CI Experts: %s\n'
                          % len(experts_fscore.intersection(experts_ci)))
        output_file.write('\n');
        output_file.write('Number of Users (Total): %s\n'
                          % (len(newsaholics) + len(active_users)
                             + len(common_users)))
        output_file.write('\n')
        output_file.write('Number of votes by Common Users: %s\n'
                          % num_votes_common)
        output_file.write('\n');
        output_file.write('Number of votes by Expert (Precision) Users: %s\n'
                % num_votes_expert_precision) 
        output_file.write('Number of votes by Expert (fscore) Users: %s\n'
                % num_votes_expert_fscore) 
        output_file.write('Number of votes by Expert (ci) Users: %s\n'
                % num_votes_expert_ci) 
        output_file.write('Number of votes per bucket: %s\n' % num_votes_buckets)
        output_file.write('\n')
        output_file.write('Total Number of Good News: %s\n' % len(target_news))

      log('Ground Truth Top 5')
      for i in range(min(len(gt_rankings), 5)):
        url, count = gt_rankings[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Common Top 5')
      for i in range(min(len(common_rankings), 5)):
        url, count = common_rankings[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Expert (Precision) Top 5')
      for i in range(min(len(expert_precision_rankings), 5)):
        url, count = expert_precision_rankings[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Expert (fscore) Top 5')
      for i in range(min(len(expert_fscore_rankings), 5)):
        url, count = expert_fscore_rankings[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
      log('Expert (ci) Top 5')
      for i in range(min(len(expert_ci_rankings), 5)):
        url, count = expert_ci_rankings[i]
        log('[%s] %s\t%s' %(i, url.strip(), count))
      log('-----------------------------------')
        

      common_precisions, common_recalls = calc_precision_recall(gt_rankings,
                                                                common_rankings)
      (expert_p_precisions,
       expert_p_recalls) = calc_precision_recall(gt_rankings,
                                                 expert_precision_rankings)
      (expert_f_precisions,
       expert_f_recalls) = calc_precision_recall(gt_rankings,
                                                 expert_fscore_rankings)
      (expert_c_precisions,
       expert_c_recalls) = calc_precision_recall(gt_rankings,
                                                 expert_ci_rankings)

      common_group_ps = []
      common_group_rs = []
      for common_group_ranking in common_groups_rankings:
        common_group_p, common_group_r = calc_precision_recall(gt_rankings,
                                                               common_group_ranking)
        common_group_ps.append(common_group_p)
        common_group_rs.append(common_group_r)
                                                

      log('Drawing common group model precision-recall graph...')
      common_user_groups.draw_precision_recall(common_group_ps, common_group_rs,
                                               expert_p_precisions, expert_p_recalls,
                                               expert_f_precisions, expert_f_recalls,
                                               expert_c_precisions, expert_c_recalls,
                                               run_params_str)

      log('Drawing common group model precision graph...')
      common_user_groups.draw_precision(common_group_ps, expert_p_precisions,
                                        expert_f_precisions, expert_c_precisions,
                                        run_params_str)
Пример #4
0
def find_counts(seeds, category=None):
  num_0_1 = 0
  num_1_4 = 0
  num_4_8 = 0
  num_after_8 = 0
  num_total = 0

  log('Finding common users delta 1...')
  (num_users_1, newsaholics_1, active_users_1, common_users_1) = basic_groups.group_users(1, category)
  log('Finding common users delta 4...')
  (num_users_4, newsaholics_4, active_users_4, common_users_4) = basic_groups.group_users(4, category)
  log('Finding common users delta 8...')
  (num_users_8, newsaholics_8, active_users_8, common_users_8) = basic_groups.group_users(8, category)

  copy_common_users_1 = set(common_users_1)
  common_users_1_1 = set()
  common_users_1_2 = set()
  common_users_1_3 = set()
  count = 0
  while len(copy_common_users_1) > 0:
    if count % 3 == 0:
      common_users_1_1.add(copy_common_users_1.pop())
    elif count % 3 == 1:
      common_users_1_2.add(copy_common_users_1.pop())
    elif count % 3 == 2:
      common_users_1_3.add(copy_common_users_1.pop())
    count += 1

  copy_common_users_4 = set(common_users_4)
  common_users_4_1 = set()
  common_users_4_2 = set()
  common_users_4_3 = set()
  count = 0
  while len(copy_common_users_4) > 0:
    if count % 3 == 0:
      common_users_4_1.add(copy_common_users_4.pop())
    elif count % 3 == 1:
      common_users_4_2.add(copy_common_users_4.pop())
    elif count % 3 == 2:
      common_users_4_3.add(copy_common_users_4.pop())
    count += 1

  copy_common_users_8 = set(common_users_8)
  common_users_8_1 = set()
  common_users_8_2 = set()
  common_users_8_3 = set()
  count = 0
  while len(copy_common_users_8) > 0:
    if count % 3 == 0:
      common_users_8_1.add(copy_common_users_8.pop())
    elif count % 3 == 1:
      common_users_8_2.add(copy_common_users_8.pop())
    elif count % 3 == 2:
      common_users_8_3.add(copy_common_users_8.pop())
    count += 1

  log('Size Common Users 1 (delta 1): %s' % len(common_users_1_1))
  log('Size Common Users 2 (delta 1): %s' % len(common_users_1_2))
  log('Size Common Users 3 (delta 1): %s' % len(common_users_1_3))
  log('Size Common Users 1 (delta 4): %s' % len(common_users_4_1))
  log('Size Common Users 2 (delta 4): %s' % len(common_users_4_2))
  log('Size Common Users 3 (delta 4): %s' % len(common_users_4_3))
  log('Size Common Users 1 (delta 8): %s' % len(common_users_8_1))
  log('Size Common Users 2 (delta 8): %s' % len(common_users_8_2))
  log('Size Common Users 3 (delta 8): %s' % len(common_users_8_3))

  log('Finding precision experts delta 1...')
  experts_p_1 = experts.select_experts_precision(newsaholics_1.union(active_users_1),
                                                 num_users_1, 1, _SIZE_EXPERTS, category)
  log('Finding precision experts delta 1...')
  experts_p_4 = experts.select_experts_precision(newsaholics_4.union(active_users_4),
                                                 num_users_4, 4, _SIZE_EXPERTS, category)
  log('Finding precision experts delta 1...')
  experts_p_8 = experts.select_experts_precision(newsaholics_8.union(active_users_8),
                                                 num_users_8, 8, _SIZE_EXPERTS, category)

  log('Finding ground truths...')
  gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING, category)
  log('Finding target news...')
  target_news = ground_truths.find_target_news(gt_rankings, _SIZE_EXPERTS)
  size_target_news = len(target_news)

  log('Finding fscore experts delta 1...')
  experts_f_1 = experts.select_experts_fscore(size_target_news, num_users_1,
                                              1, _SIZE_EXPERTS, category)
  log('Finding fscore experts delta 4...')
  experts_f_4 = experts.select_experts_fscore(size_target_news, num_users_4,
                                              4, _SIZE_EXPERTS, category)
  log('Finding fscore experts delta 8...')
  experts_f_8 = experts.select_experts_fscore(size_target_news, num_users_8,
                                              8, _SIZE_EXPERTS, category)

  log('Finding ci experts delta 1...')
  experts_ci_1 = experts.select_experts_ci(num_users_1, 1, _SIZE_EXPERTS, category)
  log('Finding ci experts delta 4...')
  experts_ci_4 = experts.select_experts_ci(num_users_4, 4, _SIZE_EXPERTS, category)
  log('Finding ci experts delta 8...')
  experts_ci_8 = experts.select_experts_ci(num_users_8, 8, _SIZE_EXPERTS, category)

  experts_all_1 = experts_p_1.union(experts_f_1).union(experts_ci_1)
  experts_all_4 = experts_p_4.union(experts_f_4).union(experts_ci_4)
  experts_all_8 = experts_p_8.union(experts_f_8).union(experts_ci_8)

  num_0_1_common = 0
  num_1_4_common = 0
  num_4_8_common = 0
  
  num_cu_1_1 = 0
  num_cu_1_2 = 0
  num_cu_1_3 = 0

  num_cu_4_1 = 0
  num_cu_4_2 = 0
  num_cu_4_3 = 0

  num_cu_8_1 = 0
  num_cu_8_2 = 0
  num_cu_8_3 = 0

  num_0_1_experts_p = 0
  num_1_4_experts_p = 0
  num_4_8_experts_p = 0

  num_0_1_experts_f = 0
  num_1_4_experts_f = 0
  num_4_8_experts_f = 0

  num_0_1_experts_ci = 0
  num_1_4_experts_ci = 0
  num_4_8_experts_ci = 0

  num_0_1_experts_all = 0
  num_1_4_experts_all = 0
  num_4_8_experts_all = 0

  log('Finding counts...')
  with open('../data/FolkWisdom/time_deltas.tsv') as input_file:
    for line in input_file:

      # parse line
      tokens = line.split('\t')
      url = tokens[_TIMEDELTAS_FILE_URL_INDEX]
      time_delta = timedelta(seconds=int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]))
      tweet_category = tokens[_TIMEDELTAS_FILE_CATEGORY_INDEX].strip()
      user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX]

      if url in seeds:
        (seed_tweet_id, seed_user_id, seed_time) = seeds[url]

        if Util.is_in_testing_set(seed_time) and category_matches(category,
                                                                  tweet_category):
            num_total += 1
            if time_delta < timedelta(hours=1):
              num_0_1 += 1
              if user_id in common_users_1:
                num_0_1_common += 1
              if user_id in experts_p_1:
                num_0_1_experts_p += 1
              if user_id in experts_f_1:
                num_0_1_experts_f += 1
              if user_id in experts_ci_1:
                num_0_1_experts_ci += 1
              if user_id in experts_all_1:
                num_0_1_experts_all += 1
              if user_id in common_users_1_1:
                num_cu_1_1 += 1
              if user_id in common_users_1_2:
                num_cu_1_2 += 1
              if user_id in common_users_1_3:
                num_cu_1_3 += 1
            elif time_delta >= timedelta(hours=1) and time_delta < timedelta(hours=4):
              num_1_4 += 1
              if user_id in common_users_4:
                num_1_4_common += 1
              if user_id in experts_p_4:
                num_1_4_experts_p += 1
              if user_id in experts_f_4:
                num_1_4_experts_f += 1
              if user_id in experts_ci_4:
                num_1_4_experts_ci += 1
              if user_id in experts_all_4:
                num_1_4_experts_all += 1
              if user_id in common_users_4_1:
                num_cu_4_1 += 1
              if user_id in common_users_4_2:
                num_cu_4_2 += 1
              if user_id in common_users_4_3:
                num_cu_4_3 += 1
            elif time_delta >= timedelta(hours=4) and time_delta < timedelta(hours=8):
              num_4_8 += 1
              if user_id in common_users_8:
                num_4_8_common += 1
              if user_id in experts_p_8:
                num_4_8_experts_p += 1
              if user_id in experts_f_8:
                num_4_8_experts_f += 1
              if user_id in experts_ci_8:
                num_4_8_experts_ci += 1
              if user_id in experts_all_8:
                num_4_8_experts_all += 1
              if user_id in common_users_8_1:
                num_cu_8_1 += 1
              if user_id in common_users_8_2:
                num_cu_8_2 += 1
              if user_id in common_users_8_3:
                num_cu_8_3 += 1
            else:
              num_after_8 += 1

  return (num_0_1, num_0_1_common, num_0_1_experts_p, num_0_1_experts_f, num_0_1_experts_ci, num_0_1_experts_all,
          num_1_4, num_1_4_common, num_1_4_experts_p, num_1_4_experts_f, num_1_4_experts_ci, num_1_4_experts_all,
          num_4_8, num_4_8_common, num_4_8_experts_p, num_4_8_experts_f, num_4_8_experts_ci, num_4_8_experts_all,
          num_cu_1_1, num_cu_1_2, num_cu_1_3,
          num_cu_4_1, num_cu_4_2, num_cu_4_3,
          num_cu_8_1, num_cu_8_2, num_cu_8_3,
          num_after_8, num_total)
Пример #5
0
def get_all_user_groups(delta=4, category=None):
  seeds = Util.load_seeds()

  # Set up params appropriately.
  data_set = DataSet.TRAINING
  months = _TRAINING_SET_MONTHS
  if _SWITCHED:
    data_set = DataSet.TESTING
    months = _TESTING_SET_MONTHS
  retweets = set()
  if _EXCLUDE_RETWEETS:
    retweets = ground_truths.find_retweets(months)

  gt_rankings = ground_truths.get_gt_rankings(seeds, data_set, category,
                                              exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA,
                                              retweets=retweets)
  target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)

  groups = UserGroups()

  (num_users, groups.newsaholics,
   groups.active_users,
   groups.common_users) = basic_groups.group_users(delta, category)
  groups.population = groups.newsaholics.union(groups.active_users).union(groups.common_users)

  num_users_eg, groups.even_groups = even_groups.group_users(delta,
                                                             _NUM_GROUPS,
                                                             _SIZE_OF_GROUP_IN_PERCENT,
                                                             category)

  groups.precision = experts.select_experts_precision(
      groups.newsaholics.union(groups.active_users), num_users, delta,
      _SIZE_EXPERTS, category)
  groups.fscore = experts.select_experts_fscore(len(target_news),
                                                num_users,
                                                delta, _SIZE_EXPERTS,
                                                category)
  groups.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS,
                                        category)
  groups.super_experts = experts.select_super_experts(groups.precision,
                                                      groups.fscore,
                                                      groups.ci)

  groups.ci_hi, groups.ci_li = experts.split_ci_experts_by_followers(groups.ci)

  groups.ci_1 = set()
  groups.ci_2 = set()
  groups.ci_3 = set()
  counter = 0
  for ci_expert in groups.ci:
    if counter % 3 == 0:
      groups.ci_1.add(ci_expert)
    elif counter % 3 == 1:
      groups.ci_2.add(ci_expert)
    elif counter % 3 == 2:
      groups.ci_3.add(ci_expert)
    counter += 1

  groups.social_bias, d_num_followers  = experts.select_experts_social_bias(num_users,
                                                                            _SIZE_EXPERTS)
  groups.all_experts = experts.select_all_experts(groups.precision,
                                                  groups.fscore,
                                                  groups.ci)
  groups.non_experts = groups.population.difference(groups.all_experts)
  sample_size = int(len(groups.non_experts) * _NON_EXPERTS_SAMPLE_SIZE)
  sample_size_25 = int(len(groups.non_experts) * 0.05)
  sample_size_10 = int(len(groups.non_experts) * 0.10)
  sample_size_1 = int(len(groups.non_experts) * 0.02)
  groups.non_experts_sampled = set(random.sample(groups.non_experts, sample_size))
  groups.non_experts_25 = set(random.sample(groups.non_experts, sample_size_25))
  groups.non_experts_10 = set(random.sample(groups.non_experts, sample_size_10))
  groups.non_experts_1 = set(random.sample(groups.non_experts, sample_size_1))

  return groups, d_num_followers
Пример #6
0
def run():

    Util.ensure_dir_exist(_DATA_DIR)
    category = None
    seeds = Util.load_seeds()  #read twitter data

    gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING,
                                                category)
    log('Num ground_truth_rankings: %s' % len(gt_rankings))
    target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)
    log('Size target_news: %s' % len(target_news))

    for delta in _DELTAS:
        (num_users, newsaholics, active_users,
         common_users) = basic_groups.group_users(delta, category)
        population = newsaholics.union(active_users).union(common_users)
        log('Num newsaholics: %s' % len(newsaholics))
        log('Num active: %s' % len(active_users))
        log('Num common: %s' % len(common_users))
        log('Num users (population): %s' % len(population))

        # -- Get experts --
        ExpertGroup.precision = experts.select_experts_precision(
            newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS,
            category)
        ExpertGroup.fscore = experts.select_experts_fscore(
            len(target_news), num_users, delta, _SIZE_EXPERTS, category)
        ExpertGroup.ci = experts.select_experts_ci(num_users, delta,
                                                   _SIZE_EXPERTS, category)
        ExpertGroup.union = experts.select_all_experts(ExpertGroup.precision,
                                                       ExpertGroup.fscore,
                                                       ExpertGroup.ci)

        log('Num experts (precision): %s' % len(ExpertGroup.precision))
        log('Num experts (fscore): %s' % len(ExpertGroup.fscore))
        log('Num experts (ci): %s' % len(ExpertGroup.ci))
        log('Num all experts: %s' % len(ExpertGroup.union))

        non_experts = population.difference(ExpertGroup.union)
        log('Num non_experts: %s' % len(non_experts))

        # other_users = population.difference(all_experts).difference(common_users)

        # -- counting --

        total_num_tweets = 0
        hour_to_num_tweets = {}
        with open('../data/FolkWisdom/time_deltas.tsv') as in_file:
            for line in in_file:
                tokens = line.split('\t')
                time_delta_in_sec = int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX])
                url = tokens[_TIMEDELTAS_FILE_URL_INDEX].strip()
                user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX]

                if time_delta_in_sec > 0 and url in target_news:
                    current_hour = time_delta_in_sec / _NUM_SEC_PER_HOUR
                    total_num_tweets += 1

                    if current_hour not in hour_to_num_tweets:
                        hour_to_num_tweets[current_hour] = GroupCount()
                    gcount = hour_to_num_tweets[current_hour]

                    gcount.population += 1
                    if user_id in ExpertGroup.union:
                        gcount.union += 1
                        if user_id in ExpertGroup.precision:
                            gcount.precision += 1
                        if user_id in ExpertGroup.fscore:
                            gcount.fscore += 1
                        if user_id in ExpertGroup.ci:
                            gcount.ci += 1
                    else:
                        gcount.non_experts += 1
                        if user_id in common_users:
                            gcount.common += 1

                        #  print >> sys.stderr, 'Error, a user in expert union but not belongs to any expert group'

                    # elif user_id in common_users:
                    #   gcount.common += 1
                    # else :
                    #   gcount.other += 1

                    # if user_id in non_experts:
                    #   gcount.non_experts += 1

        gcount = GroupCount()
        with open(_DATA_DIR + 'hour_thresholds_%s.tsv' % delta,
                  'w') as out_file:
            for hour in hour_to_num_tweets.keys():
                gc = hour_to_num_tweets[hour]
                gcount.add(gc)
                percentage = (gcount.population /
                              float(total_num_tweets)) * 100.0
                percentage_common = (gcount.common /
                                     float(total_num_tweets)) * 100.0
                percentage_other = (gcount.other /
                                    float(total_num_tweets)) * 100.0
                percentage_experts = (gcount.union /
                                      float(total_num_tweets)) * 100.0
                percentage_non_experts = (gcount.non_experts /
                                          float(total_num_tweets)) * 100.0

                out_file.write(
                    '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                    (hour, percentage, percentage_non_experts,
                     percentage_experts, percentage_common,
                     (gcount.precision / float(total_num_tweets)) * 100.0,
                     (gcount.fscore / float(total_num_tweets)) * 100.0,
                     (gcount.ci / float(total_num_tweets)) * 100.0))
        log('hour\tpopulation\tnon_experts\texperts\tcommon\tprecision\tfscore\tci'
            )
Пример #7
0
def run():

  Util.ensure_dir_exist(_DATA_DIR)
  category = None
  seeds = Util.load_seeds() #read twitter data

  gt_rankings = ground_truths.get_gt_rankings(seeds, DataSet.TESTING,
                                              category)
  log('Num ground_truth_rankings: %s' % len(gt_rankings))
  target_news = ground_truths.find_target_news(gt_rankings, _SIZE_TOP_NEWS)
  log('Size target_news: %s' % len(target_news))

  for delta in _DELTAS:
    (num_users, newsaholics,
     active_users, common_users) = basic_groups.group_users(delta, category)
    population = newsaholics.union(active_users).union(common_users)
    log('Num newsaholics: %s' % len(newsaholics))
    log('Num active: %s' % len(active_users))
    log('Num common: %s' % len(common_users))
    log('Num users (population): %s' % len(population))

    # -- Get experts --
    ExpertGroup.precision = experts.select_experts_precision(
        newsaholics.union(active_users), num_users, delta, _SIZE_EXPERTS,
        category)
    ExpertGroup.fscore = experts.select_experts_fscore(len(target_news),
                                                   num_users,
                                                   delta, _SIZE_EXPERTS,
                                                   category)
    ExpertGroup.ci = experts.select_experts_ci(num_users, delta, _SIZE_EXPERTS,
                                           category)
    ExpertGroup.union = experts.select_all_experts(ExpertGroup.precision,
                                             ExpertGroup.fscore,
                                             ExpertGroup.ci)

    log('Num experts (precision): %s' % len(ExpertGroup.precision))
    log('Num experts (fscore): %s' % len(ExpertGroup.fscore))
    log('Num experts (ci): %s' % len(ExpertGroup.ci))
    log('Num all experts: %s' % len(ExpertGroup.union))

    non_experts = population.difference(ExpertGroup.union)
    log('Num non_experts: %s' % len(non_experts))

    # other_users = population.difference(all_experts).difference(common_users)


    # -- counting --

    total_num_tweets = 0 
    hour_to_num_tweets = {}
    with open('../data/FolkWisdom/time_deltas.tsv') as in_file:
      for line in in_file:
        tokens = line.split('\t')
        time_delta_in_sec = int(tokens[_TIMEDELTAS_FILE_DELTA_INDEX])
        url = tokens[_TIMEDELTAS_FILE_URL_INDEX].strip()
        user_id = tokens[_TIMEDELTAS_FILE_USER_ID_INDEX]

        if time_delta_in_sec > 0 and url in target_news:
          current_hour = time_delta_in_sec / _NUM_SEC_PER_HOUR
          total_num_tweets += 1

          if current_hour not in hour_to_num_tweets:
            hour_to_num_tweets[current_hour] = GroupCount()
          gcount = hour_to_num_tweets[current_hour]

          gcount.population += 1
          if user_id in ExpertGroup.union:
            gcount.union += 1
            if user_id in ExpertGroup.precision:
              gcount.precision += 1
            if user_id in ExpertGroup.fscore:
              gcount.fscore += 1
            if user_id in ExpertGroup.ci:
              gcount.ci += 1
          else:
            gcount.non_experts += 1
            if user_id in common_users:
              gcount.common += 1
            
            #  print >> sys.stderr, 'Error, a user in expert union but not belongs to any expert group'

          # elif user_id in common_users:
          #   gcount.common += 1
          # else :
          #   gcount.other += 1

          # if user_id in non_experts:
          #   gcount.non_experts += 1

    gcount = GroupCount()  
    with open(_DATA_DIR + 'hour_thresholds_%s.tsv' % delta, 'w') as out_file:
      for hour in hour_to_num_tweets.keys():
        gc = hour_to_num_tweets[hour]
        gcount.add(gc)
        percentage = (gcount.population / float(total_num_tweets)) * 100.0
        percentage_common = (gcount.common / float(total_num_tweets)) * 100.0
        percentage_other = (gcount.other / float(total_num_tweets)) * 100.0
        percentage_experts = (gcount.union / float(total_num_tweets)) * 100.0
        percentage_non_experts = (gcount.non_experts / float(total_num_tweets)) * 100.0
        
        out_file.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (hour, percentage,
                                                             percentage_non_experts,
                                                             percentage_experts,
                                                             percentage_common,
                                                             (gcount.precision / float(total_num_tweets)) * 100.0,
                                                             (gcount.fscore / float(total_num_tweets)) * 100.0,
                                                             (gcount.ci / float(total_num_tweets)) * 100.0))
    log('hour\tpopulation\tnon_experts\texperts\tcommon\tprecision\tfscore\tci')