Python URLUtil.extract_category示例

示例#1

0

显示文件

文件： ground_truths.py 项目： cmoghbel/TwitterResearch

def get_gt_rankings(seeds,
                    dataset,
                    category=None,
                    delta=4,
                    exclude_tweets_within_delta=False,
                    retweets=set()):
    """Generate the ground truth rankings.
  
  Keyword Arguments:
  seeds -- A dictionary of url to first time seen.
  training -- Boolean to indicate if we want training window. False specifies
              testing window.
  category -- The category to get gt's for, None for all news.

  Returns:
  gt_rankings -- A list of (url, count) pairs in ranked order.
  """
    gt_tweet_counts = {}
    with open('../data/FolkWisdom/time_deltas.tsv') as input_file:
        for line in input_file:
            tokens = line.split('\t')
            source = tokens[_TIMEDELTAS_FILE_SOURCE_INDEX].strip()
            url = tokens[_TIMEDELTAS_FILE_URL_INDEX]
            tweet_id = tokens[_TIMEDELTAS_FILE_TWEET_ID_INDEX]
            tweet_delta = float(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]) / 3600
            include_tweet = True
            if exclude_tweets_within_delta:
                if tweet_delta > delta:
                    include_tweet = False
            if url in seeds and include_tweet and not tweet_id in retweets:
                _, _, seed_time = seeds[url]
                is_in_window = False
                if dataset == DataSet.TRAINING:
                    is_in_window = Util.is_in_training_set(seed_time)
                elif dataset == DataSet.TESTING:
                    is_in_window = Util.is_in_testing_set(seed_time)
                else:
                    is_in_window = True
                if is_in_window:
                    category_matches = True
                    if category:
                        category_matches = False
                        url_category = URLUtil.extract_category(url)
                        if url_category == category:
                            category_matches = True
                    if category_matches:
                        if url in gt_tweet_counts:
                            gt_tweet_counts[url] += 1
                        else:
                            gt_tweet_counts[url] = 1

    gt_rankings = sorted(gt_tweet_counts.items(),
                         key=lambda x: x[1],
                         reverse=True)
    return gt_rankings

示例#2

0

显示文件

def find_hits_and_mises(months, target_news, seeds, cache, delta,
                        category=None):
  """Finds the hit and miss count for each user.

  Keyword Arguments:
  months -- The months over which to calculate hit and misses.
  target_news -- A set of urls that is the set of known target news.
  cache -- A dictionary of short url to long url.
  category -- The category to find hits and misses for, None for all news.
  """
  hits_and_misses = {}
  for month in months:
    log('Finding hits and misses for users from %s for delta %s and category %s'
        % (month, delta, category))
    dir_name = Util.get_data_dir_name_for(month)
    for filename in os.listdir(dir_name):
      if '.tweet' in filename and 'http_nyti_ms' in filename:
        data_file = '%s/%s' % (dir_name, filename)
        with open(data_file) as input_file:
          for line in input_file:
            tokens = line.split('\t')
            user_id = tokens[_TWEETFILE_USER_ID_INDEX]
            tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX]
            urls = URLUtil.parse_urls(tweet_text, cache)
            for url in urls:
              _, _, seed_time = seeds[url]
              created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX],
                                          _DATETIME_FORMAT)
              time_delta = created - seed_time
              if time_delta < timedelta(hours=delta):
                category_matches = True
                if category:
                  category_matches = False
                  url_category = URLUtil.extract_category(url)
                  if category == url_category:
                    category_matches = True
                if url in target_news and category_matches:
                  if user_id in hits_and_misses:
                    (user_hits, user_misses) = hits_and_misses[user_id]
                    hits_and_misses[user_id] = (user_hits + 1, user_misses)
                  else:
                    hits_and_misses[user_id] = (1, 0)
                elif category_matches:
                  if user_id in hits_and_misses:
                    (user_hits, user_misses) = hits_and_misses[user_id]
                    hits_and_misses[user_id] = (user_hits, user_misses + 1)
                  else:
                    hits_and_misses[user_id] = (0, 1)

  output_file = (_OUT_DIR + 'user_hits_and_misses_%s_%s.tsv'
                 % (delta, category))
  with open(output_file, 'w') as out_file:
    for user_id, (hits, misses) in hits_and_misses.items():
      out_file.write('%s\t%s\t%s\n' % (user_id, hits, misses))
  log('Wrote hits and misses to disk.')

示例#3

0

显示文件

文件： gen_seeds_and_deltas.py 项目： chucheng/TwitterResearch

def find_delta_times(months, seeds, cache):
  """Finds the delta times for every url.
  
  Looks at every url, and calculates the time delta from previously calculated
  seed times.
  
  Keyword Arguments:
  months -- The months over which to look at urls.
  seeds -- A set of seed times, given as a dictionary of url to timedelta.
  cache -- Dictionary mapping short-url to long-url.
  """
  time_deltas = {}
  for month in months:
    log('Finding delta times from %s' % month)
    dir_name = Util.get_data_dir_name_for(month) 
    for filename in os.listdir(dir_name):
      if '.tweet' in filename and 'http_nyti_ms' in filename:
        data_file = '%s/%s' % (dir_name, filename)
        with open(data_file) as input_file:
          for line in input_file:
            tokens = line.split('\t')
            user_id = tokens[_TWEETFILE_USER_ID_INDEX]
            tweet_id = tokens[_TWEETFILE_TWEET_ID_INDEX]
            tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX]
            urls = URLUtil.parse_urls(tweet_text, cache)
            source = tokens[_TWEETFILE_SOURCE_INDEX]
            for url in urls:
              seed_tweet_id, _, seed_time = seeds[url]
              category = URLUtil.extract_category(url)
              if tweet_id == seed_tweet_id:
                time_deltas[tweet_id] = (user_id, 0, url, category, source)
              else:
                created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX],
                                            _DATETIME_FORMAT)
                time_delta = created - seed_time
                # Convert time delta to seconds to make it easy to read from
                # file later.
                time_delta_in_seconds = (time_delta.days * 86400
                                         + time_delta.seconds)
                time_deltas[tweet_id] = (user_id, time_delta_in_seconds, url,
                                         category, source)
  sorted_deltas = sorted(time_deltas.items(), key=lambda x: x[1][1],
                         reverse=False)
  for (tweet_id, tp) in sorted_deltas:
    if len(tp) < 5:
      print tp
  with open('../data/FolkWisdom/time_deltas.tsv', 'w') as output_file:
    for (tweet_id, (user_id, time_delta, url,
                    category, source)) in sorted_deltas:
      output_file.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (tweet_id, user_id,
                                                      time_delta, url, category,
                                                      source))
  log('Wrote time deltas to disk')

示例#4

0

显示文件

def sort_users_by_tweet_count(months, seeds, cache, delta, category=None):
  """Sorts users by their tweet activity.
  
  Keyword Arguments:
  months -- The months for which to sort the users on.
  cache -- Dictionary of short url to long url.
  category -- The category to go by, None for all news.
  """
  user_id_to_tweet_count = {}
  for month in months:
    log('Gathering count information for users from %s for delta %s '
        'and category %s' % (month, delta, category))
    dir_name = Util.get_data_dir_name_for(month)
    for filename in os.listdir(dir_name):
      if '.tweet' in filename and 'http_nyti_ms' in filename:
        data_file = '%s/%s' % (dir_name, filename)
        with open(data_file) as input_file:
          for line in input_file:
            tokens = line.split('\t')
            user_id = tokens[_TWEETFILE_USER_ID_INDEX]
            tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX]
            urls = URLUtil.parse_urls(tweet_text, cache)
            for url in urls:
              _, _, seed_time = seeds[url]
              created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX],
                                          _DATETIME_FORMAT)
              time_delta = created - seed_time
              if time_delta < timedelta(hours=delta):
                if category:
                  url_category = URLUtil.extract_category(url)
                  if url_category == category:
                    if user_id_to_tweet_count.has_key(user_id):
                      user_id_to_tweet_count[user_id] += 1
                    else:
                      user_id_to_tweet_count[user_id] = 1
                else:
                  if user_id_to_tweet_count.has_key(user_id):
                    user_id_to_tweet_count[user_id] += 1
                  else:
                    user_id_to_tweet_count[user_id] = 1
                
  user_ids_sorted_by_tweet_count = sorted(user_id_to_tweet_count.items(),
                                          key=lambda x: x[1], reverse=True)
  
  log("Size of users for category %s (total): %s"
      % (str(len(user_id_to_tweet_count.keys())), category))

  output_file = _OUT_DIR + 'user_activity_%s_%s.tsv' % (delta, category)
  with open(output_file, 'w') as out_file:
    for user_id, count in user_ids_sorted_by_tweet_count:
      out_file.write('%s\t%s\n' % (user_id, count))
  log('Wrote users (sorted by activity) to disk')

示例#5

0

显示文件

文件： ground_truths.py 项目： chucheng/TwitterResearch

def get_gt_rankings(seeds, dataset, category=None, delta=4,
                    exclude_tweets_within_delta=False, retweets=set()):
  """Generate the ground truth rankings.
  
  Keyword Arguments:
  seeds -- A dictionary of url to first time seen.
  training -- Boolean to indicate if we want training window. False specifies
              testing window.
  category -- The category to get gt's for, None for all news.

  Returns:
  gt_rankings -- A list of (url, count) pairs in ranked order.
  """
  gt_tweet_counts = {}
  with open('../data/FolkWisdom/time_deltas.tsv') as input_file:
    for line in input_file:
      tokens = line.split('\t')
      source = tokens[_TIMEDELTAS_FILE_SOURCE_INDEX].strip()
      url = tokens[_TIMEDELTAS_FILE_URL_INDEX]
      tweet_id = tokens[_TIMEDELTAS_FILE_TWEET_ID_INDEX]
      tweet_delta = float(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]) / 3600
      include_tweet = True
      if exclude_tweets_within_delta:
        if tweet_delta > delta:
          include_tweet = False
      if url in seeds and include_tweet and not tweet_id in retweets:
        _, _, seed_time = seeds[url]
        is_in_window = False
        if dataset == DataSet.TRAINING:
          is_in_window = Util.is_in_training_set(seed_time)
        elif dataset == DataSet.TESTING:
          is_in_window = Util.is_in_testing_set(seed_time)
        else:
          is_in_window = True
        if is_in_window:
          category_matches = True
          if category:
            category_matches = False
            url_category = URLUtil.extract_category(url)
            if url_category == category:
              category_matches = True
          if category_matches:
            if url in gt_tweet_counts:
              gt_tweet_counts[url] += 1
            else:
              gt_tweet_counts[url] = 1

  gt_rankings = sorted(gt_tweet_counts.items(), key=lambda x: x[1],
                       reverse=True)
  return gt_rankings