def get_gt_rankings(seeds, dataset, category=None, delta=4, exclude_tweets_within_delta=False, retweets=set()): """Generate the ground truth rankings. Keyword Arguments: seeds -- A dictionary of url to first time seen. training -- Boolean to indicate if we want training window. False specifies testing window. category -- The category to get gt's for, None for all news. Returns: gt_rankings -- A list of (url, count) pairs in ranked order. """ gt_tweet_counts = {} with open('../data/FolkWisdom/time_deltas.tsv') as input_file: for line in input_file: tokens = line.split('\t') source = tokens[_TIMEDELTAS_FILE_SOURCE_INDEX].strip() url = tokens[_TIMEDELTAS_FILE_URL_INDEX] tweet_id = tokens[_TIMEDELTAS_FILE_TWEET_ID_INDEX] tweet_delta = float(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]) / 3600 include_tweet = True if exclude_tweets_within_delta: if tweet_delta > delta: include_tweet = False if url in seeds and include_tweet and not tweet_id in retweets: _, _, seed_time = seeds[url] is_in_window = False if dataset == DataSet.TRAINING: is_in_window = Util.is_in_training_set(seed_time) elif dataset == DataSet.TESTING: is_in_window = Util.is_in_testing_set(seed_time) else: is_in_window = True if is_in_window: category_matches = True if category: category_matches = False url_category = URLUtil.extract_category(url) if url_category == category: category_matches = True if category_matches: if url in gt_tweet_counts: gt_tweet_counts[url] += 1 else: gt_tweet_counts[url] = 1 gt_rankings = sorted(gt_tweet_counts.items(), key=lambda x: x[1], reverse=True) return gt_rankings
def find_hits_and_mises(months, target_news, seeds, cache, delta, category=None): """Finds the hit and miss count for each user. Keyword Arguments: months -- The months over which to calculate hit and misses. target_news -- A set of urls that is the set of known target news. cache -- A dictionary of short url to long url. category -- The category to find hits and misses for, None for all news. """ hits_and_misses = {} for month in months: log('Finding hits and misses for users from %s for delta %s and category %s' % (month, delta, category)) dir_name = Util.get_data_dir_name_for(month) for filename in os.listdir(dir_name): if '.tweet' in filename and 'http_nyti_ms' in filename: data_file = '%s/%s' % (dir_name, filename) with open(data_file) as input_file: for line in input_file: tokens = line.split('\t') user_id = tokens[_TWEETFILE_USER_ID_INDEX] tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX] urls = URLUtil.parse_urls(tweet_text, cache) for url in urls: _, _, seed_time = seeds[url] created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX], _DATETIME_FORMAT) time_delta = created - seed_time if time_delta < timedelta(hours=delta): category_matches = True if category: category_matches = False url_category = URLUtil.extract_category(url) if category == url_category: category_matches = True if url in target_news and category_matches: if user_id in hits_and_misses: (user_hits, user_misses) = hits_and_misses[user_id] hits_and_misses[user_id] = (user_hits + 1, user_misses) else: hits_and_misses[user_id] = (1, 0) elif category_matches: if user_id in hits_and_misses: (user_hits, user_misses) = hits_and_misses[user_id] hits_and_misses[user_id] = (user_hits, user_misses + 1) else: hits_and_misses[user_id] = (0, 1) output_file = (_OUT_DIR + 'user_hits_and_misses_%s_%s.tsv' % (delta, category)) with open(output_file, 'w') as out_file: for user_id, (hits, misses) in hits_and_misses.items(): out_file.write('%s\t%s\t%s\n' % (user_id, hits, misses)) log('Wrote hits and misses to disk.')
def find_delta_times(months, seeds, cache): """Finds the delta times for every url. Looks at every url, and calculates the time delta from previously calculated seed times. Keyword Arguments: months -- The months over which to look at urls. seeds -- A set of seed times, given as a dictionary of url to timedelta. cache -- Dictionary mapping short-url to long-url. """ time_deltas = {} for month in months: log('Finding delta times from %s' % month) dir_name = Util.get_data_dir_name_for(month) for filename in os.listdir(dir_name): if '.tweet' in filename and 'http_nyti_ms' in filename: data_file = '%s/%s' % (dir_name, filename) with open(data_file) as input_file: for line in input_file: tokens = line.split('\t') user_id = tokens[_TWEETFILE_USER_ID_INDEX] tweet_id = tokens[_TWEETFILE_TWEET_ID_INDEX] tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX] urls = URLUtil.parse_urls(tweet_text, cache) source = tokens[_TWEETFILE_SOURCE_INDEX] for url in urls: seed_tweet_id, _, seed_time = seeds[url] category = URLUtil.extract_category(url) if tweet_id == seed_tweet_id: time_deltas[tweet_id] = (user_id, 0, url, category, source) else: created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX], _DATETIME_FORMAT) time_delta = created - seed_time # Convert time delta to seconds to make it easy to read from # file later. time_delta_in_seconds = (time_delta.days * 86400 + time_delta.seconds) time_deltas[tweet_id] = (user_id, time_delta_in_seconds, url, category, source) sorted_deltas = sorted(time_deltas.items(), key=lambda x: x[1][1], reverse=False) for (tweet_id, tp) in sorted_deltas: if len(tp) < 5: print tp with open('../data/FolkWisdom/time_deltas.tsv', 'w') as output_file: for (tweet_id, (user_id, time_delta, url, category, source)) in sorted_deltas: output_file.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (tweet_id, user_id, time_delta, url, category, source)) log('Wrote time deltas to disk')
def sort_users_by_tweet_count(months, seeds, cache, delta, category=None): """Sorts users by their tweet activity. Keyword Arguments: months -- The months for which to sort the users on. cache -- Dictionary of short url to long url. category -- The category to go by, None for all news. """ user_id_to_tweet_count = {} for month in months: log('Gathering count information for users from %s for delta %s ' 'and category %s' % (month, delta, category)) dir_name = Util.get_data_dir_name_for(month) for filename in os.listdir(dir_name): if '.tweet' in filename and 'http_nyti_ms' in filename: data_file = '%s/%s' % (dir_name, filename) with open(data_file) as input_file: for line in input_file: tokens = line.split('\t') user_id = tokens[_TWEETFILE_USER_ID_INDEX] tweet_text = tokens[_TWEETFILE_TWEET_TEXT_INDEX] urls = URLUtil.parse_urls(tweet_text, cache) for url in urls: _, _, seed_time = seeds[url] created = datetime.strptime(tokens[_TWEETFILE_CREATED_AT_INDEX], _DATETIME_FORMAT) time_delta = created - seed_time if time_delta < timedelta(hours=delta): if category: url_category = URLUtil.extract_category(url) if url_category == category: if user_id_to_tweet_count.has_key(user_id): user_id_to_tweet_count[user_id] += 1 else: user_id_to_tweet_count[user_id] = 1 else: if user_id_to_tweet_count.has_key(user_id): user_id_to_tweet_count[user_id] += 1 else: user_id_to_tweet_count[user_id] = 1 user_ids_sorted_by_tweet_count = sorted(user_id_to_tweet_count.items(), key=lambda x: x[1], reverse=True) log("Size of users for category %s (total): %s" % (str(len(user_id_to_tweet_count.keys())), category)) output_file = _OUT_DIR + 'user_activity_%s_%s.tsv' % (delta, category) with open(output_file, 'w') as out_file: for user_id, count in user_ids_sorted_by_tweet_count: out_file.write('%s\t%s\n' % (user_id, count)) log('Wrote users (sorted by activity) to disk')
def get_gt_rankings(seeds, dataset, category=None, delta=4, exclude_tweets_within_delta=False, retweets=set()): """Generate the ground truth rankings. Keyword Arguments: seeds -- A dictionary of url to first time seen. training -- Boolean to indicate if we want training window. False specifies testing window. category -- The category to get gt's for, None for all news. Returns: gt_rankings -- A list of (url, count) pairs in ranked order. """ gt_tweet_counts = {} with open('../data/FolkWisdom/time_deltas.tsv') as input_file: for line in input_file: tokens = line.split('\t') source = tokens[_TIMEDELTAS_FILE_SOURCE_INDEX].strip() url = tokens[_TIMEDELTAS_FILE_URL_INDEX] tweet_id = tokens[_TIMEDELTAS_FILE_TWEET_ID_INDEX] tweet_delta = float(tokens[_TIMEDELTAS_FILE_DELTA_INDEX]) / 3600 include_tweet = True if exclude_tweets_within_delta: if tweet_delta > delta: include_tweet = False if url in seeds and include_tweet and not tweet_id in retweets: _, _, seed_time = seeds[url] is_in_window = False if dataset == DataSet.TRAINING: is_in_window = Util.is_in_training_set(seed_time) elif dataset == DataSet.TESTING: is_in_window = Util.is_in_testing_set(seed_time) else: is_in_window = True if is_in_window: category_matches = True if category: category_matches = False url_category = URLUtil.extract_category(url) if url_category == category: category_matches = True if category_matches: if url in gt_tweet_counts: gt_tweet_counts[url] += 1 else: gt_tweet_counts[url] = 1 gt_rankings = sorted(gt_tweet_counts.items(), key=lambda x: x[1], reverse=True) return gt_rankings