def _write_stories(stories_dict): start_time = time.time() sorted_stories = sorted(stories_dict.keys()) row_num = 0 output_stream = open_safely(PROCESSED_STORIES_FILE_PATH, "w") for story_key in sorted_stories: if FETCH_FULL_STORIES: story_timestamp, story_contents = stories_dict[story_key] story_title_with_contents = story_key[NEW_STORIES_TITLE_INDEX] + \ " " + story_contents story_sans_timestamp_as_tuple = \ (story_key[NEW_STORIES_FEED_URL_INDEX], story_key[NEW_STORIES_FEED_TITLE_INDEX], story_key[NEW_STORIES_URL_INDEX], story_title_with_contents) story_sans_timestamp_as_str = \ DELIMITER.join(story_sans_timestamp_as_tuple) else: story_timestamp = stories_dict[story_key] story_sans_timestamp_as_str = DELIMITER.join(story_key) story_timestamp_as_str = DELIMITER + str(story_timestamp) story_as_str = story_sans_timestamp_as_str + story_timestamp_as_str output_stream.write(story_as_str + "\n") stories_dict[story_key] = row_num row_num += 1 output_stream.close() print("Wrote %d cleaned and sorted %s to %s" % (row_num, STORIES_DESCRIPTOR, PROCESSED_STORIES_FILE_PATH)) report_time_elapsed(start_time)
def _write_events(events_list, output_file_path, story_id_dict, user_id_offset): """Write the given events to the given output file using new story IDs. Maintain the ordering of events_list in the output file. Write events in newline-delimited raw text format. Within each event, delimit fields by DELIMITER. Write events with fields (new_user_id, new_story_id, time_occurred), where new user IDs start from 0. Assume the the first element in events_list belongs to the user with the smallest ID of those in the list. events_list, a list, contains all the events of a given type (reads or clickthroughs) for a range of users. Each element of events_list is in the form (old_user_id, old_story_id, time_occurred). output_file_path, a str, is the file path to which to output events. story_id_dict, a dict, maps from old story IDs to new story IDs. user_id_offset, an int, is the value that must be subtracted from an old user ID to produce the corresponding new user ID. """ output_stream = open_safely(output_file_path, "w") for old_event in events_list: old_user_id = old_event[EVENTS_USER_ID_INDEX] new_user_id = old_user_id - user_id_offset old_story_id = old_event[EVENTS_STORY_ID_INDEX] new_story_id = story_id_dict[old_story_id] time_occurred = old_event[NEW_EVENTS_TIMESTAMP_INDEX] new_event = (new_user_id, new_story_id, time_occurred) output_stream.write(DELIMITER.join(map(str, new_event)) + "\n") output_stream.close()
def _write_events(events_list, output_file_path, event_descriptor): start_time = time.time() output_stream = open_safely(output_file_path, "w") for event in events_list: output_stream.write(DELIMITER.join(map(str, event)) + "\n") output_stream.close() num_events = len(events_list) print("Wrote %d cleaned and sorted %s to %s" % (num_events, event_descriptor, output_file_path)) report_time_elapsed(start_time)