def _read_stories(input_file_path): """Return a list of stories with full contents and a dict with new IDs. Generate list elements in tuple form, where each element corresponds to a single line of the given processed stories log file. Do not trim the newline off the end of the last element of the tuple. Append a space followed by the full story contents to the title of each story. Omit stories for which the full story contents could not be fetched. Generate dict entries mapping from story IDs in the input file to story IDs in the output file. Maintain the ordering of the input file in the list. This ordering is equivalent to ascending order of both old story IDs and new story IDs. Be warned that fetching full story contents is quite slow and consumes a great deal of bandwidth, so you or other users on your network may experience connectivity problems while executing this function. input_file_path, a str, is the file path to the processed Pulse stories log file that contains story URLs and titles but not the full contents of the stories themselves. """ start_time = time.time() old_story_id = 0 new_story_id = 0 stories_list = [] story_id_dict = {} story_contents_dict = {} socket.setdefaulttimeout(TIMEOUT_LENGTH) input_stream = open_safely(input_file_path) for story_as_str in input_stream: story_as_list = story_as_str.split(DELIMITER) story_url = story_as_list[NEW_STORIES_URL_INDEX] if story_url in story_contents_dict: story_contents = story_contents_dict[story_url] else: story_contents = html2text.extractFromURL(story_url) if (story_contents is not None) and \ (len(story_contents) <= MIN_STORY_LENGTH): story_contents = None story_contents_dict[story_url] = story_contents if story_contents is not None: story_as_list[NEW_STORIES_TITLE_INDEX] += " " + story_contents stories_list.append(tuple(story_as_list)) story_id_dict[old_story_id] = new_story_id new_story_id += 1 old_story_id += 1 input_stream.close() num_stories_discarded = old_story_id - new_story_id discard_rate = float(100 * num_stories_discarded) / float(old_story_id) print(("Read a total of %d %s, %d (%.2f%%) of which were discarded " + \ "because their full contents could not be fetched.") % \ (old_story_id, STORIES_DESCRIPTOR, num_stories_discarded, discard_rate)) report_time_elapsed(start_time) return (stories_list, story_id_dict)
def _insert_full_story(story, time_first_read, stories_dict, story_contents_dict): story_url = story[OLD_STORIES_URL_INDEX] key = (story[OLD_STORIES_FEED_URL_INDEX], story[OLD_STORIES_FEED_TITLE_INDEX], story_url, story[OLD_STORIES_TITLE_INDEX]) if key in stories_dict: value = stories_dict[key] value[0] = min(value[0], time_first_read) elif story_url in story_contents_dict: story_contents = story_contents_dict[story_url] if story_contents is not None: stories_dict[key] = [time_first_read, story_contents] else: pprint.pprint(key, sys.stderr) story_contents = html2text.extractFromURL(story_url) if (story_contents is not None) and \ (len(story_contents) >= MIN_STORY_LENGTH): stories_dict[key] = [time_first_read, story_contents] story_contents_dict[story_url] = story_contents else: story_contents_dict[story_url] = None