def _write_stories(stories_dict):
    start_time = time.time()
    sorted_stories = sorted(stories_dict.keys())
    row_num = 0
    output_stream = open_safely(PROCESSED_STORIES_FILE_PATH, "w")
    for story_key in sorted_stories:
        if FETCH_FULL_STORIES:
            story_timestamp, story_contents = stories_dict[story_key]
            story_title_with_contents = story_key[NEW_STORIES_TITLE_INDEX] + \
                " " + story_contents
            story_sans_timestamp_as_tuple = \
                (story_key[NEW_STORIES_FEED_URL_INDEX],
                 story_key[NEW_STORIES_FEED_TITLE_INDEX],
                    story_key[NEW_STORIES_URL_INDEX],
                    story_title_with_contents)
            story_sans_timestamp_as_str = \
                DELIMITER.join(story_sans_timestamp_as_tuple)
        else:
            story_timestamp = stories_dict[story_key]
            story_sans_timestamp_as_str = DELIMITER.join(story_key)
        story_timestamp_as_str = DELIMITER + str(story_timestamp)
        story_as_str = story_sans_timestamp_as_str + story_timestamp_as_str
        output_stream.write(story_as_str + "\n")
        stories_dict[story_key] = row_num
        row_num += 1
    output_stream.close()
    print("Wrote %d cleaned and sorted %s to %s" %
          (row_num, STORIES_DESCRIPTOR, PROCESSED_STORIES_FILE_PATH))
    report_time_elapsed(start_time)
def stem_processed_stories(input_file_path):
    """
    """
    start_time = time.time()
    if not isinstance(input_file_path, str):
        raise TypeError("Expected input_file_path to be of type str.")
    
    stemmer = PorterStemmer()
    stories_list = []
    prog = re.compile('\W+')
    story_stream = open_safely(input_file_path)
    for story_as_str in story_stream:
        story_as_list = story_as_str[:-1].lower().split(DELIMITER)
        story_title = story_as_list[NEW_STORIES_TITLE_INDEX]
        tok_contents = WordPunctTokenizer().tokenize(story_title)
        stem_contents = [stemmer.stem(word) for word in tok_contents if \
                         prog.match(word) is None]
        story_as_list[NEW_STORIES_TITLE_INDEX] = " ".join(stem_contents)
        stories_list.append(story_as_list)
    
    story_stream.close()
    output_file_path = input_file_path + STEMMED_STORIES_EXTENSION
    write_2d_iterable(stories_list, output_file_path)
    print("Output stemmed stories to %s" % output_file_path)
    report_time_elapsed(start_time)
def _clean_data(input_file_path, num_fields, timestamp_index, data_descriptor,
                insert_data_fn, stories_dict, callback_data = None):
    start_time = time.time()
    stories_dict_already_built = (len(stories_dict) > 0)
    num_rows = 0
    input_stream = open_safely(input_file_path)
    for row in input_stream:
        num_rows += 1
        row_without_newline = row[:-1]
        _clean_row(row_without_newline, num_fields, timestamp_index,
                   insert_data_fn, stories_dict, callback_data)
    
    input_stream.close()
    
    if stories_dict_already_built:
        # We just cleaned user reads or clickthroughs.
        num_valid_rows = len(callback_data)
    else:
        # We just cleaned stories.
        num_valid_rows = len(stories_dict)
    
    num_invalid_rows = num_rows - num_valid_rows
    discard_rate = float(100 * num_invalid_rows) / float(num_rows)
    print("Read a total of %d %s, %d (%.2f%%) of which were discarded." %
          (num_rows, data_descriptor, num_invalid_rows, discard_rate))
    report_time_elapsed(start_time)
def _write_user_ids(user_ids_list):
    start_time = time.time()
    output_stream = open_safely(USER_IDS_FILE_PATH, "w")
    for user_id in user_ids_list:
        output_stream.write(user_id + "\n")
    output_stream.close()
    num_users = len(user_ids_list)
    print(("Wrote %d cleaned and sorted original 38-character hexadecimal %s " +
           "to %s") % (num_users, USER_IDS_DESCRIPTOR, USER_IDS_FILE_PATH))
    report_time_elapsed(start_time)
def _write_events(events_list, output_file_path, event_descriptor):
    start_time = time.time()
    output_stream = open_safely(output_file_path, "w")
    for event in events_list:
        output_stream.write(DELIMITER.join(map(str, event)) + "\n")
    output_stream.close()
    num_events = len(events_list)
    print("Wrote %d cleaned and sorted %s to %s" %
          (num_events, event_descriptor, output_file_path))
    report_time_elapsed(start_time)
def create_fixtures(min_user_id, max_user_id):
    """Create processed Pulse log files with data only for the given users.

    Assume processed data is available in PROCESSED_DATA_DIRECTORY.  Include
    only events performed by the given users and stories referenced in such
    events.  Reassign story IDs to account for the omission of other stories.
    Reassign the given user IDs to 0, 1, 2, etc. to account for the omission
    of other users.  Place output in a directory named Fixtures for Users
    min_user_id-max_user_id within PROCESSED_DATA_DIRECTORY, creating such a
    directory if it does not already exist.
    
    min_user_id, an int, is the smallest user ID to include in the output, and
    is in processed form (i.e., 0, 1, 2) rather than the original 38-character
    hexadecimal format.
    max_user_id, an int, is the largest user ID to include in the output, and
    is in processed form (i.e., 0, 1, 2) rather than the original 38-character
    hexadecimal format.
    """
    start_time = time.time()
    if not isinstance(min_user_id, int) or not isinstance(max_user_id, int):
        raise TypeError("min_user_id and max_user_id must both be of type int.")
    if min_user_id > max_user_id:
        raise ValueError(
            ("min_user_id is %d but must be less than or " + "equal to max_user_id, which is %d.")
            % (min_user_id, max_user_id)
        )
    if min_user_id < 0:
        raise ValueError(("min_user_id is %d, but user IDs must be " + "non-negative.") % min_user_id)
    reads_list = _read_events(min_user_id, max_user_id, PROCESSED_READS_FILE_PATH)
    clickthroughs_list = _read_events(min_user_id, max_user_id, PROCESSED_CLICKTHROUGHS_FILE_PATH)
    max_user_id_found = _get_largest_user_id(reads_list, clickthroughs_list)
    if max_user_id_found is None:
        raise LookupError(
            ("No User IDs in the range [%d, %d] were found in" + " the processed data.") % (min_user_id, max_user_id)
        )
    if max_user_id_found < max_user_id:
        raise LookupError(
            ("max_user_id is %d, but the largest user ID in " + "the processed data is %d.")
            % (max_user_id, max_user_id_found)
        )
    story_ids = frozenset([event[EVENTS_STORY_ID_INDEX] for event in reads_list + clickthroughs_list])
    stories_list, story_id_dict = _read_stories(story_ids)
    output_directory = "%sFixtures for Users %d-%d/" % (PROCESSED_DATA_DIRECTORY, min_user_id, max_user_id)
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    output_reads_path = output_directory + READS_FILENAME
    _write_events(reads_list, output_reads_path, story_id_dict, min_user_id)
    output_clickthroughs_path = output_directory + CLICKTHROUGHS_FILENAME
    _write_events(clickthroughs_list, output_clickthroughs_path, story_id_dict, min_user_id)
    output_stories_path = output_directory + STORIES_FILENAME
    write_iterable(stories_list, output_stories_path, "")
    print("Output fixtures in directory: %s" % output_directory)
    report_time_elapsed(start_time)
def get_user_ids(reads_list, clickthroughs_list):
    start_time = time.time()
    user_ids_set = set()
    for read in reads_list:
        user_ids_set.add(read[EVENTS_USER_ID_INDEX])
    for clickthrough in clickthroughs_list:
        user_ids_set.add(clickthrough[EVENTS_USER_ID_INDEX])
    user_ids_list = sorted(user_ids_set)
    user_ids_dict = dict([(original_user_id, new_user_id) for \
                          (new_user_id, original_user_id) in \
                          enumerate(user_ids_list)])
    _reassign_user_ids(user_ids_dict, reads_list)
    _reassign_user_ids(user_ids_dict, clickthroughs_list)
    print("Reassigned %s from original values to 0, 1, 2, etc." % \
          USER_IDS_DESCRIPTOR)
    report_time_elapsed(start_time)
    return user_ids_list