def stem_processed_stories(input_file_path):
    """
    """
    start_time = time.time()
    if not isinstance(input_file_path, str):
        raise TypeError("Expected input_file_path to be of type str.")
    
    stemmer = PorterStemmer()
    stories_list = []
    prog = re.compile('\W+')
    story_stream = open_safely(input_file_path)
    for story_as_str in story_stream:
        story_as_list = story_as_str[:-1].lower().split(DELIMITER)
        story_title = story_as_list[NEW_STORIES_TITLE_INDEX]
        tok_contents = WordPunctTokenizer().tokenize(story_title)
        stem_contents = [stemmer.stem(word) for word in tok_contents if \
                         prog.match(word) is None]
        story_as_list[NEW_STORIES_TITLE_INDEX] = " ".join(stem_contents)
        stories_list.append(story_as_list)
    
    story_stream.close()
    output_file_path = input_file_path + STEMMED_STORIES_EXTENSION
    write_2d_iterable(stories_list, output_file_path)
    print("Output stemmed stories to %s" % output_file_path)
    report_time_elapsed(start_time)
def _write_stories(stories_dict):
    start_time = time.time()
    sorted_stories = sorted(stories_dict.keys())
    row_num = 0
    output_stream = open_safely(PROCESSED_STORIES_FILE_PATH, "w")
    for story_key in sorted_stories:
        if FETCH_FULL_STORIES:
            story_timestamp, story_contents = stories_dict[story_key]
            story_title_with_contents = story_key[NEW_STORIES_TITLE_INDEX] + \
                " " + story_contents
            story_sans_timestamp_as_tuple = \
                (story_key[NEW_STORIES_FEED_URL_INDEX],
                 story_key[NEW_STORIES_FEED_TITLE_INDEX],
                    story_key[NEW_STORIES_URL_INDEX],
                    story_title_with_contents)
            story_sans_timestamp_as_str = \
                DELIMITER.join(story_sans_timestamp_as_tuple)
        else:
            story_timestamp = stories_dict[story_key]
            story_sans_timestamp_as_str = DELIMITER.join(story_key)
        story_timestamp_as_str = DELIMITER + str(story_timestamp)
        story_as_str = story_sans_timestamp_as_str + story_timestamp_as_str
        output_stream.write(story_as_str + "\n")
        stories_dict[story_key] = row_num
        row_num += 1
    output_stream.close()
    print("Wrote %d cleaned and sorted %s to %s" %
          (row_num, STORIES_DESCRIPTOR, PROCESSED_STORIES_FILE_PATH))
    report_time_elapsed(start_time)
def _clean_data(input_file_path, num_fields, timestamp_index, data_descriptor,
                insert_data_fn, stories_dict, callback_data = None):
    start_time = time.time()
    stories_dict_already_built = (len(stories_dict) > 0)
    num_rows = 0
    input_stream = open_safely(input_file_path)
    for row in input_stream:
        num_rows += 1
        row_without_newline = row[:-1]
        _clean_row(row_without_newline, num_fields, timestamp_index,
                   insert_data_fn, stories_dict, callback_data)
    
    input_stream.close()
    
    if stories_dict_already_built:
        # We just cleaned user reads or clickthroughs.
        num_valid_rows = len(callback_data)
    else:
        # We just cleaned stories.
        num_valid_rows = len(stories_dict)
    
    num_invalid_rows = num_rows - num_valid_rows
    discard_rate = float(100 * num_invalid_rows) / float(num_rows)
    print("Read a total of %d %s, %d (%.2f%%) of which were discarded." %
          (num_rows, data_descriptor, num_invalid_rows, discard_rate))
    report_time_elapsed(start_time)
def _read_stories():
    stories = []
    story_stream = open_safely(STORIES_FILE_PATH)
    for story_as_str in story_stream:
        story_as_list = story_as_str[:-1].lower().split(DELIMITER)
        time_first_read = int(story_as_list[STORIES_TIMESTAMP_INDEX])
        story_as_list[STORIES_TIMESTAMP_INDEX] = time_first_read
        stories.append(tuple(story_as_list))
    story_stream.close()
    return stories
def _write_user_ids(user_ids_list):
    start_time = time.time()
    output_stream = open_safely(USER_IDS_FILE_PATH, "w")
    for user_id in user_ids_list:
        output_stream.write(user_id + "\n")
    output_stream.close()
    num_users = len(user_ids_list)
    print(("Wrote %d cleaned and sorted original 38-character hexadecimal %s " +
           "to %s") % (num_users, USER_IDS_DESCRIPTOR, USER_IDS_FILE_PATH))
    report_time_elapsed(start_time)
def _write_events(events_list, output_file_path, event_descriptor):
    start_time = time.time()
    output_stream = open_safely(output_file_path, "w")
    for event in events_list:
        output_stream.write(DELIMITER.join(map(str, event)) + "\n")
    output_stream.close()
    num_events = len(events_list)
    print("Wrote %d cleaned and sorted %s to %s" %
          (num_events, event_descriptor, output_file_path))
    report_time_elapsed(start_time)
def classify(version):
    if not os.path.exists(OUTPUT_DIRECTORY):
        os.mkdir(OUTPUT_DIRECTORY)
    ignore = False
    logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

    random.seed()

    ############################################

    stories = _read_stories()
    events = _read_events()
    # NUM_USERS_TO_ANALYZE = 500

    user_list = [[] for i in range(NUM_USERS_TO_ANALYZE)]
    day = 0
    max_day = 30
    curr_day = EARLIEST_ACCEPTABLE_TIMESTAMP
    curr_day += SECONDS_IN_DAY
    reselect_by_user = [[] for i in range(NUM_USERS_TO_ANALYZE)]
    while day < max_day:
        corpus_dict = corpora.Dictionary(
            story[NEW_STORIES_TITLE_INDEX].split() for story in stories if story[STORIES_TIMESTAMP_INDEX] <= curr_day
        )
        # remove stop words and words that appear only once
        stop_ids = [corpus_dict.token2id[stopword] for stopword in STOPLIST if stopword in corpus_dict.token2id]
        once_ids = [tokenid for tokenid, docfreq in corpus_dict.dfs.iteritems() if docfreq == 1]
        '''	for tokenid in once_ids:
            # replace token with UNK
    	    corpus_dict[tokenid] = "UNK"'''

        # remove stop words and words that appear only once
        corpus_dict.filter_tokens(stop_ids + once_ids)

        # remove gaps in id sequence after words that were removed
        corpus_dict.compactify()
        ####################
        #      tf-idf      #
        ####################

        tfidf = _build_tfidf_model(corpus_dict, stories, curr_day)
        for user_id in range(NUM_USERS_TO_ANALYZE):
            # user_id+=904
            user_tfidf, pos_tfidf, num_pos_train, num_neg_train, to_ignore = _tfidf(
                tfidf, corpus_dict, stories, events, user_id, curr_day, reselect_by_user[user_id], False
            )
            # reselect_by_user = [[] for i in range(NUM_USERS_TO_ANALYZE)]
            if user_tfidf != []:
                # modelsvm = train(labels, corpus_tfidf)
                to_predict, other_tfidf, num_pos_predict, num_neg_predict, chosen_stories = _tfidf(
                    tfidf, corpus_dict, stories, events, user_id, curr_day, [], True
                )

                if to_predict != []:
                    p_labs, p_vals, labels_predict = _train_and_predict(
                        user_tfidf,
                        pos_tfidf,
                        to_predict,
                        num_pos_train,
                        num_neg_train,
                        num_pos_predict,
                        num_neg_predict,
                        version,
                        ignore,
                    )
                    reselect = []
                    num_bool = True
                    for i in range(len(p_labs)):
                        if labels_predict[i] == -1:
                            if num_bool:
                                num_pos_predict = i
                                num_bool = False
                            if p_labs[i] == 1:
                                next_day = curr_day + SECONDS_IN_DAY
                                if chosen_stories[i - num_pos_predict][2] <= next_day:
                                    reselect += [chosen_stories[i - num_pos_predict]]
                    reselect_by_user[user_id] += reselect
                    p, r, f = _p_r_f_one(labels_predict, p_labs)
                    user_list[user_id].append((p, r, f, day))

        curr_day += SECONDS_IN_DAY
        day += 1

    user_a_p = 0
    user_a_r = 0
    user_a_f = 0
    skipped = 0
    print("Read stories from %s" % STORIES_FILE_PATH)
    print("Read events from %s" % EVENTS_FILE_PATH)
    print("%d users were analyzed" % NUM_USERS_TO_ANALYZE)
    output_file_name = "reselect.py %s %s %d %d output written at %d.txt" % (
        version,
        sys.argv[2],
        KERNEL_NUMBER,
        NUM_USERS_TO_ANALYZE,
        time.time(),
    )
    output_file_path = OUTPUT_DIRECTORY + output_file_name
    print("Outputting precision, recall, and f_1 scores to %s" % output_file_path)
    user_id = 0
    output_stream = open_safely(output_file_path, "w")
    for user in user_list:
        av_p = 0
        av_r = 0
        for results in user:
            av_p += results[0]
            av_r += results[1]
            f_1 = results[2]
            day = results[3]
            output_stream.write("%.3f\t%.3f\t%.3f\t%d\t%d\n" % (results[0], results[1], f_1, day, user_id))
        if len(user) > 0:
            av_p = av_p / float(len(user))
            av_r = av_r / float(len(user))
            denominator = av_p + av_r
            if denominator == 0.0:
                av_f = 0.0
            else:
                av_f = (2 * av_p * av_r) / float(av_p + av_r)
            user_a_p += av_p
            user_a_r += av_r
        else:
            skipped += 1
        user_id += 1
    user_a_p = user_a_p / float(NUM_USERS_TO_ANALYZE - skipped)
    user_a_r = user_a_r / float(NUM_USERS_TO_ANALYZE - skipped)
    denominator = user_a_p + user_a_r
    if denominator == 0.0:
        user_a_f = 0.0
    else:
        user_a_f = (2 * user_a_p * user_a_r) / float(user_a_p + user_a_r)
    output_stream.write("%.3f\t%.3f\t%.3f\t-1\t-1\n" % (user_a_p, user_a_r, user_a_f))
    output_stream.close()
def _read_events():
    event_stream = open_safely(EVENTS_FILE_PATH)
    events = [tuple(map(int, event[:-1].split(DELIMITER))) for event in event_stream]
    event_stream.close()
    return events