예제 #1
0
파일: index.py 프로젝트: kaiserahmed/CS3245
def main():
    """
    This is the point of entry. Does initialization, retrieve files' content,
    do indexing, generation diction and postings_files.

    Make another pass to calculate doc weights (tf * 1)
    """

    data = sorted(os.listdir(dir_to_index), key=int)
    for d in data:
        filepath = os.path.join(dir_to_index, d)
        with open(filepath, 'r') as f:
            content = " ".join(map(lambda x: x.strip(), f.readlines()))
            term_freq = get_each_file_term_frequency(content, d)
            index_content(term_freq, d)

    # make another pass to calculate weights
    for word, pointer in dictionary.iteritems():
        for doc in postings[pointer]:
            doc.append(SearchIndex.cal_log_tfs(doc[1]))

    # pprint(postings)
    create_files(len(data))