def main(): """ This is the point of entry. Does initialization, retrieve files' content, do indexing, generation diction and postings_files. Make another pass to calculate doc weights (tf * 1) """ data = sorted(os.listdir(dir_to_index), key=int) for d in data: filepath = os.path.join(dir_to_index, d) with open(filepath, 'r') as f: content = " ".join(map(lambda x: x.strip(), f.readlines())) term_freq = get_each_file_term_frequency(content, d) index_content(term_freq, d) # make another pass to calculate weights for word, pointer in dictionary.iteritems(): for doc in postings[pointer]: doc.append(SearchIndex.cal_log_tfs(doc[1])) # pprint(postings) create_files(len(data))