class Preprocessor(object): def __init__(self, source_filename, target_filename, include_filenames): self.reader = Reader(source_filename) for filename in include_filenames: self.reader.include(filename) self.writer = Writer(target_filename) self.defines = {} self.lines = self.reader.get_lines() # это генератор def define(self, line): pass def include(self, line): pass def process(self, line): return True def run(self): for line in self.lines: if self.process(line): self.writer.write_line(line) def close(self): self.reader.close() self.writer.close()
def __init__(self, source_filename, target_filename, include_filenames): self.reader = Reader(source_filename) for filename in include_filenames: self.reader.include(filename) self.writer = Writer(target_filename) self.defines = {} self.lines = self.reader.get_lines() # это генератор
def indexing(self, files, ignore_case=True, ignore_stop_words=True, stemming=True, use_weights=True, title_weight=5, date_weight=2, memory_limit=50, use_vbytes=True): """ Launch the indexing of a list of files :param files: the paths to the files to index :param ignore_case: should case be ignored in the indexing ? :param ignore_stop_words: should stop words be ignored ? :param stemming: should we stemm the tokens ? :param use_weights: shoud we differenciate word with their position in the document ? :param title_weight: weight for words in title :param date_weight: weight for words in the date :param memory_limit: limit on the memory before a flush in a temp file :param use_vbytes: usage of variable bytes for the final posting list ? :return: when the indexing is finished """ SC.new_indexing() documents = [] self.current_status = "Indexing - Starting" self.__id_to_filename = SortedDict() self.inv_file = InvertedFile(use_vbytes, memory_limit) for file in files: self.current_status = "Indexing - {}".format(file) file_docs = Reader.read_file(file, ignore_case, ignore_stop_words, stemming, use_weights, title_weight, date_weight) for doc in file_docs: self.__id_to_filename[int(doc.doc_id())] = file self.inv_file.add_document(doc) self.current_status = "Indexing - Making the inverted file" self.inv_file.gen_pl_file() self.current_status = "Indexing - Saving to pickle file" with open(self.PICKLES[0], "wb") as file: pickle.dump(self.inv_file, file) with open(self.PICKLES[1], "wb") as file: pickle.dump(self.__id_to_filename, file) self.current_status = "Indexing - Finished - You can query" SC.last_indexing().stop() SC.last_indexing().log(files, ignore_case, ignore_stop_words, stemming, use_weights, title_weight, date_weight, memory_limit, use_vbytes)