class DirectIndexWriter: def __init__(self, index_path, splitter=lambda a: re.split("[.!?]", a)): self.index_shlv = shelve.open(index_path, "w") self.text_extractor = HTMLTextExtractor(skip_tags=("a", "style", "script", "noindex", "title")) self.splitter = splitter self.word_tokenizer = nltk.tokenize.word_tokenize self.morph = pymorphy2.MorphAnalyzer() def add_to_index(self, name, markup): text = self.text_extractor.extract(markup) sentences = [Sentence(sentence_text, pos) for pos, sentence_text in enumerate(self.splitter(text))] self.index_shlv[name] = sentences def __del__(self): self.index_shlv.close()
def __init__(self, index_path, splitter=lambda a: re.split("[.!?]", a)): self.index_shlv = shelve.open(index_path, "w") self.text_extractor = HTMLTextExtractor(skip_tags=("a", "style", "script", "noindex", "title")) self.splitter = splitter self.word_tokenizer = nltk.tokenize.word_tokenize self.morph = pymorphy2.MorphAnalyzer()