示例#1
0
class DirectIndexWriter:
    def __init__(self, index_path, splitter=lambda a: re.split("[.!?]", a)):
        self.index_shlv = shelve.open(index_path, "w")
        self.text_extractor = HTMLTextExtractor(skip_tags=("a", "style", "script", "noindex", "title"))
        self.splitter = splitter
        self.word_tokenizer = nltk.tokenize.word_tokenize
        self.morph = pymorphy2.MorphAnalyzer()

    def add_to_index(self, name, markup):
        text = self.text_extractor.extract(markup)
        sentences = [Sentence(sentence_text, pos) for pos, sentence_text in enumerate(self.splitter(text))]
        self.index_shlv[name] = sentences

    def __del__(self):
        self.index_shlv.close()
示例#2
0
 def __init__(self, index_path, splitter=lambda a: re.split("[.!?]", a)):
     self.index_shlv = shelve.open(index_path, "w")
     self.text_extractor = HTMLTextExtractor(skip_tags=("a", "style", "script", "noindex", "title"))
     self.splitter = splitter
     self.word_tokenizer = nltk.tokenize.word_tokenize
     self.morph = pymorphy2.MorphAnalyzer()