def accept(self, consumer_input: PipedInput):
     tokens = self.mystem.lemmatize(consumer_input.get_text().lower())
     result = []
     for token in tokens:
         token = token.strip()
         if is_russian(token) or is_belarusian(token) or is_english(token):
             result.append(token)
     return consumer_input.new(text=" ".join(result))
 def accept(self, consumer_input: PipedInput):
     text = consumer_input.get_text().lower()
     token_words = word_tokenize(text)
     result = []
     for token in token_words:
         token = token.strip()
         if is_russian(token) or is_belarusian(token):
             result.append(self.russian_stemmer.stem(token))
         if is_english(token):
             result.append(self.english_stemmer.stem(token))
     return consumer_input.new(text=" ".join(result))
 def accept(self, consumer_input: PipedInput):
     return consumer_input.new(meta=json.loads(consumer_input.get_meta()))
 def accept(self, consumer_input: PipedInput):
     text = consumer_input.get_meta(
     )['title'] + " . " + consumer_input.get_text()
     return consumer_input.new(text=text,
                               meta=json.dumps(consumer_input.get_meta()))
예제 #5
0
 def accept(self, consumer_input: PipedInput):
     new_meta = copy(consumer_input.get_meta())
     new_meta["title"] = self.filter_stopwords(new_meta["title"])
     return consumer_input.new(text=self.filter_stopwords(
         consumer_input.get_text()),
                               meta=new_meta)
예제 #6
0
 def accept(self, consumer_input: PipedInput):
     new_meta = copy(consumer_input.get_meta())
     new_meta["title"] = self.lemmatize(new_meta["title"])
     return consumer_input.new(text=self.lemmatize(
         consumer_input.get_text()),
                               meta=new_meta)
예제 #7
0
 def accept(self, consumer_input: PipedInput):
     meta = json.loads(consumer_input.get_meta())
     return consumer_input.new(doc_id=meta["url"], meta=meta)