def accept(self, consumer_input: PipedInput): tokens = self.mystem.lemmatize(consumer_input.get_text().lower()) result = [] for token in tokens: token = token.strip() if is_russian(token) or is_belarusian(token) or is_english(token): result.append(token) return consumer_input.new(text=" ".join(result))
def accept(self, consumer_input: PipedInput): text = consumer_input.get_text().lower() token_words = word_tokenize(text) result = [] for token in token_words: token = token.strip() if is_russian(token) or is_belarusian(token): result.append(self.russian_stemmer.stem(token)) if is_english(token): result.append(self.english_stemmer.stem(token)) return consumer_input.new(text=" ".join(result))
def accept(self, consumer_input: PipedInput): return consumer_input.new(meta=json.loads(consumer_input.get_meta()))
def accept(self, consumer_input: PipedInput): text = consumer_input.get_meta( )['title'] + " . " + consumer_input.get_text() return consumer_input.new(text=text, meta=json.dumps(consumer_input.get_meta()))
def accept(self, consumer_input: PipedInput): new_meta = copy(consumer_input.get_meta()) new_meta["title"] = self.filter_stopwords(new_meta["title"]) return consumer_input.new(text=self.filter_stopwords( consumer_input.get_text()), meta=new_meta)
def accept(self, consumer_input: PipedInput): new_meta = copy(consumer_input.get_meta()) new_meta["title"] = self.lemmatize(new_meta["title"]) return consumer_input.new(text=self.lemmatize( consumer_input.get_text()), meta=new_meta)
def accept(self, consumer_input: PipedInput): meta = json.loads(consumer_input.get_meta()) return consumer_input.new(doc_id=meta["url"], meta=meta)