Пример #1
0
 def get_stopwords(self):
     lang = self.args.get('lang_code')
     return StopWords.get_stop_words(lang)
Пример #2
0
            grams = self.generate_ngrams(tokens, 3)
            grams.extend(self.generate_ngrams(tokens, 2))
            for word in grams:
                if word not in stop_tokens:
                    doc_grams.append(space_join(word))

        pattern = r'<VERB>?<ADV>*<VERB>+'
        doc = textacy.Doc(sentence, lang=model)
        lists = textacy.extract.pos_regex_matches(doc, pattern)
        verbs_list = []
        for l in lists:
            verb_tokens = l.lemma_.split()
            for verb in verb_tokens:
                if verb not in stop_tokens and self.is_valid_word(verb):
                    verbs_list.append(verb)
        return doc_grams, unigrams, verbs_list

    def generate_ngrams(self, tokens, n):
        return list(ngrams(tokens, n))


if __name__ == "__main__":
    a = 'How does the e-monies NEFT service differ from RGTS and EFT?'
    from StopWords import StopWords
    from StringProcessor import StringProcessor

    a = StringProcessor().normalize(a, 'en')
    en = StopWords.get_stop_words('en')
    cl = PhraseFinder()
    print(cl.find_phrases(a, en))