def __get_keywords_from_text(text): tokens = ViTokenizer.tokenize(text) tokens = ViTokenizer.spacy_tokenize(tokens)[0] tokens = list(filter(lambda x: len(x) > 1, tokens)) counter_tokens = Counter(tokens) counter_tokens = dict(counter_tokens) counter_tokens = dict( sorted(counter_tokens.items(), key=lambda x: -x[1])) return counter_tokens
def make_doc(self, text): if self.Defaults.use_pyvi: try: from pyvi import ViTokenizer except ImportError: msg = ("Pyvi not installed. Either set Vietnamese.use_pyvi = False, " "or install it https://pypi.python.org/pypi/pyvi") raise ImportError(msg) words, spaces = ViTokenizer.spacy_tokenize(text) return Doc(self.vocab, words=words, spaces=spaces) else: words = [] spaces = [] doc = self.tokenizer(text) for token in self.tokenizer(text): words.extend(list(token.text)) spaces.extend([False]*len(token.text)) spaces[-1] = bool(token.whitespace_) return Doc(self.vocab, words=words, spaces=spaces)