def get_word_ngram(text, n=2, clean=False): ngrams = defaultdict(int) words = word_tokenize(text) if clean: words = simple.clean_words(words) for i in range(0, len(words)-n+1): ng = tuple(words[i:i+n]) ngrams[ng] += 1 return ngrams
def get_word_ngram(text, n=2, clean=False): ngrams = defaultdict(int) words = word_tokenize(text) if clean: words = simple.clean_words(words) for i in range(0, len(words) - n + 1): ng = tuple(words[i : i + n]) ngrams[ng] += 1 return ngrams
def get_words(text): words = word_tokenize(text) clean_words = simple.clean_words(words) return words, clean_words