示例#1
0
def get_word_ngram(text, n=2, clean=False):
    ngrams = defaultdict(int)
    words = word_tokenize(text)
    if clean:
        words = simple.clean_words(words)
    for i in range(0, len(words)-n+1):
        ng = tuple(words[i:i+n])
        ngrams[ng] += 1
    return ngrams
示例#2
0
def get_word_ngram(text, n=2, clean=False):
    ngrams = defaultdict(int)
    words = word_tokenize(text)
    if clean:
        words = simple.clean_words(words)
    for i in range(0, len(words) - n + 1):
        ng = tuple(words[i : i + n])
        ngrams[ng] += 1
    return ngrams
示例#3
0
def get_words(text):
    words = word_tokenize(text)
    clean_words = simple.clean_words(words)
    return words, clean_words
示例#4
0
def get_words(text):
    words = word_tokenize(text)
    clean_words = simple.clean_words(words)
    return words, clean_words