Exemplo n.º 1
0
def extract_word_frequencies(text, is_markdown=True):
    """Parse given text and return a dictionary of word frequencies.

    :param is_markdown: if True, parse text from markdown
    """
    word_freqs = defaultdict(int)

    if is_markdown:
        text = strip_markdown(text)

    # expand contractions
    text = contraction_expander.replace(text)

    # remove remaining punctuation
    text = strip_punct(text)

    # convert text to lowercase
    text = text.lower()

    for token in wordpunct_tokenize(text):
        # stem token
        token = WordNetLemmatizer().lemmatize(token)

        if token not in stopwords.words('english') and len(token) > 2:
            word_freqs[token] += 1

    return word_freqs
Exemplo n.º 2
0
def test_strip_punct():
    expected = "I cant believe that she was a sea shell seller " u"—— I really cant  "
    assert utils.strip_punct(MOCK_TEXT_3) == expected