def extract_word_frequencies(text, is_markdown=True): """Parse given text and return a dictionary of word frequencies. :param is_markdown: if True, parse text from markdown """ word_freqs = defaultdict(int) if is_markdown: text = strip_markdown(text) # expand contractions text = contraction_expander.replace(text) # remove remaining punctuation text = strip_punct(text) # convert text to lowercase text = text.lower() for token in wordpunct_tokenize(text): # stem token token = WordNetLemmatizer().lemmatize(token) if token not in stopwords.words('english') and len(token) > 2: word_freqs[token] += 1 return word_freqs
def test_strip_punct(): expected = "I cant believe that she was a sea shell seller " u"—— I really cant " assert utils.strip_punct(MOCK_TEXT_3) == expected