def test_preprocess_words_gives_bigrams(): stemmer = noop_stemmer() lemmatizer = noop_lemmatizer() processed = preprocess_words(["alright", "welcome", "everyone"], stemmer, lemmatizer, [], bigrams=True) assert set(processed) == set(["alright welcome", "welcome everyone"])
def test_preprocess_words_stems_words(): stemmer = Mock() stemmer.stem = lambda word: "1" if word == "aaaa" else word lemmatizer = noop_lemmatizer() processed = preprocess_words(["aaaa", "bbbb"], stemmer, lemmatizer, []) assert list(processed) == ["1", "bbbb"]
def test_preprocess_words_lemmatizes_words(): stemmer = noop_stemmer() lemmatizer = Mock() lemmatizer.lemmatize = lambda word, **kwargs: "a" if word == "bbbb" else word processed = preprocess_words(["bbbb", "dddd"], stemmer, lemmatizer, []) assert list(processed) == ["a", "dddd"]
def test_preprocess_words_removes_stopwords(): stemmer = noop_stemmer() lemmatizer = noop_lemmatizer() processed = preprocess_words(["Test", "YOLO"], stemmer, lemmatizer, ["YOLO"]) assert list(processed) == ["Test"]
def test_preprocess_words_removes_lt_3_char_words(): stemmer = noop_stemmer() lemmatizer = noop_lemmatizer() processed = preprocess_words(["help", "me"], stemmer, lemmatizer, []) assert list(processed) == ["help"]
def test_preprocess_words_handles_getting_bigrams_from_empty_word_list(): stemmer = noop_stemmer() lemmatizer = noop_lemmatizer() processed = preprocess_words([], stemmer, lemmatizer, [], bigrams=True) assert list(processed) == []