def test_stop_word_filter(): stop = tokenization.StopWordFilter(["the", "of"]) assert isinstance(stop.stop_words_, set) assert stop(["and", "the", "they", "of"]) == ["and", "they"] standardizer = tokenization.Standardizer("porter_stemmer") stop = tokenization.StopWordFilter("nltk", standardizer) assert "yourselv" in stop.stop_words_ assert stop(standardizer( "do it yourselves computers".split())) == ["comput"]
def test_standardizer(): standardizer = tokenization.Standardizer("porter_stemmer") assert standardizer(["EatIng", "Cheese"]) == ["eat", "chees"]