def test_get_standardizing_inverse(): std_inv = tokenization.get_standardizing_inverse( VOCABULARY_FILE, lambda t: tokenization.standardize_text(t, stemming="porter_stemmer"), ) assert std_inv["memori"] == "memory" assert std_inv["work memori"] == "working memory" assert std_inv["nerv"] == "nerves"
def test_standardize_text(): text = "One a the Word abcd-eft: --\nhello\t 1240" assert ( tokenization.standardize_text(text) == "one word abcd eft hello 1240")