def test_count_vectorizer_fit_transform(): cv = CountVectorizer((1, 2)) X = cv.fit_transform([["a", "b"], ["b", "c"]]) assert cv.vocabulary assert X == [ { cv.vocabulary["a"]: 1, cv.vocabulary["b"]: 1, cv.vocabulary["a b"]: 1, len(cv.vocabulary) - 1: 0, }, {cv.vocabulary["b"]: 1, cv.vocabulary["c"]: 1, cv.vocabulary["b c"]: 1}, ]
def test_naive_bayes_from_file(tmp_path): nb = NaiveBayesScorer( CTParsePipeline(CountVectorizer((1, 1)), MultinomialNaiveBayes())) path = tmp_path / "model.pkl" with bz2.open(path, "w") as f: pickle.dump(nb, f) nb = NaiveBayesScorer.from_model_file(path) assert nb
def train_naive_bayes(X: Sequence[Sequence[str]], y: Sequence[bool]) -> CTParsePipeline: """Train a naive bayes model for NaiveBayesScorer""" y_binary = [1 if y_i else -1 for y_i in y] # Create and train the pipeline pipeline = CTParsePipeline(CountVectorizer(ngram_range=(1, 3)), MultinomialNaiveBayes(alpha=1.0)) model = pipeline.fit(X, y_binary) return model
def test_save_naive_bayes(tmp_path): path = tmp_path / "model.pkl" model = CTParsePipeline(CountVectorizer((1, 1)), MultinomialNaiveBayes()) save_naive_bayes(model, path)
def test_count_vectorizer_transform_no_fit(): cv = CountVectorizer((1, 2)) with pytest.raises(ValueError): cv.transform([["a"]])
def test_count_vectorizer_fit_and_transform(): cv = CountVectorizer((1, 2)) cv = cv.fit([["a", "b", "c"], ["c", "d"]]) assert cv.vocabulary assert cv.transform([["b"]]) == [{cv.vocabulary["b"]: 1, 6: 0}]
def test_ngrams(ngrams, doc, result): assert CountVectorizer._create_ngrams(ngrams, [doc]) == [result]