def test_fit_with_df_input_without_column_arg(self, example_train_df): """ In case we give no column argument to the initalizer, the input during fit should be a pd.Series. Otherwise raise TypeError. """ transformer = PandasTfidfVectorizer() with pytest.raises(TypeError): transformer.fit(example_train_df)
def test_missing_values_fit(self, example_missing_values_df): """ Tests the case where there are missing values in the training data. Should return a ValueError. """ transformer = PandasTfidfVectorizer(column="text") with pytest.raises(ValueError): transformer.fit(example_missing_values_df)
def test_fit_with_series_input_with_column_arg(self, example_series): """ In case we do give a value for the column keyword argument, the input should be a pd.DataFrame. Otherwise, return a TypeError. """ transformer = PandasTfidfVectorizer(column="text") with pytest.raises(TypeError): transformer.fit(example_series)
def test_example(self, example_train_df): """ Tests a simple example. """ transformer = PandasTfidfVectorizer(column="text") transformer.fit(example_train_df) transformed = transformer.transform(example_train_df) expected = pd.DataFrame({ "num": pd.Series([3, 4, 4]), "animal": pd.Series([0.0, 1.0, 0.0]), "house": pd.Series([1.0, 0.0, 1.0]), }) # The column order shouldnt matter (therefore we sort them) pd.testing.assert_frame_equal(transformed.sort_index(axis=1), expected.sort_index(axis=1))
def test_series_input(self, example_series): """ In case we don't give a value for the column keyword argument, the input should be a pandas series or np.ndarray. Otherwise, return a TypeError. """ transformer = PandasTfidfVectorizer() transformer.fit(example_series) transformed = transformer.transform(example_series) expected = pd.DataFrame({ "animal": pd.Series([0.0, 1.0, 0.0]), "house": pd.Series([1.0, 0.0, 1.0]), }) pd.testing.assert_frame_equal(transformed.sort_index(axis=1), expected.sort_index(axis=1))
def test_clone(self): """ Test clone """ transformer = PandasTfidfVectorizer(column="test", max_features=123) cloned = clone(transformer) assert transformer.column == cloned.column assert transformer.max_features == cloned.max_features
def test_grid_search(self, example_train_df_binary): """Tests for grid search compatibility.""" pipe = Pipeline([("tfidf", PandasTfidfVectorizer()), ("model", LogisticRegression())]) param_grid = { "tfidf__max_features": [5, 15], } X = example_train_df_binary["text"] y = example_train_df_binary["y"] search = GridSearchCV(pipe, param_grid) search.fit(X, y)
def test_missing_column(self, example_train_df, example_test_df_diff_column): """ Test transformer when test set does not have the required columns. In that case, it should return a KeyError """ transformer = PandasTfidfVectorizer(column="text") transformer.fit(example_train_df) with pytest.raises(KeyError): transformer.transform(example_test_df_diff_column)