def test_fit_with_df_input_without_column_arg(self, example_train_df):
        """
        In case we give no column argument to the initalizer, the input during fit
        should be a pd.Series. Otherwise raise TypeError.

        """
        transformer = PandasTfidfVectorizer()
        with pytest.raises(TypeError):
            transformer.fit(example_train_df)
    def test_missing_values_fit(self, example_missing_values_df):
        """
        Tests the case where there are missing values in the training data.
        Should return a ValueError.
        """

        transformer = PandasTfidfVectorizer(column="text")
        with pytest.raises(ValueError):
            transformer.fit(example_missing_values_df)
    def test_fit_with_series_input_with_column_arg(self, example_series):
        """
        In case we do  give a value for the column keyword argument, the input
        should be a pd.DataFrame.
        Otherwise, return a TypeError.
        """

        transformer = PandasTfidfVectorizer(column="text")
        with pytest.raises(TypeError):
            transformer.fit(example_series)
    def test_example(self, example_train_df):
        """ Tests a simple example. """
        transformer = PandasTfidfVectorizer(column="text")
        transformer.fit(example_train_df)
        transformed = transformer.transform(example_train_df)

        expected = pd.DataFrame({
            "num": pd.Series([3, 4, 4]),
            "animal": pd.Series([0.0, 1.0, 0.0]),
            "house": pd.Series([1.0, 0.0, 1.0]),
        })
        # The column order shouldnt matter (therefore we sort them)
        pd.testing.assert_frame_equal(transformed.sort_index(axis=1),
                                      expected.sort_index(axis=1))
    def test_series_input(self, example_series):
        """
        In case we don't give a value for the column keyword argument, the input
        should be a pandas series or np.ndarray.
        Otherwise, return a TypeError.
        """

        transformer = PandasTfidfVectorizer()
        transformer.fit(example_series)
        transformed = transformer.transform(example_series)

        expected = pd.DataFrame({
            "animal": pd.Series([0.0, 1.0, 0.0]),
            "house": pd.Series([1.0, 0.0, 1.0]),
        })

        pd.testing.assert_frame_equal(transformed.sort_index(axis=1),
                                      expected.sort_index(axis=1))
    def test_clone(self):
        """
        Test clone

        """
        transformer = PandasTfidfVectorizer(column="test", max_features=123)
        cloned = clone(transformer)

        assert transformer.column == cloned.column
        assert transformer.max_features == cloned.max_features
    def test_grid_search(self, example_train_df_binary):
        """Tests for grid search compatibility."""

        pipe = Pipeline([("tfidf", PandasTfidfVectorizer()),
                         ("model", LogisticRegression())])
        param_grid = {
            "tfidf__max_features": [5, 15],
        }

        X = example_train_df_binary["text"]
        y = example_train_df_binary["y"]

        search = GridSearchCV(pipe, param_grid)
        search.fit(X, y)
    def test_missing_column(self, example_train_df,
                            example_test_df_diff_column):
        """
        Test transformer when test set does not have the required columns.
        In that case, it should return a KeyError
        """
        transformer = PandasTfidfVectorizer(column="text")
        transformer.fit(example_train_df)

        with pytest.raises(KeyError):
            transformer.transform(example_test_df_diff_column)