Пример #1
0
    def test_convert_column_id_to_name(self, col_id_list,
                                       expected_columns_name):
        df = DataFrameMock.df_multi_type(10)
        dataset = Dataset(df_object=df)

        columns_name = dataset.convert_column_id_to_name(col_id_list)

        assert type(columns_name)
        assert columns_name == expected_columns_name
Пример #2
0
    def test_get_categorical_cols(self, request, sample_size,
                                  expected_categ_cols):
        df_categ = DataFrameMock.df_categorical_cols(sample_size)
        dataset = Dataset(df_object=df_categ)

        categ_cols = dataset._get_categorical_cols(col_list=df_categ.columns)

        assert isinstance(categ_cols, set)
        assert categ_cols == expected_categ_cols
Пример #3
0
    def test_nan_columns(self, request, nan_ratio, n_columns,
                         expected_nan_columns):
        df = DataFrameMock.df_many_nans(nan_ratio, n_columns)
        dataset = Dataset(df_object=df)

        nan_columns = dataset.nan_columns(nan_ratio - 0.01)

        assert len(nan_columns) == len(expected_nan_columns)
        assert isinstance(nan_columns, set)
        assert nan_columns == expected_nan_columns
Пример #4
0
    def test_contains_duplicated_features(self, request, duplicated_cols_count,
                                          expected_contains_dupl_cols_bool):
        df_duplicated_cols = DataFrameMock.df_duplicated_columns(
            duplicated_cols_count)
        dataset = Dataset(df_object=df_duplicated_cols)

        contains_duplicated_features = dataset.check_duplicated_features()

        assert isinstance(contains_duplicated_features, bool)
        assert contains_duplicated_features is expected_contains_dupl_cols_bool
Пример #5
0
def dataset_with_operations(fillna_col0_col1, fillna_col1_col4) -> Dataset:
    """
    Create Dataset instance with not empty ``operations_history`` attribute.

    Returns
    -------
    Dataset
        Dataset instance containing FeatureOperation instances
        in the `operations_history` attribute
    """
    dataset = Dataset(df_object=DataFrameMock.df_generic(10))

    dataset.track_history(fillna_col0_col1)
    dataset.track_history(fillna_col1_col4)

    return dataset
Пример #6
0
    def it_can_apply_fillna(self, request, columns, derived_columns,
                            expected_new_columns, expected_inplace):
        df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = df
        dataset = Dataset(data_file="fake/path0")
        pd_fillna_ = method_mock(request, pd.Series, "fillna")
        pd_fillna_.return_value = pd.Series([0] * 100)
        fillna = fop.FillNA(columns=columns,
                            derived_columns=derived_columns,
                            value=0)

        filled_dataset = fillna._apply(dataset)

        assert filled_dataset is not None
        assert filled_dataset is not dataset
        assert isinstance(filled_dataset, Dataset)
        for col in expected_new_columns:
            assert col in filled_dataset.data.columns
        get_df_from_csv_.assert_called_once_with("fake/path0")
        assert len(pd_fillna_.call_args_list) == len(columns)
        pd.testing.assert_series_equal(pd_fillna_.call_args_list[0][0][0],
                                       df[columns[0]])
        assert pd_fillna_.call_args_list[0][1] == {"inplace": expected_inplace}
Пример #7
0
    def it_can_apply_replace_strings(self, request, columns, derived_columns,
                                     expected_new_columns, expected_inplace):
        df = DataFrameMock.df_generic(sample_size=100)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = df
        dataset = Dataset(data_file="fake/path0")
        pd_str_replace_ = function_mock(request, "pandas.Series.str.replace")
        pd_str_replace_.return_value = pd.Series([0] * 100)
        replace_substrings = fop.ReplaceSubstrings(
            columns=columns,
            derived_columns=derived_columns,
            replacement_map={"a": "b"})

        replaced_dataset = replace_substrings._apply(dataset)

        assert replaced_dataset is not None
        assert replaced_dataset is not dataset
        assert isinstance(replaced_dataset, Dataset)
        for col in expected_new_columns:
            assert col in replaced_dataset.data.columns
        get_df_from_csv_.assert_called_once_with("fake/path0")
        assert len(pd_str_replace_.call_args_list) == len(columns)
        pd.testing.assert_series_equal(
            pd_str_replace_.call_args_list[0][0][0][:], df[columns[0]])
        assert pd_str_replace_.call_args_list[0][1] == {
            "pat": "a",
            "repl": "b",
        }
Пример #8
0
    def it_can_apply_ordinal_encoder(
        self,
        request,
        columns,
        derived_columns,
        expected_new_columns,
    ):
        df = DataFrameMock.df_generic(sample_size=100)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = df
        dataset = Dataset(data_file="fake/path0")
        sk_fit_transform_ = method_mock(request, sk_preproc.OrdinalEncoder,
                                        "fit_transform")
        sk_fit_transform_.return_value = pd.Series(range(100))
        ordinal_encoder = fop.OrdinalEncoder(
            columns=columns,
            derived_columns=derived_columns,
        )

        encoded_dataset = ordinal_encoder._apply(dataset)

        assert encoded_dataset is not None
        assert encoded_dataset is not dataset
        assert isinstance(encoded_dataset, Dataset)
        for col in expected_new_columns:
            assert col in encoded_dataset.data.columns
        get_df_from_csv_.assert_called_once_with("fake/path0")
        assert len(sk_fit_transform_.call_args_list) == len(columns)
        pd.testing.assert_frame_equal(
            sk_fit_transform_.call_args_list[0][0][1], df[[columns[0]]])
Пример #9
0
    def it_knows_its_operations_history(self, request):
        function_mock(request, "trousse.dataset.get_df_from_csv")
        dataset = Dataset(data_file="fake/path")

        history = dataset.operations_history

        assert isinstance(history, OperationsList)
        assert len(history) == 0
Пример #10
0
    def it_knows_its_metadata_cols(self, metadata_cols):
        df = DataFrameMock.df_multi_type(10)
        dataset = Dataset(df_object=df, metadata_cols=metadata_cols)

        metadata_cols_ = dataset.metadata_cols

        assert type(metadata_cols_) == set
        assert metadata_cols_ == set(metadata_cols)
Пример #11
0
def test_ordinal_encode_column(csv, column, derived_column, expected_csv):
    dataset = Dataset(data_file=csv)
    expected_df = load_expectation(expected_csv, type_="csv")

    encoded_df, _, new_cols = ffx._ordinal_encode_column(
        dataset.data, column, False)

    pd.testing.assert_frame_equal(encoded_df, expected_df)
    assert derived_column == new_cols
Пример #12
0
    def it_knows_how_to_track_history(self, request, metadata_cols,
                                      derived_columns, expected_metadata_cols):
        operations_list_iadd_ = method_mock(request, OperationsList,
                                            "__iadd__")

        expected_df = DataFrameMock.df_generic(10)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = expected_df
        dataset = Dataset(data_file="fake/path", metadata_cols=metadata_cols)
        feat_op = fop.FillNA(columns=["metadata_num_col"],
                             derived_columns=derived_columns,
                             value=0)

        dataset.track_history(feat_op)

        assert dataset.metadata_cols == expected_metadata_cols
        operations_list_iadd_.assert_called_once_with(ANY, feat_op)
Пример #13
0
def test_ordinal_encoder(csv, columns, derived_columns, expected_csv):
    dataset = Dataset(data_file=csv)
    expected_df = load_expectation(expected_csv, type_="csv")
    ordinal_encoder = fop.OrdinalEncoder(columns=columns,
                                         derived_columns=derived_columns)

    encoded_dataset = ordinal_encoder(dataset)

    pd.testing.assert_frame_equal(encoded_dataset.data, expected_df)
Пример #14
0
    def test_to_be_fixed_cols(self):
        df = DataFrameMock.df_multi_type(10)
        dataset = Dataset(df_object=df)

        to_be_fixed_cols = dataset.to_be_fixed_cols

        assert type(to_be_fixed_cols) == set
        assert len(to_be_fixed_cols) == 1
        assert to_be_fixed_cols == {"mixed_type_col"}
Пример #15
0
def test_replace_substrings(csv, columns, derived_columns, expected_csv):
    dataset = Dataset(data_file=csv)
    expected_df = load_expectation(expected_csv, type_="csv")
    replace_substrings = fop.ReplaceSubstrings(columns=columns,
                                               derived_columns=derived_columns,
                                               replacement_map={"r": "c"})

    replaced_dataset = replace_substrings(dataset)

    pd.testing.assert_frame_equal(replaced_dataset.data, expected_df)
Пример #16
0
    def it_can_fillna_with_template_call(self, request):
        _apply_ = method_mock(request, fop.FillNA, "_apply")
        track_history_ = method_mock(request, Dataset, "track_history")
        df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = df
        dataset_in = Dataset(data_file="fake/path0")
        dataset_out = Dataset(data_file="fake/path0")
        _apply_.return_value = dataset_out
        fillna = fop.FillNA(columns=["nan_0"],
                            derived_columns=["filled_nan_0"],
                            value=0)

        filled_dataset = fillna(dataset_in)

        _apply_.assert_called_once_with(fillna, dataset_in)
        track_history_.assert_called_once_with(filled_dataset, fillna)
        assert filled_dataset is dataset_out
Пример #17
0
    def test_trivial_columns(self, request, n_columns,
                             expected_trivial_columns):
        df = DataFrameMock.df_trivial(n_columns)
        dataset = Dataset(df_object=df)

        trivial_columns = dataset.trivial_columns

        assert len(trivial_columns) == len(expected_trivial_columns)
        assert isinstance(trivial_columns, set)
        assert trivial_columns == expected_trivial_columns
Пример #18
0
    def test_constant_columns(self, request, n_columns,
                              expected_constant_columns):
        df = DataFrameMock.df_same_value(n_columns)
        dataset = Dataset(df_object=df)

        constant_cols = dataset.constant_cols

        assert len(constant_cols) == len(expected_constant_columns)
        assert isinstance(constant_cols, set)
        assert constant_cols == expected_constant_columns
Пример #19
0
    def it_knows_its_data(self, request):
        expected_df = DataFrameMock.df_generic(10)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = expected_df
        dataset = Dataset(data_file="fake/path")

        data = dataset.data

        assert isinstance(data, pd.DataFrame)
        pd.testing.assert_frame_equal(data, expected_df)
Пример #20
0
def test_one_hot_encode_column(csv, column, drop_one_new_column,
                               expected_new_cols, expected_csv):
    dataset = Dataset(data_file=csv)
    expected_df = load_expectation(expected_csv, type_="csv")

    encoded_df, encoder, new_cols = ffx._one_hot_encode_column(
        dataset.data, column, drop_one_new_column)

    assert expected_new_cols == new_cols
    pd.testing.assert_frame_equal(encoded_df, expected_df, check_dtype=False)
    assert isinstance(encoder, sk_preproc.OneHotEncoder)
Пример #21
0
    def it_knows_its_feature_cols(self, metadata_cols, feature_cols,
                                  expected_feature_cols):
        df = DataFrameMock.df_multi_type(10)
        dataset = Dataset(df_object=df,
                          metadata_cols=metadata_cols,
                          feature_cols=feature_cols)

        feature_cols_ = dataset.feature_cols

        assert type(feature_cols_) == set
        assert feature_cols_ == expected_feature_cols
Пример #22
0
    def it_can_encode_with_template_call(self, request):
        _apply_ = method_mock(request, fop.OrdinalEncoder, "_apply")
        track_history_ = method_mock(request, Dataset, "track_history")
        df = DataFrameMock.df_generic(sample_size=100)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = df
        dataset_in = Dataset(data_file="fake/path0")
        dataset_out = Dataset(data_file="fake/path0")
        _apply_.return_value = dataset_out
        ordinal_encoder = fop.OrdinalEncoder(
            columns=["exam_num_col_0"],
            derived_columns=["exam_str_col_0"],
        )

        replaced_dataset = ordinal_encoder(dataset_in)

        _apply_.assert_called_once_with(ordinal_encoder, dataset_in)
        track_history_.assert_called_once_with(replaced_dataset,
                                               ordinal_encoder)
        assert replaced_dataset is dataset_out
Пример #23
0
    def test_med_exam_col_list(self, feature_cols, expected_med_exam_col_list):
        df_multi_type = DataFrameMock.df_multi_type(sample_size=200)
        dataset = Dataset(
            df_object=df_multi_type,
            metadata_cols=("metadata_num_col", ),
            feature_cols=feature_cols,
        )

        med_exam_col_list = dataset.med_exam_col_list

        assert isinstance(med_exam_col_list, set)
        assert med_exam_col_list == expected_med_exam_col_list
Пример #24
0
    def it_knows_its_bool_columns(self, request):
        _columns_type = property_mock(request, Dataset, "_columns_type")
        column_list_by_type = _ColumnListByType(bool_cols={"bool0", "bool1"})
        _columns_type.return_value = column_list_by_type
        initializer_mock(request, Dataset)
        dataset = Dataset(data_file="fake/path")

        bool_columns_ = dataset.bool_columns

        assert type(bool_columns_) == set
        assert bool_columns_ == {"bool0", "bool1"}
        _columns_type.assert_called_once()
Пример #25
0
    def it_knows_its_num_categorical_columns(self, request):
        _columns_type = property_mock(request, Dataset, "_columns_type")
        column_list_by_type = _ColumnListByType(
            num_categorical_cols={"numcat0", "numcat1"})
        _columns_type.return_value = column_list_by_type
        initializer_mock(request, Dataset)
        dataset = Dataset(data_file="fake/path")

        num_categorical_columns_ = dataset.num_categorical_columns

        assert type(num_categorical_columns_) == set
        assert num_categorical_columns_ == {"numcat0", "numcat1"}
        _columns_type.assert_called_once()
Пример #26
0
    def it_knows_its_med_exam_col_list(self, request):
        _columns_type = property_mock(request, Dataset, "_columns_type")
        column_list_by_type = _ColumnListByType(
            med_exam_col_list={"med0", "med1"})
        _columns_type.return_value = column_list_by_type
        initializer_mock(request, Dataset)
        dataset = Dataset(data_file="fake/path")

        med_exam_col_list_ = dataset.med_exam_col_list

        assert type(med_exam_col_list_) == set
        assert med_exam_col_list_ == {"med0", "med1"}
        _columns_type.assert_called_once()
Пример #27
0
    def it_can_replace_with_template_call(self, request):
        _apply_ = method_mock(request, fop.ReplaceSubstrings, "_apply")
        track_history_ = method_mock(request, Dataset, "track_history")
        df = DataFrameMock.df_generic(sample_size=100)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = df
        dataset_in = Dataset(data_file="fake/path0")
        dataset_out = Dataset(data_file="fake/path0")
        _apply_.return_value = dataset_out
        replace_substrings = fop.ReplaceSubstrings(
            columns=["exam_num_col_0"],
            derived_columns=["exam_str_col_0"],
            replacement_map={"a": "b"},
        )

        replaced_dataset = replace_substrings(dataset_in)

        _apply_.assert_called_once_with(replace_substrings, dataset_in)
        track_history_.assert_called_once_with(replaced_dataset,
                                               replace_substrings)
        assert replaced_dataset is dataset_out
Пример #28
0
    def test_column_list_by_type(self, feature_cols,
                                 expected_column_list_type):
        df_multi_type = DataFrameMock.df_multi_type(sample_size=200)
        dataset = Dataset(
            df_object=df_multi_type,
            metadata_cols=("metadata_num_col", ),
            feature_cols=feature_cols,
        )

        col_list_by_type = dataset._columns_type

        assert isinstance(col_list_by_type, _ColumnListByType)
        assert col_list_by_type == expected_column_list_type
Пример #29
0
    def test_track_history_with_no_derived_columns(
        self,
        request,
        metadata_columns,
        original_columns,
        expected_metadata_cols,
    ):
        df = DataFrameMock.df_generic(10)
        dataset = Dataset(
            df_object=df,
            metadata_cols=metadata_columns,
        )
        feat_op = fop.FillNA(columns=original_columns,
                             derived_columns=None,
                             value=0)

        dataset.track_history(feat_op)

        for column in original_columns:
            # Check if the operation is added to each column
            assert feat_op in dataset.operations_history[column]
        assert dataset.metadata_cols == expected_metadata_cols
Пример #30
0
def test_fillna(request, columns, derived_columns, expected_df):
    df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3)
    get_df_from_csv_ = function_mock(request,
                                     "trousse.dataset.get_df_from_csv")
    get_df_from_csv_.return_value = df
    dataset = Dataset(data_file="fake/path0")
    fillna = fop.FillNA(columns=columns,
                        derived_columns=derived_columns,
                        value=1)

    filled_dataset = fillna(dataset)

    assert filled_dataset is not dataset
    pd.testing.assert_frame_equal(filled_dataset.data, expected_df)