def test_convert_column_id_to_name(self, col_id_list, expected_columns_name): df = DataFrameMock.df_multi_type(10) dataset = Dataset(df_object=df) columns_name = dataset.convert_column_id_to_name(col_id_list) assert type(columns_name) assert columns_name == expected_columns_name
def test_get_categorical_cols(self, request, sample_size, expected_categ_cols): df_categ = DataFrameMock.df_categorical_cols(sample_size) dataset = Dataset(df_object=df_categ) categ_cols = dataset._get_categorical_cols(col_list=df_categ.columns) assert isinstance(categ_cols, set) assert categ_cols == expected_categ_cols
def test_nan_columns(self, request, nan_ratio, n_columns, expected_nan_columns): df = DataFrameMock.df_many_nans(nan_ratio, n_columns) dataset = Dataset(df_object=df) nan_columns = dataset.nan_columns(nan_ratio - 0.01) assert len(nan_columns) == len(expected_nan_columns) assert isinstance(nan_columns, set) assert nan_columns == expected_nan_columns
def test_contains_duplicated_features(self, request, duplicated_cols_count, expected_contains_dupl_cols_bool): df_duplicated_cols = DataFrameMock.df_duplicated_columns( duplicated_cols_count) dataset = Dataset(df_object=df_duplicated_cols) contains_duplicated_features = dataset.check_duplicated_features() assert isinstance(contains_duplicated_features, bool) assert contains_duplicated_features is expected_contains_dupl_cols_bool
def dataset_with_operations(fillna_col0_col1, fillna_col1_col4) -> Dataset: """ Create Dataset instance with not empty ``operations_history`` attribute. Returns ------- Dataset Dataset instance containing FeatureOperation instances in the `operations_history` attribute """ dataset = Dataset(df_object=DataFrameMock.df_generic(10)) dataset.track_history(fillna_col0_col1) dataset.track_history(fillna_col1_col4) return dataset
def it_can_apply_fillna(self, request, columns, derived_columns, expected_new_columns, expected_inplace): df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset = Dataset(data_file="fake/path0") pd_fillna_ = method_mock(request, pd.Series, "fillna") pd_fillna_.return_value = pd.Series([0] * 100) fillna = fop.FillNA(columns=columns, derived_columns=derived_columns, value=0) filled_dataset = fillna._apply(dataset) assert filled_dataset is not None assert filled_dataset is not dataset assert isinstance(filled_dataset, Dataset) for col in expected_new_columns: assert col in filled_dataset.data.columns get_df_from_csv_.assert_called_once_with("fake/path0") assert len(pd_fillna_.call_args_list) == len(columns) pd.testing.assert_series_equal(pd_fillna_.call_args_list[0][0][0], df[columns[0]]) assert pd_fillna_.call_args_list[0][1] == {"inplace": expected_inplace}
def it_can_apply_replace_strings(self, request, columns, derived_columns, expected_new_columns, expected_inplace): df = DataFrameMock.df_generic(sample_size=100) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset = Dataset(data_file="fake/path0") pd_str_replace_ = function_mock(request, "pandas.Series.str.replace") pd_str_replace_.return_value = pd.Series([0] * 100) replace_substrings = fop.ReplaceSubstrings( columns=columns, derived_columns=derived_columns, replacement_map={"a": "b"}) replaced_dataset = replace_substrings._apply(dataset) assert replaced_dataset is not None assert replaced_dataset is not dataset assert isinstance(replaced_dataset, Dataset) for col in expected_new_columns: assert col in replaced_dataset.data.columns get_df_from_csv_.assert_called_once_with("fake/path0") assert len(pd_str_replace_.call_args_list) == len(columns) pd.testing.assert_series_equal( pd_str_replace_.call_args_list[0][0][0][:], df[columns[0]]) assert pd_str_replace_.call_args_list[0][1] == { "pat": "a", "repl": "b", }
def it_can_apply_ordinal_encoder( self, request, columns, derived_columns, expected_new_columns, ): df = DataFrameMock.df_generic(sample_size=100) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset = Dataset(data_file="fake/path0") sk_fit_transform_ = method_mock(request, sk_preproc.OrdinalEncoder, "fit_transform") sk_fit_transform_.return_value = pd.Series(range(100)) ordinal_encoder = fop.OrdinalEncoder( columns=columns, derived_columns=derived_columns, ) encoded_dataset = ordinal_encoder._apply(dataset) assert encoded_dataset is not None assert encoded_dataset is not dataset assert isinstance(encoded_dataset, Dataset) for col in expected_new_columns: assert col in encoded_dataset.data.columns get_df_from_csv_.assert_called_once_with("fake/path0") assert len(sk_fit_transform_.call_args_list) == len(columns) pd.testing.assert_frame_equal( sk_fit_transform_.call_args_list[0][0][1], df[[columns[0]]])
def it_knows_its_operations_history(self, request): function_mock(request, "trousse.dataset.get_df_from_csv") dataset = Dataset(data_file="fake/path") history = dataset.operations_history assert isinstance(history, OperationsList) assert len(history) == 0
def it_knows_its_metadata_cols(self, metadata_cols): df = DataFrameMock.df_multi_type(10) dataset = Dataset(df_object=df, metadata_cols=metadata_cols) metadata_cols_ = dataset.metadata_cols assert type(metadata_cols_) == set assert metadata_cols_ == set(metadata_cols)
def test_ordinal_encode_column(csv, column, derived_column, expected_csv): dataset = Dataset(data_file=csv) expected_df = load_expectation(expected_csv, type_="csv") encoded_df, _, new_cols = ffx._ordinal_encode_column( dataset.data, column, False) pd.testing.assert_frame_equal(encoded_df, expected_df) assert derived_column == new_cols
def it_knows_how_to_track_history(self, request, metadata_cols, derived_columns, expected_metadata_cols): operations_list_iadd_ = method_mock(request, OperationsList, "__iadd__") expected_df = DataFrameMock.df_generic(10) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = expected_df dataset = Dataset(data_file="fake/path", metadata_cols=metadata_cols) feat_op = fop.FillNA(columns=["metadata_num_col"], derived_columns=derived_columns, value=0) dataset.track_history(feat_op) assert dataset.metadata_cols == expected_metadata_cols operations_list_iadd_.assert_called_once_with(ANY, feat_op)
def test_ordinal_encoder(csv, columns, derived_columns, expected_csv): dataset = Dataset(data_file=csv) expected_df = load_expectation(expected_csv, type_="csv") ordinal_encoder = fop.OrdinalEncoder(columns=columns, derived_columns=derived_columns) encoded_dataset = ordinal_encoder(dataset) pd.testing.assert_frame_equal(encoded_dataset.data, expected_df)
def test_to_be_fixed_cols(self): df = DataFrameMock.df_multi_type(10) dataset = Dataset(df_object=df) to_be_fixed_cols = dataset.to_be_fixed_cols assert type(to_be_fixed_cols) == set assert len(to_be_fixed_cols) == 1 assert to_be_fixed_cols == {"mixed_type_col"}
def test_replace_substrings(csv, columns, derived_columns, expected_csv): dataset = Dataset(data_file=csv) expected_df = load_expectation(expected_csv, type_="csv") replace_substrings = fop.ReplaceSubstrings(columns=columns, derived_columns=derived_columns, replacement_map={"r": "c"}) replaced_dataset = replace_substrings(dataset) pd.testing.assert_frame_equal(replaced_dataset.data, expected_df)
def it_can_fillna_with_template_call(self, request): _apply_ = method_mock(request, fop.FillNA, "_apply") track_history_ = method_mock(request, Dataset, "track_history") df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset_in = Dataset(data_file="fake/path0") dataset_out = Dataset(data_file="fake/path0") _apply_.return_value = dataset_out fillna = fop.FillNA(columns=["nan_0"], derived_columns=["filled_nan_0"], value=0) filled_dataset = fillna(dataset_in) _apply_.assert_called_once_with(fillna, dataset_in) track_history_.assert_called_once_with(filled_dataset, fillna) assert filled_dataset is dataset_out
def test_trivial_columns(self, request, n_columns, expected_trivial_columns): df = DataFrameMock.df_trivial(n_columns) dataset = Dataset(df_object=df) trivial_columns = dataset.trivial_columns assert len(trivial_columns) == len(expected_trivial_columns) assert isinstance(trivial_columns, set) assert trivial_columns == expected_trivial_columns
def test_constant_columns(self, request, n_columns, expected_constant_columns): df = DataFrameMock.df_same_value(n_columns) dataset = Dataset(df_object=df) constant_cols = dataset.constant_cols assert len(constant_cols) == len(expected_constant_columns) assert isinstance(constant_cols, set) assert constant_cols == expected_constant_columns
def it_knows_its_data(self, request): expected_df = DataFrameMock.df_generic(10) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = expected_df dataset = Dataset(data_file="fake/path") data = dataset.data assert isinstance(data, pd.DataFrame) pd.testing.assert_frame_equal(data, expected_df)
def test_one_hot_encode_column(csv, column, drop_one_new_column, expected_new_cols, expected_csv): dataset = Dataset(data_file=csv) expected_df = load_expectation(expected_csv, type_="csv") encoded_df, encoder, new_cols = ffx._one_hot_encode_column( dataset.data, column, drop_one_new_column) assert expected_new_cols == new_cols pd.testing.assert_frame_equal(encoded_df, expected_df, check_dtype=False) assert isinstance(encoder, sk_preproc.OneHotEncoder)
def it_knows_its_feature_cols(self, metadata_cols, feature_cols, expected_feature_cols): df = DataFrameMock.df_multi_type(10) dataset = Dataset(df_object=df, metadata_cols=metadata_cols, feature_cols=feature_cols) feature_cols_ = dataset.feature_cols assert type(feature_cols_) == set assert feature_cols_ == expected_feature_cols
def it_can_encode_with_template_call(self, request): _apply_ = method_mock(request, fop.OrdinalEncoder, "_apply") track_history_ = method_mock(request, Dataset, "track_history") df = DataFrameMock.df_generic(sample_size=100) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset_in = Dataset(data_file="fake/path0") dataset_out = Dataset(data_file="fake/path0") _apply_.return_value = dataset_out ordinal_encoder = fop.OrdinalEncoder( columns=["exam_num_col_0"], derived_columns=["exam_str_col_0"], ) replaced_dataset = ordinal_encoder(dataset_in) _apply_.assert_called_once_with(ordinal_encoder, dataset_in) track_history_.assert_called_once_with(replaced_dataset, ordinal_encoder) assert replaced_dataset is dataset_out
def test_med_exam_col_list(self, feature_cols, expected_med_exam_col_list): df_multi_type = DataFrameMock.df_multi_type(sample_size=200) dataset = Dataset( df_object=df_multi_type, metadata_cols=("metadata_num_col", ), feature_cols=feature_cols, ) med_exam_col_list = dataset.med_exam_col_list assert isinstance(med_exam_col_list, set) assert med_exam_col_list == expected_med_exam_col_list
def it_knows_its_bool_columns(self, request): _columns_type = property_mock(request, Dataset, "_columns_type") column_list_by_type = _ColumnListByType(bool_cols={"bool0", "bool1"}) _columns_type.return_value = column_list_by_type initializer_mock(request, Dataset) dataset = Dataset(data_file="fake/path") bool_columns_ = dataset.bool_columns assert type(bool_columns_) == set assert bool_columns_ == {"bool0", "bool1"} _columns_type.assert_called_once()
def it_knows_its_num_categorical_columns(self, request): _columns_type = property_mock(request, Dataset, "_columns_type") column_list_by_type = _ColumnListByType( num_categorical_cols={"numcat0", "numcat1"}) _columns_type.return_value = column_list_by_type initializer_mock(request, Dataset) dataset = Dataset(data_file="fake/path") num_categorical_columns_ = dataset.num_categorical_columns assert type(num_categorical_columns_) == set assert num_categorical_columns_ == {"numcat0", "numcat1"} _columns_type.assert_called_once()
def it_knows_its_med_exam_col_list(self, request): _columns_type = property_mock(request, Dataset, "_columns_type") column_list_by_type = _ColumnListByType( med_exam_col_list={"med0", "med1"}) _columns_type.return_value = column_list_by_type initializer_mock(request, Dataset) dataset = Dataset(data_file="fake/path") med_exam_col_list_ = dataset.med_exam_col_list assert type(med_exam_col_list_) == set assert med_exam_col_list_ == {"med0", "med1"} _columns_type.assert_called_once()
def it_can_replace_with_template_call(self, request): _apply_ = method_mock(request, fop.ReplaceSubstrings, "_apply") track_history_ = method_mock(request, Dataset, "track_history") df = DataFrameMock.df_generic(sample_size=100) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset_in = Dataset(data_file="fake/path0") dataset_out = Dataset(data_file="fake/path0") _apply_.return_value = dataset_out replace_substrings = fop.ReplaceSubstrings( columns=["exam_num_col_0"], derived_columns=["exam_str_col_0"], replacement_map={"a": "b"}, ) replaced_dataset = replace_substrings(dataset_in) _apply_.assert_called_once_with(replace_substrings, dataset_in) track_history_.assert_called_once_with(replaced_dataset, replace_substrings) assert replaced_dataset is dataset_out
def test_column_list_by_type(self, feature_cols, expected_column_list_type): df_multi_type = DataFrameMock.df_multi_type(sample_size=200) dataset = Dataset( df_object=df_multi_type, metadata_cols=("metadata_num_col", ), feature_cols=feature_cols, ) col_list_by_type = dataset._columns_type assert isinstance(col_list_by_type, _ColumnListByType) assert col_list_by_type == expected_column_list_type
def test_track_history_with_no_derived_columns( self, request, metadata_columns, original_columns, expected_metadata_cols, ): df = DataFrameMock.df_generic(10) dataset = Dataset( df_object=df, metadata_cols=metadata_columns, ) feat_op = fop.FillNA(columns=original_columns, derived_columns=None, value=0) dataset.track_history(feat_op) for column in original_columns: # Check if the operation is added to each column assert feat_op in dataset.operations_history[column] assert dataset.metadata_cols == expected_metadata_cols
def test_fillna(request, columns, derived_columns, expected_df): df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset = Dataset(data_file="fake/path0") fillna = fop.FillNA(columns=columns, derived_columns=derived_columns, value=1) filled_dataset = fillna(dataset) assert filled_dataset is not dataset pd.testing.assert_frame_equal(filled_dataset.data, expected_df)