def it_can_get_operations_from_original_column(self, request): op_list = OperationsList() getitem_ = method_mock(request, OperationsList, "__getitem__") fop0 = fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0) fop1 = fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0) fop2 = fop.FillNA(columns=["col4"], derived_columns=None, value=0) getitem_.return_value = [fop0, fop1, fop2] operations = op_list.operations_from_original_column("col4") assert type(operations) == list assert operations == [fop0, fop2]
def it_can_apply_fillna(self, request, columns, derived_columns, expected_new_columns, expected_inplace): df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset = Dataset(data_file="fake/path0") pd_fillna_ = method_mock(request, pd.Series, "fillna") pd_fillna_.return_value = pd.Series([0] * 100) fillna = fop.FillNA(columns=columns, derived_columns=derived_columns, value=0) filled_dataset = fillna._apply(dataset) assert filled_dataset is not None assert filled_dataset is not dataset assert isinstance(filled_dataset, Dataset) for col in expected_new_columns: assert col in filled_dataset.data.columns get_df_from_csv_.assert_called_once_with("fake/path0") assert len(pd_fillna_.call_args_list) == len(columns) pd.testing.assert_series_equal(pd_fillna_.call_args_list[0][0][0], df[columns[0]]) assert pd_fillna_.call_args_list[0][1] == {"inplace": expected_inplace}
def but_it_raises_runtimeerror_with_multiple_operations_found( self, request): op_list = OperationsList() _operations_from_derived_column_ = method_mock( request, OperationsList, "operations_from_derived_column") _operations_from_derived_column_.return_value = [ fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0), fop.FillNA(columns=["col2"], derived_columns=["col1"], value=0), ] with pytest.raises(RuntimeError) as err: op_list.original_columns_from_derived_column("col1") assert isinstance(err.value, RuntimeError) assert ("Multiple FeatureOperation found that generated column " "col1... the pipeline is compromised") == str(err.value)
def it_knows_if_equal(self, other, expected_equal): feat_op = fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0) equal = feat_op == other assert type(equal) == bool assert equal == expected_equal
def it_knows_its_str(self): feat_op = fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0) _str = str(feat_op) assert type(_str) == str assert _str == ( "FillNA(\n\tcolumns=['col0'],\n\tvalue=0,\n\tderived_columns=['col1'],\n)" )
def and_it_validates_its_arguments(self, request): validate_columns_ = method_mock(request, fop.FillNA, "_validate_single_element_columns") validate_derived_columns_ = method_mock( request, fop.FillNA, "_validate_single_element_derived_columns") fillna = fop.FillNA(columns=["nan"], derived_columns=["filled"], value=0) validate_columns_.assert_called_once_with(fillna, ["nan"]) validate_derived_columns_.assert_called_once_with(fillna, ["filled"])
def it_construct_from_args(self, request): _init_ = initializer_mock(request, fop.FillNA) fillna = fop.FillNA(columns=["nan"], derived_columns=["filled"], value=0) _init_.assert_called_once_with(ANY, columns=["nan"], derived_columns=["filled"], value=0) assert isinstance(fillna, fop.FillNA)
def it_can_get_original_columns_from_derived_column(self, request): op_list = OperationsList() operations_from_derived_column_ = method_mock( request, OperationsList, "operations_from_derived_column") operations_from_derived_column_.return_value = [ fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0) ] original_columns = op_list.original_columns_from_derived_column("col1") assert type(original_columns) == list assert original_columns == ["col0"]
def test_track_history_on_previous_one(self, request, dataset_with_operations): feat_op = fop.FillNA(columns=["col1"], derived_columns=["col5"], value=0) dataset_with_operations.track_history(feat_op) added_op = dataset_with_operations.operations_history[2] # Check if the previous operations are still present assert len(dataset_with_operations.operations_history) == 3 assert isinstance(added_op, fop.FillNA) assert added_op.columns == ["col1"] assert added_op.derived_columns == ["col5"]
def test_fillna(request, columns, derived_columns, expected_df): df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset = Dataset(data_file="fake/path0") fillna = fop.FillNA(columns=columns, derived_columns=derived_columns, value=1) filled_dataset = fillna(dataset) assert filled_dataset is not dataset pd.testing.assert_frame_equal(filled_dataset.data, expected_df)
def it_knows_how_to_track_history(self, request, metadata_cols, derived_columns, expected_metadata_cols): operations_list_iadd_ = method_mock(request, OperationsList, "__iadd__") expected_df = DataFrameMock.df_generic(10) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = expected_df dataset = Dataset(data_file="fake/path", metadata_cols=metadata_cols) feat_op = fop.FillNA(columns=["metadata_num_col"], derived_columns=derived_columns, value=0) dataset.track_history(feat_op) assert dataset.metadata_cols == expected_metadata_cols operations_list_iadd_.assert_called_once_with(ANY, feat_op)
def it_can_fillna_with_template_call(self, request): _apply_ = method_mock(request, fop.FillNA, "_apply") track_history_ = method_mock(request, Dataset, "track_history") df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset_in = Dataset(data_file="fake/path0") dataset_out = Dataset(data_file="fake/path0") _apply_.return_value = dataset_out fillna = fop.FillNA(columns=["nan_0"], derived_columns=["filled_nan_0"], value=0) filled_dataset = fillna(dataset_in) _apply_.assert_called_once_with(fillna, dataset_in) track_history_.assert_called_once_with(filled_dataset, fillna) assert filled_dataset is dataset_out
def test_track_history_with_no_derived_columns( self, request, metadata_columns, original_columns, expected_metadata_cols, ): df = DataFrameMock.df_generic(10) dataset = Dataset( df_object=df, metadata_cols=metadata_columns, ) feat_op = fop.FillNA(columns=original_columns, derived_columns=None, value=0) dataset.track_history(feat_op) for column in original_columns: # Check if the operation is added to each column assert feat_op in dataset.operations_history[column] assert dataset.metadata_cols == expected_metadata_cols
class DescribeTrousse: @pytest.mark.parametrize( "operations", [ (), (fop.FillNA( columns=["nan"], value=0, ), ), ( fop.ReplaceStrings( columns=["exam_num_col_0"], derived_columns=["replaced_exam_num_col_0"], replacement_map={"a": "b"}, ), fop.FillNA(columns=["nan"], value=0), ), ], ) def it_contructs_from_args(self, request, operations): _init_ = initializer_mock(request, fop.Trousse) trousse = fop.Trousse(*operations) _init_.assert_called_once_with(ANY, *operations) assert isinstance(trousse, fop.Trousse) def it_knows_its_operations( self, replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b, fillna_col0_col1, ): trousse = fop.Trousse( replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b, fillna_col0_col1) operations = trousse.operations assert type(operations) == tuple assert operations == ( replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b, fillna_col0_col1, ) def it_knows_how_to_call( self, request, replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b, fillna_col0_col1, ): dataset_in = instance_mock(request, Dataset, "in") dataset_out_1 = instance_mock(request, Dataset, "1") dataset_out_2 = instance_mock(request, Dataset, "2") _call_replacestrings = method_mock(request, fop.ReplaceStrings, "__call__") _call_replacestrings.return_value = dataset_out_1 _call_fillna = method_mock(request, fop.FillNA, "__call__") _call_fillna.return_value = dataset_out_2 trousse = fop.Trousse( replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b, fillna_col0_col1) new_dataset = trousse(dataset_in) _call_replacestrings.assert_called_once_with( replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b, dataset_in) _call_fillna.assert_called_once_with(fillna_col0_col1, dataset_out_1) assert isinstance(new_dataset, Dataset) assert new_dataset == dataset_out_2 def it_knows_its_str(self, fillna_col0_col1, fillna_col1_col4): trousse = fop.Trousse(fillna_col0_col1, fillna_col1_col4) _str = str(trousse) assert type(_str) == str assert _str == ( "Trousse: (FillNA(\n\tcolumns=['col0'],\n\tvalue=0,\n\t" "derived_columns=['col1'],\n), FillNA(\n\tcolumns=['col1']" ",\n\tvalue=0,\n\tderived_columns=['col4'],\n))")
class DescribeFillNa: def it_construct_from_args(self, request): _init_ = initializer_mock(request, fop.FillNA) fillna = fop.FillNA(columns=["nan"], derived_columns=["filled"], value=0) _init_.assert_called_once_with(ANY, columns=["nan"], derived_columns=["filled"], value=0) assert isinstance(fillna, fop.FillNA) def and_it_validates_its_arguments(self, request): validate_columns_ = method_mock(request, fop.FillNA, "_validate_single_element_columns") validate_derived_columns_ = method_mock( request, fop.FillNA, "_validate_single_element_derived_columns") fillna = fop.FillNA(columns=["nan"], derived_columns=["filled"], value=0) validate_columns_.assert_called_once_with(fillna, ["nan"]) validate_derived_columns_.assert_called_once_with(fillna, ["filled"]) @pytest.mark.parametrize( "columns, derived_columns, expected_new_columns, expected_inplace", [ (["nan_0"], ["filled_nan_0"], ["filled_nan_0"], False), (["nan_0"], None, [], True), ], ) def it_can_apply_fillna(self, request, columns, derived_columns, expected_new_columns, expected_inplace): df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset = Dataset(data_file="fake/path0") pd_fillna_ = method_mock(request, pd.Series, "fillna") pd_fillna_.return_value = pd.Series([0] * 100) fillna = fop.FillNA(columns=columns, derived_columns=derived_columns, value=0) filled_dataset = fillna._apply(dataset) assert filled_dataset is not None assert filled_dataset is not dataset assert isinstance(filled_dataset, Dataset) for col in expected_new_columns: assert col in filled_dataset.data.columns get_df_from_csv_.assert_called_once_with("fake/path0") assert len(pd_fillna_.call_args_list) == len(columns) pd.testing.assert_series_equal(pd_fillna_.call_args_list[0][0][0], df[columns[0]]) assert pd_fillna_.call_args_list[0][1] == {"inplace": expected_inplace} def it_can_fillna_with_template_call(self, request): _apply_ = method_mock(request, fop.FillNA, "_apply") track_history_ = method_mock(request, Dataset, "track_history") df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = df dataset_in = Dataset(data_file="fake/path0") dataset_out = Dataset(data_file="fake/path0") _apply_.return_value = dataset_out fillna = fop.FillNA(columns=["nan_0"], derived_columns=["filled_nan_0"], value=0) filled_dataset = fillna(dataset_in) _apply_.assert_called_once_with(fillna, dataset_in) track_history_.assert_called_once_with(filled_dataset, fillna) assert filled_dataset is dataset_out @pytest.mark.parametrize( "other, expected_equal", [ (fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0), True), (fop.FillNA(columns=["col9"], derived_columns=["col1"], value=0), False), (fop.FillNA(columns=["col0"], derived_columns=["col2"], value=1), False), (dict(), False), ], ) def it_knows_if_equal(self, other, expected_equal): feat_op = fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0) equal = feat_op == other assert type(equal) == bool assert equal == expected_equal def it_knows_its_str(self): feat_op = fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0) _str = str(feat_op) assert type(_str) == str assert _str == ( "FillNA(\n\tcolumns=['col0'],\n\tvalue=0,\n\tderived_columns=['col1'],\n)" )
class DescribeOperationsList: def it_can_construct_itself(self, request): _init_ = initializer_mock(request, OperationsList) operations_list = OperationsList() _init_.assert_called_once_with(ANY) assert isinstance(operations_list, OperationsList) def it_can_iadd_first_featop(self, request, fillna_col0_col1): op_list = OperationsList() op_list += fillna_col0_col1 assert op_list._operations_list == [fillna_col0_col1] for column in ["col0", "col1"]: assert op_list._operations_by_column[column] == [fillna_col0_col1] def it_can_iadd_next_featop(self, request, fillna_col0_col1, fillna_col4_none): op_list = OperationsList() op_list._operations_list = [fillna_col0_col1] for column in ["col0", "col1"]: op_list._operations_by_column[column] = [fillna_col0_col1] op_list += fillna_col4_none assert op_list._operations_list == [fillna_col0_col1, fillna_col4_none] assert op_list._operations_by_column["col0"] == [fillna_col0_col1] assert op_list._operations_by_column["col1"] == [ fillna_col0_col1, ] assert op_list._operations_by_column["col4"] == [fillna_col4_none] def it_can_getitem_from_int(self, fillna_col0_col1, fillna_col1_col4): op_list = OperationsList() op_list._operations_list = [fillna_col0_col1, fillna_col1_col4] op_list._operations_by_column["col0"] = [fillna_col0_col1] op_list._operations_by_column["col1"] = [ fillna_col0_col1, fillna_col1_col4 ] op_list._operations_by_column["col4"] = [fillna_col1_col4] feat_op0_ = op_list[0] feat_op1_ = op_list[1] assert isinstance(feat_op0_, fop.FillNA) assert isinstance(feat_op1_, fop.FillNA) assert feat_op0_ == fillna_col0_col1 assert feat_op1_ == fillna_col1_col4 def it_can_getitem_from_str(self, fillna_col0_col1, fillna_col1_col4): op_list = OperationsList() op_list._operations_list = [fillna_col0_col1, fillna_col1_col4] op_list._operations_by_column["col0"] = [fillna_col0_col1] op_list._operations_by_column["col1"] = [ fillna_col0_col1, fillna_col1_col4 ] op_list._operations_by_column["col4"] = [fillna_col1_col4] feat_op_col0 = op_list["col0"] feat_op_col1 = op_list["col1"] assert isinstance(feat_op_col0, list) assert isinstance(feat_op_col1, list) assert feat_op_col0 == [fillna_col0_col1] assert feat_op_col1 == [fillna_col0_col1, fillna_col1_col4] def but_it_raisestypeerror_with_wrong_type(self): op_list = OperationsList() with pytest.raises(TypeError) as err: op_list[{"wrong"}] assert isinstance(err.value, TypeError) assert "Cannot get FeatureOperation with a label of type set" == str( err.value) @pytest.mark.parametrize( "column, operations_from_original_column_return_value, " "expected_derived_columns", [ ( "col0", [ fop.FillNA( columns=["col0"], derived_columns=["col1"], value=0) ], ["col1"], ), ( "col1", [ fop.FillNA( columns=["col1"], derived_columns=["col4"], value=0), fop.FillNA( columns=["col1"], derived_columns=["col2"], value=0), ], ["col4", "col2"], ), ( "col4", [ fop.FillNA(columns=["col4"], derived_columns=None, value=0), ], [], ), ], ) def it_can_get_derived_columns_from_col( self, request, column, operations_from_original_column_return_value, expected_derived_columns, ): op_list = OperationsList() operations_from_original_column_ = method_mock( request, OperationsList, "operations_from_original_column") operations_from_original_column_.return_value = ( operations_from_original_column_return_value) derived_columns = op_list.derived_columns_from_col(column) assert type(derived_columns) == list assert derived_columns == expected_derived_columns def it_can_get_operations_from_derived_column(self, request): op_list = OperationsList() getitem_ = method_mock(request, OperationsList, "__getitem__") fop0 = fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0) fop1 = fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0) fop2 = fop.FillNA(columns=["col4"], derived_columns=None, value=0) getitem_.return_value = [fop0, fop1, fop2] operations = op_list.operations_from_derived_column("col4") assert type(operations) == list assert operations == [fop1] def it_can_get_operations_from_original_column(self, request): op_list = OperationsList() getitem_ = method_mock(request, OperationsList, "__getitem__") fop0 = fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0) fop1 = fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0) fop2 = fop.FillNA(columns=["col4"], derived_columns=None, value=0) getitem_.return_value = [fop0, fop1, fop2] operations = op_list.operations_from_original_column("col4") assert type(operations) == list assert operations == [fop0, fop2] def it_can_get_original_columns_from_derived_column(self, request): op_list = OperationsList() operations_from_derived_column_ = method_mock( request, OperationsList, "operations_from_derived_column") operations_from_derived_column_.return_value = [ fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0) ] original_columns = op_list.original_columns_from_derived_column("col1") assert type(original_columns) == list assert original_columns == ["col0"] def but_it_raises_runtimeerror_with_multiple_operations_found( self, request): op_list = OperationsList() _operations_from_derived_column_ = method_mock( request, OperationsList, "operations_from_derived_column") _operations_from_derived_column_.return_value = [ fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0), fop.FillNA(columns=["col2"], derived_columns=["col1"], value=0), ] with pytest.raises(RuntimeError) as err: op_list.original_columns_from_derived_column("col1") assert isinstance(err.value, RuntimeError) assert ("Multiple FeatureOperation found that generated column " "col1... the pipeline is compromised") == str(err.value) def but_it_raises_runtimeerror_with_zero_operations_found(self, request): op_list = OperationsList() _operations_from_derived_column_ = method_mock( request, OperationsList, "operations_from_derived_column") _operations_from_derived_column_.return_value = [] with pytest.raises(RuntimeError) as err: op_list.original_columns_from_derived_column("col1") assert isinstance(err.value, RuntimeError) assert ("No FeatureOperation found that generated column " "col1... the pipeline is compromised") == str(err.value) def it_knows_its_len(self, fillna_col0_col1, fillna_col1_col4): op_list = OperationsList() op_list._operations_list = [fillna_col0_col1, fillna_col1_col4] len_ = len(op_list) assert type(len_) == int assert len_ == 2 def it_can_be_iterated_over(self, fillna_col0_col1, fillna_col1_col4, fillna_col1_col2): op_list = OperationsList() op_list._operations_list = [ fillna_col0_col1, fillna_col1_col4, fillna_col1_col2, ] operations = [] for operation in op_list: operations.append(operation) assert isinstance(op_list, Iterable) assert operations == [ fillna_col0_col1, fillna_col1_col4, fillna_col1_col2, ] @pytest.mark.parametrize( "other_operation_list, expected_equal", [ ( [ fop.FillNA( columns=["col0"], derived_columns=["col1"], value=0), fop.FillNA( columns=["col1"], derived_columns=["col4"], value=0), ], True, ), ([fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0) ], False), ([], False), ], ) def it_knows_if_equal(self, other_operation_list, expected_equal, fillna_col0_col1, fillna_col1_col4): op_list = OperationsList() op_list._operations_list = [fillna_col0_col1, fillna_col1_col4] other = OperationsList() other._operations_list = other_operation_list equal = op_list == other assert type(equal) == bool assert equal == expected_equal def it_knows_not_equal_not_operations_list(self): equal = dict() == OperationsList() assert type(equal) == bool assert equal is False def it_knows_its_str(self, fillna_col0_col1, fillna_col1_col4): op_list = OperationsList() op_list._operations_list = [fillna_col0_col1, fillna_col1_col4] _str = str(op_list) assert type(_str) == str assert _str == ( "[FillNA(\n\tcolumns=['col0'],\n\tvalue=0,\n\tderived_columns=['col1'],\n)," " FillNA(\n\tcolumns=['col1'],\n\tvalue=0,\n\tderived_columns=['col4'],\n)]" )
def fillna_col1_col2(): return fop.FillNA(columns=["col1"], derived_columns=["col2"], value=0)
def fillna_col4_none(): return fop.FillNA(columns=["col4"], derived_columns=None, value=0)