def it_can_get_operations_from_original_column(self, request):
        op_list = OperationsList()
        getitem_ = method_mock(request, OperationsList, "__getitem__")
        fop0 = fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0)
        fop1 = fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0)
        fop2 = fop.FillNA(columns=["col4"], derived_columns=None, value=0)
        getitem_.return_value = [fop0, fop1, fop2]

        operations = op_list.operations_from_original_column("col4")

        assert type(operations) == list
        assert operations == [fop0, fop2]
Пример #2
0
    def it_can_apply_fillna(self, request, columns, derived_columns,
                            expected_new_columns, expected_inplace):
        df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = df
        dataset = Dataset(data_file="fake/path0")
        pd_fillna_ = method_mock(request, pd.Series, "fillna")
        pd_fillna_.return_value = pd.Series([0] * 100)
        fillna = fop.FillNA(columns=columns,
                            derived_columns=derived_columns,
                            value=0)

        filled_dataset = fillna._apply(dataset)

        assert filled_dataset is not None
        assert filled_dataset is not dataset
        assert isinstance(filled_dataset, Dataset)
        for col in expected_new_columns:
            assert col in filled_dataset.data.columns
        get_df_from_csv_.assert_called_once_with("fake/path0")
        assert len(pd_fillna_.call_args_list) == len(columns)
        pd.testing.assert_series_equal(pd_fillna_.call_args_list[0][0][0],
                                       df[columns[0]])
        assert pd_fillna_.call_args_list[0][1] == {"inplace": expected_inplace}
    def but_it_raises_runtimeerror_with_multiple_operations_found(
            self, request):
        op_list = OperationsList()
        _operations_from_derived_column_ = method_mock(
            request, OperationsList, "operations_from_derived_column")
        _operations_from_derived_column_.return_value = [
            fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0),
            fop.FillNA(columns=["col2"], derived_columns=["col1"], value=0),
        ]

        with pytest.raises(RuntimeError) as err:
            op_list.original_columns_from_derived_column("col1")

        assert isinstance(err.value, RuntimeError)
        assert ("Multiple FeatureOperation found that generated column "
                "col1... the pipeline is compromised") == str(err.value)
Пример #4
0
    def it_knows_if_equal(self, other, expected_equal):
        feat_op = fop.FillNA(columns=["col0"],
                             derived_columns=["col1"],
                             value=0)

        equal = feat_op == other

        assert type(equal) == bool
        assert equal == expected_equal
Пример #5
0
    def it_knows_its_str(self):
        feat_op = fop.FillNA(columns=["col0"],
                             derived_columns=["col1"],
                             value=0)

        _str = str(feat_op)

        assert type(_str) == str
        assert _str == (
            "FillNA(\n\tcolumns=['col0'],\n\tvalue=0,\n\tderived_columns=['col1'],\n)"
        )
Пример #6
0
    def and_it_validates_its_arguments(self, request):
        validate_columns_ = method_mock(request, fop.FillNA,
                                        "_validate_single_element_columns")
        validate_derived_columns_ = method_mock(
            request, fop.FillNA, "_validate_single_element_derived_columns")

        fillna = fop.FillNA(columns=["nan"],
                            derived_columns=["filled"],
                            value=0)

        validate_columns_.assert_called_once_with(fillna, ["nan"])
        validate_derived_columns_.assert_called_once_with(fillna, ["filled"])
Пример #7
0
    def it_construct_from_args(self, request):
        _init_ = initializer_mock(request, fop.FillNA)

        fillna = fop.FillNA(columns=["nan"],
                            derived_columns=["filled"],
                            value=0)

        _init_.assert_called_once_with(ANY,
                                       columns=["nan"],
                                       derived_columns=["filled"],
                                       value=0)
        assert isinstance(fillna, fop.FillNA)
    def it_can_get_original_columns_from_derived_column(self, request):
        op_list = OperationsList()
        operations_from_derived_column_ = method_mock(
            request, OperationsList, "operations_from_derived_column")
        operations_from_derived_column_.return_value = [
            fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0)
        ]

        original_columns = op_list.original_columns_from_derived_column("col1")

        assert type(original_columns) == list
        assert original_columns == ["col0"]
Пример #9
0
    def test_track_history_on_previous_one(self, request,
                                           dataset_with_operations):
        feat_op = fop.FillNA(columns=["col1"],
                             derived_columns=["col5"],
                             value=0)

        dataset_with_operations.track_history(feat_op)

        added_op = dataset_with_operations.operations_history[2]
        # Check if the previous operations are still present
        assert len(dataset_with_operations.operations_history) == 3
        assert isinstance(added_op, fop.FillNA)
        assert added_op.columns == ["col1"]
        assert added_op.derived_columns == ["col5"]
Пример #10
0
def test_fillna(request, columns, derived_columns, expected_df):
    df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3)
    get_df_from_csv_ = function_mock(request,
                                     "trousse.dataset.get_df_from_csv")
    get_df_from_csv_.return_value = df
    dataset = Dataset(data_file="fake/path0")
    fillna = fop.FillNA(columns=columns,
                        derived_columns=derived_columns,
                        value=1)

    filled_dataset = fillna(dataset)

    assert filled_dataset is not dataset
    pd.testing.assert_frame_equal(filled_dataset.data, expected_df)
Пример #11
0
    def it_knows_how_to_track_history(self, request, metadata_cols,
                                      derived_columns, expected_metadata_cols):
        operations_list_iadd_ = method_mock(request, OperationsList,
                                            "__iadd__")

        expected_df = DataFrameMock.df_generic(10)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = expected_df
        dataset = Dataset(data_file="fake/path", metadata_cols=metadata_cols)
        feat_op = fop.FillNA(columns=["metadata_num_col"],
                             derived_columns=derived_columns,
                             value=0)

        dataset.track_history(feat_op)

        assert dataset.metadata_cols == expected_metadata_cols
        operations_list_iadd_.assert_called_once_with(ANY, feat_op)
Пример #12
0
    def it_can_fillna_with_template_call(self, request):
        _apply_ = method_mock(request, fop.FillNA, "_apply")
        track_history_ = method_mock(request, Dataset, "track_history")
        df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = df
        dataset_in = Dataset(data_file="fake/path0")
        dataset_out = Dataset(data_file="fake/path0")
        _apply_.return_value = dataset_out
        fillna = fop.FillNA(columns=["nan_0"],
                            derived_columns=["filled_nan_0"],
                            value=0)

        filled_dataset = fillna(dataset_in)

        _apply_.assert_called_once_with(fillna, dataset_in)
        track_history_.assert_called_once_with(filled_dataset, fillna)
        assert filled_dataset is dataset_out
Пример #13
0
    def test_track_history_with_no_derived_columns(
        self,
        request,
        metadata_columns,
        original_columns,
        expected_metadata_cols,
    ):
        df = DataFrameMock.df_generic(10)
        dataset = Dataset(
            df_object=df,
            metadata_cols=metadata_columns,
        )
        feat_op = fop.FillNA(columns=original_columns,
                             derived_columns=None,
                             value=0)

        dataset.track_history(feat_op)

        for column in original_columns:
            # Check if the operation is added to each column
            assert feat_op in dataset.operations_history[column]
        assert dataset.metadata_cols == expected_metadata_cols
Пример #14
0
class DescribeTrousse:
    @pytest.mark.parametrize(
        "operations",
        [
            (),
            (fop.FillNA(
                columns=["nan"],
                value=0,
            ), ),
            (
                fop.ReplaceStrings(
                    columns=["exam_num_col_0"],
                    derived_columns=["replaced_exam_num_col_0"],
                    replacement_map={"a": "b"},
                ),
                fop.FillNA(columns=["nan"], value=0),
            ),
        ],
    )
    def it_contructs_from_args(self, request, operations):
        _init_ = initializer_mock(request, fop.Trousse)

        trousse = fop.Trousse(*operations)

        _init_.assert_called_once_with(ANY, *operations)
        assert isinstance(trousse, fop.Trousse)

    def it_knows_its_operations(
        self,
        replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b,
        fillna_col0_col1,
    ):
        trousse = fop.Trousse(
            replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b,
            fillna_col0_col1)

        operations = trousse.operations

        assert type(operations) == tuple
        assert operations == (
            replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b,
            fillna_col0_col1,
        )

    def it_knows_how_to_call(
        self,
        request,
        replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b,
        fillna_col0_col1,
    ):
        dataset_in = instance_mock(request, Dataset, "in")
        dataset_out_1 = instance_mock(request, Dataset, "1")
        dataset_out_2 = instance_mock(request, Dataset, "2")
        _call_replacestrings = method_mock(request, fop.ReplaceStrings,
                                           "__call__")
        _call_replacestrings.return_value = dataset_out_1
        _call_fillna = method_mock(request, fop.FillNA, "__call__")
        _call_fillna.return_value = dataset_out_2
        trousse = fop.Trousse(
            replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b,
            fillna_col0_col1)

        new_dataset = trousse(dataset_in)

        _call_replacestrings.assert_called_once_with(
            replacestrings_exam_num_col_0_replaced_exam_num_col_0_a_b,
            dataset_in)
        _call_fillna.assert_called_once_with(fillna_col0_col1, dataset_out_1)
        assert isinstance(new_dataset, Dataset)
        assert new_dataset == dataset_out_2

    def it_knows_its_str(self, fillna_col0_col1, fillna_col1_col4):
        trousse = fop.Trousse(fillna_col0_col1, fillna_col1_col4)

        _str = str(trousse)

        assert type(_str) == str
        assert _str == (
            "Trousse: (FillNA(\n\tcolumns=['col0'],\n\tvalue=0,\n\t"
            "derived_columns=['col1'],\n), FillNA(\n\tcolumns=['col1']"
            ",\n\tvalue=0,\n\tderived_columns=['col4'],\n))")
Пример #15
0
class DescribeFillNa:
    def it_construct_from_args(self, request):
        _init_ = initializer_mock(request, fop.FillNA)

        fillna = fop.FillNA(columns=["nan"],
                            derived_columns=["filled"],
                            value=0)

        _init_.assert_called_once_with(ANY,
                                       columns=["nan"],
                                       derived_columns=["filled"],
                                       value=0)
        assert isinstance(fillna, fop.FillNA)

    def and_it_validates_its_arguments(self, request):
        validate_columns_ = method_mock(request, fop.FillNA,
                                        "_validate_single_element_columns")
        validate_derived_columns_ = method_mock(
            request, fop.FillNA, "_validate_single_element_derived_columns")

        fillna = fop.FillNA(columns=["nan"],
                            derived_columns=["filled"],
                            value=0)

        validate_columns_.assert_called_once_with(fillna, ["nan"])
        validate_derived_columns_.assert_called_once_with(fillna, ["filled"])

    @pytest.mark.parametrize(
        "columns, derived_columns, expected_new_columns, expected_inplace",
        [
            (["nan_0"], ["filled_nan_0"], ["filled_nan_0"], False),
            (["nan_0"], None, [], True),
        ],
    )
    def it_can_apply_fillna(self, request, columns, derived_columns,
                            expected_new_columns, expected_inplace):
        df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = df
        dataset = Dataset(data_file="fake/path0")
        pd_fillna_ = method_mock(request, pd.Series, "fillna")
        pd_fillna_.return_value = pd.Series([0] * 100)
        fillna = fop.FillNA(columns=columns,
                            derived_columns=derived_columns,
                            value=0)

        filled_dataset = fillna._apply(dataset)

        assert filled_dataset is not None
        assert filled_dataset is not dataset
        assert isinstance(filled_dataset, Dataset)
        for col in expected_new_columns:
            assert col in filled_dataset.data.columns
        get_df_from_csv_.assert_called_once_with("fake/path0")
        assert len(pd_fillna_.call_args_list) == len(columns)
        pd.testing.assert_series_equal(pd_fillna_.call_args_list[0][0][0],
                                       df[columns[0]])
        assert pd_fillna_.call_args_list[0][1] == {"inplace": expected_inplace}

    def it_can_fillna_with_template_call(self, request):
        _apply_ = method_mock(request, fop.FillNA, "_apply")
        track_history_ = method_mock(request, Dataset, "track_history")
        df = DataFrameMock.df_many_nans(nan_ratio=0.5, n_columns=3)
        get_df_from_csv_ = function_mock(request,
                                         "trousse.dataset.get_df_from_csv")
        get_df_from_csv_.return_value = df
        dataset_in = Dataset(data_file="fake/path0")
        dataset_out = Dataset(data_file="fake/path0")
        _apply_.return_value = dataset_out
        fillna = fop.FillNA(columns=["nan_0"],
                            derived_columns=["filled_nan_0"],
                            value=0)

        filled_dataset = fillna(dataset_in)

        _apply_.assert_called_once_with(fillna, dataset_in)
        track_history_.assert_called_once_with(filled_dataset, fillna)
        assert filled_dataset is dataset_out

    @pytest.mark.parametrize(
        "other, expected_equal",
        [
            (fop.FillNA(columns=["col0"], derived_columns=["col1"],
                        value=0), True),
            (fop.FillNA(columns=["col9"], derived_columns=["col1"],
                        value=0), False),
            (fop.FillNA(columns=["col0"], derived_columns=["col2"],
                        value=1), False),
            (dict(), False),
        ],
    )
    def it_knows_if_equal(self, other, expected_equal):
        feat_op = fop.FillNA(columns=["col0"],
                             derived_columns=["col1"],
                             value=0)

        equal = feat_op == other

        assert type(equal) == bool
        assert equal == expected_equal

    def it_knows_its_str(self):
        feat_op = fop.FillNA(columns=["col0"],
                             derived_columns=["col1"],
                             value=0)

        _str = str(feat_op)

        assert type(_str) == str
        assert _str == (
            "FillNA(\n\tcolumns=['col0'],\n\tvalue=0,\n\tderived_columns=['col1'],\n)"
        )
Пример #16
0
class DescribeOperationsList:
    def it_can_construct_itself(self, request):
        _init_ = initializer_mock(request, OperationsList)

        operations_list = OperationsList()

        _init_.assert_called_once_with(ANY)
        assert isinstance(operations_list, OperationsList)

    def it_can_iadd_first_featop(self, request, fillna_col0_col1):
        op_list = OperationsList()

        op_list += fillna_col0_col1

        assert op_list._operations_list == [fillna_col0_col1]
        for column in ["col0", "col1"]:
            assert op_list._operations_by_column[column] == [fillna_col0_col1]

    def it_can_iadd_next_featop(self, request, fillna_col0_col1,
                                fillna_col4_none):
        op_list = OperationsList()
        op_list._operations_list = [fillna_col0_col1]
        for column in ["col0", "col1"]:
            op_list._operations_by_column[column] = [fillna_col0_col1]

        op_list += fillna_col4_none

        assert op_list._operations_list == [fillna_col0_col1, fillna_col4_none]
        assert op_list._operations_by_column["col0"] == [fillna_col0_col1]
        assert op_list._operations_by_column["col1"] == [
            fillna_col0_col1,
        ]
        assert op_list._operations_by_column["col4"] == [fillna_col4_none]

    def it_can_getitem_from_int(self, fillna_col0_col1, fillna_col1_col4):
        op_list = OperationsList()
        op_list._operations_list = [fillna_col0_col1, fillna_col1_col4]
        op_list._operations_by_column["col0"] = [fillna_col0_col1]
        op_list._operations_by_column["col1"] = [
            fillna_col0_col1, fillna_col1_col4
        ]
        op_list._operations_by_column["col4"] = [fillna_col1_col4]

        feat_op0_ = op_list[0]
        feat_op1_ = op_list[1]

        assert isinstance(feat_op0_, fop.FillNA)
        assert isinstance(feat_op1_, fop.FillNA)
        assert feat_op0_ == fillna_col0_col1
        assert feat_op1_ == fillna_col1_col4

    def it_can_getitem_from_str(self, fillna_col0_col1, fillna_col1_col4):
        op_list = OperationsList()
        op_list._operations_list = [fillna_col0_col1, fillna_col1_col4]
        op_list._operations_by_column["col0"] = [fillna_col0_col1]
        op_list._operations_by_column["col1"] = [
            fillna_col0_col1, fillna_col1_col4
        ]
        op_list._operations_by_column["col4"] = [fillna_col1_col4]

        feat_op_col0 = op_list["col0"]
        feat_op_col1 = op_list["col1"]

        assert isinstance(feat_op_col0, list)
        assert isinstance(feat_op_col1, list)
        assert feat_op_col0 == [fillna_col0_col1]
        assert feat_op_col1 == [fillna_col0_col1, fillna_col1_col4]

    def but_it_raisestypeerror_with_wrong_type(self):
        op_list = OperationsList()

        with pytest.raises(TypeError) as err:
            op_list[{"wrong"}]

        assert isinstance(err.value, TypeError)
        assert "Cannot get FeatureOperation with a label of type set" == str(
            err.value)

    @pytest.mark.parametrize(
        "column, operations_from_original_column_return_value, "
        "expected_derived_columns",
        [
            (
                "col0",
                [
                    fop.FillNA(
                        columns=["col0"], derived_columns=["col1"], value=0)
                ],
                ["col1"],
            ),
            (
                "col1",
                [
                    fop.FillNA(
                        columns=["col1"], derived_columns=["col4"], value=0),
                    fop.FillNA(
                        columns=["col1"], derived_columns=["col2"], value=0),
                ],
                ["col4", "col2"],
            ),
            (
                "col4",
                [
                    fop.FillNA(columns=["col4"], derived_columns=None,
                               value=0),
                ],
                [],
            ),
        ],
    )
    def it_can_get_derived_columns_from_col(
        self,
        request,
        column,
        operations_from_original_column_return_value,
        expected_derived_columns,
    ):
        op_list = OperationsList()
        operations_from_original_column_ = method_mock(
            request, OperationsList, "operations_from_original_column")
        operations_from_original_column_.return_value = (
            operations_from_original_column_return_value)

        derived_columns = op_list.derived_columns_from_col(column)

        assert type(derived_columns) == list
        assert derived_columns == expected_derived_columns

    def it_can_get_operations_from_derived_column(self, request):
        op_list = OperationsList()
        getitem_ = method_mock(request, OperationsList, "__getitem__")
        fop0 = fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0)
        fop1 = fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0)
        fop2 = fop.FillNA(columns=["col4"], derived_columns=None, value=0)
        getitem_.return_value = [fop0, fop1, fop2]

        operations = op_list.operations_from_derived_column("col4")

        assert type(operations) == list
        assert operations == [fop1]

    def it_can_get_operations_from_original_column(self, request):
        op_list = OperationsList()
        getitem_ = method_mock(request, OperationsList, "__getitem__")
        fop0 = fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0)
        fop1 = fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0)
        fop2 = fop.FillNA(columns=["col4"], derived_columns=None, value=0)
        getitem_.return_value = [fop0, fop1, fop2]

        operations = op_list.operations_from_original_column("col4")

        assert type(operations) == list
        assert operations == [fop0, fop2]

    def it_can_get_original_columns_from_derived_column(self, request):
        op_list = OperationsList()
        operations_from_derived_column_ = method_mock(
            request, OperationsList, "operations_from_derived_column")
        operations_from_derived_column_.return_value = [
            fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0)
        ]

        original_columns = op_list.original_columns_from_derived_column("col1")

        assert type(original_columns) == list
        assert original_columns == ["col0"]

    def but_it_raises_runtimeerror_with_multiple_operations_found(
            self, request):
        op_list = OperationsList()
        _operations_from_derived_column_ = method_mock(
            request, OperationsList, "operations_from_derived_column")
        _operations_from_derived_column_.return_value = [
            fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0),
            fop.FillNA(columns=["col2"], derived_columns=["col1"], value=0),
        ]

        with pytest.raises(RuntimeError) as err:
            op_list.original_columns_from_derived_column("col1")

        assert isinstance(err.value, RuntimeError)
        assert ("Multiple FeatureOperation found that generated column "
                "col1... the pipeline is compromised") == str(err.value)

    def but_it_raises_runtimeerror_with_zero_operations_found(self, request):
        op_list = OperationsList()
        _operations_from_derived_column_ = method_mock(
            request, OperationsList, "operations_from_derived_column")
        _operations_from_derived_column_.return_value = []

        with pytest.raises(RuntimeError) as err:
            op_list.original_columns_from_derived_column("col1")

        assert isinstance(err.value, RuntimeError)
        assert ("No FeatureOperation found that generated column "
                "col1... the pipeline is compromised") == str(err.value)

    def it_knows_its_len(self, fillna_col0_col1, fillna_col1_col4):
        op_list = OperationsList()
        op_list._operations_list = [fillna_col0_col1, fillna_col1_col4]

        len_ = len(op_list)

        assert type(len_) == int
        assert len_ == 2

    def it_can_be_iterated_over(self, fillna_col0_col1, fillna_col1_col4,
                                fillna_col1_col2):
        op_list = OperationsList()
        op_list._operations_list = [
            fillna_col0_col1,
            fillna_col1_col4,
            fillna_col1_col2,
        ]
        operations = []
        for operation in op_list:
            operations.append(operation)

        assert isinstance(op_list, Iterable)
        assert operations == [
            fillna_col0_col1,
            fillna_col1_col4,
            fillna_col1_col2,
        ]

    @pytest.mark.parametrize(
        "other_operation_list, expected_equal",
        [
            (
                [
                    fop.FillNA(
                        columns=["col0"], derived_columns=["col1"], value=0),
                    fop.FillNA(
                        columns=["col1"], derived_columns=["col4"], value=0),
                ],
                True,
            ),
            ([fop.FillNA(columns=["col0"], derived_columns=["col1"], value=0)
              ], False),
            ([], False),
        ],
    )
    def it_knows_if_equal(self, other_operation_list, expected_equal,
                          fillna_col0_col1, fillna_col1_col4):
        op_list = OperationsList()
        op_list._operations_list = [fillna_col0_col1, fillna_col1_col4]
        other = OperationsList()
        other._operations_list = other_operation_list
        equal = op_list == other

        assert type(equal) == bool
        assert equal == expected_equal

    def it_knows_not_equal_not_operations_list(self):
        equal = dict() == OperationsList()

        assert type(equal) == bool
        assert equal is False

    def it_knows_its_str(self, fillna_col0_col1, fillna_col1_col4):
        op_list = OperationsList()
        op_list._operations_list = [fillna_col0_col1, fillna_col1_col4]

        _str = str(op_list)

        assert type(_str) == str
        assert _str == (
            "[FillNA(\n\tcolumns=['col0'],\n\tvalue=0,\n\tderived_columns=['col1'],\n),"
            " FillNA(\n\tcolumns=['col1'],\n\tvalue=0,\n\tderived_columns=['col4'],\n)]"
        )
Пример #17
0
def fillna_col1_col2():
    return fop.FillNA(columns=["col1"], derived_columns=["col2"], value=0)
Пример #18
0
def fillna_col4_none():
    return fop.FillNA(columns=["col4"], derived_columns=None, value=0)