예제 #1
0
    def test_batch_2H_transform(self, concat_mock, fm_mock):
        time = pd.date_range('2000-01-01', freq='1H', periods=7)
        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})
        pipeline = Pipeline(batch=pd.Timedelta("2h"))
        step_one = MagicMock()
        step_one.get_result.return_value = {"step": da}
        step_one.name = "step"
        result_mock = MagicMock()
        concat_mock.return_value = result_mock
        pipeline.start_steps["foo"] = StartStep("foo"), None
        pipeline.start_steps["foo"][0].last = False
        step_one.further_elements.side_effect = [True, True, True, True, False]
        pipeline.add(module=step_one, input_ids=[1])

        result = pipeline.transform(foo=da)

        self.assertEqual(concat_mock.call_count, 3)
        self.assertEqual(step_one.get_result.call_count, 4)
        self.assertEqual(step_one.further_elements.call_count, 5)
        self.assertEqual({"step": result_mock}, result)
예제 #2
0
class TestPipeline(unittest.TestCase):
    @patch("pywatts.core.pipeline.FileManager")
    def setUp(self, fm_mock) -> None:
        self.fm_mock = fm_mock()
        self.pipeline = Pipeline()

    def tearDown(self) -> None:
        self.pipeline = None

    def test_add_input_as_positional(self):
        # Should fail with an better error message
        SKLearnWrapper(LinearRegression())(x=self.pipeline["input"])

    def test_add_only_module(self):
        SKLearnWrapper(LinearRegression())(x=self.pipeline["input"])
        # nodes 1 plus startstep
        self.assertEqual(len(self.pipeline.id_to_step), 2)

    def test_add_module_which_is_not_in_a_list(self):
        wrapper = SKLearnWrapper(
            LinearRegression())(input=self.pipeline["input"])
        SKLearnWrapper(LinearRegression())(x=wrapper)
        # nodes 1 plus startstep
        self.assertEqual(len(self.pipeline.id_to_step), 3)

    def test_add_pipeline_without_index(self):
        # This should raise an exception since pipeline might get multiple columns in the input dataframe
        with self.assertRaises(Exception) as context:
            SKLearnWrapper(StandardScaler())(
                x=self.pipeline)  # This should fail
        self.assertEqual(
            "Adding a pipeline as input might be ambigious. Specifiy the desired column of your dataset by using pipeline[<column_name>]",
            str(context.exception))

    def test_add_module_with_inputs(self):
        scaler1 = SKLearnWrapper(StandardScaler())(x=self.pipeline["x"])
        scaler2 = SKLearnWrapper(StandardScaler())(x=self.pipeline["test1"])
        SKLearnWrapper(LinearRegression())(input_1=scaler1, input_2=scaler2)

        # Three modules plus start step and one collect step
        self.assertEqual(5, len(self.pipeline.id_to_step))

    def test_add_module_with_one_input_without_a_list(self):
        scaler = SKLearnWrapper(StandardScaler())(input=self.pipeline["test"])
        SKLearnWrapper(LinearRegression())(input=scaler)

        # Three modules plus start step and one collect step
        self.assertEqual(3, len(self.pipeline.id_to_step))

    @patch('pywatts.core.pipeline.FileManager')
    @patch('pywatts.core.pipeline.json')
    @patch("builtins.open", new_callable=mock_open)
    def test_to_folder(self, mock_file, json_mock, fm_mock):
        scaler = SKLearnWrapper(StandardScaler())(input=self.pipeline["input"])
        SKLearnWrapper(LinearRegression())(x=scaler)
        fm_mock_object = MagicMock()
        fm_mock.return_value = fm_mock_object
        fm_mock_object.get_path.side_effect = [
            os.path.join('test_pipeline', 'StandardScaler.pickle'),
            os.path.join('test_pipeline', 'LinearRegression.pickle'),
            os.path.join('test_pipeline', 'pipeline.json'),
        ]

        self.pipeline.to_folder("test_pipeline")

        calls_open = [
            call(os.path.join('test_pipeline', 'StandardScaler.pickle'), 'wb'),
            call(os.path.join('test_pipeline', 'LinearRegression.pickle'),
                 'wb'),
            call(os.path.join('test_pipeline', 'pipeline.json'), 'w')
        ]
        mock_file.assert_has_calls(calls_open, any_order=True)
        args, kwargs = json_mock.dump.call_args
        assert kwargs["obj"]["id"] == pipeline_json["id"]
        assert kwargs["obj"]["name"] == pipeline_json["name"]

        assert kwargs["obj"]["modules"] == pipeline_json["modules"]
        assert kwargs["obj"]["steps"] == pipeline_json["steps"]

    @patch('pywatts.core.pipeline.FileManager')
    @patch('pywatts.modules.sklearn_wrapper.pickle')
    @patch('pywatts.core.pipeline.json')
    @patch("builtins.open", new_callable=mock_open)
    @patch('pywatts.core.pipeline.os.path.isdir')
    def test_from_folder(self, isdir_mock, mock_file, json_mock, pickle_mock,
                         fm_mock):
        scaler = StandardScaler()
        linear_regression = LinearRegression()

        isdir_mock.return_value = True
        json_mock.load.return_value = pipeline_json

        pickle_mock.load.side_effect = [scaler, linear_regression]

        pipeline = Pipeline.from_folder("test_pipeline")
        calls_open = [
            call(os.path.join("test_pipeline", "StandardScaler.pickle"), "rb"),
            call(os.path.join("test_pipeline", "LinearRegression.pickle"),
                 "rb"),
            call(os.path.join("test_pipeline", "pipeline.json"), "r")
        ]

        mock_file.assert_has_calls(calls_open, any_order=True)

        json_mock.load.assert_called_once()
        assert pickle_mock.load.call_count == 2

        isdir_mock.assert_called_once()
        self.assertEqual(3, len(pipeline.id_to_step))

    def test_module_naming_conflict(self):
        # This test should check, that modules with the same name do not lead to an error
        # What should this test?
        # self.fail()
        pass

    def test_add_with_target(self):
        SKLearnWrapper(LinearRegression())(input=self.pipeline["input"],
                                           target=self.pipeline["target"])
        self.assertEqual(3, len(self.pipeline.id_to_step))

    def test_multiple_same_module(self):
        reg_module = SKLearnWrapper(module=LinearRegression())
        reg_one = reg_module(x=self.pipeline["test"],
                             target=self.pipeline["target"])
        reg_two = reg_module(x=self.pipeline["test2"],
                             target=self.pipeline["target"])
        detector = MissingValueDetector()
        detector(dataset=reg_one)
        detector(dataset=reg_two)

        # Three start steps (test, test2, target), two regressors two detectors
        self.assertEqual(7, len(self.pipeline.id_to_step))
        modules = []
        for element in self.pipeline.id_to_step.values():
            if isinstance(element, Step) and not element.module in modules:
                modules.append(element.module)
        # One sklearn wrappers, one missing value detector
        self.assertEqual(2, len(modules))

        self.pipeline.train(
            pd.DataFrame(
                {
                    "test": [1, 2, 2, 3, 4],
                    "test2": [2, 2, 2, 2, 2],
                    "target": [2, 2, 4, 4, -5]
                },
                index=pd.DatetimeIndex(
                    pd.date_range('2000-01-01', freq='24H', periods=5))))

    @patch('pywatts.core.pipeline.Pipeline._create_summary')
    @patch('pywatts.core.pipeline.FileManager')
    def test_add_pipeline_to_pipeline_and_train(self, fm_mock,
                                                create_summary_mock):
        sub_pipeline = Pipeline()

        detector = MissingValueDetector()

        detector(dataset=sub_pipeline["regression"])

        regressor = SKLearnWrapper(LinearRegression(), name="regression")(
            x=self.pipeline["test"], target=self.pipeline["target"])
        sub_pipeline(regression=regressor)

        summary_formatter_mock = MagicMock()
        self.pipeline.train(pd.DataFrame({
            "test": [24, 24],
            "target": [12, 24]
        },
                                         index=pd.to_datetime([
                                             '2015-06-03 00:00:00',
                                             '2015-06-03 01:00:00'
                                         ])),
                            summary_formatter=summary_formatter_mock)

        for step in self.pipeline.id_to_step.values():
            assert step.current_run_setting.computation_mode == ComputationMode.FitTransform

        create_summary_mock.assert_has_calls(
            [call(summary_formatter_mock),
             call(summary_formatter_mock)])

    @patch('pywatts.core.pipeline.FileManager')
    def test_add_pipeline_to_pipeline_and_test(self, fm_mock):
        # Add some steps to the pipeline

        # Assert that the computation is set to fit_transform if the ComputationMode was default

        step = MagicMock()
        step.computation_mode = ComputationMode.Default
        step.finished = False
        time = pd.date_range('2000-01-01', freq='24H', periods=7)

        ds = xr.Dataset({'foo': ('time', [2, 3, 4, 5, 6, 7, 8]), 'time': time})

        subpipeline = Pipeline()
        subpipeline.add(module=step)

        # BUG: In step_factory.py -> create_step the file_manager of the pipeline is accessed
        # and the pipeline is None...
        # subpipeline(self.pipeline)

        # self.pipeline.test(ds)

        # step.set_computation_mode.assert_called_once_with(ComputationMode.Transform)

        # step.reset.assert_called_once()

    @patch("pywatts.core.pipeline.FileManager")
    @patch('pywatts.core.pipeline.json')
    @patch("builtins.open", new_callable=mock_open)
    def test_add_pipeline_to_pipeline_and_save(self, open_mock, json_mock,
                                               fm_mock):
        sub_pipeline = Pipeline()

        detector = MissingValueDetector()
        detector(dataset=sub_pipeline["regressor"])

        regressor = SKLearnWrapper(LinearRegression())(x=self.pipeline["test"])
        sub_pipeline(regression=regressor)

        self.pipeline.to_folder(path="path")

        self.assertEqual(json_mock.dump.call_count, 2)

    def create_summary_in_subpipelines(self):
        assert False

    @patch('pywatts.core.pipeline.FileManager')
    def test__collect_batch_results_naming_conflict(self, fm_mock):
        step_one = MagicMock()
        step_one.name = "step"
        step_two = MagicMock()
        step_two.name = "step"
        result_step_one = MagicMock()
        result_step_two = MagicMock()
        merged_result = {"step": result_step_one, "step_1": result_step_two}

        step_one.get_result.return_value = {"step": result_step_one}
        step_two.get_result.return_value = {"step_1": result_step_two}

        result = self.pipeline._collect_results([step_one, step_two])

        # Assert that steps are correclty called.
        step_one.get_result.assert_called_once_with(None,
                                                    None,
                                                    return_all=True)
        step_two.get_result.assert_called_once_with(None,
                                                    None,
                                                    return_all=True)

        # Assert return value is correct
        self.assertEqual(merged_result, result)

    @patch("pywatts.core.pipeline.FileManager")
    def test_get_params(self, fm_mock):
        result = Pipeline(batch=pd.Timedelta("1h")).get_params()
        self.assertEqual(result, {"batch": pd.Timedelta("1h")})

    def test_set_params(self):
        self.pipeline.set_params(batch=pd.Timedelta("2h"))
        self.assertEqual(self.pipeline.get_params(),
                         {"batch": pd.Timedelta("2h")})

    def test__collect_batch_results(self):
        step_one = MagicMock()
        step_one.name = "step_one"
        step_two = MagicMock()
        step_two.name = "step_two"
        result_step_one = MagicMock()
        result_step_two = MagicMock()
        merged_result = {
            "step_one": result_step_one,
            "step_two": result_step_two
        }

        step_one.get_result.return_value = {"step_one": result_step_one}
        step_two.get_result.return_value = {"step_two": result_step_two}

        result = self.pipeline._collect_results([step_one, step_two])

        # Assert that steps are correclty called.
        step_one.get_result.assert_called_once_with(None,
                                                    None,
                                                    return_all=True)
        step_two.get_result.assert_called_once_with(None,
                                                    None,
                                                    return_all=True)

        # Assert return value is correct
        self.assertEqual(merged_result, result)

    @patch("pywatts.core.pipeline.FileManager")
    @patch("pywatts.core.pipeline.xr.concat")
    def test_batched_pipeline(self, concat_mock, fm_mock):
        # Add some steps to the pipeline

        time = pd.date_range('2000-01-01', freq='1H', periods=7)
        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})

        # Assert that the computation is set to fit_transform if the ComputationMode was default
        first_step = MagicMock()
        first_step.run_setting = RunSetting(ComputationMode.Default)
        first_step.finished = False
        first_step.further_elements.side_effect = [
            True, True, True, True, False
        ]

        first_step.get_result.return_value = {"one": da}
        self.pipeline.set_params(pd.Timedelta("24h"))
        self.pipeline.add(module=first_step)

        data = pd.DataFrame({
            "test": [1, 2, 2, 3],
            "test2": [2, 2, 2, 2]
        },
                            index=pd.DatetimeIndex(
                                pd.date_range('2000-01-01',
                                              freq='24H',
                                              periods=4)))
        self.pipeline.test(data)

        first_step.set_run_setting.assert_called_once()
        self.assertEqual(
            first_step.set_run_setting.call_args[0][0].computation_mode,
            ComputationMode.Transform)
        calls = [
            call(pd.Timestamp('2000-01-01 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-02 00:00:00', freq='24H'),
                 return_all=True),
            call(pd.Timestamp('2000-01-02 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-03 00:00:00', freq='24H'),
                 return_all=True),
            call(pd.Timestamp('2000-01-03 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-04 00:00:00', freq='24H'),
                 return_all=True),
            call(pd.Timestamp('2000-01-04 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-05 00:00:00', freq='24H'),
                 return_all=True),
        ]
        first_step.get_result.assert_has_calls(calls, any_order=True)
        self.assertEqual(concat_mock.call_count, 3)

    @patch("pywatts.core.pipeline.FileManager")
    @patch("pywatts.core.pipeline.xr.concat")
    def test_batch_2H_transform(self, concat_mock, fm_mock):
        time = pd.date_range('2000-01-01', freq='1H', periods=7)
        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})
        pipeline = Pipeline(batch=pd.Timedelta("2h"))
        step_one = MagicMock()
        step_one.get_result.return_value = {"step": da}
        step_one.name = "step"
        result_mock = MagicMock()
        concat_mock.return_value = result_mock
        pipeline.start_steps["foo"] = StartStep("foo"), None
        pipeline.start_steps["foo"][0].last = False
        step_one.further_elements.side_effect = [True, True, True, True, False]
        pipeline.add(module=step_one, input_ids=[1])

        result = pipeline.transform(foo=da)

        self.assertEqual(concat_mock.call_count, 3)
        self.assertEqual(step_one.get_result.call_count, 4)
        self.assertEqual(step_one.further_elements.call_count, 5)
        self.assertEqual({"step": result_mock}, result)

    @patch('pywatts.core.pipeline.FileManager')
    @patch("pywatts.core.pipeline._get_time_indexes", return_value=["time"])
    def test_transform_pipeline(self, get_time_indexes_mock, fm_mock):
        input_mock = MagicMock()
        input_mock.indexes = {"time": ["20.12.2020"]}
        step_two = MagicMock()
        result_mock = MagicMock()
        step_two.name = "mock"
        step_two.get_result.return_value = {"mock": result_mock}
        self.pipeline.add(module=step_two, input_ids=[1])

        result = self.pipeline.transform(x=input_mock)

        step_two.get_result.assert_called_once_with("20.12.2020",
                                                    None,
                                                    return_all=True)
        get_time_indexes_mock.assert_called_once_with({"x": input_mock})
        self.assertEqual({"mock": result_mock}, result)

    @patch("pywatts.core.pipeline.FileManager")
    @patch("pywatts.core.pipeline.Pipeline.from_folder")
    def test_load(self, from_folder_mock, fm_mock):
        created_pipeline = MagicMock()
        from_folder_mock.return_value = created_pipeline
        pipeline = Pipeline.load({
            'name': 'Pipeline',
            'class': 'Pipeline',
            'module': 'pywatts.core.pipeline',
            'pipeline_path': 'save_path'
        })

        from_folder_mock.assert_called_once_with("save_path")
        self.assertEqual(created_pipeline, pipeline)

    @patch("pywatts.core.pipeline.FileManager")
    @patch("pywatts.core.pipeline.Pipeline.to_folder")
    @patch("pywatts.core.pipeline.os")
    def test_save(self, os_mock, to_folder_mock, fm_mock):
        os_mock.path.join.return_value = "save_path"
        os_mock.path.isdir.return_value = False
        sub_pipeline = Pipeline(batch=pd.Timedelta("1h"))
        detector = MissingValueDetector()
        detector(dataset=sub_pipeline["test"])
        fm_mock = MagicMock()
        fm_mock.basic_path = "path_to_save"
        result = sub_pipeline.save(fm_mock)

        to_folder_mock.assert_called_once_with("save_path")
        os_mock.path.join.assert_called_once_with("path_to_save", "Pipeline")
        self.assertEqual(
            {
                'name': 'Pipeline',
                'class': 'Pipeline',
                'module': 'pywatts.core.pipeline',
                'params': {
                    'batch': '0 days 01:00:00'
                },
                'pipeline_path': 'save_path'
            }, result)

    @patch("pywatts.core.pipeline.FileManager")
    @patch("pywatts.core.pipeline.xr.concat")
    def test_batch_1_transform(self, concat_mock, fm_mock):
        time = pd.date_range('2000-01-01', freq='1H', periods=7)
        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})
        pipeline = Pipeline(batch=pd.Timedelta("1h"))
        step_one = MagicMock()
        step_one.get_result.return_value = {"step": da}
        step_one.name = "step"
        result_mock = MagicMock()
        concat_mock.return_value = result_mock
        pipeline.start_steps["foo"] = StartStep("foo"), None
        pipeline.start_steps["foo"][0].last = False
        step_one.further_elements.side_effect = [
            True, True, True, True, True, True, True, False
        ]
        pipeline.add(module=step_one, input_ids=[1])

        result = pipeline.transform(foo=da)

        self.assertEqual(concat_mock.call_count, 6)
        self.assertEqual(step_one.get_result.call_count, 7)
        self.assertEqual(step_one.further_elements.call_count, 8)
        self.assertEqual({"step": result_mock}, result)

    @patch('pywatts.core.pipeline.FileManager')
    def test_test(self, fm_mock):
        # Add some steps to the pipeline

        # Assert that the computation is set to fit_transform if the ComputationMode was default
        first_step = MagicMock()
        first_step.computation_mode = ComputationMode.Default
        first_step.finished = False
        time = pd.date_range('2000-01-01', freq='1H', periods=7)

        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})

        first_step.get_result.return_value = {"first": da}
        second_step = MagicMock()
        second_step.computation_mode = ComputationMode.Train
        second_step.finished = False
        second_step.get_result.return_value = {"Second": da}

        self.pipeline.add(module=first_step)
        self.pipeline.add(module=second_step)

        self.pipeline.test(
            pd.DataFrame({
                "test": [1, 2, 2, 3, 4],
                "test2": [2, 2, 2, 2, 2]
            },
                         index=pd.DatetimeIndex(
                             pd.date_range('2000-01-01', freq='24H',
                                           periods=5))))

        first_step.get_result.assert_called_once_with(pd.Timestamp(
            '2000-01-01 00:00:00', freq='24H'),
                                                      None,
                                                      return_all=True)
        second_step.get_result.assert_called_once_with(pd.Timestamp(
            '2000-01-01 00:00:00', freq='24H'),
                                                       None,
                                                       return_all=True)

        first_step.set_run_setting.assert_called_once()
        self.assertEqual(
            first_step.set_run_setting.call_args[0][0].computation_mode,
            ComputationMode.Transform)
        second_step.set_run_setting.assert_called_once()
        self.assertEqual(
            second_step.set_run_setting.call_args[0][0].computation_mode,
            ComputationMode.Transform)

        first_step.reset.assert_called_once()
        second_step.reset.assert_called_once()

    @patch('pywatts.core.pipeline.FileManager')
    def test_train(self, fmmock):
        # Add some steps to the pipeline
        time = pd.date_range('2000-01-01', freq='1H', periods=7)

        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})

        # Assert that the computation is set to fit_transform if the ComputationMode was default
        first_step = MagicMock()
        first_step.computation_mode = ComputationMode.Default
        first_step.finished = False
        first_step.get_result.return_value = {"first": da}

        second_step = MagicMock()
        second_step.computation_mode = ComputationMode.Train
        second_step.finished = False
        second_step.get_result.return_value = {"second": da}

        self.pipeline.add(module=first_step)
        self.pipeline.add(module=second_step)

        data = pd.DataFrame({
            "test": [1, 2, 2, 3, 4],
            "test2": [2, 2, 2, 2, 2]
        },
                            index=pd.DatetimeIndex(
                                pd.date_range('2000-01-01',
                                              freq='24H',
                                              periods=5)))
        result, summary = self.pipeline.train(data, summary=True)

        first_step.set_run_setting.assert_called_once()
        self.assertEqual(
            first_step.set_run_setting.call_args[0][0].computation_mode,
            ComputationMode.FitTransform)
        second_step.set_run_setting.assert_called_once()
        self.assertEqual(
            second_step.set_run_setting.call_args[0][0].computation_mode,
            ComputationMode.FitTransform)

        first_step.get_result.assert_called_once_with(pd.Timestamp(
            '2000-01-01 00:00:00', freq='24H'),
                                                      None,
                                                      return_all=True)
        second_step.get_result.assert_called_once_with(pd.Timestamp(
            '2000-01-01 00:00:00', freq='24H'),
                                                       None,
                                                       return_all=True)

        first_step.reset.assert_called_once()
        second_step.reset.assert_called_once()
        xr.testing.assert_equal(result["second"], da)

    @patch("builtins.open", new_callable=mock_open)
    def test_horizon_greater_one_regression_inclusive_summary_file(
            self, open_mock):
        lin_reg = LinearRegression()
        self.fm_mock.get_path.return_value = "summary_path"

        multi_regressor = SKLearnWrapper(lin_reg)(
            foo=self.pipeline["foo"],
            target=self.pipeline["target"],
            target2=self.pipeline["target2"])
        RMSE()(y=self.pipeline["target"], prediction=multi_regressor["target"])

        time = pd.date_range('2000-01-01', freq='24H', periods=5)

        foo = xr.DataArray([1, 2, 3, 4, 5],
                           dims=["time"],
                           coords={'time': time})
        target = xr.DataArray([[2, 3], [2, 4], [2, 5], [2, 6], [2, 7]],
                              dims=["time", "horizon"],
                              coords={
                                  'time': time,
                                  "horizon": [1, 2]
                              })
        target2 = xr.DataArray([3, 3, 3, 3, 3],
                               dims=["time"],
                               coords={'time': time})

        ds = xr.Dataset({'foo': foo, "target": target, "target2": target2})

        result, summary = self.pipeline.train(ds, summary=True)

        self.assertTrue("Training Time" in summary)
        self.assertTrue("RMSE" in summary)

        self.fm_mock.get_path.assert_called_once_with("summary.md")
        open_mock().__enter__.return_value.write.assert_called_once_with(
            summary)

        self.assertTrue("target" in result.keys())