예제 #1
0
    def test_release_at_earliest_opportunity(self, is_async):
        runner = ParallelRunner(is_async=is_async)
        log = runner._manager.list()

        pipeline = Pipeline([
            node(source, None, "first"),
            node(identity, "first", "second"),
            node(sink, "second", None),
        ])
        # pylint: disable=no-member
        catalog = DataCatalog({
            "first":
            runner._manager.LoggingDataSet(log, "first"),
            "second":
            runner._manager.LoggingDataSet(log, "second"),
        })
        runner.run(pipeline, catalog)

        # we want to see "release first" before "load second"
        assert list(log) == [
            ("load", "first"),
            ("release", "first"),
            ("load", "second"),
            ("release", "second"),
        ]
예제 #2
0
    def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out):
        """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet .
        """
        pickle_data = PickleLocalDataSet(
            filepath=str(tmp_path / "data.pkl"), backend="pickle"
        )
        catalog = DataCatalog(
            data_sets={
                "spark_in": spark_in,
                "pickle": pickle_data,
                "spark_out": spark_out,
            }
        )
        pipeline = Pipeline(
            [
                node(identity, "spark_in", "pickle"),
                node(identity, "pickle", "spark_out"),
            ]
        )
        runner = ParallelRunner()

        pattern = (
            r"The following data_sets cannot be "
            r"serialized: \[\'spark\_in\'\, \'spark\_out\'\]"
        )
        with pytest.raises(AttributeError, match=pattern):
            runner.run(pipeline, catalog)
예제 #3
0
 def test_parallel_runner(self, spark_in, spark_out):
     """Test ParallelRunner with SparkDataSet load and save.
     """
     catalog = DataCatalog(data_sets={"spark_in": spark_in, "spark_out": spark_out})
     pipeline = Pipeline([node(identity, "spark_in", "spark_out")])
     runner = ParallelRunner()
     result = runner.run(pipeline, catalog)
     # 'spark_out' is saved in 'tmp_path/input', so the result of run should be empty
     assert not result
예제 #4
0
    def test_parallel_runner(self, spark_in, spark_out):
        """Test ParallelRunner with SparkDataSet load and save.
        """
        catalog = DataCatalog(data_sets={"spark_in": spark_in, "spark_out": spark_out})
        pipeline = Pipeline([node(identity, "spark_in", "spark_out")])
        runner = ParallelRunner()

        pattern = (
            r"The following data_sets cannot be "
            r"serialized: \[\'spark\_in\'\, \'spark\_out\'\]"
        )
        with pytest.raises(AttributeError, match=pattern):
            runner.run(pipeline, catalog)
예제 #5
0
    def test_release_at_earliest_opportunity(self):
        manager = ParallelRunnerManager()
        manager.start()
        log = manager.list()

        pipeline = Pipeline(
            [
                node(source, None, "first"),
                node(identity, "first", "second"),
                node(sink, "second", None),
            ]
        )
        catalog = DataCatalog(
            {
                "first": manager.LoggingDataSet(log, "first"),
                "second": manager.LoggingDataSet(log, "second"),
            }
        )
        ParallelRunner().run(pipeline, catalog)

        # we want to see "release first" before "load second"
        assert list(log) == [
            ("load", "first"),
            ("release", "first"),
            ("load", "second"),
            ("release", "second"),
        ]
예제 #6
0
    def test_on_node_error_hook_is_called_with_parallel_runner(
            self, tmp_path, mocker, logging_hooks):
        log_records = []

        class LogHandler(logging.Handler):  # pylint: disable=abstract-method
            def handle(self, record):
                log_records.append(record)

        broken_context_with_hooks = _create_broken_context_with_hooks(
            tmp_path, mocker, logging_hooks)
        mocker.patch(
            "kedro.framework.context.context.load_context",
            return_value=broken_context_with_hooks,
        )
        logs_queue_listener = QueueListener(logging_hooks.queue, LogHandler())
        logs_queue_listener.start()

        with pytest.raises(ValueError, match="broken"):
            broken_context_with_hooks.run(runner=ParallelRunner(max_workers=2),
                                          node_names=["node1", "node2"])
        logs_queue_listener.stop()

        on_node_error_records = [
            r for r in log_records if r.funcName == "on_node_error"
        ]
        assert len(on_node_error_records) == 2

        for call_record in on_node_error_records:
            self._assert_hook_call_record_has_expected_parameters(
                call_record,
                ["error", "node", "catalog", "inputs", "is_async", "run_id"],
            )
            expected_error = ValueError("broken")
            assert_exceptions_equal(call_record.error, expected_error)
예제 #7
0
    def test_on_node_error_hook_parallel_runner(self, tmp_path, logging_hooks):
        session = KedroSession.create(MOCK_PACKAGE_NAME, tmp_path)
        log_records = []

        class LogHandler(logging.Handler):  # pylint: disable=abstract-method
            def handle(self, record):
                log_records.append(record)

        logs_queue_listener = QueueListener(logging_hooks.queue, LogHandler())
        logs_queue_listener.start()

        with pytest.raises(ValueError, match="broken"):
            try:
                session.run(runner=ParallelRunner(max_workers=2),
                            node_names=["node1", "node2"])
            finally:
                logs_queue_listener.stop()

        on_node_error_records = [
            r for r in log_records if r.funcName == "on_node_error"
        ]
        assert len(on_node_error_records) == 2

        for call_record in on_node_error_records:
            _assert_hook_call_record_has_expected_parameters(
                call_record,
                ["error", "node", "catalog", "inputs", "is_async", "run_id"],
            )
            expected_error = ValueError("broken")
            assert_exceptions_equal(call_record.error, expected_error)
예제 #8
0
    def test_before_and_after_node_run_hooks_are_called_with_parallel_runner(
            self, context_with_hooks, dummy_dataframe, logs_queue):
        log_records = []

        class LogHandler(logging.Handler):  # pylint: disable=abstract-method
            def handle(self, record):
                log_records.append(record)

        logs_queue_listener = QueueListener(logs_queue, LogHandler())
        logs_queue_listener.start()
        context_with_hooks.catalog.save("cars", dummy_dataframe)
        context_with_hooks.catalog.save("boats", dummy_dataframe)
        context_with_hooks.run(runner=ParallelRunner(),
                               node_names=["node1", "node2"])
        logs_queue_listener.stop()

        before_node_run_log_records = [
            r for r in log_records if r.funcName == "before_node_run"
        ]
        assert len(before_node_run_log_records) == 2
        for record in before_node_run_log_records:
            assert record.getMessage() == "About to run node"
            assert record.node.name in ["node1", "node2"]
            assert set(record.inputs.keys()) <= {"cars", "boats"}

        after_node_run_log_records = [
            r for r in log_records if r.funcName == "after_node_run"
        ]
        assert len(after_node_run_log_records) == 2
        for record in after_node_run_log_records:
            assert record.getMessage() == "Ran node"
            assert record.node.name in ["node1", "node2"]
            assert set(record.outputs.keys()) <= {"planes", "ships"}
    def test_before_and_after_dataset_saved_hooks_parallel_runner(
            self, mock_session, logs_listener, dummy_dataframe):
        context = mock_session.load_context()
        catalog = context.catalog
        catalog.save("cars", dummy_dataframe)
        catalog.save("boats", dummy_dataframe)

        mock_session.run(runner=ParallelRunner(),
                         node_names=["node1", "node2"])

        before_dataset_saved_log_records = [
            r for r in logs_listener.logs
            if r.funcName == "before_dataset_saved"
        ]
        assert len(before_dataset_saved_log_records) == 2
        for record in before_dataset_saved_log_records:
            assert record.getMessage() == "Before dataset saved"
            assert record.dataset_name in ["planes", "ships"]
            assert record.data.to_dict() == dummy_dataframe.to_dict()

        after_dataset_saved_log_records = [
            r for r in logs_listener.logs
            if r.funcName == "after_dataset_saved"
        ]
        assert len(after_dataset_saved_log_records) == 2
        for record in after_dataset_saved_log_records:
            assert record.getMessage() == "After dataset saved"
            assert record.dataset_name in ["planes", "ships"]
            assert record.data.to_dict() == dummy_dataframe.to_dict()
    def test_before_and_after_node_run_hooks_parallel_runner(
            self, mock_session, logs_listener, dummy_dataframe):
        context = mock_session.load_context()
        catalog = context.catalog
        catalog.save("cars", dummy_dataframe)
        catalog.save("boats", dummy_dataframe)

        mock_session.run(runner=ParallelRunner(),
                         node_names=["node1", "node2"])

        before_node_run_log_records = [
            r for r in logs_listener.logs if r.funcName == "before_node_run"
        ]
        assert len(before_node_run_log_records) == 2
        for record in before_node_run_log_records:
            assert record.getMessage() == "About to run node"
            assert record.node.name in ["node1", "node2"]
            assert set(record.inputs.keys()) <= {"cars", "boats"}

        after_node_run_log_records = [
            r for r in logs_listener.logs if r.funcName == "after_node_run"
        ]
        assert len(after_node_run_log_records) == 2
        for record in after_node_run_log_records:
            assert record.getMessage() == "Ran node"
            assert record.node.name in ["node1", "node2"]
            assert set(record.outputs.keys()) <= {"planes", "ships"}
예제 #11
0
 def test_memory_data_set_input(self, is_async, fan_out_fan_in):
     pipeline = Pipeline([fan_out_fan_in])
     catalog = DataCatalog({"A": MemoryDataSet("42")})
     result = ParallelRunner(is_async=is_async).run(pipeline, catalog)
     assert "Z" in result
     assert len(result["Z"]) == 3
     assert result["Z"] == ("42", "42", "42")
예제 #12
0
    def test_specified_max_workers_bellow_cpu_cores_count(
        self,
        is_async,
        mocker,
        fan_out_fan_in,
        catalog,
        cpu_cores,
        user_specified_number,
        expected_number,
    ):  # pylint: disable=too-many-arguments
        """
        The system has 2 cores, but we initialize the runner with max_workers=4.
        `fan_out_fan_in` pipeline needs 3 processes.
        A pool with 3 workers should be used.
        """
        mocker.patch("os.cpu_count", return_value=cpu_cores)

        executor_cls_mock = mocker.patch(
            "kedro.runner.parallel_runner.ProcessPoolExecutor",
            wraps=ProcessPoolExecutor,
        )

        catalog.add_feed_dict(dict(A=42))
        result = ParallelRunner(max_workers=user_specified_number,
                                is_async=is_async).run(fan_out_fan_in, catalog)
        assert result == {"Z": (42, 42, 42)}

        executor_cls_mock.assert_called_once_with(max_workers=expected_number)
예제 #13
0
 def test_decorate_pipeline(self, is_async, fan_out_fan_in, catalog):
     catalog.add_feed_dict(dict(A=42))
     result = ParallelRunner(is_async=is_async).run(
         fan_out_fan_in.decorate(log_time), catalog)
     assert "Z" in result
     assert len(result["Z"]) == 3
     assert result["Z"] == (42, 42, 42)
예제 #14
0
 def test_parallel_runner(self, is_async, spark_in):
     """Test ParallelRunner with SparkDataSet fails.
     """
     catalog = DataCatalog(data_sets={"spark_in": spark_in})
     pipeline = Pipeline([node(identity, "spark_in", "spark_out")])
     pattern = r"The following data_sets cannot be serialized: \['spark_in'\]"
     with pytest.raises(AttributeError, match=pattern):
         ParallelRunner(is_async=is_async).run(pipeline, catalog)
예제 #15
0
    def test_parallel_run_arg(self, dummy_context, dummy_dataframe, caplog):
        dummy_context.catalog.save("cars", dummy_dataframe)
        dummy_context.run(runner=ParallelRunner())

        log_msgs = [record.getMessage() for record in caplog.records]
        log_names = [record.name for record in caplog.records]
        assert "kedro.runner.parallel_runner" in log_names
        assert "Pipeline execution completed successfully." in log_msgs
예제 #16
0
 def test_decorated_nodes(self, is_async, decorated_fan_out_fan_in,
                          catalog):
     catalog.add_feed_dict(dict(A=42))
     result = ParallelRunner(is_async=is_async).run(
         decorated_fan_out_fan_in, catalog)
     assert "Z" in result
     assert len(result["Z"]) == 3
     assert result["Z"] == (42, 42, 42)
예제 #17
0
 def test_node_returning_none(self, is_async):
     pipeline = Pipeline(
         [node(identity, "A", "B"),
          node(return_none, "B", "C")])
     catalog = DataCatalog({"A": MemoryDataSet("42")})
     pattern = "Saving `None` to a `DataSet` is not allowed"
     with pytest.raises(DataSetError, match=pattern):
         ParallelRunner(is_async=is_async).run(pipeline, catalog)
예제 #18
0
 def test_memory_data_set_output(self, is_async, fan_out_fan_in):
     """ParallelRunner does not support output to externally
     created MemoryDataSets.
     """
     pipeline = Pipeline([fan_out_fan_in])
     catalog = DataCatalog({"C": MemoryDataSet()}, dict(A=42))
     with pytest.raises(AttributeError, match="['C']"):
         ParallelRunner(is_async=is_async).run(pipeline, catalog)
예제 #19
0
    def test_release_transcoded(self, is_async):
        runner = ParallelRunner(is_async=is_async)
        log = runner._manager.list()

        pipeline = Pipeline(
            [node(source, None, "ds@save"),
             node(sink, "ds@load", None)])
        catalog = DataCatalog({
            "ds@save": LoggingDataSet(log, "save"),
            "ds@load": LoggingDataSet(log, "load"),
        })

        ParallelRunner().run(pipeline, catalog)

        # we want to see both datasets being released
        assert list(log) == [("release", "save"), ("load", "load"),
                             ("release", "load")]
예제 #20
0
    def test_dont_release_inputs_and_outputs(self, is_async):
        runner = ParallelRunner(is_async=is_async)
        log = runner._manager.list()

        pipeline = Pipeline(
            [node(identity, "in", "middle"), node(identity, "middle", "out")]
        )
        catalog = DataCatalog(
            {
                "in": runner._manager.LoggingDataSet(log, "in", "stuff"),
                "middle": runner._manager.LoggingDataSet(log, "middle"),
                "out": runner._manager.LoggingDataSet(log, "out"),
            }
        )
        ParallelRunner().run(pipeline, catalog)

        # we don't want to see release in or out in here
        assert list(log) == [("load", "in"), ("load", "middle"), ("release", "middle")]
예제 #21
0
    def test_max_worker_windows(self, mocker):
        """The ProcessPoolExecutor on Python 3.7+
        has a quirk with the max worker number on Windows
        and requires it to be <=61"""
        mocker.patch("os.cpu_count", return_value=100)
        mocker.patch("sys.platform", "win32")

        parallel_runner = ParallelRunner()
        assert parallel_runner._max_workers == _MAX_WINDOWS_WORKERS
예제 #22
0
    def test_count_multiple_loads(self):
        runner = ParallelRunner()
        log = runner._manager.list()

        pipeline = Pipeline([
            node(source, None, "dataset"),
            node(sink, "dataset", None, name="bob"),
            node(sink, "dataset", None, name="fred"),
        ])
        catalog = DataCatalog(
            {"dataset": runner._manager.LoggingDataSet(log, "dataset")})
        runner.run(pipeline, catalog)

        # we want to the release after both the loads
        assert list(log) == [
            ("load", "dataset"),
            ("load", "dataset"),
            ("release", "dataset"),
        ]
예제 #23
0
    def test_correct_input_update_parallel(self, tmp_path, dummy_dataframe):
        session = KedroSession.create(MOCK_PACKAGE_NAME, tmp_path)
        context = session.load_context()
        catalog = context.catalog
        catalog.save("cars", dummy_dataframe)
        catalog.save("boats", dummy_dataframe)

        result = session.run(runner=ParallelRunner())
        assert isinstance(result["planes"], MockDatasetReplacement)
        assert isinstance(result["ships"], pd.DataFrame)
예제 #24
0
    def test_memory_dataset_not_serializable(self, is_async, catalog):
        """Memory dataset cannot be serializable because of data it stores."""
        data = return_not_serializable(None)
        pipeline = Pipeline([node(return_not_serializable, "A", "B")])
        catalog.add_feed_dict(feed_dict=dict(A=42))
        pattern = (
            fr"{str(data.__class__)} cannot be serialized. ParallelRunner implicit "
            fr"memory datasets can only be used with serializable data")

        with pytest.raises(DataSetError, match=pattern):
            ParallelRunner(is_async=is_async).run(pipeline, catalog)
예제 #25
0
    def test_parallel_run_arg(self, dummy_context, dummy_dataframe, caplog, mocker):
        mocker.patch(
            "kedro.framework.context.context.load_context", return_value=dummy_context
        )
        dummy_context.catalog.save("cars", dummy_dataframe)
        dummy_context.run(runner=ParallelRunner())

        log_msgs = [record.getMessage() for record in caplog.records]
        log_names = [record.name for record in caplog.records]
        assert "kedro.runner.parallel_runner" in log_names
        assert "Pipeline execution completed successfully." in log_msgs
예제 #26
0
    def test_parallel_runner_with_memory_dataset(self, spark_in, spark_out,
                                                 sample_spark_df):
        """Run ParallelRunner with SparkDataSet -> MemoryDataSet -> SparkDataSet.
        """
        catalog = DataCatalog(data_sets={
            "spark_in": spark_in,
            "spark_out": spark_out
        })
        pipeline = Pipeline([
            node(identity, "spark_in", "memory"),
            node(identity, "memory", "spark_out"),
        ])
        runner = ParallelRunner()

        pattern = (
            r"{0} cannot be serialized. ParallelRunner implicit memory datasets "
            r"can only be used with serializable data".format(
                str(sample_spark_df.__class__)))
        with pytest.raises(DataSetError, match=pattern):
            runner.run(pipeline, catalog)
    def test_parallel_runner(self, is_async):
        """Test ParallelRunner with SparkDataSet fails."""
        def no_output(x):
            _ = x + 1  # pragma: no cover

        delta_ds = DeltaTableDataSet(filepath="")
        catalog = DataCatalog(data_sets={"delta_in": delta_ds})
        pipeline = Pipeline([node(no_output, "delta_in", None)])
        pattern = (r"The following data sets cannot be used with "
                   r"multiprocessing: \['delta_in'\]")
        with pytest.raises(AttributeError, match=pattern):
            ParallelRunner(is_async=is_async).run(pipeline, catalog)
예제 #28
0
 def run(
         self,
         *args,  # type: Any
         runner=None,  # type: Union[AbstractRunner, str]
         **kwargs,  # type: Any
 ):
     # type: (...) -> Dict[str, Any]
     if isinstance(runner, str):
         assert runner in {"ParallelRunner", "SequentialRunner"}
         runner = (ParallelRunner()
                   if runner == "ParallelRunner" else SequentialRunner())
     return super().run(*args, runner=runner, **kwargs)
예제 #29
0
    def test_broken_input_update_parallel(self, tmp_path, dummy_dataframe):
        session = KedroSession.create(MOCK_PACKAGE_NAME, tmp_path)
        context = session.load_context()
        catalog = context.catalog
        catalog.save("cars", dummy_dataframe)
        catalog.save("boats", dummy_dataframe)

        pattern = (
            "`before_node_run` must return either None or a dictionary "
            "mapping dataset names to updated values, got `MockDatasetReplacement`"
        )
        with pytest.raises(TypeError, match=re.escape(pattern)):
            session.run(runner=ParallelRunner())
예제 #30
0
    def test_unable_to_schedule_all_nodes(self, mocker, is_async,
                                          fan_out_fan_in, catalog):
        """Test the error raised when `futures` variable is empty,
        but `todo_nodes` is not (can barely happen in real life).
        """
        catalog.add_feed_dict(dict(A=42))
        runner = ParallelRunner(is_async=is_async)

        real_node_deps = fan_out_fan_in.node_dependencies
        # construct deliberately unresolvable dependencies for all
        # pipeline nodes, so that none can be run
        fake_node_deps = {k: {"you_shall_not_pass"} for k in real_node_deps}
        # property mock requires patching a class, not an instance
        mocker.patch(
            "kedro.pipeline.Pipeline.node_dependencies",
            new_callable=mocker.PropertyMock,
            return_value=fake_node_deps,
        )

        pattern = "Unable to schedule new tasks although some nodes have not been run"
        with pytest.raises(RuntimeError, match=pattern):
            runner.run(fan_out_fan_in, catalog)