def test_release_at_earliest_opportunity(self, is_async): runner = ParallelRunner(is_async=is_async) log = runner._manager.list() pipeline = Pipeline([ node(source, None, "first"), node(identity, "first", "second"), node(sink, "second", None), ]) # pylint: disable=no-member catalog = DataCatalog({ "first": runner._manager.LoggingDataSet(log, "first"), "second": runner._manager.LoggingDataSet(log, "second"), }) runner.run(pipeline, catalog) # we want to see "release first" before "load second" assert list(log) == [ ("load", "first"), ("release", "first"), ("load", "second"), ("release", "second"), ]
def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out): """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet . """ pickle_data = PickleLocalDataSet( filepath=str(tmp_path / "data.pkl"), backend="pickle" ) catalog = DataCatalog( data_sets={ "spark_in": spark_in, "pickle": pickle_data, "spark_out": spark_out, } ) pipeline = Pipeline( [ node(identity, "spark_in", "pickle"), node(identity, "pickle", "spark_out"), ] ) runner = ParallelRunner() pattern = ( r"The following data_sets cannot be " r"serialized: \[\'spark\_in\'\, \'spark\_out\'\]" ) with pytest.raises(AttributeError, match=pattern): runner.run(pipeline, catalog)
def test_parallel_runner(self, spark_in, spark_out): """Test ParallelRunner with SparkDataSet load and save. """ catalog = DataCatalog(data_sets={"spark_in": spark_in, "spark_out": spark_out}) pipeline = Pipeline([node(identity, "spark_in", "spark_out")]) runner = ParallelRunner() result = runner.run(pipeline, catalog) # 'spark_out' is saved in 'tmp_path/input', so the result of run should be empty assert not result
def test_parallel_runner(self, spark_in, spark_out): """Test ParallelRunner with SparkDataSet load and save. """ catalog = DataCatalog(data_sets={"spark_in": spark_in, "spark_out": spark_out}) pipeline = Pipeline([node(identity, "spark_in", "spark_out")]) runner = ParallelRunner() pattern = ( r"The following data_sets cannot be " r"serialized: \[\'spark\_in\'\, \'spark\_out\'\]" ) with pytest.raises(AttributeError, match=pattern): runner.run(pipeline, catalog)
def test_release_at_earliest_opportunity(self): manager = ParallelRunnerManager() manager.start() log = manager.list() pipeline = Pipeline( [ node(source, None, "first"), node(identity, "first", "second"), node(sink, "second", None), ] ) catalog = DataCatalog( { "first": manager.LoggingDataSet(log, "first"), "second": manager.LoggingDataSet(log, "second"), } ) ParallelRunner().run(pipeline, catalog) # we want to see "release first" before "load second" assert list(log) == [ ("load", "first"), ("release", "first"), ("load", "second"), ("release", "second"), ]
def test_on_node_error_hook_is_called_with_parallel_runner( self, tmp_path, mocker, logging_hooks): log_records = [] class LogHandler(logging.Handler): # pylint: disable=abstract-method def handle(self, record): log_records.append(record) broken_context_with_hooks = _create_broken_context_with_hooks( tmp_path, mocker, logging_hooks) mocker.patch( "kedro.framework.context.context.load_context", return_value=broken_context_with_hooks, ) logs_queue_listener = QueueListener(logging_hooks.queue, LogHandler()) logs_queue_listener.start() with pytest.raises(ValueError, match="broken"): broken_context_with_hooks.run(runner=ParallelRunner(max_workers=2), node_names=["node1", "node2"]) logs_queue_listener.stop() on_node_error_records = [ r for r in log_records if r.funcName == "on_node_error" ] assert len(on_node_error_records) == 2 for call_record in on_node_error_records: self._assert_hook_call_record_has_expected_parameters( call_record, ["error", "node", "catalog", "inputs", "is_async", "run_id"], ) expected_error = ValueError("broken") assert_exceptions_equal(call_record.error, expected_error)
def test_on_node_error_hook_parallel_runner(self, tmp_path, logging_hooks): session = KedroSession.create(MOCK_PACKAGE_NAME, tmp_path) log_records = [] class LogHandler(logging.Handler): # pylint: disable=abstract-method def handle(self, record): log_records.append(record) logs_queue_listener = QueueListener(logging_hooks.queue, LogHandler()) logs_queue_listener.start() with pytest.raises(ValueError, match="broken"): try: session.run(runner=ParallelRunner(max_workers=2), node_names=["node1", "node2"]) finally: logs_queue_listener.stop() on_node_error_records = [ r for r in log_records if r.funcName == "on_node_error" ] assert len(on_node_error_records) == 2 for call_record in on_node_error_records: _assert_hook_call_record_has_expected_parameters( call_record, ["error", "node", "catalog", "inputs", "is_async", "run_id"], ) expected_error = ValueError("broken") assert_exceptions_equal(call_record.error, expected_error)
def test_before_and_after_node_run_hooks_are_called_with_parallel_runner( self, context_with_hooks, dummy_dataframe, logs_queue): log_records = [] class LogHandler(logging.Handler): # pylint: disable=abstract-method def handle(self, record): log_records.append(record) logs_queue_listener = QueueListener(logs_queue, LogHandler()) logs_queue_listener.start() context_with_hooks.catalog.save("cars", dummy_dataframe) context_with_hooks.catalog.save("boats", dummy_dataframe) context_with_hooks.run(runner=ParallelRunner(), node_names=["node1", "node2"]) logs_queue_listener.stop() before_node_run_log_records = [ r for r in log_records if r.funcName == "before_node_run" ] assert len(before_node_run_log_records) == 2 for record in before_node_run_log_records: assert record.getMessage() == "About to run node" assert record.node.name in ["node1", "node2"] assert set(record.inputs.keys()) <= {"cars", "boats"} after_node_run_log_records = [ r for r in log_records if r.funcName == "after_node_run" ] assert len(after_node_run_log_records) == 2 for record in after_node_run_log_records: assert record.getMessage() == "Ran node" assert record.node.name in ["node1", "node2"] assert set(record.outputs.keys()) <= {"planes", "ships"}
def test_before_and_after_dataset_saved_hooks_parallel_runner( self, mock_session, logs_listener, dummy_dataframe): context = mock_session.load_context() catalog = context.catalog catalog.save("cars", dummy_dataframe) catalog.save("boats", dummy_dataframe) mock_session.run(runner=ParallelRunner(), node_names=["node1", "node2"]) before_dataset_saved_log_records = [ r for r in logs_listener.logs if r.funcName == "before_dataset_saved" ] assert len(before_dataset_saved_log_records) == 2 for record in before_dataset_saved_log_records: assert record.getMessage() == "Before dataset saved" assert record.dataset_name in ["planes", "ships"] assert record.data.to_dict() == dummy_dataframe.to_dict() after_dataset_saved_log_records = [ r for r in logs_listener.logs if r.funcName == "after_dataset_saved" ] assert len(after_dataset_saved_log_records) == 2 for record in after_dataset_saved_log_records: assert record.getMessage() == "After dataset saved" assert record.dataset_name in ["planes", "ships"] assert record.data.to_dict() == dummy_dataframe.to_dict()
def test_before_and_after_node_run_hooks_parallel_runner( self, mock_session, logs_listener, dummy_dataframe): context = mock_session.load_context() catalog = context.catalog catalog.save("cars", dummy_dataframe) catalog.save("boats", dummy_dataframe) mock_session.run(runner=ParallelRunner(), node_names=["node1", "node2"]) before_node_run_log_records = [ r for r in logs_listener.logs if r.funcName == "before_node_run" ] assert len(before_node_run_log_records) == 2 for record in before_node_run_log_records: assert record.getMessage() == "About to run node" assert record.node.name in ["node1", "node2"] assert set(record.inputs.keys()) <= {"cars", "boats"} after_node_run_log_records = [ r for r in logs_listener.logs if r.funcName == "after_node_run" ] assert len(after_node_run_log_records) == 2 for record in after_node_run_log_records: assert record.getMessage() == "Ran node" assert record.node.name in ["node1", "node2"] assert set(record.outputs.keys()) <= {"planes", "ships"}
def test_memory_data_set_input(self, is_async, fan_out_fan_in): pipeline = Pipeline([fan_out_fan_in]) catalog = DataCatalog({"A": MemoryDataSet("42")}) result = ParallelRunner(is_async=is_async).run(pipeline, catalog) assert "Z" in result assert len(result["Z"]) == 3 assert result["Z"] == ("42", "42", "42")
def test_specified_max_workers_bellow_cpu_cores_count( self, is_async, mocker, fan_out_fan_in, catalog, cpu_cores, user_specified_number, expected_number, ): # pylint: disable=too-many-arguments """ The system has 2 cores, but we initialize the runner with max_workers=4. `fan_out_fan_in` pipeline needs 3 processes. A pool with 3 workers should be used. """ mocker.patch("os.cpu_count", return_value=cpu_cores) executor_cls_mock = mocker.patch( "kedro.runner.parallel_runner.ProcessPoolExecutor", wraps=ProcessPoolExecutor, ) catalog.add_feed_dict(dict(A=42)) result = ParallelRunner(max_workers=user_specified_number, is_async=is_async).run(fan_out_fan_in, catalog) assert result == {"Z": (42, 42, 42)} executor_cls_mock.assert_called_once_with(max_workers=expected_number)
def test_decorate_pipeline(self, is_async, fan_out_fan_in, catalog): catalog.add_feed_dict(dict(A=42)) result = ParallelRunner(is_async=is_async).run( fan_out_fan_in.decorate(log_time), catalog) assert "Z" in result assert len(result["Z"]) == 3 assert result["Z"] == (42, 42, 42)
def test_parallel_runner(self, is_async, spark_in): """Test ParallelRunner with SparkDataSet fails. """ catalog = DataCatalog(data_sets={"spark_in": spark_in}) pipeline = Pipeline([node(identity, "spark_in", "spark_out")]) pattern = r"The following data_sets cannot be serialized: \['spark_in'\]" with pytest.raises(AttributeError, match=pattern): ParallelRunner(is_async=is_async).run(pipeline, catalog)
def test_parallel_run_arg(self, dummy_context, dummy_dataframe, caplog): dummy_context.catalog.save("cars", dummy_dataframe) dummy_context.run(runner=ParallelRunner()) log_msgs = [record.getMessage() for record in caplog.records] log_names = [record.name for record in caplog.records] assert "kedro.runner.parallel_runner" in log_names assert "Pipeline execution completed successfully." in log_msgs
def test_decorated_nodes(self, is_async, decorated_fan_out_fan_in, catalog): catalog.add_feed_dict(dict(A=42)) result = ParallelRunner(is_async=is_async).run( decorated_fan_out_fan_in, catalog) assert "Z" in result assert len(result["Z"]) == 3 assert result["Z"] == (42, 42, 42)
def test_node_returning_none(self, is_async): pipeline = Pipeline( [node(identity, "A", "B"), node(return_none, "B", "C")]) catalog = DataCatalog({"A": MemoryDataSet("42")}) pattern = "Saving `None` to a `DataSet` is not allowed" with pytest.raises(DataSetError, match=pattern): ParallelRunner(is_async=is_async).run(pipeline, catalog)
def test_memory_data_set_output(self, is_async, fan_out_fan_in): """ParallelRunner does not support output to externally created MemoryDataSets. """ pipeline = Pipeline([fan_out_fan_in]) catalog = DataCatalog({"C": MemoryDataSet()}, dict(A=42)) with pytest.raises(AttributeError, match="['C']"): ParallelRunner(is_async=is_async).run(pipeline, catalog)
def test_release_transcoded(self, is_async): runner = ParallelRunner(is_async=is_async) log = runner._manager.list() pipeline = Pipeline( [node(source, None, "ds@save"), node(sink, "ds@load", None)]) catalog = DataCatalog({ "ds@save": LoggingDataSet(log, "save"), "ds@load": LoggingDataSet(log, "load"), }) ParallelRunner().run(pipeline, catalog) # we want to see both datasets being released assert list(log) == [("release", "save"), ("load", "load"), ("release", "load")]
def test_dont_release_inputs_and_outputs(self, is_async): runner = ParallelRunner(is_async=is_async) log = runner._manager.list() pipeline = Pipeline( [node(identity, "in", "middle"), node(identity, "middle", "out")] ) catalog = DataCatalog( { "in": runner._manager.LoggingDataSet(log, "in", "stuff"), "middle": runner._manager.LoggingDataSet(log, "middle"), "out": runner._manager.LoggingDataSet(log, "out"), } ) ParallelRunner().run(pipeline, catalog) # we don't want to see release in or out in here assert list(log) == [("load", "in"), ("load", "middle"), ("release", "middle")]
def test_max_worker_windows(self, mocker): """The ProcessPoolExecutor on Python 3.7+ has a quirk with the max worker number on Windows and requires it to be <=61""" mocker.patch("os.cpu_count", return_value=100) mocker.patch("sys.platform", "win32") parallel_runner = ParallelRunner() assert parallel_runner._max_workers == _MAX_WINDOWS_WORKERS
def test_count_multiple_loads(self): runner = ParallelRunner() log = runner._manager.list() pipeline = Pipeline([ node(source, None, "dataset"), node(sink, "dataset", None, name="bob"), node(sink, "dataset", None, name="fred"), ]) catalog = DataCatalog( {"dataset": runner._manager.LoggingDataSet(log, "dataset")}) runner.run(pipeline, catalog) # we want to the release after both the loads assert list(log) == [ ("load", "dataset"), ("load", "dataset"), ("release", "dataset"), ]
def test_correct_input_update_parallel(self, tmp_path, dummy_dataframe): session = KedroSession.create(MOCK_PACKAGE_NAME, tmp_path) context = session.load_context() catalog = context.catalog catalog.save("cars", dummy_dataframe) catalog.save("boats", dummy_dataframe) result = session.run(runner=ParallelRunner()) assert isinstance(result["planes"], MockDatasetReplacement) assert isinstance(result["ships"], pd.DataFrame)
def test_memory_dataset_not_serializable(self, is_async, catalog): """Memory dataset cannot be serializable because of data it stores.""" data = return_not_serializable(None) pipeline = Pipeline([node(return_not_serializable, "A", "B")]) catalog.add_feed_dict(feed_dict=dict(A=42)) pattern = ( fr"{str(data.__class__)} cannot be serialized. ParallelRunner implicit " fr"memory datasets can only be used with serializable data") with pytest.raises(DataSetError, match=pattern): ParallelRunner(is_async=is_async).run(pipeline, catalog)
def test_parallel_run_arg(self, dummy_context, dummy_dataframe, caplog, mocker): mocker.patch( "kedro.framework.context.context.load_context", return_value=dummy_context ) dummy_context.catalog.save("cars", dummy_dataframe) dummy_context.run(runner=ParallelRunner()) log_msgs = [record.getMessage() for record in caplog.records] log_names = [record.name for record in caplog.records] assert "kedro.runner.parallel_runner" in log_names assert "Pipeline execution completed successfully." in log_msgs
def test_parallel_runner_with_memory_dataset(self, spark_in, spark_out, sample_spark_df): """Run ParallelRunner with SparkDataSet -> MemoryDataSet -> SparkDataSet. """ catalog = DataCatalog(data_sets={ "spark_in": spark_in, "spark_out": spark_out }) pipeline = Pipeline([ node(identity, "spark_in", "memory"), node(identity, "memory", "spark_out"), ]) runner = ParallelRunner() pattern = ( r"{0} cannot be serialized. ParallelRunner implicit memory datasets " r"can only be used with serializable data".format( str(sample_spark_df.__class__))) with pytest.raises(DataSetError, match=pattern): runner.run(pipeline, catalog)
def test_parallel_runner(self, is_async): """Test ParallelRunner with SparkDataSet fails.""" def no_output(x): _ = x + 1 # pragma: no cover delta_ds = DeltaTableDataSet(filepath="") catalog = DataCatalog(data_sets={"delta_in": delta_ds}) pipeline = Pipeline([node(no_output, "delta_in", None)]) pattern = (r"The following data sets cannot be used with " r"multiprocessing: \['delta_in'\]") with pytest.raises(AttributeError, match=pattern): ParallelRunner(is_async=is_async).run(pipeline, catalog)
def run( self, *args, # type: Any runner=None, # type: Union[AbstractRunner, str] **kwargs, # type: Any ): # type: (...) -> Dict[str, Any] if isinstance(runner, str): assert runner in {"ParallelRunner", "SequentialRunner"} runner = (ParallelRunner() if runner == "ParallelRunner" else SequentialRunner()) return super().run(*args, runner=runner, **kwargs)
def test_broken_input_update_parallel(self, tmp_path, dummy_dataframe): session = KedroSession.create(MOCK_PACKAGE_NAME, tmp_path) context = session.load_context() catalog = context.catalog catalog.save("cars", dummy_dataframe) catalog.save("boats", dummy_dataframe) pattern = ( "`before_node_run` must return either None or a dictionary " "mapping dataset names to updated values, got `MockDatasetReplacement`" ) with pytest.raises(TypeError, match=re.escape(pattern)): session.run(runner=ParallelRunner())
def test_unable_to_schedule_all_nodes(self, mocker, is_async, fan_out_fan_in, catalog): """Test the error raised when `futures` variable is empty, but `todo_nodes` is not (can barely happen in real life). """ catalog.add_feed_dict(dict(A=42)) runner = ParallelRunner(is_async=is_async) real_node_deps = fan_out_fan_in.node_dependencies # construct deliberately unresolvable dependencies for all # pipeline nodes, so that none can be run fake_node_deps = {k: {"you_shall_not_pass"} for k in real_node_deps} # property mock requires patching a class, not an instance mocker.patch( "kedro.pipeline.Pipeline.node_dependencies", new_callable=mocker.PropertyMock, return_value=fake_node_deps, ) pattern = "Unable to schedule new tasks although some nodes have not been run" with pytest.raises(RuntimeError, match=pattern): runner.run(fan_out_fan_in, catalog)