def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out): """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet . """ pickle_data = PickleLocalDataSet( filepath=str(tmp_path / "data.pkl"), backend="pickle" ) catalog = DataCatalog( data_sets={ "spark_in": spark_in, "pickle": pickle_data, "spark_out": spark_out, } ) pipeline = Pipeline( [ node(identity, "spark_in", "pickle"), node(identity, "pickle", "spark_out"), ] ) runner = ParallelRunner() pattern = ( r"The following data_sets cannot be " r"serialized: \[\'spark\_in\'\, \'spark\_out\'\]" ) with pytest.raises(AttributeError, match=pattern): runner.run(pipeline, catalog)
def test_release_at_earliest_opportunity(self, is_async): runner = ParallelRunner(is_async=is_async) log = runner._manager.list() pipeline = Pipeline([ node(source, None, "first"), node(identity, "first", "second"), node(sink, "second", None), ]) # pylint: disable=no-member catalog = DataCatalog({ "first": runner._manager.LoggingDataSet(log, "first"), "second": runner._manager.LoggingDataSet(log, "second"), }) runner.run(pipeline, catalog) # we want to see "release first" before "load second" assert list(log) == [ ("load", "first"), ("release", "first"), ("load", "second"), ("release", "second"), ]
def test_parallel_runner(self, spark_in, spark_out): """Test ParallelRunner with SparkDataSet load and save. """ catalog = DataCatalog(data_sets={"spark_in": spark_in, "spark_out": spark_out}) pipeline = Pipeline([node(identity, "spark_in", "spark_out")]) runner = ParallelRunner() pattern = ( r"The following data_sets cannot be " r"serialized: \[\'spark\_in\'\, \'spark\_out\'\]" ) with pytest.raises(AttributeError, match=pattern): runner.run(pipeline, catalog)
def test_parallel_runner(self, spark_in, spark_out): """Test ParallelRunner with SparkDataSet load and save. """ catalog = DataCatalog(data_sets={"spark_in": spark_in, "spark_out": spark_out}) pipeline = Pipeline([node(identity, "spark_in", "spark_out")]) runner = ParallelRunner() result = runner.run(pipeline, catalog) # 'spark_out' is saved in 'tmp_path/input', so the result of run should be empty assert not result
def test_count_multiple_loads(self): runner = ParallelRunner() log = runner._manager.list() pipeline = Pipeline([ node(source, None, "dataset"), node(sink, "dataset", None, name="bob"), node(sink, "dataset", None, name="fred"), ]) catalog = DataCatalog( {"dataset": runner._manager.LoggingDataSet(log, "dataset")}) runner.run(pipeline, catalog) # we want to the release after both the loads assert list(log) == [ ("load", "dataset"), ("load", "dataset"), ("release", "dataset"), ]
def test_parallel_runner_with_memory_dataset(self, spark_in, spark_out, sample_spark_df): """Run ParallelRunner with SparkDataSet -> MemoryDataSet -> SparkDataSet. """ catalog = DataCatalog(data_sets={ "spark_in": spark_in, "spark_out": spark_out }) pipeline = Pipeline([ node(identity, "spark_in", "memory"), node(identity, "memory", "spark_out"), ]) runner = ParallelRunner() pattern = ( r"{0} cannot be serialized. ParallelRunner implicit memory datasets " r"can only be used with serializable data".format( str(sample_spark_df.__class__))) with pytest.raises(DataSetError, match=pattern): runner.run(pipeline, catalog)
def test_unable_to_schedule_all_nodes(self, mocker, is_async, fan_out_fan_in, catalog): """Test the error raised when `futures` variable is empty, but `todo_nodes` is not (can barely happen in real life). """ catalog.add_feed_dict(dict(A=42)) runner = ParallelRunner(is_async=is_async) real_node_deps = fan_out_fan_in.node_dependencies # construct deliberately unresolvable dependencies for all # pipeline nodes, so that none can be run fake_node_deps = {k: {"you_shall_not_pass"} for k in real_node_deps} # property mock requires patching a class, not an instance mocker.patch( "kedro.pipeline.Pipeline.node_dependencies", new_callable=mocker.PropertyMock, return_value=fake_node_deps, ) pattern = "Unable to schedule new tasks although some nodes have not been run" with pytest.raises(RuntimeError, match=pattern): runner.run(fan_out_fan_in, catalog)
def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out): """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet . """ pickle_data = PickleDataSet(filepath=str(tmp_path / "data.pkl")) catalog = DataCatalog( data_sets={ "spark_in": spark_in, "pickle": pickle_data, "spark_out": spark_out, } ) pipeline = Pipeline( [ node(identity, "spark_in", "pickle"), node(identity, "pickle", "spark_out"), ] ) runner = ParallelRunner() pattern = r"Failed while saving data to data set PickleDataSet" with pytest.raises(DataSetError, match=pattern): runner.run(pipeline, catalog)
def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out, sample_spark_df): """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet . """ pickle_data = PickleLocalDataSet(filepath=str(tmp_path / "data.pkl"), backend="pickle") catalog = DataCatalog(data_sets={ "spark_in": spark_in, "pickle": pickle_data, "spark_out": spark_out, }) pipeline = Pipeline([ node(identity, "spark_in", "pickle"), node(identity, "pickle", "spark_out"), ]) runner = ParallelRunner() pattern = r"{0} cannot be serialized. {1} can only be used with serializable data".format( str(sample_spark_df.__class__), str(pickle_data.__class__.__name__)) with pytest.raises(DataSetError, match=pattern): runner.run(pipeline, catalog)