예제 #1
0
    def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out):
        """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet .
        """
        pickle_data = PickleLocalDataSet(
            filepath=str(tmp_path / "data.pkl"), backend="pickle"
        )
        catalog = DataCatalog(
            data_sets={
                "spark_in": spark_in,
                "pickle": pickle_data,
                "spark_out": spark_out,
            }
        )
        pipeline = Pipeline(
            [
                node(identity, "spark_in", "pickle"),
                node(identity, "pickle", "spark_out"),
            ]
        )
        runner = ParallelRunner()

        pattern = (
            r"The following data_sets cannot be "
            r"serialized: \[\'spark\_in\'\, \'spark\_out\'\]"
        )
        with pytest.raises(AttributeError, match=pattern):
            runner.run(pipeline, catalog)
예제 #2
0
    def test_release_at_earliest_opportunity(self, is_async):
        runner = ParallelRunner(is_async=is_async)
        log = runner._manager.list()

        pipeline = Pipeline([
            node(source, None, "first"),
            node(identity, "first", "second"),
            node(sink, "second", None),
        ])
        # pylint: disable=no-member
        catalog = DataCatalog({
            "first":
            runner._manager.LoggingDataSet(log, "first"),
            "second":
            runner._manager.LoggingDataSet(log, "second"),
        })
        runner.run(pipeline, catalog)

        # we want to see "release first" before "load second"
        assert list(log) == [
            ("load", "first"),
            ("release", "first"),
            ("load", "second"),
            ("release", "second"),
        ]
예제 #3
0
    def test_parallel_runner(self, spark_in, spark_out):
        """Test ParallelRunner with SparkDataSet load and save.
        """
        catalog = DataCatalog(data_sets={"spark_in": spark_in, "spark_out": spark_out})
        pipeline = Pipeline([node(identity, "spark_in", "spark_out")])
        runner = ParallelRunner()

        pattern = (
            r"The following data_sets cannot be "
            r"serialized: \[\'spark\_in\'\, \'spark\_out\'\]"
        )
        with pytest.raises(AttributeError, match=pattern):
            runner.run(pipeline, catalog)
예제 #4
0
 def test_parallel_runner(self, spark_in, spark_out):
     """Test ParallelRunner with SparkDataSet load and save.
     """
     catalog = DataCatalog(data_sets={"spark_in": spark_in, "spark_out": spark_out})
     pipeline = Pipeline([node(identity, "spark_in", "spark_out")])
     runner = ParallelRunner()
     result = runner.run(pipeline, catalog)
     # 'spark_out' is saved in 'tmp_path/input', so the result of run should be empty
     assert not result
예제 #5
0
    def test_count_multiple_loads(self):
        runner = ParallelRunner()
        log = runner._manager.list()

        pipeline = Pipeline([
            node(source, None, "dataset"),
            node(sink, "dataset", None, name="bob"),
            node(sink, "dataset", None, name="fred"),
        ])
        catalog = DataCatalog(
            {"dataset": runner._manager.LoggingDataSet(log, "dataset")})
        runner.run(pipeline, catalog)

        # we want to the release after both the loads
        assert list(log) == [
            ("load", "dataset"),
            ("load", "dataset"),
            ("release", "dataset"),
        ]
예제 #6
0
    def test_parallel_runner_with_memory_dataset(self, spark_in, spark_out,
                                                 sample_spark_df):
        """Run ParallelRunner with SparkDataSet -> MemoryDataSet -> SparkDataSet.
        """
        catalog = DataCatalog(data_sets={
            "spark_in": spark_in,
            "spark_out": spark_out
        })
        pipeline = Pipeline([
            node(identity, "spark_in", "memory"),
            node(identity, "memory", "spark_out"),
        ])
        runner = ParallelRunner()

        pattern = (
            r"{0} cannot be serialized. ParallelRunner implicit memory datasets "
            r"can only be used with serializable data".format(
                str(sample_spark_df.__class__)))
        with pytest.raises(DataSetError, match=pattern):
            runner.run(pipeline, catalog)
예제 #7
0
    def test_unable_to_schedule_all_nodes(self, mocker, is_async,
                                          fan_out_fan_in, catalog):
        """Test the error raised when `futures` variable is empty,
        but `todo_nodes` is not (can barely happen in real life).
        """
        catalog.add_feed_dict(dict(A=42))
        runner = ParallelRunner(is_async=is_async)

        real_node_deps = fan_out_fan_in.node_dependencies
        # construct deliberately unresolvable dependencies for all
        # pipeline nodes, so that none can be run
        fake_node_deps = {k: {"you_shall_not_pass"} for k in real_node_deps}
        # property mock requires patching a class, not an instance
        mocker.patch(
            "kedro.pipeline.Pipeline.node_dependencies",
            new_callable=mocker.PropertyMock,
            return_value=fake_node_deps,
        )

        pattern = "Unable to schedule new tasks although some nodes have not been run"
        with pytest.raises(RuntimeError, match=pattern):
            runner.run(fan_out_fan_in, catalog)
예제 #8
0
    def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out):
        """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet .
        """
        pickle_data = PickleDataSet(filepath=str(tmp_path / "data.pkl"))
        catalog = DataCatalog(
            data_sets={
                "spark_in": spark_in,
                "pickle": pickle_data,
                "spark_out": spark_out,
            }
        )
        pipeline = Pipeline(
            [
                node(identity, "spark_in", "pickle"),
                node(identity, "pickle", "spark_out"),
            ]
        )
        runner = ParallelRunner()

        pattern = r"Failed while saving data to data set PickleDataSet"

        with pytest.raises(DataSetError, match=pattern):
            runner.run(pipeline, catalog)
예제 #9
0
    def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in,
                                                 spark_out, sample_spark_df):
        """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet .
        """
        pickle_data = PickleLocalDataSet(filepath=str(tmp_path / "data.pkl"),
                                         backend="pickle")
        catalog = DataCatalog(data_sets={
            "spark_in": spark_in,
            "pickle": pickle_data,
            "spark_out": spark_out,
        })
        pipeline = Pipeline([
            node(identity, "spark_in", "pickle"),
            node(identity, "pickle", "spark_out"),
        ])
        runner = ParallelRunner()

        pattern = r"{0} cannot be serialized. {1} can only be used with serializable data".format(
            str(sample_spark_df.__class__),
            str(pickle_data.__class__.__name__))

        with pytest.raises(DataSetError, match=pattern):
            runner.run(pipeline, catalog)