예제 #1
0
def test_model_packaging(tmp_path, pipeline_ml_obj):

    catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })

    catalog._data_sets["model"].save(2)  # emulate model fitting

    artifacts = pipeline_ml_obj.extract_pipeline_artifacts(catalog)

    kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj,
                                     catalog=catalog)

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=artifacts,
            conda_env={"python": "3.7.0"},
        )
        run_id = mlflow.active_run().info.run_id

    loaded_model = mlflow.pyfunc.load_model(
        model_uri=(Path(r"runs:/") / run_id / "model").as_posix())
    assert loaded_model.predict(1) == 2
    def run(self):
        """
        Run the workflow : run each config
        """
        # data
        data_catalog = DataCatalog({
            'config': MemoryDataSet(),
            'log': MemoryDataSet(),
            'base_directory': MemoryDataSet()
        })
        data_catalog.save('config', self.config)
        data_catalog.save('log', self.log)
        data_catalog.save('base_directory', self.base_directory)

        expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node()
        multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node(
        )

        # Assemble nodes into a pipeline
        pipeline = Pipeline([expand_config_node, multiple_learning_node])
        # Create a runner to run the pipeline
        runner = SequentialRunner()
        # Run the pipeline
        result = runner.run(pipeline, data_catalog)
        if len(result) == 0:
            self.terminate()
예제 #3
0
 def _run_one_task(self, config_filename):
     # create node from Task
     load_data_node = mls.workflows.tasks.LoadDataTask.get_node()
     prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node()
     split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node()
     learn_data_node = mls.sl.workflows.tasks.LearnTask.get_node()
     config, log = self._init_config_log(config_filename,
                                         self.base_directory,
                                         self.config_directory)
     # Prepare a data catalog
     data_catalog = DataCatalog({
         'config': MemoryDataSet(),
         'log': MemoryDataSet(),
         'base_directory': MemoryDataSet()
     })
     data_catalog.save('config', config)
     data_catalog.save('log', log)
     data_catalog.save('base_directory', self.base_directory)
     # Assemble nodes into a pipeline
     pipeline = Pipeline([
         load_data_node, prepare_data_node, split_data_node, learn_data_node
     ])
     # Create a runner to run the pipeline
     runner = SequentialRunner()
     # Run the pipeline
     runner.run(pipeline, data_catalog)
     return log, config, data_catalog
def test_model_packaging_missing_artifacts(tmp_path, pipeline_ml_obj):

    catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })

    kedro_model = KedroPipelineModel(pipeline_ml=pipeline_ml_obj,
                                     catalog=catalog)

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=kedro_model,
            artifacts=None,  # no artifacts provided
            conda_env={"python": "3.7.0"},
        )
        run_id = mlflow.active_run().info.run_id

    with pytest.raises(
            ValueError,
            match="Provided artifacts do not match catalog entries"):
        mlflow.pyfunc.load_model(model_uri=(Path(r"runs:/") / run_id /
                                            "model").as_posix())
예제 #5
0
 def __init__(self, layers):
     self._data_sets = {
         "cat": PickleDataSet(filepath=str(tmp_path)),
         "parameters": MemoryDataSet({"name": "value"}),
         "params:rabbit": MemoryDataSet("value"),
     }
     self.layers = layers
    def run(self):
        """
        Run all tasks
        """
        # data
        data_catalog = DataCatalog({
            'config': MemoryDataSet(),
            'log': MemoryDataSet(),
            'base_directory': MemoryDataSet(),
            'dataset': MemoryDataSet(),
            'data': MemoryDataSet()
        })
        data_catalog.save('config', self.config)
        data_catalog.save('log', self.log)
        data_catalog.save('base_directory', self.base_directory)

        load_data_node = mls.workflows.tasks.LoadDataTask.get_node()
        prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node()
        split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node()
        learn_node = mls.sl.workflows.tasks.LearnTask.get_node()
        evaluate_node = mls.sl.workflows.tasks.EvaluateTask.get_node()
        # Assemble nodes into a pipeline
        pipeline = Pipeline([
            load_data_node, prepare_data_node, split_data_node, learn_node,
            evaluate_node
        ])
        # Create a runner to run the pipeline
        runner = SequentialRunner()
        # Run the pipeline
        runner.run(pipeline, data_catalog)
        self.terminate()
def dummy_catalog():
    dummy_catalog = DataCatalog({
        "raw_data": MemoryDataSet(),
        "data": MemoryDataSet(),
        "model": CSVDataSet("fake/path/to/model.csv"),
    })
    return dummy_catalog
예제 #8
0
def catalog_with_encoder():
    return DataCatalog({
        "raw_data": MemoryDataSet(),
        "data": MemoryDataSet(),
        "encoder": CSVDataSet("fake/path/to/encoder.csv"),
        "model": CSVDataSet("fake/path/to/model.csv"),
    })
예제 #9
0
    def test_no_param_datasets_in_respose(self, fake_cli_invoke,
                                          fake_load_context, mocker):
        yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML")
        mocked_context = fake_load_context.return_value
        catalog_data_sets = {
            "iris_data": CSVDataSet("test.csv"),
            "parameters": MemoryDataSet(),
            "params:data_ratio": MemoryDataSet(),
            "intermediate": MemoryDataSet(),
            "not_used": CSVDataSet("test2.csv"),
        }

        pl_obj_data_sets = catalog_data_sets.keys() - {"not_used"}
        mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets)
        mocked_context.pipelines.keys.return_value = (self.PIPELINE_NAME, )
        mocked_pl_obj = mocked_context.pipelines.get.return_value
        mocked_pl_obj.data_sets.return_value = pl_obj_data_sets

        result = fake_cli_invoke(["catalog", "list"])

        assert not result.exit_code
        # 'parameters' and 'params:data_ratio' should not appear in the response
        expected_dict = {
            "DataSets in 'pipeline' pipeline": {
                "Datasets mentioned in pipeline": {
                    "CSVDataSet": ["iris_data"],
                    "MemoryDataSet": ["intermediate"],
                },
                "Datasets not mentioned in pipeline": {
                    "CSVDataSet": ["not_used"]
                },
            }
        }
        yaml_dump_mock.assert_called_once_with(expected_dict)
예제 #10
0
def set_catalog(context, key1, key2, key3, key4):
    ds1 = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]})
    ds2 = pd.DataFrame({"col1": [9, 8], "col2": [7, 6], "col3": [5, 4]})
    context.catalog = DataCatalog({
        key1: MemoryDataSet(ds1),
        key2: MemoryDataSet(),
        key3: MemoryDataSet(ds2),
        key4: MemoryDataSet(),
    })
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "model.pkl").resolve().as_posix()),
    })
    return dummy_catalog
예제 #12
0
def test_catalog_extraction_missing_inference_input(pipeline_ml_with_tag):
    catalog = DataCatalog({
        "raw_data": MemoryDataSet(),
        "data": MemoryDataSet()
    })
    with pytest.raises(
            KedroMlflowPipelineMLDatasetsError,
            match="since it is an input for inference pipeline",
    ):
        pipeline_ml_with_tag.extract_pipeline_catalog(catalog)
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
    })
    return dummy_catalog
def catalog_with_stopwords():
    catalog_with_stopwords = DataCatalog({
        "data":
        MemoryDataSet(),
        "cleaned_data":
        MemoryDataSet(),
        "stopwords_from_nltk":
        CSVDataSet("fake/path/to/stopwords.csv"),
        "model":
        CSVDataSet("fake/path/to/model.csv"),
    })
    return catalog_with_stopwords
예제 #15
0
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(1),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
    })
    return dummy_catalog
def catalog_with_stopwords(tmp_path):
    catalog_with_stopwords = DataCatalog({
        "data":
        MemoryDataSet(),
        "cleaned_data":
        MemoryDataSet(),
        "stopwords_from_nltk":
        PickleDataSet((tmp_path / "stopwords.pkl").resolve().as_posix()),
        "model":
        PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()),
    })
    return catalog_with_stopwords
def catalog_with_encoder(tmp_path):
    catalog_with_encoder = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "encoder":
        PickleDataSet((tmp_path / "encoder.pkl").resolve().as_posix()),
        "model":
        PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()),
    })
    return catalog_with_encoder
예제 #18
0
def test_catalog_extraction_unpersisted_inference_input(pipeline_ml_with_tag):
    catalog = DataCatalog({
        "raw_data": MemoryDataSet(),
        "data": MemoryDataSet(),
        "model": MemoryDataSet()
    })
    with pytest.raises(
            KedroMlflowPipelineMLDatasetsError,
            match=
            "The datasets of the training pipeline must be persisted locally",
    ):
        pipeline_ml_with_tag.extract_pipeline_catalog(catalog)
예제 #19
0
def dummy_catalog():

    catalog = DataCatalog(
        {
            "params:param1": 1,
            "foo": MemoryDataSet(),
            "bar": MemoryDataSet(),
            "parameters": {"param1": 1, "param2": 2},
        }
    )

    return catalog
예제 #20
0
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet(filepath=(tmp_path / "data" / "06_models" /
                                "model.pkl").resolve().as_posix()),
    })
    dummy_catalog._data_sets["model"].save(2)  # emulate model fitting

    return dummy_catalog
예제 #21
0
def all_catalog(dataframex, dataframey, dataframey_bad):
    #создадим DF, так как они лежат в Кедро, после загрузки в память
    # https://kedro.readthedocs.io/en/stable/05_data/02_kedro_io.html
    from kedro.io import DataCatalog, MemoryDataSet
    catalog = DataCatalog({
        "dataframex": MemoryDataSet(),
        "dataframey": MemoryDataSet(),
        "dataframey_bad": MemoryDataSet()
    })
    catalog.save("dataframex", dataframex)
    catalog.save("dataframey", dataframey)
    catalog.save("dataframey_bad", dataframey_bad)
    return catalog
예제 #22
0
def test_node_hook(tmp_path):
    mlflow_node_hook = MlflowNodeHook(flatten_dict_params=True,
                                      recursive=True,
                                      sep="-")

    def fake_fun(arg1, arg2, arg3):
        return None

    node_test = node(
        func=fake_fun,
        inputs={
            "arg1": "params:param1",
            "arg2": "foo",
            "arg3": "parameters"
        },
        outputs="out",
    )
    catalog = DataCatalog({
        "params:param1": 1,
        "foo": MemoryDataSet(),
        "bar": MemoryDataSet(),
        "parameters": {
            "param1": 1,
            "param2": 2
        },
    })
    node_inputs = {
        v: catalog._data_sets.get(v)
        for k, v in node_test._inputs.items()
    }

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow_node_hook.before_node_run(
            node=node_test,
            catalog=catalog,
            inputs=node_inputs,
            is_async=False,
            run_id="132",
        )
        run_id = mlflow.active_run().info.run_id

    mlflow_client = MlflowClient(mlflow_tracking_uri)
    current_run = mlflow_client.get_run(run_id)
    assert current_run.data.params == {
        "param1": "1",
        "parameters-param1": "1",
        "parameters-param2": "2",
    }
예제 #23
0
    def test_exists(self, new_data):
        """Test `exists` method invocation"""
        data_set = MemoryDataSet()
        assert not data_set.exists()

        data_set.save(new_data)
        assert data_set.exists()
def catalog_with_parameters():
    catalog_with_parameters = DataCatalog({
        "data":
        MemoryDataSet(),
        "cleaned_data":
        MemoryDataSet(),
        "params:stopwords":
        MemoryDataSet(["Hello", "Hi"]),
        "params:penalty":
        MemoryDataSet(0.1),
        "model":
        CSVDataSet("fake/path/to/model.csv"),
        "params:threshold":
        MemoryDataSet(0.5),
    })
    return catalog_with_parameters
예제 #25
0
 def test_memory_data_set_input(self, is_async, fan_out_fan_in):
     pipeline = Pipeline([fan_out_fan_in])
     catalog = DataCatalog({"A": MemoryDataSet("42")})
     result = ParallelRunner(is_async=is_async).run(pipeline, catalog)
     assert "Z" in result
     assert len(result["Z"]) == 3
     assert result["Z"] == ("42", "42", "42")
예제 #26
0
    def extract_pipeline_catalog(self, catalog: DataCatalog) -> DataCatalog:
        sub_catalog = DataCatalog()
        for data_set_name in self.inference.inputs():
            if data_set_name == self.input_name:
                # there is no obligation that this dataset is persisted
                # thus it is allowed to be an empty memory dataset
                data_set = catalog._data_sets.get(
                    data_set_name) or MemoryDataSet()
                sub_catalog.add(data_set_name=data_set_name, data_set=data_set)
            else:
                try:
                    data_set = catalog._data_sets[data_set_name]
                    if isinstance(data_set, MemoryDataSet):
                        raise KedroMlflowPipelineMLDatasetsError("""
                                The datasets of the training pipeline must be persisted locally
                                to be used by the inference pipeline. You must enforce them as
                                non 'MemoryDataSet' in the 'catalog.yml'.
                                Dataset '{data_set_name}' is not persisted currently.
                                """.format(data_set_name=data_set_name))
                    sub_catalog.add(data_set_name=data_set_name,
                                    data_set=data_set)
                except KeyError:
                    raise KedroMlflowPipelineMLDatasetsError("""
                                The provided catalog must contains '{data_set_name}' data_set
                                since it is an input for inference pipeline.
                                """.format(data_set_name=data_set_name))

        return sub_catalog
예제 #27
0
    def _init_dataset(self):

        if not getattr(self, "_ready", None):
            self._ready = True
            self.dataset_name = self.dataset_name or self._dataset_name
            _dataset = self.dataset
            if isinstance(self.dataset, str):
                dataset_dict = dataset_dicts.get(
                    self.dataset, {"type": "pickle.PickleDataSet"})
                dataset_dict["filepath"] = self.filepath = (
                    self.filepath or tempfile.gettempdir() + "/" +
                    self.dataset_name + "." + self.dataset)
                _dataset = dataset_dict

            if isinstance(_dataset, dict):
                self._dataset = AbstractDataSet.from_config(
                    self._dataset_name, _dataset)
            elif isinstance(_dataset, AbstractDataSet):
                self._dataset = _dataset
            else:
                raise ValueError(
                    "The argument type of `dataset` should be either a dict/YAML "
                    "representation of the dataset, or the actual dataset object."
                )

            _filepath = getattr(self._dataset, "_filepath", None)
            if _filepath:
                self.filepath = str(_filepath)

            if self.caching and (not self._running_parallel):
                self._cache = MemoryDataSet(copy_mode=self.copy_mode)
예제 #28
0
def create_master_table(shuttles: pd.DataFrame, companies: pd.DataFrame,
                        reviews: pd.DataFrame) -> [pd.DataFrame, pd.DataFrame]:
    """Combines all data to create a master table.

        Args:
            shuttles: Preprocessed data for shuttles.
            companies: Preprocessed data for companies.
            reviews: Source data for reviews.
        Returns:
            Master table.

    """
    rated_shuttles = shuttles.merge(reviews,
                                    left_on="id",
                                    right_on="shuttle_id")

    with_companies = rated_shuttles.merge(companies,
                                          left_on="company_id",
                                          right_on="id")

    master_table = with_companies.drop(["shuttle_id", "company_id"], axis=1)
    master_table = master_table.dropna()
    input_ml_data = pd.DataFrame({'state': 'ready'}, index=[0])
    input_ml = MemoryDataSet(data=input_ml_data)
    return master_table, input_ml
예제 #29
0
    def __init__(
        self,
        propensity_model_filename="../data/06_models/propensity_model.pickle",
        uplift_models_filename="../data/06_models/uplift_models_dict.pickle",
        df_filename="../data/07_model_output/df.csv",
        treated_sim_eval_filename="../data/08_reporting/treated__sim_eval_df.csv",
        untreated_sim_eval_filename="../data/08_reporting/untreated__sim_eval_df.csv",
        estimated_effect_filename="../data/08_reporting/estimated_effect_df.csv",
        args_raw=MemoryDataSet({}).load()):

        self.propensity_model = PickleLocalDataSet(
            filepath=propensity_model_filename, version=None)
        self.uplift_models_dict = PickleLocalDataSet(
            filepath=uplift_models_filename, version=None)
        self.df_03 = CSVLocalDataSet(
            filepath=df_filename,
            load_args=dict(index_col=["partition", "index"],
                           float_precision="high"),
            save_args=dict(index=True, float_format="%.16e"),
            version=None,
        )
        self.treated__sim_eval_df = CSVLocalDataSet(
            filepath=treated_sim_eval_filename, version=None)
        self.untreated__sim_eval_df = CSVLocalDataSet(
            filepath=untreated_sim_eval_filename, version=None)
        self.estimated_effect_df = CSVLocalDataSet(
            filepath=estimated_effect_filename, version=None)
        self.args_raw = args_raw
예제 #30
0
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog(
        {
            "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
            "params:unused_param": MemoryDataSet("blah"),
            "data": MemoryDataSet(),
            "model": PickleDataSet((tmp_path / "model.csv").as_posix()),
            "my_metrics": MlflowMetricsDataSet(),
            "another_metrics": MlflowMetricsDataSet(prefix="foo"),
            "my_metric": MlflowMetricDataSet(),
            "another_metric": MlflowMetricDataSet(key="foo"),
            "my_metric_history": MlflowMetricHistoryDataSet(),
            "another_metric_history": MlflowMetricHistoryDataSet(key="bar"),
        }
    )
    return dummy_catalog