예제 #1
0
 def _run_one_task(self, config_filename):
     # create node from Task
     load_data_node = mls.workflows.tasks.LoadDataTask.get_node()
     prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node()
     split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node()
     learn_data_node = mls.sl.workflows.tasks.LearnTask.get_node()
     config, log = self._init_config_log(config_filename,
                                         self.base_directory,
                                         self.config_directory)
     # Prepare a data catalog
     data_catalog = DataCatalog({
         'config': MemoryDataSet(),
         'log': MemoryDataSet(),
         'base_directory': MemoryDataSet()
     })
     data_catalog.save('config', config)
     data_catalog.save('log', log)
     data_catalog.save('base_directory', self.base_directory)
     # Assemble nodes into a pipeline
     pipeline = Pipeline([
         load_data_node, prepare_data_node, split_data_node, learn_data_node
     ])
     # Create a runner to run the pipeline
     runner = SequentialRunner()
     # Run the pipeline
     runner.run(pipeline, data_catalog)
     return log, config, data_catalog
예제 #2
0
    def _init_dataset(self):

        if not getattr(self, "_ready", None):
            self._ready = True
            self.dataset_name = self.dataset_name or self._dataset_name
            _dataset = self.dataset
            if isinstance(self.dataset, str):
                dataset_dict = dataset_dicts.get(
                    self.dataset, {"type": "pickle.PickleDataSet"})
                dataset_dict["filepath"] = self.filepath = (
                    self.filepath or tempfile.gettempdir() + "/" +
                    self.dataset_name + "." + self.dataset)
                _dataset = dataset_dict

            if isinstance(_dataset, dict):
                self._dataset = AbstractDataSet.from_config(
                    self._dataset_name, _dataset)
            elif isinstance(_dataset, AbstractDataSet):
                self._dataset = _dataset
            else:
                raise ValueError(
                    "The argument type of `dataset` should be either a dict/YAML "
                    "representation of the dataset, or the actual dataset object."
                )

            _filepath = getattr(self._dataset, "_filepath", None)
            if _filepath:
                self.filepath = str(_filepath)

            if self.caching and (not self._running_parallel):
                self._cache = MemoryDataSet(copy_mode=self.copy_mode)
    def run(self):
        """
        Run all tasks
        """
        # data
        data_catalog = DataCatalog({
            'config': MemoryDataSet(),
            'log': MemoryDataSet(),
            'base_directory': MemoryDataSet(),
            'dataset': MemoryDataSet(),
            'data': MemoryDataSet()
        })
        data_catalog.save('config', self.config)
        data_catalog.save('log', self.log)
        data_catalog.save('base_directory', self.base_directory)

        load_data_node = mls.workflows.tasks.LoadDataTask.get_node()
        prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node()
        split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node()
        learn_node = mls.sl.workflows.tasks.LearnTask.get_node()
        evaluate_node = mls.sl.workflows.tasks.EvaluateTask.get_node()
        # Assemble nodes into a pipeline
        pipeline = Pipeline([
            load_data_node, prepare_data_node, split_data_node, learn_node,
            evaluate_node
        ])
        # Create a runner to run the pipeline
        runner = SequentialRunner()
        # Run the pipeline
        runner.run(pipeline, data_catalog)
        self.terminate()
def dummy_catalog():
    dummy_catalog = DataCatalog({
        "raw_data": MemoryDataSet(),
        "data": MemoryDataSet(),
        "model": CSVDataSet("fake/path/to/model.csv"),
    })
    return dummy_catalog
    def run(self):
        """
        Run the workflow : run each config
        """
        # data
        data_catalog = DataCatalog({
            'config': MemoryDataSet(),
            'log': MemoryDataSet(),
            'base_directory': MemoryDataSet()
        })
        data_catalog.save('config', self.config)
        data_catalog.save('log', self.log)
        data_catalog.save('base_directory', self.base_directory)

        expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node()
        multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node(
        )

        # Assemble nodes into a pipeline
        pipeline = Pipeline([expand_config_node, multiple_learning_node])
        # Create a runner to run the pipeline
        runner = SequentialRunner()
        # Run the pipeline
        result = runner.run(pipeline, data_catalog)
        if len(result) == 0:
            self.terminate()
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
    })
    return dummy_catalog
def catalog_with_stopwords():
    catalog_with_stopwords = DataCatalog({
        "data":
        MemoryDataSet(),
        "cleaned_data":
        MemoryDataSet(),
        "stopwords_from_nltk":
        CSVDataSet("fake/path/to/stopwords.csv"),
        "model":
        CSVDataSet("fake/path/to/model.csv"),
    })
    return catalog_with_stopwords
예제 #8
0
def dummy_catalog(tmp_path):
    dummy_catalog = DataCatalog({
        "raw_data":
        MemoryDataSet(1),
        "params:unused_param":
        MemoryDataSet("blah"),
        "data":
        MemoryDataSet(),
        "model":
        PickleDataSet((tmp_path / "model.csv").as_posix()),
    })
    return dummy_catalog
def catalog_with_stopwords(tmp_path):
    catalog_with_stopwords = DataCatalog({
        "data":
        MemoryDataSet(),
        "cleaned_data":
        MemoryDataSet(),
        "stopwords_from_nltk":
        PickleDataSet((tmp_path / "stopwords.pkl").resolve().as_posix()),
        "model":
        PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()),
    })
    return catalog_with_stopwords
def catalog_with_encoder(tmp_path):
    catalog_with_encoder = DataCatalog({
        "raw_data":
        MemoryDataSet(),
        "data":
        MemoryDataSet(),
        "encoder":
        PickleDataSet((tmp_path / "encoder.pkl").resolve().as_posix()),
        "model":
        PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()),
    })
    return catalog_with_encoder
예제 #11
0
def test_node_hook(tmp_path):
    mlflow_node_hook = MlflowNodeHook(flatten_dict_params=True,
                                      recursive=True,
                                      sep="-")

    def fake_fun(arg1, arg2, arg3):
        return None

    node_test = node(
        func=fake_fun,
        inputs={
            "arg1": "params:param1",
            "arg2": "foo",
            "arg3": "parameters"
        },
        outputs="out",
    )
    catalog = DataCatalog({
        "params:param1": 1,
        "foo": MemoryDataSet(),
        "bar": MemoryDataSet(),
        "parameters": {
            "param1": 1,
            "param2": 2
        },
    })
    node_inputs = {
        v: catalog._data_sets.get(v)
        for k, v in node_test._inputs.items()
    }

    mlflow_tracking_uri = (tmp_path / "mlruns").as_uri()
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    with mlflow.start_run():
        mlflow_node_hook.before_node_run(
            node=node_test,
            catalog=catalog,
            inputs=node_inputs,
            is_async=False,
            run_id="132",
        )
        run_id = mlflow.active_run().info.run_id

    mlflow_client = MlflowClient(mlflow_tracking_uri)
    current_run = mlflow_client.get_run(run_id)
    assert current_run.data.params == {
        "param1": "1",
        "parameters-param1": "1",
        "parameters-param2": "2",
    }
예제 #12
0
    def __init__(
        self,
        propensity_model_filename="../data/06_models/propensity_model.pickle",
        uplift_models_filename="../data/06_models/uplift_models_dict.pickle",
        df_filename="../data/07_model_output/df.csv",
        treated_sim_eval_filename="../data/08_reporting/treated__sim_eval_df.csv",
        untreated_sim_eval_filename="../data/08_reporting/untreated__sim_eval_df.csv",
        estimated_effect_filename="../data/08_reporting/estimated_effect_df.csv",
        args_raw=MemoryDataSet({}).load()):

        self.propensity_model = PickleLocalDataSet(
            filepath=propensity_model_filename, version=None)
        self.uplift_models_dict = PickleLocalDataSet(
            filepath=uplift_models_filename, version=None)
        self.df_03 = CSVLocalDataSet(
            filepath=df_filename,
            load_args=dict(index_col=["partition", "index"],
                           float_precision="high"),
            save_args=dict(index=True, float_format="%.16e"),
            version=None,
        )
        self.treated__sim_eval_df = CSVLocalDataSet(
            filepath=treated_sim_eval_filename, version=None)
        self.untreated__sim_eval_df = CSVLocalDataSet(
            filepath=untreated_sim_eval_filename, version=None)
        self.estimated_effect_df = CSVLocalDataSet(
            filepath=estimated_effect_filename, version=None)
        self.args_raw = args_raw
예제 #13
0
def create_master_table(shuttles: pd.DataFrame, companies: pd.DataFrame,
                        reviews: pd.DataFrame) -> [pd.DataFrame, pd.DataFrame]:
    """Combines all data to create a master table.

        Args:
            shuttles: Preprocessed data for shuttles.
            companies: Preprocessed data for companies.
            reviews: Source data for reviews.
        Returns:
            Master table.

    """
    rated_shuttles = shuttles.merge(reviews,
                                    left_on="id",
                                    right_on="shuttle_id")

    with_companies = rated_shuttles.merge(companies,
                                          left_on="company_id",
                                          right_on="id")

    master_table = with_companies.drop(["shuttle_id", "company_id"], axis=1)
    master_table = master_table.dropna()
    input_ml_data = pd.DataFrame({'state': 'ready'}, index=[0])
    input_ml = MemoryDataSet(data=input_ml_data)
    return master_table, input_ml