def _run_one_task(self, config_filename): # create node from Task load_data_node = mls.workflows.tasks.LoadDataTask.get_node() prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node() split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node() learn_data_node = mls.sl.workflows.tasks.LearnTask.get_node() config, log = self._init_config_log(config_filename, self.base_directory, self.config_directory) # Prepare a data catalog data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet() }) data_catalog.save('config', config) data_catalog.save('log', log) data_catalog.save('base_directory', self.base_directory) # Assemble nodes into a pipeline pipeline = Pipeline([ load_data_node, prepare_data_node, split_data_node, learn_data_node ]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) return log, config, data_catalog
def _init_dataset(self): if not getattr(self, "_ready", None): self._ready = True self.dataset_name = self.dataset_name or self._dataset_name _dataset = self.dataset if isinstance(self.dataset, str): dataset_dict = dataset_dicts.get( self.dataset, {"type": "pickle.PickleDataSet"}) dataset_dict["filepath"] = self.filepath = ( self.filepath or tempfile.gettempdir() + "/" + self.dataset_name + "." + self.dataset) _dataset = dataset_dict if isinstance(_dataset, dict): self._dataset = AbstractDataSet.from_config( self._dataset_name, _dataset) elif isinstance(_dataset, AbstractDataSet): self._dataset = _dataset else: raise ValueError( "The argument type of `dataset` should be either a dict/YAML " "representation of the dataset, or the actual dataset object." ) _filepath = getattr(self._dataset, "_filepath", None) if _filepath: self.filepath = str(_filepath) if self.caching and (not self._running_parallel): self._cache = MemoryDataSet(copy_mode=self.copy_mode)
def run(self): """ Run all tasks """ # data data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet(), 'dataset': MemoryDataSet(), 'data': MemoryDataSet() }) data_catalog.save('config', self.config) data_catalog.save('log', self.log) data_catalog.save('base_directory', self.base_directory) load_data_node = mls.workflows.tasks.LoadDataTask.get_node() prepare_data_node = mls.sl.workflows.tasks.PrepareDataTask.get_node() split_data_node = mls.sl.workflows.tasks.SplitDataTask.get_node() learn_node = mls.sl.workflows.tasks.LearnTask.get_node() evaluate_node = mls.sl.workflows.tasks.EvaluateTask.get_node() # Assemble nodes into a pipeline pipeline = Pipeline([ load_data_node, prepare_data_node, split_data_node, learn_node, evaluate_node ]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline runner.run(pipeline, data_catalog) self.terminate()
def dummy_catalog(): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "model": CSVDataSet("fake/path/to/model.csv"), }) return dummy_catalog
def run(self): """ Run the workflow : run each config """ # data data_catalog = DataCatalog({ 'config': MemoryDataSet(), 'log': MemoryDataSet(), 'base_directory': MemoryDataSet() }) data_catalog.save('config', self.config) data_catalog.save('log', self.log) data_catalog.save('base_directory', self.base_directory) expand_config_node = mls.sl.workflows.tasks.ExpandConfigTask.get_node() multiple_learning_node = mls.sl.workflows.tasks.MultipleLearningTask.get_node( ) # Assemble nodes into a pipeline pipeline = Pipeline([expand_config_node, multiple_learning_node]) # Create a runner to run the pipeline runner = SequentialRunner() # Run the pipeline result = runner.run(pipeline, data_catalog) if len(result) == 0: self.terminate()
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), }) return dummy_catalog
def catalog_with_stopwords(): catalog_with_stopwords = DataCatalog({ "data": MemoryDataSet(), "cleaned_data": MemoryDataSet(), "stopwords_from_nltk": CSVDataSet("fake/path/to/stopwords.csv"), "model": CSVDataSet("fake/path/to/model.csv"), }) return catalog_with_stopwords
def dummy_catalog(tmp_path): dummy_catalog = DataCatalog({ "raw_data": MemoryDataSet(1), "params:unused_param": MemoryDataSet("blah"), "data": MemoryDataSet(), "model": PickleDataSet((tmp_path / "model.csv").as_posix()), }) return dummy_catalog
def catalog_with_stopwords(tmp_path): catalog_with_stopwords = DataCatalog({ "data": MemoryDataSet(), "cleaned_data": MemoryDataSet(), "stopwords_from_nltk": PickleDataSet((tmp_path / "stopwords.pkl").resolve().as_posix()), "model": PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()), }) return catalog_with_stopwords
def catalog_with_encoder(tmp_path): catalog_with_encoder = DataCatalog({ "raw_data": MemoryDataSet(), "data": MemoryDataSet(), "encoder": PickleDataSet((tmp_path / "encoder.pkl").resolve().as_posix()), "model": PickleDataSet((tmp_path / "model.pkl").resolve().as_posix()), }) return catalog_with_encoder
def test_node_hook(tmp_path): mlflow_node_hook = MlflowNodeHook(flatten_dict_params=True, recursive=True, sep="-") def fake_fun(arg1, arg2, arg3): return None node_test = node( func=fake_fun, inputs={ "arg1": "params:param1", "arg2": "foo", "arg3": "parameters" }, outputs="out", ) catalog = DataCatalog({ "params:param1": 1, "foo": MemoryDataSet(), "bar": MemoryDataSet(), "parameters": { "param1": 1, "param2": 2 }, }) node_inputs = { v: catalog._data_sets.get(v) for k, v in node_test._inputs.items() } mlflow_tracking_uri = (tmp_path / "mlruns").as_uri() mlflow.set_tracking_uri(mlflow_tracking_uri) with mlflow.start_run(): mlflow_node_hook.before_node_run( node=node_test, catalog=catalog, inputs=node_inputs, is_async=False, run_id="132", ) run_id = mlflow.active_run().info.run_id mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) assert current_run.data.params == { "param1": "1", "parameters-param1": "1", "parameters-param2": "2", }
def __init__( self, propensity_model_filename="../data/06_models/propensity_model.pickle", uplift_models_filename="../data/06_models/uplift_models_dict.pickle", df_filename="../data/07_model_output/df.csv", treated_sim_eval_filename="../data/08_reporting/treated__sim_eval_df.csv", untreated_sim_eval_filename="../data/08_reporting/untreated__sim_eval_df.csv", estimated_effect_filename="../data/08_reporting/estimated_effect_df.csv", args_raw=MemoryDataSet({}).load()): self.propensity_model = PickleLocalDataSet( filepath=propensity_model_filename, version=None) self.uplift_models_dict = PickleLocalDataSet( filepath=uplift_models_filename, version=None) self.df_03 = CSVLocalDataSet( filepath=df_filename, load_args=dict(index_col=["partition", "index"], float_precision="high"), save_args=dict(index=True, float_format="%.16e"), version=None, ) self.treated__sim_eval_df = CSVLocalDataSet( filepath=treated_sim_eval_filename, version=None) self.untreated__sim_eval_df = CSVLocalDataSet( filepath=untreated_sim_eval_filename, version=None) self.estimated_effect_df = CSVLocalDataSet( filepath=estimated_effect_filename, version=None) self.args_raw = args_raw
def create_master_table(shuttles: pd.DataFrame, companies: pd.DataFrame, reviews: pd.DataFrame) -> [pd.DataFrame, pd.DataFrame]: """Combines all data to create a master table. Args: shuttles: Preprocessed data for shuttles. companies: Preprocessed data for companies. reviews: Source data for reviews. Returns: Master table. """ rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id") with_companies = rated_shuttles.merge(companies, left_on="company_id", right_on="id") master_table = with_companies.drop(["shuttle_id", "company_id"], axis=1) master_table = master_table.dropna() input_ml_data = pd.DataFrame({'state': 'ready'}, index=[0]) input_ml = MemoryDataSet(data=input_ml_data) return master_table, input_ml