def test_no_training_serve_skew(): """ Test for training-serving skew (feature engineering in training and serving should be the same) """ dag = DAGSpec.find().to_dag() # load raw data get = pd.read_parquet(dag['get'].product) del get['target'] # load feature vectors join = pd.read_parquet(dag['join'].product) del join['target'] pipeline = InferencePipeline() # make predictions using the online pipeline (if training set is large # you can take a random sample) online = [ pipeline.predict(get=get.loc[[idx]])['join'] for idx in join.index ] # cast to a data frame online_df = pd.concat(online) online_df.index = join.index # compare data frames assert online_df.equals(join)
def test_export(mock_docker_calls, backup_packaged_project, monkeypatch, mode, args): load_tasks_mock = Mock(wraps=commons.load_tasks) monkeypatch.setattr(commons, 'load_tasks', load_tasks_mock) exporter = ArgoWorkflowsExporter(path_to_config='soopervisor.yaml', env_name='serve') exporter.add() exporter.export(mode=mode, until=None) yaml_str = Path('serve/argo.yaml').read_text() spec = yaml.safe_load(yaml_str) dag = DAGSpec.find().to_dag() load_tasks_mock.assert_called_once_with(mode=mode) # make sure the "source" key is represented in literal style # (https://yaml-multiline.info/) to make the generated script more readable assert 'source: |' in yaml_str run_task_template = spec['spec']['templates'][0] tasks = spec['spec']['templates'][1]['dag']['tasks'] assert run_task_template['script'][ 'source'] == 'ploomber task {{inputs.parameters.task_name}}' + args assert spec['spec']['volumes'] == [] assert run_task_template['script']['volumeMounts'] == [] assert Workflow.from_dict(copy(spec)) assert set(spec) == {'apiVersion', 'kind', 'metadata', 'spec'} assert set(spec['metadata']) == {'generateName'} assert set(spec['spec']) == {'entrypoint', 'templates', 'volumes'} # should not change workingdir assert run_task_template['script']['workingDir'] is None assert run_task_template['script'][ 'image'] == 'your-repository/name:0.1dev' assert run_task_template['name'] == 'run-task' assert spec['metadata']['generateName'] == 'my-project-' assert all([ set(dag[t['name']].upstream) == set(t['dependencies']) for t in tasks ]) # tasks call the right template assert set(t['template'] for t in tasks) == {'run-task'} # check each task uses the right parameters assert all([ t['arguments']['parameters'][0] == { 'name': 'task_name', 'value': t['name'] } for t in tasks ])
def load_tasks(mode='incremental'): """Load tasks names and their upstream dependencies Parameters ---------- mode : bool, default='incremental' One of 'incremental' (only include outdated tasks with respect to the remote metadata), 'regular' (ignore status, submit all tasks and determine status at runtime) or 'force' (ignore status, submit all tasks and force execution regardless of status) Returns ------- task : dict A dictionary with tasks (keys) and upstream dependencies (values) to submit args : list A list of arguments to pass to "ploomber task {name}" """ valid = Mode.get_values() if mode not in valid: raise ValueError(f'mode must be one of {valid!r}') dag = DAGSpec.find().to_dag() if mode == 'incremental': dag.render(remote=True) tasks = [] for name, task in dag.items(): if not mode or task.exec_status != TaskStatus.Skipped: tasks.append(name) else: # force makes rendering faster. we just need this to ensure the # pipeline does not have any rendering problems before proceeding dag.render(force=True) tasks = list(dag.keys()) out = {} for t in tasks: out[t] = [name for name in dag[t].upstream.keys() if name in tasks] return out, [] if mode != 'force' else ['--force']
def __init__(self, path_to_config, env_name): # initialize configuration and a few checks on it self._cfg = self.CONFIG_CLASS.from_file_with_root_key( path_to_config=path_to_config, env_name=env_name, ) self._env_name = env_name # initialize dag (needed for validation) # TODO: implement logic to the corresponding env.{target-name}.yaml # to simulate what's going to happen self._dag = DAGSpec.find(lazy_import=True).to_dag().render( force=True, show_progress=False) # ensure that the project and the config make sense self.validate() # validate specific details about the target self._validate(self._cfg, self._dag, self._env_name)
def test_pipeline(): """ This is a smoke test, checking that the pipeline runs (but not the output) NOTE: it's common for pipelines to take hours to run, a way to make this test feasible is to run it here with a sample of the data and save results in a different folder to prevent overwriting your results. """ # load dag dag = DAGSpec.find().to_dag() # change executor settings: you can use "pytest --pdb" to start a debugging # session if the test fails. Calling dag['task'].debug() is another # option dag.executor = Serial(build_in_subprocess=False, catch_exceptions=False) # a third approach for debugging is to use: import IPython; IPython.embed() # to start an interactive session at this point. To do so, you must call # "pytest -s" dag.build()
def test_train(): """ This is a smoke test. It only check that the training pipeline runs (doesn't check if the output is correct). It passes a sample of the data to make it faster. """ # load dag dag = DAGSpec.find(env={ 'products': '{{root}}/testing', 'sample': True }).to_dag() # change executor settings: you can use "pytest --pdb" to start a debugging # session if the test fails. Calling dag['task'].debug() is another # option dag.executor = Serial(build_in_subprocess=False, catch_exceptions=False) # a third approach for debugging is to use: import IPython; IPython.embed() # to start an interactive session at this point. To do so, you must call # "pytest -s" dag.build()
def dag_build(): dag = DAGSpec.find().to_dag() dag.executor = Serial(build_in_subprocess=False) dag.render().build()