def test_copy(copy): def _assign_upstream(upstream): _assign_upstream.obj = upstream return 42 dag_ = DAG() root = PythonCallable(_root, File('root.parquet'), dag_, name='root', serializer=serializer, params={'input_data': { 'x': [0, 0, 0] }}) task = PythonCallable(_assign_upstream, File('task.parquet'), dag_, name='task', unserializer=unserializer, serializer=serializer) root >> task dag = InMemoryDAG(dag_) out = dag.build({'root': {'x': [1]}}, copy=copy) # test that the function _assign_upstream received the same object # the task root returned in the upstream argument if copy is disabled. # if copying, then it should be a different object assert (_assign_upstream.obj['root'] is out['root']) is (not copy)
def test_in_memory_dag(dag): dag_in_memory = InMemoryDAG(dag) out = dag_in_memory.build({'root': {'x': [1, 2, 3]}}) assert out['root']['x'].tolist() == [1, 2, 3] assert out['task']['x'].tolist() == [2, 3, 4]
def make_predict(): """Instantiate a prediction DAG using a previously trained model """ dag_pred = DAG() # this special function adds a task with name "get" that will just forward # whatever value we pass when calling .build(). You can pass a function # in the "preprocessor" argument to perform arbitrary logic like parsing # or validation input_data_passer(dag=dag_pred, name='get', preprocessor=validate_input_data) # we re-use the same code that we used for training! add_features(dag_pred) # load model generated by the training graph with open(Path('output', 'model.pickle'), 'rb') as f: model = pickle.load(f) # add the final task, this special function just executes whatever # function we pass as the first argument, we can pass arbitrary parameters # using "params" predict_task = in_memory_callable(predict, dag=dag_pred, name='predict', params=dict(model=model)) # predict after joining features dag_pred['join'] >> predict_task # convert our batch-processing pipeline to a in-memory one and return return InMemoryDAG(dag_pred)
def test_input_data_passer(): dag = DAG() root = input_data_passer(dag, name='root') task = PythonCallable(_add_one, File('task.parquet'), dag, name='task', unserializer=unserializer, serializer=serializer) root >> task dag_ = InMemoryDAG(dag) assert dag_.build({'root': 1}) == {'root': 1, 'task': 2}
def test_in_memory_callable(): dag = DAG() def add_some(upstream, to_add): return upstream['root'] + to_add root = input_data_passer(dag, name='root') task = in_memory_callable(add_some, dag, name='task', params=dict(to_add=2)) root >> task dag_ = InMemoryDAG(dag) assert dag_.build({'root': 1}) == {'root': 1, 'task': 3}
def test_error_if_a_task_returns_none(): dag = DAG() PythonCallable(_return_none, File('root.parquet'), dag, name='root', params={'input_data': None}, serializer=serializer) dag_ = InMemoryDAG(dag) with pytest.raises(ValueError) as excinfo: dag_.build({'root': None}) expected = ('All callables in a InMemoryDAG must return a value. ' 'Callable "_return_none", from task "root" returned None') assert str(excinfo.value) == expected
def __init__(self): dag = self.init_dag_from_partial(self.get_partial()) # TODO: add support for manually specifying upstream dependencies upstream = { name: dag[name].source.extract_upstream() for name in dag._iter() } # names of all tasks used as upstream upstream_tasks = chain(*upstream.values()) # find tasks that are declared as upstream but do not exist in the dag missing = set(upstream_tasks) - set(dag) for name in missing: input_data_passer(dag, name=name) # TODO: maybe delete all upstream dependencies and set them again # (raise a warning if there are some upstream dependencies?) # this doesn't happen when we get a yaml file because we control # that using extract_upstream=False but might happen if we receive # a DAG object already # the dag is complete now, set all upstream dependencies for name in dag._iter(): for dependency in upstream.get(name, []): dag[name].set_upstream(dag[dependency]) # get all terminal nodes and make them a dependency of the node terminal_current = [ name for name, degree in dag._G.out_degree() if not degree ] # TODO: extract upstream and make sure they match with the ones in # terminal_current terminal = in_memory_callable(self.terminal_task, dag, name='terminal', params=self.terminal_params()) for dependency in terminal_current: terminal.set_upstream(dag[dependency]) self.in_memory = InMemoryDAG(dag)
def test_error_if_non_compatible_tasks(): dag = DAG() ShellScript('touch {{product}}', File('file.txt'), dag, name='task') with pytest.raises(TypeError) as excinfo: InMemoryDAG(dag) expected = ('All tasks in the DAG must be PythonCallable, ' 'got unallowed types: ShellScript') assert str(excinfo.value) == expected
class OnlineDAG(abc.ABC): """ Execute partial DAGs in-memory. This is an abstract class, to use it. Create a subclass and provide the required static methods. See here for a complete example: https://github.com/ploomber/projects/blob/master/ml-online/src/ml_online/infer.py """ # FIXME: add a way to customize def __init__(self): dag = self.init_dag_from_partial(self.get_partial()) # TODO: add support for manually specifying upstream dependencies upstream = { name: dag[name].source.extract_upstream() for name in dag._iter() } # names of all tasks used as upstream upstream_tasks = chain(*upstream.values()) # find tasks that are declared as upstream but do not exist in the dag missing = set(upstream_tasks) - set(dag) for name in missing: input_data_passer(dag, name=name) # TODO: maybe delete all upstream dependencies and set them again # (raise a warning if there are some upstream dependencies?) # this doesn't happen when we get a yaml file because we control # that using extract_upstream=False but might happen if we receive # a DAG object already # the dag is complete now, set all upstream dependencies for name in dag._iter(): for dependency in upstream.get(name, []): dag[name].set_upstream(dag[dependency]) # get all terminal nodes and make them a dependency of the node terminal_current = [ name for name, degree in dag._G.out_degree() if not degree ] # TODO: extract upstream and make sure they match with the ones in # terminal_current terminal = in_memory_callable(self.terminal_task, dag, name='terminal', params=self.terminal_params()) for dependency in terminal_current: terminal.set_upstream(dag[dependency]) self.in_memory = InMemoryDAG(dag) @classmethod def init_dag_from_partial(cls, partial): """Initialize partial returned by get_partial() """ if isinstance(partial, (str, Path)): with open(partial) as f: tasks = yaml.safe_load(f) # cannot extract upstream because this is an incomplete DAG meta = {'extract_product': False, 'extract_upstream': False} spec = DAGSpec( { 'tasks': tasks, 'meta': meta }, parent_path=Path(partial).parent, ) return spec.to_dag() elif isinstance(partial, DAG): return partial else: raise TypeError(f'Expected {cls.__name__}.get_partial() to ' 'return a str, pathlib.Path or ploomber.DAG, ' f'got {type(partial).__name__}') def predict(self, **kwargs): """ Run the DAG Parameters ---------- **kwargs One parameter per root task (task with no upstream dependencies) in the partial DAG. Returns ------- A dictionary with {task_name: returned_value} """ return self.in_memory.build(kwargs) @abc.abstractstaticmethod def get_partial(): """ Must return the location of a partial dag (str or pathlib.Path) """ pass @abc.abstractstaticmethod def terminal_task(upstream, model): """ Las function to execute. The ``upstream`` parameter contains the output of all tasks that have no downstream dependencies """ pass @abc.abstractstaticmethod def terminal_params(): """ Must return a dictionary with parameters passed to ``terminal_task`` """ pass
def test_error_input_data(input_data, dag): dag_ = InMemoryDAG(dag) with pytest.raises(KeyError): dag_.build(input_data)