def make_predict(): """Instantiate a prediction DAG using a previously trained model """ dag_pred = DAG() # this special function adds a task with name "get" that will just forward # whatever value we pass when calling .build(). You can pass a function # in the "preprocessor" argument to perform arbitrary logic like parsing # or validation input_data_passer(dag=dag_pred, name='get', preprocessor=validate_input_data) # we re-use the same code that we used for training! add_features(dag_pred) # load model generated by the training graph with open(Path('output', 'model.pickle'), 'rb') as f: model = pickle.load(f) # add the final task, this special function just executes whatever # function we pass as the first argument, we can pass arbitrary parameters # using "params" predict_task = in_memory_callable(predict, dag=dag_pred, name='predict', params=dict(model=model)) # predict after joining features dag_pred['join'] >> predict_task # convert our batch-processing pipeline to a in-memory one and return return InMemoryDAG(dag_pred)
def __init__(self): dag = self.init_dag_from_partial(self.get_partial()) # TODO: add support for manually specifying upstream dependencies upstream = { name: dag[name].source.extract_upstream() for name in dag._iter() } # names of all tasks used as upstream upstream_tasks = chain(*upstream.values()) # find tasks that are declared as upstream but do not exist in the dag missing = set(upstream_tasks) - set(dag) for name in missing: input_data_passer(dag, name=name) # TODO: maybe delete all upstream dependencies and set them again # (raise a warning if there are some upstream dependencies?) # this doesn't happen when we get a yaml file because we control # that using extract_upstream=False but might happen if we receive # a DAG object already # the dag is complete now, set all upstream dependencies for name in dag._iter(): for dependency in upstream.get(name, []): dag[name].set_upstream(dag[dependency]) # get all terminal nodes and make them a dependency of the node terminal_current = [ name for name, degree in dag._G.out_degree() if not degree ] # TODO: extract upstream and make sure they match with the ones in # terminal_current terminal = in_memory_callable(self.terminal_task, dag, name='terminal', params=self.terminal_params()) for dependency in terminal_current: terminal.set_upstream(dag[dependency]) self.in_memory = InMemoryDAG(dag)
def test_input_data_passer(): dag = DAG() root = input_data_passer(dag, name='root') task = PythonCallable(_add_one, File('task.parquet'), dag, name='task', unserializer=unserializer, serializer=serializer) root >> task dag_ = InMemoryDAG(dag) assert dag_.build({'root': 1}) == {'root': 1, 'task': 2}
def test_in_memory_callable(): dag = DAG() def add_some(upstream, to_add): return upstream['root'] + to_add root = input_data_passer(dag, name='root') task = in_memory_callable(add_some, dag, name='task', params=dict(to_add=2)) root >> task dag_ = InMemoryDAG(dag) assert dag_.build({'root': 1}) == {'root': 1, 'task': 3}