示例#1
0
def make_predict():
    """Instantiate a prediction DAG using a previously trained model
    """
    dag_pred = DAG()

    # this special function adds a task with name "get" that will just forward
    # whatever value we pass when calling .build(). You can pass a function
    # in the "preprocessor" argument to perform arbitrary logic like parsing
    # or validation
    input_data_passer(dag=dag_pred,
                      name='get',
                      preprocessor=validate_input_data)

    # we re-use the same code that we used for training!
    add_features(dag_pred)

    # load model generated by the training graph
    with open(Path('output', 'model.pickle'), 'rb') as f:
        model = pickle.load(f)

    # add the final task, this special function just executes whatever
    # function we pass as the first argument, we can pass arbitrary parameters
    # using "params"
    predict_task = in_memory_callable(predict,
                                      dag=dag_pred,
                                      name='predict',
                                      params=dict(model=model))

    # predict after joining features
    dag_pred['join'] >> predict_task

    # convert our batch-processing pipeline to a in-memory one and return
    return InMemoryDAG(dag_pred)
示例#2
0
    def __init__(self):
        dag = self.init_dag_from_partial(self.get_partial())

        # TODO: add support for manually specifying upstream dependencies
        upstream = {
            name: dag[name].source.extract_upstream()
            for name in dag._iter()
        }

        # names of all tasks used as upstream
        upstream_tasks = chain(*upstream.values())

        # find tasks that are declared as upstream but do not exist in the dag
        missing = set(upstream_tasks) - set(dag)

        for name in missing:
            input_data_passer(dag, name=name)

        # TODO: maybe delete all upstream dependencies and set them again
        # (raise a warning if there are some upstream dependencies?)
        # this doesn't happen when we get a yaml file because we control
        # that using extract_upstream=False but might happen if we receive
        # a DAG object already
        # the dag is complete now, set all upstream dependencies
        for name in dag._iter():
            for dependency in upstream.get(name, []):
                dag[name].set_upstream(dag[dependency])

        # get all terminal nodes and make them a dependency of the  node
        terminal_current = [
            name for name, degree in dag._G.out_degree() if not degree
        ]

        # TODO: extract upstream and make sure they match with the ones in
        # terminal_current
        terminal = in_memory_callable(self.terminal_task,
                                      dag,
                                      name='terminal',
                                      params=self.terminal_params())

        for dependency in terminal_current:
            terminal.set_upstream(dag[dependency])

        self.in_memory = InMemoryDAG(dag)
示例#3
0
def test_input_data_passer():
    dag = DAG()

    root = input_data_passer(dag, name='root')
    task = PythonCallable(_add_one,
                          File('task.parquet'),
                          dag,
                          name='task',
                          unserializer=unserializer,
                          serializer=serializer)

    root >> task

    dag_ = InMemoryDAG(dag)

    assert dag_.build({'root': 1}) == {'root': 1, 'task': 2}
示例#4
0
def test_in_memory_callable():
    dag = DAG()

    def add_some(upstream, to_add):
        return upstream['root'] + to_add

    root = input_data_passer(dag, name='root')
    task = in_memory_callable(add_some,
                              dag,
                              name='task',
                              params=dict(to_add=2))

    root >> task

    dag_ = InMemoryDAG(dag)

    assert dag_.build({'root': 1}) == {'root': 1, 'task': 3}