Пример #1
0
def _call_node_run(
    node: Node,
    catalog: DataCatalog,
    inputs: Dict[str, Any],
    is_async: bool,
    run_id: str = None,
) -> Dict[str, Any]:
    hook_manager = get_hook_manager()
    try:
        outputs = node.run(inputs)
    except Exception as exc:
        hook_manager.hook.on_node_error(  # pylint: disable=no-member
            error=exc,
            node=node,
            catalog=catalog,
            inputs=inputs,
            is_async=is_async,
            run_id=run_id,
        )
        raise exc
    hook_manager.hook.after_node_run(  # pylint: disable=no-member
        node=node,
        catalog=catalog,
        inputs=inputs,
        outputs=outputs,
        is_async=is_async,
        run_id=run_id,
    )
    return outputs
Пример #2
0
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node:
    inputs = {name: catalog.load(name) for name in node.inputs}
    hook_manager = get_hook_manager()
    is_async = False
    hook_manager.hook.before_node_run(  # pylint: disable=no-member
        node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id
    )
    try:
        outputs = node.run(inputs)
    except Exception as exc:
        hook_manager.hook.on_node_error(  # pylint: disable=no-member
            error=exc,
            node=node,
            catalog=catalog,
            inputs=inputs,
            is_async=is_async,
            run_id=run_id,
        )
        raise exc
    hook_manager.hook.after_node_run(  # pylint: disable=no-member
        node=node,
        catalog=catalog,
        inputs=inputs,
        outputs=outputs,
        is_async=is_async,
        run_id=run_id,
    )

    for name, data in outputs.items():
        catalog.save(name, data)
    return node
Пример #3
0
    def _copy_node(node: Node) -> Node:
        new_namespace = node.namespace
        if namespace:
            new_namespace = (f"{namespace}.{node.namespace}"
                             if node.namespace else namespace)

        return node._copy(
            inputs=_process_dataset_names(node._inputs),
            outputs=_process_dataset_names(node._outputs),
            namespace=new_namespace,
        )
Пример #4
0
def test_operator_arguments(mocker):
    # The Nodes
    first_node = Node(lambda: None, [], "a")
    last_node = Node(lambda: None, [], "b")

    # get turned into tasks and then into operators by the runner
    operator = mocker.patch("kedro_airflow.runner.PythonOperator")

    def operator_arguments(task_id):
        args = {"lambda-none-a": {"retries": 1}}
        return args.get(task_id, {})

    # actually call the runner to do the conversion
    dag = Mock()
    pipeline = Pipeline([first_node, last_node])
    catalog = DataCatalog({"a": None, "b": None})
    AirflowRunner(dag, None, operator_arguments).run(pipeline, catalog)

    # check the operator constructor calls
    operator.assert_has_calls(
        [
            call(
                dag=dag,
                provide_context=True,
                python_callable=ANY,
                task_id="lambda-none-a",
                retries=1,
            ),
            call(
                dag=dag,
                provide_context=True,
                python_callable=ANY,
                task_id="lambda-none-b",
            ),
        ],
        any_order=True,
    )
Пример #5
0
def _run_node_async(node: Node,
                    catalog: DataCatalog,
                    run_id: str = None) -> Node:
    with ThreadPoolExecutor() as pool:
        inputs = {
            name: pool.submit(catalog.load, name)
            for name in node.inputs
        }  # Python dict is thread-safe
        wait(inputs.values(), return_when=ALL_COMPLETED)
        inputs = {key: value.result() for key, value in inputs.items()}
        hook_manager = get_hook_manager()
        is_async = True
        hook_manager.hook.before_node_run(  # pylint: disable=no-member
            node=node,
            catalog=catalog,
            inputs=inputs,
            is_async=is_async,
            run_id=run_id)
        try:
            outputs = node.run(inputs)
        except Exception as exc:
            hook_manager.hook.on_node_error(  # pylint: disable=no-member
                error=exc,
                node=node,
                catalog=catalog,
                inputs=inputs,
                is_async=is_async,
                run_id=run_id,
            )
            raise exc
        hook_manager.hook.after_node_run(  # pylint: disable=no-member
            node=node,
            catalog=catalog,
            inputs=inputs,
            outputs=outputs,
            is_async=is_async,
            run_id=run_id,
        )

        save_futures = set()

        for name, data in outputs.items():
            save_futures.add(pool.submit(catalog.save, name, data))

        for future in as_completed(save_futures):
            exception = future.exception()
            if exception:
                raise exception
    return node
Пример #6
0
def test_create_task():
    def func(a, b):
        return a + b

    orig_catalog = Mock()
    catalog = orig_catalog.shallow_copy()
    catalog.load.side_effect = [1, 2]
    process_context = Mock(return_value=catalog)
    node = Node(func, ["ds_a", "ds_b"], "ds_c")

    task = AirflowRunner(None, process_context,
                         None).create_task(node, orig_catalog)
    task(param=123)
    process_context.assert_called_once_with(catalog, param=123)
    catalog.save.assert_called_once_with("ds_c", 3)
Пример #7
0
def run_node(node: Node, catalog: DataCatalog) -> Node:
    """Run a single `Node` with inputs from and outputs to the `catalog`.

    Args:
        node: The ``Node`` to run.
        catalog: A ``DataCatalog`` containing the node's inputs and outputs.

    Returns:
        The node argument.

    """
    inputs = {name: catalog.load(name) for name in node.inputs}
    outputs = node.run(inputs)
    for name, data in outputs.items():
        catalog.save(name, data)
    return node
Пример #8
0
def test_run(mocker):  # pylint: disable=too-many-locals
    # The Nodes
    first_node = Node(lambda: None, [], "a")
    middle_node = Node(lambda a: None, ["a"], "b")
    last_node = Node(lambda b: None, ["b"], [])

    # get turned into tasks by create_task
    first_task = Mock()
    middle_task = Mock()
    last_task = Mock()
    create_task = mocker.patch(
        "kedro_airflow.runner.AirflowRunner.create_task")
    create_task.side_effect = lambda node, catalog: {
        first_node: first_task,
        middle_node: middle_task,
        last_node: last_task,
    }[node]

    # and tasks get turned into operators by the runner
    first_op = Mock()
    middle_op = Mock()
    last_op = Mock()
    operator = mocker.patch("kedro_airflow.runner.PythonOperator")
    operator.side_effect = lambda python_callable, **kwargs: {
        first_task: first_op,
        middle_task: middle_op,
        last_task: last_op,
    }[python_callable]

    def operator_arguments(task_id):
        args = {
            "lambda-none-a": {
                "retries": 1
            },
            "lambda-b-none": {
                "retries": 2
            }
        }
        return args.get(task_id, {})

    # actually call the runner to do the conversion
    dag = Mock()
    pipeline = Pipeline([first_node, last_node, middle_node])
    catalog = DataCatalog({
        "a": LambdaDataSet(load=None, save=None),
        "b": LambdaDataSet(load=None, save=None),
    })
    AirflowRunner(dag, None, operator_arguments).run(pipeline, catalog)

    # check the create task calls
    create_task.assert_has_calls(
        [
            call(first_node, catalog),
            call(middle_node, catalog),
            call(last_node, catalog),
        ],
        any_order=True,
    )

    # check the operator constructor calls
    operator.assert_has_calls(
        [
            call(
                dag=dag,
                provide_context=True,
                python_callable=first_task,
                task_id="lambda-none-a",
                retries=1,
            ),
            call(
                dag=dag,
                provide_context=True,
                python_callable=middle_task,
                task_id="lambda-a-b",
            ),
            call(
                dag=dag,
                provide_context=True,
                python_callable=last_task,
                task_id="lambda-b-none",
                retries=2,
            ),
        ],
        any_order=True,
    )

    # check the dependcy hookup
    first_op.set_upstream.assert_not_called()
    middle_op.set_upstream.assert_called_once_with(first_op)
    last_op.set_upstream.assert_called_once_with(middle_op)
Пример #9
0
def test_no_memory_datasets():
    pipeline = Pipeline([Node(lambda: None, [], "fred")])
    catalog = DataCatalog({"fred": MemoryDataSet()})
    with pytest.raises(ValueError, match="memory data sets: 'fred'"):
        AirflowRunner(None, None, {}).run(pipeline, catalog)
Пример #10
0
def test_no_default_datasets():
    pipeline = Pipeline([Node(lambda: None, [], "fred")])
    catalog = DataCatalog()
    with pytest.raises(ValueError, match="'fred' is not registered"):
        AirflowRunner(None, None, {}).run(pipeline, catalog)