def _call_node_run( node: Node, catalog: DataCatalog, inputs: Dict[str, Any], is_async: bool, run_id: str = None, ) -> Dict[str, Any]: hook_manager = get_hook_manager() try: outputs = node.run(inputs) except Exception as exc: hook_manager.hook.on_node_error( # pylint: disable=no-member error=exc, node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id, ) raise exc hook_manager.hook.after_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, outputs=outputs, is_async=is_async, run_id=run_id, ) return outputs
def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: inputs = {name: catalog.load(name) for name in node.inputs} hook_manager = get_hook_manager() is_async = False hook_manager.hook.before_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id ) try: outputs = node.run(inputs) except Exception as exc: hook_manager.hook.on_node_error( # pylint: disable=no-member error=exc, node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id, ) raise exc hook_manager.hook.after_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, outputs=outputs, is_async=is_async, run_id=run_id, ) for name, data in outputs.items(): catalog.save(name, data) return node
def _copy_node(node: Node) -> Node: new_namespace = node.namespace if namespace: new_namespace = (f"{namespace}.{node.namespace}" if node.namespace else namespace) return node._copy( inputs=_process_dataset_names(node._inputs), outputs=_process_dataset_names(node._outputs), namespace=new_namespace, )
def test_operator_arguments(mocker): # The Nodes first_node = Node(lambda: None, [], "a") last_node = Node(lambda: None, [], "b") # get turned into tasks and then into operators by the runner operator = mocker.patch("kedro_airflow.runner.PythonOperator") def operator_arguments(task_id): args = {"lambda-none-a": {"retries": 1}} return args.get(task_id, {}) # actually call the runner to do the conversion dag = Mock() pipeline = Pipeline([first_node, last_node]) catalog = DataCatalog({"a": None, "b": None}) AirflowRunner(dag, None, operator_arguments).run(pipeline, catalog) # check the operator constructor calls operator.assert_has_calls( [ call( dag=dag, provide_context=True, python_callable=ANY, task_id="lambda-none-a", retries=1, ), call( dag=dag, provide_context=True, python_callable=ANY, task_id="lambda-none-b", ), ], any_order=True, )
def _run_node_async(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: with ThreadPoolExecutor() as pool: inputs = { name: pool.submit(catalog.load, name) for name in node.inputs } # Python dict is thread-safe wait(inputs.values(), return_when=ALL_COMPLETED) inputs = {key: value.result() for key, value in inputs.items()} hook_manager = get_hook_manager() is_async = True hook_manager.hook.before_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id) try: outputs = node.run(inputs) except Exception as exc: hook_manager.hook.on_node_error( # pylint: disable=no-member error=exc, node=node, catalog=catalog, inputs=inputs, is_async=is_async, run_id=run_id, ) raise exc hook_manager.hook.after_node_run( # pylint: disable=no-member node=node, catalog=catalog, inputs=inputs, outputs=outputs, is_async=is_async, run_id=run_id, ) save_futures = set() for name, data in outputs.items(): save_futures.add(pool.submit(catalog.save, name, data)) for future in as_completed(save_futures): exception = future.exception() if exception: raise exception return node
def test_create_task(): def func(a, b): return a + b orig_catalog = Mock() catalog = orig_catalog.shallow_copy() catalog.load.side_effect = [1, 2] process_context = Mock(return_value=catalog) node = Node(func, ["ds_a", "ds_b"], "ds_c") task = AirflowRunner(None, process_context, None).create_task(node, orig_catalog) task(param=123) process_context.assert_called_once_with(catalog, param=123) catalog.save.assert_called_once_with("ds_c", 3)
def run_node(node: Node, catalog: DataCatalog) -> Node: """Run a single `Node` with inputs from and outputs to the `catalog`. Args: node: The ``Node`` to run. catalog: A ``DataCatalog`` containing the node's inputs and outputs. Returns: The node argument. """ inputs = {name: catalog.load(name) for name in node.inputs} outputs = node.run(inputs) for name, data in outputs.items(): catalog.save(name, data) return node
def test_run(mocker): # pylint: disable=too-many-locals # The Nodes first_node = Node(lambda: None, [], "a") middle_node = Node(lambda a: None, ["a"], "b") last_node = Node(lambda b: None, ["b"], []) # get turned into tasks by create_task first_task = Mock() middle_task = Mock() last_task = Mock() create_task = mocker.patch( "kedro_airflow.runner.AirflowRunner.create_task") create_task.side_effect = lambda node, catalog: { first_node: first_task, middle_node: middle_task, last_node: last_task, }[node] # and tasks get turned into operators by the runner first_op = Mock() middle_op = Mock() last_op = Mock() operator = mocker.patch("kedro_airflow.runner.PythonOperator") operator.side_effect = lambda python_callable, **kwargs: { first_task: first_op, middle_task: middle_op, last_task: last_op, }[python_callable] def operator_arguments(task_id): args = { "lambda-none-a": { "retries": 1 }, "lambda-b-none": { "retries": 2 } } return args.get(task_id, {}) # actually call the runner to do the conversion dag = Mock() pipeline = Pipeline([first_node, last_node, middle_node]) catalog = DataCatalog({ "a": LambdaDataSet(load=None, save=None), "b": LambdaDataSet(load=None, save=None), }) AirflowRunner(dag, None, operator_arguments).run(pipeline, catalog) # check the create task calls create_task.assert_has_calls( [ call(first_node, catalog), call(middle_node, catalog), call(last_node, catalog), ], any_order=True, ) # check the operator constructor calls operator.assert_has_calls( [ call( dag=dag, provide_context=True, python_callable=first_task, task_id="lambda-none-a", retries=1, ), call( dag=dag, provide_context=True, python_callable=middle_task, task_id="lambda-a-b", ), call( dag=dag, provide_context=True, python_callable=last_task, task_id="lambda-b-none", retries=2, ), ], any_order=True, ) # check the dependcy hookup first_op.set_upstream.assert_not_called() middle_op.set_upstream.assert_called_once_with(first_op) last_op.set_upstream.assert_called_once_with(middle_op)
def test_no_memory_datasets(): pipeline = Pipeline([Node(lambda: None, [], "fred")]) catalog = DataCatalog({"fred": MemoryDataSet()}) with pytest.raises(ValueError, match="memory data sets: 'fred'"): AirflowRunner(None, None, {}).run(pipeline, catalog)
def test_no_default_datasets(): pipeline = Pipeline([Node(lambda: None, [], "fred")]) catalog = DataCatalog() with pytest.raises(ValueError, match="'fred' is not registered"): AirflowRunner(None, None, {}).run(pipeline, catalog)