def parse_action_node(self, action_node: ET.Element): """ Action nodes are the mechanism by which a workflow triggers the execution of a computation/processing task. Action nodes are required to have an action-choice (map-reduce, etc.), ok, and error node in the xml. """ # The 0th element of the node is the actual action tag. # In the form of 'action' action_operation_node = action_node[0] action_name = action_operation_node.tag mapper: BaseMapper if action_name not in self.action_map: action_name = "unknown" mapper = DummyMapper( oozie_node=action_operation_node, name=action_node.attrib["name"], dag_name=self.workflow.dag_name, props=self.props, ) else: map_class = self.action_map[action_name] mapper = map_class( oozie_node=action_operation_node, name=action_node.attrib["name"], props=self.props, dag_name=self.workflow.dag_name, action_mapper=self.action_map, renderer=self.renderer, input_directory_path=self.workflow.input_directory_path, output_directory_path=self.workflow.output_directory_path, jar_files=self.workflow.jar_files, transformers=self.transformers, ) oozie_action_node = OozieActionNode(mapper) ok_node = action_node.find("ok") if ok_node is None: raise Exception(f"Missing ok node in {action_node}") oozie_action_node.downstream_names.append(ok_node.attrib["to"]) error_node = action_node.find("error") if error_node is None: raise Exception(f"Missing error node in {action_node}") oozie_action_node.error_downstream_name = error_node.attrib["to"] mapper.on_parse_node() logging.info( f"Parsed {mapper.name} as Action Node of type {action_name}.") self.workflow.nodes[mapper.name] = oozie_action_node
def test_convert(self, sort_imports_mock, autoflake_fix_file_mock, black_mock, parse_workflow_mock): # Given workflow = Workflow( dag_name="A", input_directory_path="in_dir", output_directory_path="out_dir", relations={Relation(from_task_id="AAA", to_task_id="BBB")}, nodes=dict(AAA=ParsedActionNode(DummyMapper(Element("dummy"), name="AAA"))), dependencies={"import AAAA"}, ) parse_workflow_mock.return_value = workflow # When self.converter.convert() # Then parse_workflow_mock.assert_called_once_with() black_mock.format_file_in_place.assert_called_once_with( Path("/tmp/test_dag.py"), fast=mock.ANY, mode=mock.ANY, write_back=mock.ANY ) autoflake_fix_file_mock.assert_called_once_with( "/tmp/test_dag.py", args=AutoflakeArgs( remove_all_unused_imports=True, ignore_init_module_imports=False, remove_duplicate_keys=False, remove_unused_variables=True, in_place=True, imports=None, expand_star_imports=False, check=False, ), standard_out=sys.stdout, ) sort_imports_mock.assert_called_once_with("/tmp/test_dag.py")
def test_write_dag_file(self, render_template_mock): relations = {Relation(from_task_id="TASK_1", to_task_id="TASK_2")} nodes = dict(TASK_1=ParsedActionNode(DummyMapper(Element("dummy"), name="TASK_1"))) dependencies = {"import awesome_stuff"} workflow = Workflow( input_directory_path="/tmp/input_directory", output_directory_path="/tmp/input_directory", dag_name="test_dag", relations=relations, nodes=nodes, dependencies=dependencies, ) content = self.converter.render_workflow(workflow=workflow) render_template_mock.assert_called_once_with( dag_name="test_dag", dependencies={"import awesome_stuff"}, nodes=[nodes["TASK_1"]], params={"user.name": "USER"}, relations={Relation(from_task_id="TASK_1", to_task_id="TASK_2")}, schedule_interval=None, start_days_ago=None, template_name="workflow.tpl", ) self.assertEqual(content, "TEXT_CONTENT")
def parse_join_node(self, join_node): """ Join nodes wait for the corresponding beginning fork node paths to finish. As the parser we are assuming the Oozie workflow follows the schema perfectly. """ mapper = DummyMapper(oozie_node=join_node, name=join_node.attrib["name"], dag_name=self.workflow.dag_name) p_node = ParsedActionNode(mapper) p_node.add_downstream_node_name(join_node.attrib["to"]) mapper.on_parse_node() logging.info(f"Parsed {mapper.name} as Join Node.") self.workflow.nodes[join_node.attrib["name"]] = p_node
def parse_fork_node(self, root, fork_node): """ Fork nodes need to be dummy operators with multiple parallel downstream tasks. This parses the fork node, the action nodes that it references and then the join node at the end. This will only parse well-formed xml-adhering workflows where all paths end at the join node. """ fork_name = fork_node.attrib["name"] mapper = DummyMapper(oozie_node=fork_node, name=fork_name, dag_name=self.workflow.dag_name) p_node = ParsedActionNode(mapper) mapper.on_parse_node() logging.info(f"Parsed {mapper.name} as Fork Node.") paths = [] for node in fork_node: if "path" in node.tag: # Parse all the downstream tasks that can run in parallel. curr_name = node.attrib["start"] paths.append(xml_utils.find_node_by_name(root, curr_name)) self.workflow.nodes[fork_name] = p_node for path in paths: p_node.add_downstream_node_name(path.attrib["name"]) logging.info( f"Added {mapper.name}'s downstream: {path.attrib['name']}") # Theoretically these will all be action nodes, however I don't # think that is guaranteed. # The end of the execution path has not been reached self.parse_node(root, path) if path.attrib["name"] not in self.workflow.nodes: root.remove(path)
def _create_workflow(nodes=None): return Workflow( dag_name="A", input_directory_path="in_dir", output_directory_path="out_dir", relations={Relation(from_task_id="DAG_NAME_A", to_task_id="DAG_NAME_B")}, nodes=dict( AAA=ParsedActionNode(DummyMapper(Element("dummy"), name="DAG_NAME_A", dag_name="DAG_NAME_B")) ) if not nodes else nodes, dependencies={"import IMPORT"}, )
def _create_workflow(): return Workflow( dag_name="DAG_NAME", input_directory_path="/tmp/input", output_directory_path="/tmp/output", task_group_relations={ Relation(from_task_id="DAG_NAME_A", to_task_id="DAG_NAME_B") }, nodes=dict(AAA=OozieActionNode( DummyMapper(Element("dummy"), name="DAG_NAME_A", dag_name="DAG_NAME_B"))), dependencies={"import IMPORT"}, )
def test_create_dag_file(self, open_mock, _): # Given workflow = Workflow( dag_name="A", input_directory_path="in_dir", output_directory_path="out_dir", relations={Relation(from_task_id="AAA", to_task_id="BBB")}, nodes=dict(AAA=ParsedActionNode(DummyMapper(Element("dummy"), name="AAA"))), dependencies={"import AAAA"}, ) # When self.converter.create_dag_file(workflow) # Then open_mock.assert_called_once_with("/tmp/test_dag.py", "w")
def test_create_dag_file(self, black_mock, open_mock, _): workflow = Workflow( dag_name="A", input_directory_path="in_dir", output_directory_path="out_dir", relations={Relation(from_task_id="AAA", to_task_id="BBB")}, nodes=dict( AAA=ParsedNode(DummyMapper(ET.Element("dummy"), name="AAA"))), dependencies={"import AAAA"}, ) self.converter.create_dag_file(workflow) open_mock.assert_called_once_with("/tmp/test_dag.py", "w") black_mock.format_file_in_place.assert_called_once_with( Path("/tmp/test_dag.py"), fast=mock.ANY, mode=mock.ANY, write_back=mock.ANY)