def test_on_parse_finish_decision_should_not_remove_end_node(self): workflow = Workflow(input_directory_path=None, output_directory_path=None, dag_name=None) mapper = self._get_end_mapper("end_task") workflow.nodes["first_task"] = ParsedNode( mock.Mock(spec=DecisionMapper, last_task_id="first_task")) workflow.nodes["second_task"] = ParsedNode( mock.Mock(spec=BaseMapper, last_task_id="second_task")) workflow.nodes["end_task"] = ParsedNode(mapper) workflow.relations = { Relation(from_task_id="first_task", to_task_id="end_task"), Relation(from_task_id="second_task", to_task_id="end_task"), } mapper.on_parse_finish(workflow) self.assertEqual(set(workflow.nodes.keys()), {"first_task", "second_task", "end_task"}) self.assertEqual( workflow.relations, {Relation(from_task_id="first_task", to_task_id="end_task")})
class SubWorkflowTemplateTestCase(BaseTestCases.BaseTemplateTestCase): TEMPLATE_NAME = "subworkflow.tpl" DEFAULT_TEMPLATE_PARAMS = dict( dependencies={"import awesome_stuff"}, task_groups=[ TaskGroup( name="AAA", tasks=[ Task(task_id="first_task", template_name="dummy.tpl"), Task(task_id="second_task", template_name="dummy.tpl"), ], relations=[ Relation(from_task_id="first_task", to_task_id="second_task") ], ) ], job_properties={"user.name": "USER"}, config={"key": "value"}, relations={Relation(from_task_id="TASK_1", to_task_id="TASK_2")}, ) def test_green_path(self): res = render_template(self.TEMPLATE_NAME, **self.DEFAULT_TEMPLATE_PARAMS) self.assertValidPython(res)
class SubWorkflowTemplateTestCase(TestCase, TemplateTestMixin): TEMPLATE_NAME = "subworkflow.tpl" DEFAULT_TEMPLATE_PARAMS = dict( dependencies={"import awesome_stuff"}, nodes=[ ParsedActionNode( mock.MagicMock(spec=DummyMapper), tasks=[ Task(task_id="first_task", template_name="dummy.tpl"), Task(task_id="second_task", template_name="dummy.tpl"), ], relations=[ Relation(from_task_id="first_task", to_task_id="second_task") ], ) ], job_properties={"user.name": "USER"}, config={"key": "value"}, relations={Relation(from_task_id="TASK_1", to_task_id="TASK_2")}, ) def test_green_path(self): res = render_template(self.TEMPLATE_NAME, **self.DEFAULT_TEMPLATE_PARAMS) self.assertValidPython(res)
class SubWorkflowTemplateTestCase(TestCase, TemplateTestMixin): TEMPLATE_NAME = "subworkflow.tpl" DEFAULT_TEMPLATE_PARAMS = dict( dependencies=["import awesome_stuff"], nodes=[ ParsedNode( mock.MagicMock(spec=DummyMapper), tasks=[ Task(task_id="first_task", template_name="dummy.tpl"), Task(task_id="second_task", template_name="dummy.tpl"), ], relations=[ Relation(from_task_id="first_task", to_task_id="second_task") ], ) ], params={"user.name": "USER"}, relations={Relation(from_task_id="TASK_1", to_task_id="TASK_2")}, schedule_interval=None, start_days_ago=None, ) def test_green_path(self): res = render_template(self.TEMPLATE_NAME, **self.DEFAULT_TEMPLATE_PARAMS) self.assertValidPython(res)
def test_write_dag_file(self, render_template_mock): relations = {Relation(from_task_id="TASK_1", to_task_id="TASK_2")} nodes = dict(TASK_1=ParsedActionNode(DummyMapper(Element("dummy"), name="TASK_1"))) dependencies = {"import awesome_stuff"} workflow = Workflow( input_directory_path="/tmp/input_directory", output_directory_path="/tmp/input_directory", dag_name="test_dag", relations=relations, nodes=nodes, dependencies=dependencies, ) content = self.converter.render_workflow(workflow=workflow) render_template_mock.assert_called_once_with( dag_name="test_dag", dependencies={"import awesome_stuff"}, nodes=[nodes["TASK_1"]], params={"user.name": "USER"}, relations={Relation(from_task_id="TASK_1", to_task_id="TASK_2")}, schedule_interval=None, start_days_ago=None, template_name="workflow.tpl", ) self.assertEqual(content, "TEXT_CONTENT")
def add_state_handler_if_needed(self): """ Add additional tasks and relations to handle error and ok flow. If the error path is specified, additional relations and task are added to handle the error state. If the error path and the ok path is specified, additional relations and task are added to handle the ok path and the error path. If the error path and the ok path is not-specified, no action is performed. """ if not self.error_downstream_name: return error_handler_task_id = self.name + "_error" error_handler = Task( task_id=error_handler_task_id, template_name="dummy.tpl", trigger_rule=TriggerRule.ONE_FAILED ) self.error_handler_task = error_handler new_relations = ( Relation(from_task_id=t.task_id, to_task_id=error_handler_task_id, is_error=True) for t in self.tasks ) self.relations.extend(new_relations) if not self.downstream_names: return ok_handler_task_id = self.name + "_ok" ok_handler = Task( task_id=ok_handler_task_id, template_name="dummy.tpl", trigger_rule=TriggerRule.ONE_SUCCESS ) self.ok_handler_task = ok_handler self.relations.append(Relation(from_task_id=self.tasks[-1].task_id, to_task_id=ok_handler_task_id))
def test_on_parse_finish(self): workflow = Workflow(input_directory_path=None, output_directory_path=None, dag_name=None) mapper = self._get_kill_mapper(name="fail_task") workflow.nodes["task"] = ParsedNode(mock.Mock(autospec=BaseMapper)) workflow.nodes["fail_task"] = ParsedNode(mapper) workflow.nodes["success_task"] = ParsedNode( mock.Mock(autospec=BaseMapper)) workflow.nodes["success_task"].set_is_ok(True) workflow.nodes["fail_task"].set_is_error(True) workflow.relations = { Relation(from_task_id="task", to_task_id="fail_task"), Relation(from_task_id="task", to_task_id="success_task"), } mapper.on_parse_finish(workflow) self.assertEqual(set(workflow.nodes.keys()), {"task", "success_task"}) self.assertEqual( workflow.relations, {Relation(from_task_id="task", to_task_id="success_task")})
def test_multiple(self): relations = fs_mapper.chain([ Task(task_id="task_1", template_name=""), Task(task_id="task_2", template_name=""), Task(task_id="task_3", template_name=""), Task(task_id="task_4", template_name=""), ]) self.assertEqual( [ Relation(from_task_id="task_1", to_task_id="task_2"), Relation(from_task_id="task_2", to_task_id="task_3"), Relation(from_task_id="task_3", to_task_id="task_4"), ], relations, )
def test_to_tasks_and_relations_should_parse_prepare_element(self): self.hive_node.append(ET.fromstring(FRAGMENT_QUERY)) self.hive_node.append(ET.fromstring(FRAGMENT_PREPARE)) mapper = self._get_hive_mapper(job_properties=self.job_properties, config=self.config) mapper.on_parse_node() tasks, relations = mapper.to_tasks_and_relations() self.assertEqual(2, len(tasks)) self.assertEqual( Task( task_id="test_id_prepare", template_name="prepare.tpl", template_params={ "delete": "/user/TEST_USERNAME/TEST_EXAMPLE_ROOT/apps/pig/output", "mkdir": "/user/TEST_USERNAME/TEST_EXAMPLE_ROOT/apps/pig/created-folder", }, ), tasks[0], ) self.assertEqual( [Relation(from_task_id="test_id_prepare", to_task_id="test_id")], relations)
def test_task_and_relations(self): # Given mapper = _get_distcp_mapper(self.distcp_node, job_properties=EXAMPLE_JOB_PROPERTIES, config=EXAMPLE_CONFIG_PROPERTIES) # When mapper.on_parse_node() tasks, relations = mapper.to_tasks_and_relations() # Then self.assertEqual(mapper.oozie_node, self.distcp_node) self.assertIsNotNone(tasks) self.assertIsNotNone(relations) self.assertEqual(2, len(tasks)) self.assertEqual(1, len(relations)) self.assertEqual( [ Task( task_id="distcp_prepare", template_name="prepare.tpl", trigger_rule="one_success", template_params={ "delete": "/tmp/d_path", "mkdir": None }, ), Task( task_id="distcp", template_name="distcp.tpl", trigger_rule="one_success", template_params={ "props": PropertySet( config={ "dataproc_cluster": "my-cluster", "gcp_region": "europe-west3" }, job_properties={ "nameNode1": "hdfs://localhost:8081", "nameNode2": "hdfs://localhost:8082", }, action_node_properties={ "oozie.launcher.mapreduce.job.hdfs-servers": "{{nameNode1}} ,{{nameNode2}}" }, ), "distcp_command": "--class=org.apache.hadoop.tools.DistCp -- -update -skipcrccheck " "-strategy dynamic '{{nameNode1}}/path/to/input file.txt' " "'{{nameNode2}}/path/to/output-file.txt'", }, ), ], tasks, ) self.assertEqual([ Relation(from_task_id=f"{mapper.name}_prepare", to_task_id=mapper.name) ], relations)
def test_convert(self, sort_imports_mock, autoflake_fix_file_mock, black_mock, parse_workflow_mock): # Given workflow = Workflow( dag_name="A", input_directory_path="in_dir", output_directory_path="out_dir", relations={Relation(from_task_id="AAA", to_task_id="BBB")}, nodes=dict(AAA=ParsedActionNode(DummyMapper(Element("dummy"), name="AAA"))), dependencies={"import AAAA"}, ) parse_workflow_mock.return_value = workflow # When self.converter.convert() # Then parse_workflow_mock.assert_called_once_with() black_mock.format_file_in_place.assert_called_once_with( Path("/tmp/test_dag.py"), fast=mock.ANY, mode=mock.ANY, write_back=mock.ANY ) autoflake_fix_file_mock.assert_called_once_with( "/tmp/test_dag.py", args=AutoflakeArgs( remove_all_unused_imports=True, ignore_init_module_imports=False, remove_duplicate_keys=False, remove_unused_variables=True, in_place=True, imports=None, expand_star_imports=False, check=False, ), standard_out=sys.stdout, ) sort_imports_mock.assert_called_once_with("/tmp/test_dag.py")
def test_convert_nodes(self): tasks_1 = [ Task(task_id="first_task", template_name="dummy.tpl"), Task(task_id="second_task", template_name="dummy.tpl"), ] relations_1 = { Relation(from_task_id="first_task", to_task_id="tasks_2") } tasks_2 = [Task(task_id="third_task", template_name="dummy.tpl")] relations_2 = {} mapper_1 = mock.MagicMock( **{"to_tasks_and_relations.return_value": (tasks_1, relations_1)}) mapper_2 = mock.MagicMock( **{"to_tasks_and_relations.return_value": (tasks_2, relations_2)}) node_1 = ParsedNode(mapper=mapper_1) node_2 = ParsedNode(mapper=mapper_2) nodes = dict(TASK_1=node_1, TASK_2=node_2) self.converter.convert_nodes(nodes=nodes) self.assertIs(node_1.tasks, tasks_1) self.assertIs(node_2.tasks, tasks_2) self.assertIs(node_1.relations, relations_1) self.assertIs(node_2.relations, relations_2)
def test_to_tasks_and_relations(self): job_properties = {"nameNode": "hdfs://localhost:9020/", "queueName": "default"} config = {"dataproc_cluster": "my-cluster", "gcp_region": "europe-west3"} mapper = self._get_shell_mapper(job_properties=job_properties, config=config) mapper.on_parse_node() tasks, relations = mapper.to_tasks_and_relations() self.assertEqual( [ Task( task_id="test_id_prepare", template_name="prepare.tpl", template_params={ "delete": "//examples/output-data/demo/pig-node " "//examples/output-data/demo/pig-node2", "mkdir": "//examples/input-data/demo/pig-node " "//examples/input-data/demo/pig-node2", }, ), Task( task_id="test_id", template_name="shell.tpl", template_params={ "pig_command": "sh echo arg1 arg2", "action_node_properties": { "mapred.job.queue.name": "default", "mapred.map.output.compress": "false", }, }, ), ], tasks, ) self.assertEqual(relations, [Relation(from_task_id="test_id_prepare", to_task_id="test_id")])
def to_tasks_and_relations(self): prepare_command = self.get_prepare_command(self.oozie_node, self.params) tasks = [ Task( task_id=self.name + "_prepare", template_name="prepare.tpl", trigger_rule=self.trigger_rule, template_params=dict(prepare_command=prepare_command), ), Task( task_id=self.name, template_name="pig.tpl", trigger_rule=self.trigger_rule, template_params=dict( properties=self.properties, params_dict=self.params_dict, script_file_name=self.script_file_name, ), ), ] relations = [ Relation(from_task_id=self.name + "_prepare", to_task_id=self.name) ] return tasks, relations
def convert_relations(workflow: Workflow) -> None: logging.info("Converting relations between nodes.") for p_node in workflow.nodes.values(): for downstream in p_node.get_downstreams(): relation = Relation( from_task_id=p_node.last_task_id, to_task_id=workflow.nodes[downstream].first_task_id) workflow.relations.add(relation) error_downstream = p_node.get_error_downstream_name() if error_downstream: relation = Relation( from_task_id=p_node.last_task_id, to_task_id=workflow.nodes[error_downstream].first_task_id, is_error=True, ) workflow.relations.add(relation)
def to_tasks_and_relations(self): tasks = [ Task( task_id=self.name, template_name="mapreduce.tpl", trigger_rule=self.trigger_rule, template_params=dict( properties=self.properties, params_dict=self.params_dict, hdfs_files=self.hdfs_files, hdfs_archives=self.hdfs_archives, ), ) ] relations = [] if self.has_prepare(self.oozie_node): prepare_command = self.get_prepare_command(self.oozie_node, self.params) tasks.insert( 0, Task( task_id=self.name + "_prepare", template_name="prepare.tpl", trigger_rule=self.trigger_rule, template_params=dict(prepare_command=prepare_command), ), ) relations = [Relation(from_task_id=self.name + "_prepare", to_task_id=self.name)] return tasks, relations
class WorkflowTemplateTestCase(TestCase, TemplateTestMixin): TEMPLATE_NAME = "workflow.tpl" DEFAULT_TEMPLATE_PARAMS = dict( dag_name="test_dag", dependencies={"import awesome_stuff"}, task_groups=[ TaskGroup( name="TASK_GROUP", tasks=[ Task(task_id="first_task", template_name="dummy.tpl"), Task(task_id="second_task", template_name="dummy.tpl"), ], ) ], job_properties={"user.name": "USER"}, config={}, relations={Relation(from_task_id="TASK_1", to_task_id="TASK_2")}, schedule_interval=3, start_days_ago=3, task_map={"oozie-task": ["airflow-task"]}, ) def test_green_path(self): res = render_template(self.TEMPLATE_NAME, **self.DEFAULT_TEMPLATE_PARAMS) self.assertValidPython(res)
def test_to_tasks_and_relations(self): params = { "dataproc_cluster": "my-cluster", "gcp_region": "europe-west3", "nameNode": "hdfs://localhost:9020/", } mapper = self._get_shell_mapper(params=params) tasks, relations = mapper.to_tasks_and_relations() self.assertEqual( tasks, [ Task( task_id="test_id_prepare", template_name="prepare.tpl", template_params={ "prepare_command": "$DAGS_FOLDER/../data/prepare.sh -c my-cluster -r europe-west3 " '-d "//examples/output-data/demo/pig-node //examples/output-data' '/demo/pig-node2" -m "//examples/input-data/demo/pig-node ' '//examples/input-data/demo/pig-node2"' }, ), Task( task_id="test_id", template_name="shell.tpl", template_params={"pig_command": "sh 'echo arg1 arg2'"}, ), ], ) self.assertEqual( relations, [Relation(from_task_id="test_id_prepare", to_task_id="test_id")])
def to_tasks_and_relations(self): tasks = [ Task( task_id=self.name, template_name="git.tpl", template_params=dict(bash_command=self.bash_command), ) ] relations = [] if self.has_prepare(self.oozie_node): prepare_command = self.get_prepare_command(self.oozie_node, self.params) tasks.insert( 0, Task( task_id=self.name + "_prepare", template_name="prepare.tpl", template_params=dict(prepare_command=prepare_command), ), ) relations = [ Relation(from_task_id=self.name + "_prepare", to_task_id=self.name) ] return tasks, relations
def test_to_tasks_and_relations(self): job_properties = {"nameNode": "hdfs://"} config = { "dataproc_cluster": "my-cluster", "gcp_region": "europe-west3" } mapper = self._get_pig_mapper(job_properties=job_properties, config=config) mapper.on_parse_node() tasks, relations = mapper.to_tasks_and_relations() self.assertEqual( tasks, [ Task( task_id="test_id_prepare", template_name="prepare.tpl", template_params={ "delete": "/examples/output-data/demo/pig-node /examples/output-data/demo/pig-node2", "mkdir": "/examples/input-data/demo/pig-node /examples/input-data/demo/pig-node2", }, ), Task( task_id="test_id", template_name="pig.tpl", template_params={ "props": PropertySet( config={ "dataproc_cluster": "my-cluster", "gcp_region": "europe-west3" }, job_properties={"nameNode": "hdfs://"}, action_node_properties={ "mapred.job.queue.name": "${queueName}", "mapred.map.output.compress": "false", }, ), "params_dict": { "INPUT": "/user/${wf:user()}/${examplesRoot}/input-data/text", "OUTPUT": "/user/${wf:user()}/${examplesRoot}/output-data/demo/pig-node", }, "script_file_name": "id.pig", "action_node_properties": { "mapred.job.queue.name": "${queueName}", "mapred.map.output.compress": "false", }, }, ), ], ) self.assertEqual( [Relation(from_task_id="test_id_prepare", to_task_id="test_id")], relations)
def _get_relations(self): """ Returns the list of Airflow relations that are the result of mapping :return: list of relations """ return ([ Relation(from_task_id=self.name + "_prepare", to_task_id=self.name) ] if self.has_prepare(self.oozie_node) else [])
def convert_relations(self) -> None: logging.info("Converting relations between tasks groups.") for task_group in self.workflow.task_groups.values(): for downstream in task_group.downstream_names: relation = Relation( from_task_id=task_group.last_task_id_of_ok_flow, to_task_id=self.workflow.task_groups[downstream]. first_task_id, ) self.workflow.task_group_relations.add(relation) error_downstream = task_group.error_downstream_name if error_downstream: relation = Relation( from_task_id=task_group.last_task_id_of_error_flow, to_task_id=self.workflow.task_groups[error_downstream]. first_task_id, is_error=True, ) self.workflow.task_group_relations.add(relation)
def test_convert_to_text_with_prepare_node(self): git_node = ET.fromstring(EXAMPLE_XML) mapper = self._get_git_mapper(git_node) mapper.on_parse_node() tasks, relations = mapper.to_tasks_and_relations() self.assertEqual( [ Task( task_id="test_id_prepare", template_name="prepare.tpl", trigger_rule="one_success", template_params={ "delete": "/tmp/d_path", "mkdir": "/tmp/mk_path" }, ), Task( task_id="test_id", template_name="git.tpl", trigger_rule="one_success", template_params={ "git_uri": "https://github.com/apache/oozie", "git_branch": "{{branch}}", "destination_path": "/my_git_repo_directory", "key_path": "/awesome-key/", "props": PropertySet( config={ "dataproc_cluster": "my-cluster", "gcp_region": "europe-west3" }, job_properties={ "branch": "my-awesome-branch", "nameNode": "hdfs://", "userName": "******", "examplesRoot": "examples", }, action_node_properties={}, ), }, ), ], tasks, ) self.assertEqual( [Relation(from_task_id="test_id_prepare", to_task_id="test_id")], relations)
def test_prepend_task_empty_relations(self): task_1 = Task(task_id=TEST_MAPPER_NAME + "_1", template_name="pig.tpl") task_2 = Task(task_id=TEST_MAPPER_NAME + "_2", template_name="pig.tpl") tasks, relations = ActionMapper.prepend_task(task_to_prepend=task_1, tasks=[task_2], relations=[]) self.assertEqual([task_1, task_2], tasks) self.assertEqual([ Relation(from_task_id="mapper_name_1", to_task_id="mapper_name_2") ], relations)
def create_relations(self) -> None: """ Given a dictionary of task_ids and ParsedNodes, returns a set of logical connectives for each task in Airflow. :return: Set with strings of task's downstream nodes. """ logging.info("Parsing relations between operators.") for p_node in self.workflow.nodes.values(): for downstream in p_node.get_downstreams(): relation = Relation( from_task_id=p_node.last_task_id, to_task_id=self.workflow.nodes[downstream].first_task_id ) self.workflow.relations.add(relation) error_downstream = p_node.get_error_downstream_name() if error_downstream: relation = Relation( from_task_id=p_node.last_task_id, to_task_id=self.workflow.nodes[error_downstream].first_task_id, ) self.workflow.relations.add(relation)
def test_on_parse_finish_decision_should_not_remove_end_node(self): workflow = Workflow(input_directory_path="", output_directory_path="", dag_name="DAG_NAME_B") mapper = self._get_end_mapper("end_task") workflow.nodes["first_task"] = ParsedActionNode( mock.Mock(spec=DecisionMapper), tasks=[self._get_dummy_task("first_task")] ) workflow.nodes["second_task"] = ParsedActionNode( mock.Mock(spec=BaseMapper), tasks=[self._get_dummy_task("second_task")] ) workflow.nodes["end_task"] = ParsedActionNode(mapper, tasks=[self._get_dummy_task("end_task")]) workflow.relations = { Relation(from_task_id="first_task", to_task_id="end_task"), Relation(from_task_id="second_task", to_task_id="end_task"), } mapper.on_parse_finish(workflow) self.assertEqual({"first_task", "second_task", "end_task"}, set(workflow.nodes.keys())) self.assertEqual({Relation(from_task_id="first_task", to_task_id="end_task")}, workflow.relations)
def _create_workflow(nodes=None): return Workflow( dag_name="A", input_directory_path="in_dir", output_directory_path="out_dir", relations={Relation(from_task_id="DAG_NAME_A", to_task_id="DAG_NAME_B")}, nodes=dict( AAA=ParsedActionNode(DummyMapper(Element("dummy"), name="DAG_NAME_A", dag_name="DAG_NAME_B")) ) if not nodes else nodes, dependencies={"import IMPORT"}, )
def _create_workflow(): return Workflow( dag_name="DAG_NAME", input_directory_path="/tmp/input", output_directory_path="/tmp/output", task_group_relations={ Relation(from_task_id="DAG_NAME_A", to_task_id="DAG_NAME_B") }, task_groups=dict(TASK_NAME=TaskGroup( name="DAG_NAME_A", tasks=[Task(task_id="task_name", template_name="dummy.tpl")])), dependencies={"import IMPORT"}, )
def _create_workflow(): return Workflow( dag_name="DAG_NAME", input_directory_path="/tmp/input", output_directory_path="/tmp/output", task_group_relations={ Relation(from_task_id="DAG_NAME_A", to_task_id="DAG_NAME_B") }, nodes=dict(AAA=OozieActionNode( DummyMapper(Element("dummy"), name="DAG_NAME_A", dag_name="DAG_NAME_B"))), dependencies={"import IMPORT"}, )
def test_create_dag_file(self, open_mock, _): # Given workflow = Workflow( dag_name="A", input_directory_path="in_dir", output_directory_path="out_dir", relations={Relation(from_task_id="AAA", to_task_id="BBB")}, nodes=dict(AAA=ParsedActionNode(DummyMapper(Element("dummy"), name="AAA"))), dependencies={"import AAAA"}, ) # When self.converter.create_dag_file(workflow) # Then open_mock.assert_called_once_with("/tmp/test_dag.py", "w")