def __init__( self, dag_name: str, input_directory_path: str, output_directory_path: str, action_mapper: Dict[str, Type[ActionMapper]], renderer: BaseRenderer, transformers: List[BaseWorkflowTransformer] = None, user: str = None, initial_props: PropertySet = None, ): self.workflow = Workflow( dag_name=dag_name, input_directory_path=input_directory_path, output_directory_path=output_directory_path, ) self.renderer = renderer self.transformers = transformers or [] # Propagate the configuration in case initial property set is passed job_properties = {} if not initial_props else initial_props.job_properties job_properties["user.name"] = user or os.environ["USER"] self.props = PropertySet(job_properties=job_properties) self.property_parser = PropertyParser(props=self.props, workflow=self.workflow) self.parser = parser.OozieParser(props=self.props, action_mapper=action_mapper, renderer=self.renderer, workflow=self.workflow)
def setUp(self): props = PropertySet(job_properties={}, config={}) workflow = Workflow(input_directory_path=EXAMPLE_DEMO_PATH, output_directory_path="/tmp", dag_name="DAG_NAME_B") self.parser = parser.OozieParser(workflow=workflow, props=props, action_mapper=ACTION_MAP, renderer=mock.MagicMock())
def setUp(self): params = {} self.parser = parser.OozieParser( input_directory_path=EXAMPLE_DEMO_PATH, output_directory_path="/tmp", params=params, action_mapper=ACTION_MAP, control_mapper=CONTROL_MAP, )
def __init__( self, dag_name: str, input_directory_path: str, output_directory_path: str, action_mapper: Dict[str, Type[ActionMapper]], control_mapper: Dict[str, Type[BaseMapper]], template_name: str = "workflow.tpl", user: str = None, start_days_ago: int = None, schedule_interval: str = None, output_dag_name: str = None, ): """ :param input_directory_path: Oozie workflow directory. :param output_directory_path: Desired output directory. :param user: Username. # TODO remove me and use real ${user} EL :param start_days_ago: Desired DAG start date, expressed as number of days ago from the present day :param schedule_interval: Desired DAG schedule interval, expressed as number of days :param dag_name: Desired output DAG name. """ # Each OozieParser class corresponds to one workflow, where one can get # the workflow's required dependencies (imports), operator relations, # and operator execution sequence. self.input_directory_path = input_directory_path self.output_directory_path = output_directory_path self.start_days_ago = start_days_ago self.schedule_interval = schedule_interval self.dag_name = dag_name self.template_name = template_name self.configuration_properties_file = os.path.join( input_directory_path, CONFIGURATION_PROPERTIES) self.job_properties_file = os.path.join(input_directory_path, JOB_PROPERTIES) self.output_dag_name = ( os.path.join(output_directory_path, output_dag_name) if output_dag_name else os.path.join(output_directory_path, self.dag_name) + ".py") params = {"user.name": user or os.environ["USER"]} params = self.add_properties_to_params(params) params = el_utils.parse_els(self.configuration_properties_file, params) self.params = params self.parser = parser.OozieParser( input_directory_path=input_directory_path, output_directory_path=output_directory_path, params=params, dag_name=dag_name, action_mapper=action_mapper, control_mapper=control_mapper, )
def __init__( self, dag_name: str, input_directory_path: str, output_directory_path: str, action_mapper: Dict[str, Type[ActionMapper]], renderer: BaseRenderer, user: str = None, initial_props: PropertySet = None, ): """ :param input_directory_path: Oozie workflow directory. :param output_directory_path: Desired output directory. :param user: Username. # TODO remove me and use real ${user} EL :param start_days_ago: Desired DAG start date, expressed as number of days ago from the present day :param schedule_interval: Desired DAG schedule interval, expressed as number of days :param dag_name: Desired output DAG name. """ # Each OozieParser class corresponds to one workflow, where one can get # the workflow's required dependencies (imports), operator relations, # and operator execution sequence. self.input_directory_path = input_directory_path self.output_directory_path = output_directory_path self.dag_name = dag_name self.config_file = os.path.join(input_directory_path, CONFIG) self.job_properties_file = os.path.join(input_directory_path, JOB_PROPS) self.renderer = renderer # Propagate the configuration in case initial property set is passed self.job_properties = {} if not initial_props else initial_props.job_properties self.job_properties["user.name"] = user or os.environ["USER"] self.config: Dict[str, str] = {} self.props = PropertySet(job_properties=self.job_properties, config=self.config, action_node_properties={}) self.read_and_update_job_properties_replace_el() self.read_config_replace_el() self.parser = parser.OozieParser( input_directory_path=input_directory_path, output_directory_path=output_directory_path, props=self.props, action_mapper=action_mapper, renderer=self.renderer, dag_name=dag_name, )
class TestOozieExamples(unittest.TestCase): @parameterized.expand( [ (WorkflowTestCase( name="decision", node_names={ "start_node_1234", "decision-node", "first", "end", "kill" }, job_properties={"nameNode": "hdfs://"}, config={}, ), ), (WorkflowTestCase( name="demo", node_names={ "start_node_1234", "fork-node", "pig-node", "subworkflow-node", "shell-node", "join-node", "decision-node", "hdfs-node", "end", "fail", }, job_properties={"nameNode": "hdfs://"}, config={}, ), ), (WorkflowTestCase( name="el", node_names={"start_node_1234", "ssh", "end", "fail"}, job_properties={ "hostname": "user@BBB", "nameNode": "hdfs://" }, config={}, ), ), (WorkflowTestCase( name="fs", node_names={ "start_node_1234", "end", "fail", "chmod", "mkdir", "fs-node", "delete", "move", "touchz", "chgrp", "join", }, job_properties={ "hostname": "user@BBB", "nameNode": "hdfs://*****:*****@BBB", "nameNode": "hdfs://" }, config={}, ), ), (WorkflowTestCase( name="subwf", node_names={ "start_node_1234", "end", "fail", "subworkflow-node" }, job_properties={}, config={}, ), ), (WorkflowTestCase( name="distcp", node_names={"start_node_1234", "end", "fail", "distcp-node"}, job_properties={ "hostname": "AAAA@BBB", "nameNode": "hdfs://", "nameNode1": "hdfs://*****:*****@mock.patch("uuid.uuid4", return_value="1234") def test_parse_workflow_examples(self, case: WorkflowTestCase, _): workflow = Workflow( input_directory_path=path.join(EXAMPLES_PATH, case.name), output_directory_path="/tmp", dag_name="DAG_NAME_B", ) current_parser = parser.OozieParser( workflow=workflow, props=PropertySet(job_properties=case.job_properties, config=case.config), action_mapper=ACTION_MAP, renderer=mock.MagicMock(), ) current_parser.parse_workflow() self.assertEqual(case.node_names, set(current_parser.workflow.nodes.keys())) self.assertEqual(set(), current_parser.workflow.relations)
class TestOozieExamples(unittest.TestCase): @parameterized.expand( [ ( WorkflowTestCase( name="decision", node_names={"decision_node", "first", "end", "kill"}, relations={ Relation(from_task_id="decision_node", to_task_id="end"), Relation(from_task_id="decision_node", to_task_id="first"), Relation(from_task_id="decision_node", to_task_id="kill"), }, params={"nameNode": "hdfs://"}, ), ), ( WorkflowTestCase( name="demo", node_names={ "fork_node", "pig_node", "subworkflow_node", "shell_node", "join_node", "decision_node", "hdfs_node", "end", }, relations={ Relation(from_task_id="decision_node", to_task_id="end"), Relation(from_task_id="decision_node", to_task_id="hdfs_node"), Relation(from_task_id="fork_node", to_task_id="pig_node_prepare"), Relation(from_task_id="fork_node", to_task_id="shell_node_prepare"), Relation(from_task_id="fork_node", to_task_id="subworkflow_node"), Relation(from_task_id="join_node", to_task_id="decision_node"), Relation(from_task_id="pig_node", to_task_id="join_node"), Relation(from_task_id="shell_node", to_task_id="join_node"), Relation(from_task_id="subworkflow_node", to_task_id="join_node"), }, params={"nameNode": "hdfs://", "dataproc_cluster": "AAA"}, ), ), ( WorkflowTestCase( name="el", node_names={"ssh"}, relations=set(), params={"hostname": "AAAA@BBB", "nameNode": "hdfs://"}, ), ), ( WorkflowTestCase( name="fs", node_names={"chmod", "mkdir", "fs_node", "delete", "move", "touchz", "chgrp", "join"}, relations={ Relation(from_task_id="fs_node", to_task_id="chgrp_fs_0_mkdir"), Relation(from_task_id="fs_node", to_task_id="delete_fs_0_mkdir"), Relation(from_task_id="fs_node", to_task_id="chmod_fs_0_mkdir"), Relation(from_task_id="fs_node", to_task_id="touchz"), Relation(from_task_id="fs_node", to_task_id="mkdir"), Relation(from_task_id="fs_node", to_task_id="move_fs_0_mkdir"), Relation(from_task_id="mkdir", to_task_id="join"), Relation(from_task_id="delete_fs_1_delete", to_task_id="join"), Relation(from_task_id="move_fs_1_move", to_task_id="join"), Relation(from_task_id="touchz", to_task_id="join"), Relation(from_task_id="chgrp_fs_1_chgrp", to_task_id="join"), Relation(from_task_id="chmod_fs_7_chmod", to_task_id="join"), }, params={"hostname": "AAAA@BBB", "nameNode": "hdfs://*****:*****@BBB", "nameNode": "hdfs://"}, ), ), (WorkflowTestCase(name="subwf", node_names={"subworkflow_node"}, relations=set(), params={}),), ], name_func=lambda func, num, p: f"{func.__name__}_{num}_{p.args[0].name}", ) @mock.patch("o2a.mappers.base_mapper.BaseMapper.on_parse_finish", wraps=None) @mock.patch("uuid.uuid4", return_value="1234") def test_parse_workflow_examples(self, case: WorkflowTestCase, _, on_parse_finish_mock): current_parser = parser.OozieParser( input_directory_path=path.join(EXAMPLES_PATH, case.name), output_directory_path="/tmp", params=case.params, action_mapper=ACTION_MAP, control_mapper=CONTROL_MAP, ) current_parser.parse_workflow() self.assertEqual(case.node_names, set(current_parser.workflow.nodes.keys())) self.assertEqual(case.relations, current_parser.workflow.relations) on_parse_finish_mock.assert_called()