def test_replace_el(self): # Given self.job_properties["var1"] = "value1" self.job_properties["var2"] = "value2" # language=XML node_str = """ <pig> <file>/path/with/el/${var1}</file> <file>/path/with/el/${var2}</file> <file>/path/with/two/els/${var1}/${var2}</file> </pig> """ oozie_node = ET.fromstring(node_str) file_extractor = FileExtractor(oozie_node=oozie_node, props=self.props) # When file_extractor.parse_node() # Then self.assertEqual( [ "hdfs:///path/with/el/{{var1}}", "hdfs:///path/with/el/{{var2}}", "hdfs:///path/with/two/els/{{var1}}/{{var2}}", ], file_extractor.hdfs_files, )
class MapReduceMapper(ActionMapper): """ Converts a MapReduce Oozie node to an Airflow task. """ def __init__(self, oozie_node: Element, name: str, dag_name: str, props: PropertySet, **kwargs): ActionMapper.__init__( self, oozie_node=oozie_node, name=name, dag_name=dag_name, props=props, **kwargs ) self.params_dict: Dict[str, str] = {} self.file_extractor = FileExtractor(oozie_node=oozie_node, props=self.props) self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, props=self.props) self.name_node: Optional[str] = None self.hdfs_files: Optional[List[str]] = None self.hdfs_archives: Optional[List[str]] = None self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension(self) def on_parse_node(self): super().on_parse_node() self.name_node = get_tag_el_text(self.oozie_node, "name-node", props=self.props) self.params_dict = extract_param_values_from_action_node(self.oozie_node, props=self.props) _, self.hdfs_files = self.file_extractor.parse_node() _, self.hdfs_archives = self.archive_extractor.parse_node() def to_tasks_and_relations(self): action_task = Task( task_id=self.name, template_name="mapreduce.tpl", template_params=dict( props=self.props, params_dict=self.params_dict, hdfs_files=self.hdfs_files, hdfs_archives=self.hdfs_archives, action_node_properties=self.props.action_node_properties, ), ) tasks = [action_task] relations: List[Relation] = [] prepare_task = self.prepare_extension.get_prepare_task() if prepare_task: tasks, relations = self.prepend_task(prepare_task, tasks, relations) return tasks, relations @staticmethod def _validate_paths(input_directory_path, output_directory_path): if not input_directory_path: raise Exception("The input_directory_path should be set and is {}".format(input_directory_path)) if not output_directory_path: raise Exception("The output_directory_path should be set and is {}".format(output_directory_path)) def required_imports(self) -> Set[str]: return {"from airflow.utils import dates", "from airflow.contrib.operators import dataproc_operator"}
def test_replace_el(self): # Given params = {"var1": "value1", "var2": "value2", **self.default_params} # language=XML node_str = """ <pig> <file>/path/with/el/${var1}</file> <file>/path/with/el/${var2}</file> <file>/path/with/two/els/${var1}/${var2}</file> </pig> """ oozie_node = ET.fromstring(node_str) file_extractor = FileExtractor(oozie_node=oozie_node, params=params) # When file_extractor.parse_node() # Then self.assertEqual( file_extractor.hdfs_files, [ "hdfs:///path/with/el/value1", "hdfs:///path/with/el/value2", "hdfs:///path/with/two/els/value1/value2", ], )
class SparkMapper(ActionMapper, PrepareMixin): """Maps Spark Action""" application_args: List[str] conf: Dict[str, str] hdfs_archives: List[str] hdfs_files: List[str] dataproc_jars: List[str] jars: List[str] def __init__( self, oozie_node: ET.Element, name: str, trigger_rule: str = TriggerRule.ALL_SUCCESS, params: Dict[str, str] = None, **kwargs, ): ActionMapper.__init__(self, oozie_node, name, trigger_rule, **kwargs) self.params = params or {} self.trigger_rule = trigger_rule self.java_class = "" self.java_jar = "" self.job_name = None self.jars = [] self.properties = {} self.application_args = [] self.file_extractor = FileExtractor(oozie_node=oozie_node, params=self.params) self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, params=self.params) self.prepare_command = None self.hdfs_files = [] self.hdfs_archives = [] self.dataproc_jars = [] def on_parse_node(self): if self.has_prepare: self.prepare_command = self.get_prepare_command( oozie_node=self.oozie_node, params=self.params) _, self.hdfs_files = self.file_extractor.parse_node() _, self.hdfs_archives = self.archive_extractor.parse_node() self.java_jar = self._get_or_default(self.oozie_node, SPARK_TAG_JAR, None, params=self.params) self.java_class = self._get_or_default(self.oozie_node, SPARK_TAG_CLASS, None, params=self.params) if self.java_class and self.java_jar: self.dataproc_jars = [self.java_jar] self.java_jar = None self.job_name = self._get_or_default(self.oozie_node, SPARK_TAG_JOB_NAME, None, params=self.params) job_xml_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_JOB_XML) for xml_file in job_xml_nodes: tree = ET.parse(xml_file.text) self.properties.update(self._parse_config_node(tree.getroot())) config_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_CONFIGURATION) if config_nodes: self.properties.update(self._parse_config_node(config_nodes[0])) spark_opts = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_OPTS) if spark_opts: self.properties.update(self._parse_spark_opts(spark_opts[0])) app_args = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_ARGS) for arg in app_args: self.application_args.append( el_utils.replace_el_with_var(arg.text, self.params, quote=False)) @staticmethod def _get_or_default(root: ET.Element, tag: str, default: str = None, params: Dict[str, str] = None): """ If a node exists in the oozie_node with the tag specified in tag, it will attempt to replace the EL (if it exists) with the corresponding variable. If no EL var is found, it just returns text. However, if the tag is not found under oozie_node, then return default. If there are more than one with the specified tag, it uses the first one found. """ var = xml_utils.find_nodes_by_tag(root, tag) if var: # Only check the first one return el_utils.replace_el_with_var(var[0].text, params=params, quote=False) return default @staticmethod def _parse_config_node(config_node: ET.Element) -> Dict[str, str]: conf_dict = {} for prop in config_node: name_node = prop.find(SPARK_TAG_NAME) value_node = prop.find(SPARK_TAG_VALUE) if name_node is not None and name_node.text and value_node is not None and value_node.text: conf_dict[name_node.text] = value_node.text return conf_dict @staticmethod def _parse_spark_opts(spark_opts_node: ET.Element): """ Some examples of the spark-opts element: --conf key1=value --conf key2="value1 value2" """ conf = {} if spark_opts_node.text: spark_opts = spark_opts_node.text.split("--")[1:] else: raise ParseException( "Spark opts node has no text: {}".format(spark_opts_node)) clean_opts = [opt.strip() for opt in spark_opts] clean_opts_split = [opt.split(maxsplit=1) for opt in clean_opts] for spark_opt in clean_opts_split: # Can have multiple "--conf" in spark_opts if spark_opt[0] == "conf": key, _, value = spark_opt[1].partition("=") # Value is required if not value: raise ParseException( f"Incorrect parameter format. Expected format: key=value. Current value: {spark_opt}" ) # Delete surrounding quotes if len(value) > 2 and value[0] in ["'", '"'] and value: value = value[1:-1] conf[key] = value return conf def _get_tasks(self): """ Returns the list of Airflow tasks that are the result of mapping :return: list of Airflow tasks """ action_task = Task( task_id=self.name, template_name="spark.tpl", trigger_rule=self.trigger_rule, template_params=dict( main_jar=self.java_jar, main_class=self.java_class, arguments=self.application_args, archives=self.hdfs_archives, files=self.hdfs_files, job_name=self.job_name, dataproc_spark_properties=self.properties, dataproc_spark_jars=self.dataproc_jars, ), ) if not self.has_prepare(self.oozie_node): return [action_task] prepare_task = Task( task_id=self.name + "_prepare", template_name="prepare.tpl", template_params=dict(prepare_command=self.prepare_command), ) return [prepare_task, action_task] def _get_relations(self): """ Returns the list of Airflow relations that are the result of mapping :return: list of relations """ return ([ Relation(from_task_id=self.name + "_prepare", to_task_id=self.name) ] if self.has_prepare(self.oozie_node) else []) def to_tasks_and_relations(self): tasks = self._get_tasks() relations = self._get_relations() return tasks, relations def required_imports(self) -> Set[str]: # Bash are for the potential prepare statement return { "from airflow.contrib.operators import dataproc_operator", "from airflow.operators import bash_operator", "from airflow.operators import dummy_operator", } @property def first_task_id(self): return self._get_tasks()[0].task_id
class MapReduceMapper(ActionMapper, PrepareMixin): """ Converts a MapReduce Oozie node to an Airflow task. """ def __init__( self, oozie_node: Element, name: str, trigger_rule: str = TriggerRule.ALL_SUCCESS, params: Dict[str, str] = None, **kwargs, ): ActionMapper.__init__(self, oozie_node=oozie_node, name=name, trigger_rule=trigger_rule, **kwargs) if params is None: params = dict() self.params = params self.properties: Dict[str, str] = {} self.params_dict: Dict[str, str] = {} self.file_extractor = FileExtractor(oozie_node=oozie_node, params=params) self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, params=params) self.name_node = None self.hdfs_files = None self.hdfs_archives = None def on_parse_node(self): name_node_text = self.oozie_node.find("name-node").text self.name_node = el_utils.replace_el_with_var(name_node_text, params=self.params, quote=False) self._parse_config() self._parse_params() _, self.hdfs_files = self.file_extractor.parse_node() _, self.hdfs_archives = self.archive_extractor.parse_node() def _parse_params(self): param_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, "param") if param_nodes: self.params_dict = {} for node in param_nodes: param = el_utils.replace_el_with_var(node.text, params=self.params, quote=False) key, value = param.split("=", 1) self.params_dict[key] = value def to_tasks_and_relations(self): tasks = [ Task( task_id=self.name, template_name="mapreduce.tpl", trigger_rule=self.trigger_rule, template_params=dict( properties=self.properties, params_dict=self.params_dict, hdfs_files=self.hdfs_files, hdfs_archives=self.hdfs_archives, ), ) ] relations = [] if self.has_prepare(self.oozie_node): prepare_command = self.get_prepare_command(self.oozie_node, self.params) tasks.insert( 0, Task( task_id=self.name + "_prepare", template_name="prepare.tpl", trigger_rule=self.trigger_rule, template_params=dict(prepare_command=prepare_command), ), ) relations = [Relation(from_task_id=self.name + "_prepare", to_task_id=self.name)] return tasks, relations @staticmethod def _validate_paths(input_directory_path, output_directory_path): if not input_directory_path: raise Exception("The input_directory_path should be set and is {}".format(input_directory_path)) if not output_directory_path: raise Exception("The output_directory_path should be set and is {}".format(output_directory_path)) def required_imports(self) -> Set[str]: return {"from airflow.utils import dates", "from airflow.contrib.operators import dataproc_operator"}
class JavaMapper(ActionMapper): """ Converts a Java Oozie action node to an Airflow task. """ def __init__( self, oozie_node: Element, name: str, dag_name: str, props: PropertySet, jar_files: List[str], **kwargs, ): ActionMapper.__init__(self, oozie_node=oozie_node, dag_name=dag_name, name=name, props=props, **kwargs) self.file_extractor = FileExtractor(oozie_node=oozie_node, props=self.props) self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, props=self.props) self.main_class: Optional[str] = None self.java_opts: List[str] = [] self.args: Optional[List[str]] = None self.hdfs_files: Optional[List[str]] = None self.hdfs_archives: Optional[List[str]] = None self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension( self) self.jar_files: List[str] = jar_files if jar_files else [] self.jar_files_in_hdfs: List[str] = [] self._get_jar_files_in_hdfs_full_paths() def on_parse_node(self): super().on_parse_node() _, self.hdfs_files = self.file_extractor.parse_node() _, self.hdfs_archives = self.archive_extractor.parse_node() self._extract_java_data() def to_tasks_and_relations(self): action_task = Task( task_id=self.name, template_name="java.tpl", template_params=dict( props=self.props, hdfs_files=self.hdfs_files, hdfs_archives=self.hdfs_archives, main_class=self.main_class, jar_files_in_hdfs=self.jar_files_in_hdfs, args=self.args, ), ) tasks = [action_task] relations: List[Relation] = [] prepare_task = self.prepare_extension.get_prepare_task() if prepare_task: tasks, relations = self.prepend_task(prepare_task, tasks, relations) return tasks, relations def required_imports(self) -> Set[str]: return { "from airflow.utils import dates", "from airflow.contrib.operators import dataproc_operator" } def _get_jar_files_in_hdfs_full_paths(self): hdfs_app_prefix = self.props.job_properties[ "oozie.wf.application.path"] for file in self.jar_files: self.jar_files_in_hdfs.append(hdfs_app_prefix + "/" + LIB_FOLDER + "/" + file) def _extract_java_data(self): """Extracts Java node data.""" root = self.oozie_node props = self.props if "mapred.child.java.opts" in props.merged: self.java_opts.extend( props.merged["mapred.child.java.opts"].split(" ")) if "mapreduce.map.java.opts" in props.merged: self.java_opts.extend( props.merged["mapreduce.map.java.opts"].split(" ")) self.main_class = xml_utils.get_tag_el_text(root=root, tag=TAG_MAIN_CLASS, props=props) java_opts_string = xml_utils.get_tag_el_text(root=root, tag=TAG_JAVA_OPTS, props=props) if java_opts_string: self.java_opts.extend(java_opts_string.split(" ")) else: self.java_opts.extend( get_tags_el_array_from_text(root=root, tag=TAG_JAVA_OPT, props=props)) self.args = get_tags_el_array_from_text(root=root, tag=TAG_ARG, props=props)
class PigMapper(ActionMapper): """ Converts a Pig Oozie node to an Airflow task. """ def __init__(self, oozie_node: Element, name: str, props: PropertySet, **kwargs): ActionMapper.__init__(self, oozie_node=oozie_node, name=name, props=props, **kwargs) self.params_dict: Dict[str, str] = {} self.file_extractor = FileExtractor(oozie_node=oozie_node, props=self.props) self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, props=self.props) self._parse_oozie_node() self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension( self) def _parse_oozie_node(self): self.resource_manager = get_tag_el_text(self.oozie_node, TAG_RESOURCE) self.name_node = get_tag_el_text(self.oozie_node, TAG_NAME) self.script_file_name = get_tag_el_text(self.oozie_node, TAG_SCRIPT) self.params_dict = extract_param_values_from_action_node( self.oozie_node) self.files, self.hdfs_files = self.file_extractor.parse_node() self.archives, self.hdfs_archives = self.archive_extractor.parse_node() def to_tasks_and_relations(self): action_task = Task( task_id=self.name, template_name="pig.tpl", template_params=dict( props=self.props, params_dict=self.params_dict, script_file_name=self.script_file_name, action_node_properties=self.props.action_node_properties, ), ) tasks = [action_task] relations: List[Relation] = [] prepare_task = self.prepare_extension.get_prepare_task() if prepare_task: tasks, relations = self.prepend_task(prepare_task, tasks, relations) return tasks, relations def _add_symlinks(self, destination_pig_file): destination_pig_file.write("set mapred.create.symlink yes;\n") if self.files: destination_pig_file.write("set mapred.cache.file {};\n".format( ",".join(self.hdfs_files))) if self.archives: destination_pig_file.write( "set mapred.cache.archives {};\n".format(",".join( self.hdfs_archives))) def copy_extra_assets(self, input_directory_path: str, output_directory_path: str): self._validate_paths(input_directory_path, output_directory_path) source_pig_file_path = os.path.join(input_directory_path, self.script_file_name) destination_pig_file_path = os.path.join(output_directory_path, self.script_file_name) self._copy_pig_script_with_path_injection(destination_pig_file_path, source_pig_file_path) def _copy_pig_script_with_path_injection(self, destination_pig_file_path, source_pig_file_path): os.makedirs(os.path.dirname(destination_pig_file_path), exist_ok=True) with open(destination_pig_file_path, "w") as destination_pig_file: with open(source_pig_file_path, "r") as source_pig_file: pig_script = source_pig_file.read() if self.files or self.archives: self._add_symlinks(destination_pig_file) destination_pig_file.write(pig_script) @staticmethod def _validate_paths(input_directory_path, output_directory_path): if not input_directory_path: raise Exception( f"The input_directory_path should be set and is {input_directory_path}" ) if not output_directory_path: raise Exception( f"The output_directory_path should be set and is {output_directory_path}" ) def required_imports(self) -> Set[str]: return { "from airflow.utils import dates", "from airflow.contrib.operators import dataproc_operator" }
class HiveMapper(ActionMapper): """ Converts a Hive Oozie node to an Airflow task. """ def __init__(self, oozie_node: Element, name: str, props: PropertySet, **kwargs): ActionMapper.__init__(self, oozie_node=oozie_node, name=name, props=props, **kwargs) self.variables: Optional[Dict[str, str]] = None self.query: Optional[str] = None self.script: Optional[str] = None self.hdfs_files: Optional[List[str]] = None self.hdfs_archives: Optional[List[str]] = None self.file_extractor = FileExtractor(oozie_node=oozie_node, props=self.props) self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, props=self.props) self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension( self) def on_parse_node(self): super().on_parse_node() self._parse_config() self.query = get_tag_el_text(self.oozie_node, TAG_QUERY) self.script = get_tag_el_text(self.oozie_node, TAG_SCRIPT) if not self.query and not self.script: raise ParseException( f"Action Configuration does not include {TAG_SCRIPT} or {TAG_QUERY} element" ) if self.query and self.script: raise ParseException( f"Action Configuration include {TAG_SCRIPT} and {TAG_QUERY} element. " f"Only one can be set at the same time.") self.variables = extract_param_values_from_action_node(self.oozie_node) _, self.hdfs_files = self.file_extractor.parse_node() _, self.hdfs_archives = self.archive_extractor.parse_node() def to_tasks_and_relations(self): action_task = Task( task_id=self.name, template_name="hive.tpl", template_params=dict( query=self.query, script=self.script, props=self.props, archives=self.hdfs_archives, files=self.hdfs_files, variables=self.variables, ), ) tasks = [action_task] relations = [] prepare_task = self.prepare_extension.get_prepare_task() if prepare_task: tasks, relations = self.prepend_task(prepare_task, tasks, relations) return tasks, relations def copy_extra_assets(self, input_directory_path: str, output_directory_path: str): if not self.script: return source_script_file_path = os.path.join(input_directory_path, self.script) destination_script_file_path = os.path.join(output_directory_path, self.script) os.makedirs(os.path.dirname(destination_script_file_path), exist_ok=True) shutil.copy(source_script_file_path, destination_script_file_path) def required_imports(self) -> Set[str]: return { "from airflow.utils import dates", "from airflow.contrib.operators import dataproc_operator" }
class SparkMapper(ActionMapper): """Maps Spark Action""" def __init__(self, oozie_node: ET.Element, name: str, props: PropertySet, **kwargs): ActionMapper.__init__(self, oozie_node=oozie_node, name=name, props=props, **kwargs) self.java_class: Optional[str] = None self.java_jar: Optional[str] = None self.job_name: Optional[str] = None self.jars: List[str] = [] self.application_args: List[str] = [] self.file_extractor = FileExtractor(oozie_node=oozie_node, props=self.props) self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, props=self.props) self.hdfs_files: List[str] = [] self.hdfs_archives: List[str] = [] self.dataproc_jars: List[str] = [] self.spark_opts: Dict[str, str] = {} self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension(self) def on_parse_node(self): super().on_parse_node() _, self.hdfs_files = self.file_extractor.parse_node() _, self.hdfs_archives = self.archive_extractor.parse_node() self.java_jar = get_tag_el_text(self.oozie_node, tag=SPARK_TAG_JAR) self.java_class = get_tag_el_text(self.oozie_node, tag=SPARK_TAG_CLASS) if self.java_class and self.java_jar: self.dataproc_jars = [self.java_jar] self.java_jar = None self.job_name = get_tag_el_text(self.oozie_node, tag=SPARK_TAG_JOB_NAME) spark_opts = xml_utils.find_nodes_by_tag(self.oozie_node, SPARK_TAG_OPTS) if spark_opts: self.spark_opts.update(self._parse_spark_opts(spark_opts[0])) self.application_args = xml_utils.get_tags_el_array_from_text(self.oozie_node, tag=SPARK_TAG_ARG) @staticmethod def _parse_spark_opts(spark_opts_node: ET.Element): """ Some examples of the spark-opts element: --conf key1=value --conf key2="value1 value2" """ conf: Dict[str, str] = {} if spark_opts_node.text: spark_opts = spark_opts_node.text.split("--")[1:] else: raise ParseException(f"Spark opts node has no text: {spark_opts_node}") clean_opts = [opt.strip() for opt in spark_opts] clean_opts_split = [opt.split(maxsplit=1) for opt in clean_opts] for spark_opt in clean_opts_split: # Can have multiple "--conf" in spark_opts if spark_opt[0] == "conf": key, _, value = spark_opt[1].partition("=") # Value is required if not value: raise ParseException( f"Incorrect parameter format. Expected format: key=value. Current value: {spark_opt}" ) # Delete surrounding quotes if len(value) > 2 and value[0] in ["'", '"'] and value: value = value[1:-1] conf[key] = value # TODO: parse also other options (like --executor-memory 20G --num-executors 50 and many more) # see: https://oozie.apache.org/docs/5.1.0/DG_SparkActionExtension.html#PySpark_with_Spark_Action return conf def to_tasks_and_relations(self): action_task = Task( task_id=self.name, template_name="spark.tpl", template_params=dict( main_jar=self.java_jar, main_class=self.java_class, arguments=self.application_args, hdfs_archives=self.hdfs_archives, hdfs_files=self.hdfs_files, job_name=self.job_name, dataproc_spark_jars=self.dataproc_jars, spark_opts=self.spark_opts, ), ) tasks = [action_task] relations: List[Relation] = [] prepare_task = self.prepare_extension.get_prepare_task() if prepare_task: tasks, relations = self.prepend_task(prepare_task, tasks, relations) return tasks, relations def required_imports(self) -> Set[str]: # Bash are for the potential prepare statement return { "from airflow.contrib.operators import dataproc_operator", "from airflow.operators import bash_operator", "from airflow.operators import dummy_operator", }
class PigMapper(ActionMapper, PrepareMixin): """ Converts a Pig Oozie node to an Airflow task. """ properties: Dict[str, str] params_dict: Dict[str, str] def __init__( self, oozie_node: Element, name: str, trigger_rule: str = TriggerRule.ALL_SUCCESS, params=None, **kwargs, ): ActionMapper.__init__(self, oozie_node=oozie_node, name=name, trigger_rule=trigger_rule, **kwargs) if params is None: params = dict() self.params = params self.trigger_rule = trigger_rule self.properties = {} self.params_dict = {} self.file_extractor = FileExtractor(oozie_node=oozie_node, params=params) self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, params=params) self._parse_oozie_node() def _parse_oozie_node(self): res_man_text = self.oozie_node.find("resource-manager").text name_node_text = self.oozie_node.find("name-node").text script = self.oozie_node.find("script").text self.resource_manager = el_utils.replace_el_with_var(res_man_text, params=self.params, quote=False) self.name_node = el_utils.replace_el_with_var(name_node_text, params=self.params, quote=False) self.script_file_name = el_utils.replace_el_with_var(script, params=self.params, quote=False) self._parse_config() self._parse_params() self.files, self.hdfs_files = self.file_extractor.parse_node() self.archives, self.hdfs_archives = self.archive_extractor.parse_node() def _parse_params(self): param_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, "param") if param_nodes: self.params_dict = {} for node in param_nodes: param = el_utils.replace_el_with_var(node.text, params=self.params, quote=False) key, value = param.split("=") self.params_dict[key] = value def to_tasks_and_relations(self): prepare_command = self.get_prepare_command(self.oozie_node, self.params) tasks = [ Task( task_id=self.name + "_prepare", template_name="prepare.tpl", trigger_rule=self.trigger_rule, template_params=dict(prepare_command=prepare_command), ), Task( task_id=self.name, template_name="pig.tpl", trigger_rule=self.trigger_rule, template_params=dict( properties=self.properties, params_dict=self.params_dict, script_file_name=self.script_file_name, ), ), ] relations = [Relation(from_task_id=self.name + "_prepare", to_task_id=self.name)] return tasks, relations def _add_symlinks(self, destination_pig_file): destination_pig_file.write("set mapred.create.symlink yes;\n") if self.files: destination_pig_file.write("set mapred.cache.file {};\n".format(",".join(self.hdfs_files))) if self.archives: destination_pig_file.write("set mapred.cache.archives {};\n".format(",".join(self.hdfs_archives))) def copy_extra_assets(self, input_directory_path: str, output_directory_path: str): self._validate_paths(input_directory_path, output_directory_path) source_pig_file_path = os.path.join(input_directory_path, self.script_file_name) destination_pig_file_path = os.path.join(output_directory_path, self.script_file_name) self._copy_pig_script_with_path_injection(destination_pig_file_path, source_pig_file_path) def _copy_pig_script_with_path_injection(self, destination_pig_file_path, source_pig_file_path): os.makedirs(os.path.dirname(destination_pig_file_path), exist_ok=True) with open(destination_pig_file_path, "w") as destination_pig_file: with open(source_pig_file_path, "r") as source_pig_file: pig_script = source_pig_file.read() if self.files or self.archives: self._add_symlinks(destination_pig_file) destination_pig_file.write(pig_script) @staticmethod def _validate_paths(input_directory_path, output_directory_path): if not input_directory_path: raise Exception("The input_directory_path should be set and is {}".format(input_directory_path)) if not output_directory_path: raise Exception("The output_directory_path should be set and is {}".format(output_directory_path)) def required_imports(self) -> Set[str]: return {"from airflow.utils import dates", "from airflow.contrib.operators import dataproc_operator"} @property def first_task_id(self): return "{task_id}_prepare".format(task_id=self.name)
class MapReduceMapper(ActionMapper): """ Converts a MapReduce Oozie node to an Airflow task. """ def __init__(self, oozie_node: Element, name: str, dag_name: str, props: PropertySet, **kwargs): ActionMapper.__init__(self, oozie_node=oozie_node, name=name, dag_name=dag_name, props=props, **kwargs) self.params_dict: Dict[str, str] = {} self.file_extractor = FileExtractor(oozie_node=oozie_node, props=self.props) self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, props=self.props) self.name_node = None self.hdfs_files = None self.hdfs_archives = None self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension( self) def on_parse_node(self): super().on_parse_node() name_node_text = self.oozie_node.find("name-node").text self.name_node = el_utils.replace_el_with_var(name_node_text, props=self.props, quote=False) self._parse_params() _, self.hdfs_files = self.file_extractor.parse_node() _, self.hdfs_archives = self.archive_extractor.parse_node() def _parse_params(self): param_nodes = xml_utils.find_nodes_by_tag(self.oozie_node, "param") if param_nodes: self.params_dict = {} for node in param_nodes: param = el_utils.replace_el_with_var(node.text, props=self.props, quote=False) key, value = param.split("=", 1) self.params_dict[key] = value def to_tasks_and_relations(self) -> Tuple[List[Task], List[Relation]]: action_task = Task( task_id=self.name, template_name="mapreduce.tpl", template_params=dict( props=self.props, params_dict=self.params_dict, hdfs_files=self.hdfs_files, hdfs_archives=self.hdfs_archives, action_node_properties=self.props.action_node_properties, ), ) tasks = [action_task] relations: List[Relation] = [] prepare_task = self.prepare_extension.get_prepare_task() if prepare_task: tasks, relations = self.prepend_task(prepare_task, tasks, relations) return tasks, relations @staticmethod def _validate_paths(input_directory_path, output_directory_path): if not input_directory_path: raise Exception( "The input_directory_path should be set and is {}".format( input_directory_path)) if not output_directory_path: raise Exception( "The output_directory_path should be set and is {}".format( output_directory_path)) def required_imports(self) -> Set[str]: return { "from airflow.utils import dates", "from airflow.contrib.operators import dataproc_operator" }