Exemplo n.º 1
0
    def _parse_spark_opts(spark_opts_node: ET.Element):
        """
        Some examples of the spark-opts element:
        --conf key1=value
        --conf key2="value1 value2"
        """
        conf = {}
        if spark_opts_node.text:
            spark_opts = spark_opts_node.text.split("--")[1:]
        else:
            raise ParseException(
                "Spark opts node has no text: {}".format(spark_opts_node))
        clean_opts = [opt.strip() for opt in spark_opts]
        clean_opts_split = [opt.split(maxsplit=1) for opt in clean_opts]

        for spark_opt in clean_opts_split:
            # Can have multiple "--conf" in spark_opts
            if spark_opt[0] == "conf":
                key, _, value = spark_opt[1].partition("=")
                # Value is required
                if not value:
                    raise ParseException(
                        f"Incorrect parameter format. Expected format: key=value. Current value: {spark_opt}"
                    )
                # Delete surrounding quotes
                if len(value) > 2 and value[0] in ["'", '"'] and value:
                    value = value[1:-1]
                conf[key] = value

        return conf
Exemplo n.º 2
0
    def _parse_spark_opts(spark_opts_node: ET.Element):
        """
        Some examples of the spark-opts element:
        --conf key1=value
        --conf key2="value1 value2"
        """
        conf: Dict[str, str] = {}
        if spark_opts_node.text:
            spark_opts = spark_opts_node.text.split("--")[1:]
        else:
            raise ParseException(f"Spark opts node has no text: {spark_opts_node}")
        clean_opts = [opt.strip() for opt in spark_opts]
        clean_opts_split = [opt.split(maxsplit=1) for opt in clean_opts]

        for spark_opt in clean_opts_split:
            # Can have multiple "--conf" in spark_opts
            if spark_opt[0] == "conf":
                key, _, value = spark_opt[1].partition("=")
                # Value is required
                if not value:
                    raise ParseException(
                        f"Incorrect parameter format. Expected format: key=value. Current value: {spark_opt}"
                    )
                # Delete surrounding quotes
                if len(value) > 2 and value[0] in ["'", '"'] and value:
                    value = value[1:-1]
                conf[key] = value
            # TODO: parse also other options (like --executor-memory 20G --num-executors 50 and many more)
            #  see: https://oozie.apache.org/docs/5.1.0/DG_SparkActionExtension.html#PySpark_with_Spark_Action

        return conf
def extract_properties_from_configuration_node(
        config_node: ET.Element, props: PropertySet) -> Dict[str, str]:
    """Extracts configuration properties from ``configuration`` node"""
    properties_dict: Dict[str, str] = dict()
    for property_node in config_node.findall(TAG_PROPERTY):
        name_node = property_node.find(TAG_NAME)
        value_node = property_node.find(TAG_VALUE)

        if name_node is None or value_node is None:
            raise ParseException(
                'Element "property" should have direct children elements: name, value. One of them does not '
                "exist. Make sure the configuration element is valid.")

        name = name_node.text
        value = value_node.text

        if not name:
            raise ParseException(
                'Element "name" should have content, however its value is empty. Make sure the element has '
                "the correct content.")

        if not value:
            raise ParseException(
                'Element "value" should have content, however its value is empty. Make sure the element has '
                "the correct content.")

        properties_dict[name] = el_utils.replace_el_with_var(value,
                                                             props=props,
                                                             quote=False)

    return properties_dict
Exemplo n.º 4
0
def replace_url_el(url: str, props: PropertySet, allow_no_schema=False) -> str:
    """
    Transforms url by replacing EL-expression with equivalent jinja templates.
    If schema validation is required then props should include proper name-node.
    For example:
        input: '{$nameNode}/users/{$userName}/dir
        url_with_var: `{{nameNode}}/users/{{userName}}/dir
    In this case to validate url schema props should contain `nameNode` value.
    """
    url_with_var = el_parser.translate(url)

    name_node, _ = _resolve_name_node(url_with_var, props)
    if name_node:
        url_parts = urlparse(name_node)
    else:
        url_parts = urlparse(url_with_var)

    allowed_schemas = {"hdfs", ""} if allow_no_schema else {"hdfs"}
    if url_parts.scheme not in allowed_schemas:
        raise ParseException(
            f"Unknown path format. The URL should be provided in the following format: "
            f"hdfs://localhost:9200/path. Current value: {url_with_var}"
        )

    return url_with_var
Exemplo n.º 5
0
def normalize_path(url: str, props: PropertySet, allow_no_schema=False, translated=False) -> str:
    """
    Transforms url by replacing EL-expression with equivalent jinja templates
    and returns only the path part of the url. If schema validation is
    required then props should include proper name-node. If translated is set to True
    then passed url is supposed to be a valid jinja expression.
    For example:
        input: '{$nameNode}/users/{$userName}/dir
        url_with_var: `{{nameNode}}/users/{{userName}}/dir
    In this case to validate url schema props should contain `nameNode` value.
    """
    url_with_var = url if translated else el_parser.translate(url)

    name_node, shift = _resolve_name_node(url_with_var, props)
    if name_node:
        url_parts = urlparse(name_node)
        output = url_with_var[shift:]
    else:
        url_parts = urlparse(url_with_var)
        output = url_parts.path

    allowed_schemas = {"hdfs", ""} if allow_no_schema else {"hdfs"}
    if url_parts.scheme not in allowed_schemas:
        raise ParseException(
            f"Unknown path format. The URL should be provided in the following format: "
            f"hdfs://localhost:9200/path. Current value: {url_with_var}"
        )

    return output
Exemplo n.º 6
0
def normalize_path(url, params, allow_no_schema=False):
    url_with_var = replace_el_with_var(url, params=params, quote=False)
    url_parts: ParseResult = urlparse(url_with_var)
    allowed_schema = {"hdfs", ""} if allow_no_schema else {"hdfs"}
    if url_parts.scheme not in allowed_schema:
        raise ParseException(
            f"Unknown path format. The URL should be provided in the following format: "
            f"hdfs://localhost:9200/path. Current value: {url_with_var}")
    return url_parts.path
Exemplo n.º 7
0
    def on_parse_node(self):
        super().on_parse_node()
        self._parse_config()
        self.query = get_tag_el_text(self.oozie_node, TAG_QUERY)
        self.script = get_tag_el_text(self.oozie_node, TAG_SCRIPT)
        if not self.query and not self.script:
            raise ParseException(
                f"Action Configuration does not include {TAG_SCRIPT} or {TAG_QUERY} element"
            )

        if self.query and self.script:
            raise ParseException(
                f"Action Configuration include {TAG_SCRIPT} and {TAG_QUERY} element. "
                f"Only one can be set at the same time.")

        self.variables = extract_param_values_from_action_node(self.oozie_node)
        _, self.hdfs_files = self.file_extractor.parse_node()
        _, self.hdfs_archives = self.archive_extractor.parse_node()
Exemplo n.º 8
0
def replace_url_el(url: str, props: PropertySet, allow_no_schema=False) -> str:
    """
    Replaces all EL variables in the url, validates schema and returns the url.
    """
    url_with_var = replace_el_with_var(url, props=props, quote=False)
    if not is_allowed_schema(allow_no_schema, url_with_var):
        raise ParseException(
            f"Unknown path format. The URL should be provided in the following format: "
            f"hdfs://localhost:9200/path. Current value: {url_with_var}")
    return url_with_var
Exemplo n.º 9
0
def normalize_path(url: str, props: PropertySet, allow_no_schema=False) -> str:
    """
    Replaces all EL variables in the url, validates schema and returns only the 'path' part of a url.
    Example: hdfs://localhost:8082/user/root --> user/root
    """
    url_with_var = replace_el_with_var(url, props=props, quote=False)
    url_parts: ParseResult = urlparse(url_with_var)
    if not is_allowed_schema(allow_no_schema, url_with_var):
        raise ParseException(
            f"Unknown path format. The URL should be provided in the following format: "
            f"hdfs://localhost:9200/path. Current value: {url_with_var}")
    return url_parts.path
def extract_properties_from_job_xml_nodes(job_xml_nodes: List[ET.Element],
                                          input_directory_path: str):
    """Extracts configuration properties from ``job_xml`` nodes"""
    properties_dict: Dict[str, str] = dict()

    for xml_file in job_xml_nodes:
        file_name = xml_file.text
        if not file_name:
            raise ParseException(
                'Element "job-xml" should have content, however its value is empty. Make sure the element '
                "has the correct content.")
        file_path = path.join(input_directory_path, HDFS_FOLDER, file_name)
        config_tree = ET.parse(file_path)
        config_node = config_tree.getroot()
        if not config_node:
            raise ParseException(
                "A job-xml configuration node is specified in the workflow XML, however its value is empty."
                "Make sure the path to a configuration file is valid.")
        new_properties = extract_properties_from_configuration_node(
            config_node)
        properties_dict.update(new_properties)

    return properties_dict