示例#1
0
    def _build_graph(graph_name, nodes):
        graph = nx.DiGraph()

        logger.debug('Creating graph %s from nodes %s',
                     graph_name, nodes.keys())

        graph.add_nodes_from(nodes.keys())

        graph.add_edges_from(
            (upstream_name, downstream_name)
            for (upstream_name, (downstream_node_names, node)) in six.iteritems(nodes)
            for downstream_name in downstream_node_names)

        if not nx.algorithms.components.is_weakly_connected(graph):
            components = list(
                nx.algorithms.components.weakly_connected_components(graph))
            logger.warning(
                'Multiple connected components found for graph `%s`: %s',
                graph_name,
                components)

        if not nx.algorithms.dag.is_directed_acyclic_graph(graph):
            raise exceptions.CyclicWorkflowException(
                'Invalid graph `{}`: not a DAG!'.format(graph_name))

        logger.debug('Successfully created graph %s with nodes %s',
                     graph_name, graph.nodes)

        return graph
示例#2
0
    def _augment_pruned_sub_workflow_referrers(self, planned_prune_nodes,
                                               planned_keep_nodes):
        """ Determine which, if any, workflows will be fully deleted by the
            prune operation, identify all nodes that refer to those workflows,
            and schedule them for deletion them as well.

            :param planned_prune_nodes: set of nodes currently planned for removal.
                The contents of this set will be modified by this method.
            :type planned_prune_nodes: set<(string, string)>
            :param planned_keep_nodes: set of nodes currently planned for inclusion
                in the pruned workflow.  The contents of this set will be modified
                by this method.
            :type planned_keep_nodes: set<(string, string)>
        """

        all_workflow_names = frozenset(dag['name'] for dag in self.secondary)
        deleted_workflows = all_workflow_names - \
            frozenset(node[0] for node in planned_keep_nodes)

        referrer_map = self._build_referrer_map()

        for wf in deleted_workflows:
            referrers = referrer_map[wf]
            logger.debug(
                'Pruning nodes %s that refer to fully-deleted workflow %s',
                referrers, wf)

            planned_prune_nodes |= set(referrers)
            planned_keep_nodes -= set(referrers)

        return
示例#3
0
    def process_arg(self, arg, node, raw_args):
        regex = None
        try:
            regex = re.compile(self.pattern)
        except Exception:
            raise Exception(
                'Error compiling regex for `{}`: `{}` is an invalid pattern'.
                format(self.type, self.properties['pattern']))

        rendered_arg = None
        try:
            rendered_arg = self.render_template(arg, raw_args)
        except jinja2.exceptions.UndefinedError:
            logger.debug(
                'Could not render template `%s`; cannot verify that the argument '
                'matches the required pattern `%s`!', arg, regex.pattern)
            return arg

        if regex.match(rendered_arg):
            # return the original arg, not the rendered arg, because we are not
            # actually transforming anything, just validating
            return arg

        VERBATIM_REGEX = '<<.+>>'
        if re.compile(VERBATIM_REGEX).search(rendered_arg):
            logger.debug(
                'Argument generated from `%s` may not match the required pattern `%s` and fail.',
                rendered_arg, regex.pattern)
            return arg

        raise Exception(
            'Invalid argument `{}`: does not match expected pattern `{}`'.
            format(rendered_arg, regex.pattern))
示例#4
0
    def _augment_keep_nodes_list(keep_node_paths, planned_prune_nodes,
                                 planned_keep_nodes):
        """ Examines the paths to nodes in the keep_node_paths argument, making
            sure that elements of those paths will not be pruned out of the
            graph, making those nodes inaccessible.

            :param keep_node_paths: paths to the nodes we plan to keep
            :type keep_node_paths: list<list<(string, string)>>
            :param planned_prune_nodes: set of nodes currently planned for removal.
                The contents of this set will be modified by this method.
            :type planned_prune_nodes: set<(string, string)>
            :param planned_keep_nodes: set of nodes currently planned for inclusion
                in the pruned workflow.  The contents of this set will be modified
                by this method.
            :type planned_keep_nodes: set<(string, string)>
        """
        for path in keep_node_paths:
            for node in path:
                if node in planned_prune_nodes:
                    planned_prune_nodes.remove(node)
                    planned_keep_nodes.add(node)
                    logger.debug(
                        'Keeping node %s required for accessibility to node %s',
                        node, path[-1])

        return
示例#5
0
    def load_configs(self, config_paths):
        if not isinstance(config_paths, list):
            raise TypeError('Invalid `config_paths` argument: expected a '
                            'list<str>, got {}'.format(config_paths))

        registry = {}
        errors = {}

        for path in config_paths:
            logger.debug('Loading configs from path %s', path)
            for filename in os.listdir(path):
                full_path = os.path.abspath(os.path.join(path, filename))

                if os.path.splitext(filename)[1] not in ['.yaml', '.yml']:
                    raise Exception(
                        'Invalid file found: {} (must be a yaml config file)'.
                        format(full_path))
                config = self.load_from_file(full_path)

                name = config['name']

                if name in registry:
                    errors[name].setdefault(registry[name])
                    errors[name].append(config)
                    continue

                registry[name] = config

        if errors:
            raise Exception(
                'Errors found in loading registry: {}'.format(errors))

        return registry
示例#6
0
    def load_configs(self, config_paths):
        if not isinstance(config_paths, list):
            raise TypeError('Invalid `config_paths` argument: expected a '
                            'list<str>, got {}'.format(config_paths))

        registry = {}
        duplicates = {}

        for path in config_paths:
            logger.debug('Loading configs from path %s', path)
            for filename in os.listdir(path):
                full_path = os.path.abspath(os.path.join(path, filename))

                if os.path.splitext(filename)[1] not in ['.yaml', '.yml']:
                    raise Exception(
                        'Invalid file found: {} (must be a yaml config file)'.
                        format(full_path))
                config = self.load_from_file(full_path)

                name = config['name']

                if name in registry:
                    duplicates.setdefault(name, [registry[name]])
                    duplicates[name].append(config)
                    continue

                registry[name] = config

        if duplicates:
            raise DuplicateRegistryConfigName(
                'Duplicate names found while loading registry: `{}` '
                '(full configurations: {})'.format(
                    '`, `'.join(duplicates.keys()), duplicates))

        return registry
示例#7
0
    def _strip_workflow_nodes(self, workflow, graph):
        """ Compute a new workflow from the provided one, including only those
            nodes present in the provided graph, and with upstream dependencies
            set according to those in the graph.

            :param workflow: the workflow
            :type workflow: dict
            :param graph: the graph
            :type graph: nx.DiGraph
            :returns: a new workflow
            :rtype: dict
        """
        original_workflow_node_names = frozenset(
            wf['name'] for wf in self.get_all_nodes(workflow))
        keyed_nodes = {node.name: node for node in graph.nodes()}

        def strip_section_operators(operators):
            result = []
            for operator in operators:
                if operator['name'] not in keyed_nodes:
                    continue

                dependencies = [
                    node.name
                    for node in graph.predecessors(keyed_nodes[operator['name']])
                    if node.name in original_workflow_node_names
                ]

                new_operator = operator.copy()
                new_operator['upstream_dependencies'] = dependencies
                if not dependencies:
                    new_operator.pop('upstream_dependencies')

                # Remove any downstream dependencies that may have been specified
                # in the original graph, because we will use upstream dependencies
                # (arbitarily) as the mechanism for specifying all dependencies
                if 'downstream_dependencies' in new_operator:
                    new_operator.pop('downstream_dependencies')

                result.append(new_operator)

            return result

        new_workflow = workflow.copy()

        for section_name in ['before', 'after', 'operators', 'generators', 'sub_dags']:
            if section_name not in workflow:
                continue

            new_section = strip_section_operators(workflow[section_name])
            if new_section:
                new_workflow[section_name] = new_section
                logger.debug('New workflow section %s: %s',
                             section_name, new_section)
            else:
                new_workflow.pop(section_name)
                logger.debug('Removing workflow section %s', section_name)

        return new_workflow
示例#8
0
    def insert_bundler_nodes(nodes, graph, fc_node_builder, default_bundler_node=None):
        """ Utility for inserting "bundler" nodes that help us express
            complex dependency settings that are not otherwise supported by
            Airflow.  In particular: suppose we have a node, N, that we would
            like to trigger whenever either (A and B) are complete or
            (C and D) are complete.  Airflow does not have any mechanism for
            expressing this directly.  Instead, we insert a bundler node X0
            that triggers when (A and B) are complete, and a bundler node X1
            that triggers when (C and D) are complete, and then we set Z to
            trigger when either X0 or X1 is complete, using the ONE_SUCCESS
            trigger rule.

            The specific use case for this logic in boundary-layer is for
            creating shared resources when any resource-dependent operator is
            ready, based on its upstream dependencies being satisfied, even if
            the upstream dependencies of other resource-dependent operators
            may not yet be satisfied.

            :param bundles: list of sets of nodes
            :param graph: graph
            :param default_bundler_node: preexisting node to use as the bundler
                if there is ony one bundle in the input.  This parameter is
                ignored if there are more than one bundle; in that case, or if
                this parameter is not provided at all, new flow control nodes
                will be inserted to act as bundler nodes.
        """

        bundles = _GraphUtil.get_distinct_upstream_dependency_sets(
            nodes, graph)

        bundler_nodes = _GraphUtil._create_bundler_nodes(
            bundles,
            fc_node_builder,
            default_bundler_node,
            )

        to_insert = [node for node in bundler_nodes.values() if node !=
                     default_bundler_node]
        logger.debug('Inserting bundler nodes: %s', to_insert)
        graph.add_nodes_from(to_insert)

        for node in nodes:
            dep_set = _GraphUtil.upstream_dependency_set(node, graph)

            if not dep_set:
                # Some nodes may not have had any upstream dependencies, and
                # no bundler nodes would have been added for these
                continue

            bundler_node = bundler_nodes[dep_set]

            _GraphUtil.attach_flow_control_between(
                upstream=dep_set,
                downstream=frozenset([node]),
                graph=graph,
                fc_node=bundler_node)

        return bundler_nodes
示例#9
0
    def load_config_registry(cls, subpath, factory):
        logger.debug('Configs path: %s', cls.configs_path)
        if not cls.configs_path:
            return None

        registry_config_path = os.path.join(cls.configs_path, subpath)
        if not os.path.isdir(registry_config_path):
            return None

        return factory([registry_config_path])
示例#10
0
    def get(self, item):
        if 'type' not in item:
            raise Exception(
                'Invalid item: no `type` specified for `{}`'.format(item))

        if item['type'] not in self.node_configs:
            logger.debug('Unrecognized type `%s` for item %s', item['type'],
                         item)
            return None

        return self.node_cls(config=self.node_configs[item['type']], item=item)
示例#11
0
    def load_from_file(self, filename):
        item = None
        with open(filename) as _in:
            item = yaml.load(_in)

        logger.debug('validating item %s against schema %s', item,
                     self.spec_schema_cls.__name__)

        loaded = self.spec_schema_cls().load(item)
        if loaded.errors:
            raise InvalidConfig('Invalid config spec in file {}: {}'.format(
                filename, loaded.errors))

        return loaded.data
示例#12
0
 def check_jsonschema(self, data):
     if 'parameters_jsonschema' not in data:
         return
     # Make sure that `properties` is present, because it's not actually
     # required to make a valid JSONSchema
     if 'properties' not in data.get('parameters_jsonschema', {}):
         logger.debug(
             'No `properties` defined in `parameters_jsonschema` for `%s`',
             data['name'])
     try:
         Draft4Validator.check_schema(data['parameters_jsonschema'])
     except Exception as e:
         raise ma.ValidationError('Invalid JSON schema: {}'.format(e),
                                  ['parameters_jsonschema'])
示例#13
0
    def partition_actions(data):
        operators = [ac for ac in data['action'] if not isinstance(ac, OozieSubWorkflowBuilderBase)]
        sub_workflows = [ac for ac in data['action'] if isinstance(ac, OozieSubWorkflowBuilderBase)]

        logger.debug('Partitioned actions for workflow %s.  Operators are %s, sub_workflows are %s',
                     data['name'],
                     [operator.name for operator in operators],
                     [sub_workflow.name for sub_workflow in sub_workflows])

        assert len(operators + sub_workflows) == len(data['action']), \
            'Some actions were not partitioned!'

        return {
            'operators': operators,
            'sub_workflows': sub_workflows,
        }
示例#14
0
    def attach_flow_control_between(
            upstream,
            downstream,
            graph,
            fc_node=None,
            fc_node_builder=None):
        """ For each downstream node, break that node's connections with all of
            the upstream nodes provided, and insert a flow-control node between
            them.  If the fc_node argument is provided, then use the provided node
            for flow-control; otherwise, insert a new flow-control node.
        """
        if not upstream or not downstream:
            raise Exception('upstream and downstream node lists cannot be '
                            'empty (given upstream `{}`, downstream `{}`)'.format(
                                upstream, downstream))

        if not fc_node and not fc_node_builder:
            raise Exception(
                'Internal error: neither fc_node nor fc_node_builder argument '
                'was provided to the method attach_flow_control_between() '
                'but exactly one of these is required')

        if fc_node and fc_node_builder:
            raise Exception(
                'Internal error: both fc_node and fc_node_builder arguments '
                'were provided to the method attach_flow_control_between() '
                'but exactly one of these is required')

        if not fc_node:
            fc_node = fc_node_builder(deps=downstream)
            logger.debug('Inserting new flow control node %s between '
                         '{ %s } and { %s }', fc_node, upstream, downstream)

            graph.add_node(fc_node)
        else:
            logger.debug('Attaching existing node %s between '
                         '{ %s } and { %s }', fc_node, upstream, downstream)

        for node in downstream:
            if not upstream.issubset(_GraphUtil.upstream_dependency_set(node, graph)):
                raise InvalidFlowControlNode(
                    'Internal error: Invalid flow-control insertion. '
                    'Upstream nodes {} are not all upstream dependencies of '
                    'node {}'.format(upstream, node))

        # Break the edges between the upstream and downstream nodes
        break_edges = [(up, down) for up in upstream for down in downstream]
        logger.debug('removing edges: %s', break_edges)
        graph.remove_edges_from(break_edges)

        # Now insert the flow control node between the upstream and downstream
        # dependencies
        add_edges = [(up, fc_node) for up in upstream] + \
                    [(fc_node, down) for down in downstream]

        logger.debug('adding edges: %s', add_edges)
        graph.add_edges_from(add_edges)
示例#15
0
    def __init__(self):
        logger.debug('Loading plugin %s', self.name)

        if not isinstance(self.oozie_plugin_cls, type(None)) and not \
                issubclass(self.oozie_plugin_cls, BaseOozieParserPlugin):
            raise Exception('Invalid oozie plugin: {}'.format(
                self.oozie_plugin_cls))

        self.operator_registry = self.load_config_registry(
            'operators', OperatorRegistry)
        self.subdag_registry = self.load_config_registry('subdags', SubdagRegistry)
        self.generator_registry = self.load_config_registry(
            'generators', GeneratorRegistry)
        self.resource_registry = self.load_config_registry(
            'resources', ResourceRegistry)
        self.property_preprocessor_registry = \
            PropertyPreprocessorRegistry(self.property_preprocessors)
示例#16
0
    def prune_node(node, graph):
        if node not in graph:
            raise Exception('Cannot prune node {}: not present in graph {}'.format(
                node, graph))

        upstream = list(graph.predecessors(node))
        downstream = list(graph.successors(node))

        logger.debug('Pruning node %s with upstream nodes %s and downstream nodes %s',
                     node, upstream, downstream)

        graph.add_edges_from((up, down)
                             for up in upstream for down in downstream)

        graph.remove_node(node)

        return graph
示例#17
0
    def _prune_workflow(self, workflow, is_primary, graph, prune_nodes):
        """ Produce the pruned workflow.  Works by pruning the provided graph
            using the provided prune_nodes argument, and then reconstructing
            the workflow to include only the nodes remaining after pruning,
            with their upstream dependencies set properly to those computed
            during pruning.

            :param workflow: the workflow to prune
            :type workflow: dict
            :param is_primary: whether or not this workflow is the primary
                workflow
            :type is_primary: boolean
            :param graph: the OperatorGraph associated with this workflow
            :type graph: OperatorGraph
            :param prune_nodes: the set of all nodes being pruned from all
                workflows, specified as (workflow_name, node_name) tuples
            :type prune_nodes: set<(string, string)>

            :returns: the pruned workflow
            :rtype: dict
        """
        prune_node_names = [
            node_name
            for (prune_workflow_name, node_name) in prune_nodes
            if (is_primary and prune_workflow_name is None) or
            (workflow['name'] == prune_workflow_name)
        ]

        logger.debug(
            'Graph for workflow %s (%s) before pruning: %s',
            workflow['name'],
            'primary' if is_primary else 'secondary',
            graph.graph.nodes())

        _GraphUtil.prune_nodes(
            graph=graph.graph,
            nodes=[graph.nodes[node_name] for node_name in prune_node_names])

        logger.debug(
            'Graph for workflow %s (%s) after pruning: %s',
            workflow['name'],
            'primary' if is_primary else 'secondary',
            graph.graph.nodes())

        return self._strip_workflow_nodes(workflow, graph.graph)
示例#18
0
    def render_operator(self, node):
        template_filename = None
        if node.type == NodeTypes.GENERATOR:
            template_filename = 'generator_operator.j2'
        elif node.type == NodeTypes.SUBDAG:
            template_filename = 'subdag_operator.j2'
        else:
            template_filename = 'operator.j2'

        template = self.get_jinja_template(template_filename)

        # Do not set upstream/downstream dependencies that involve generator nodes
        # at this stage; those are all set within the generator nodes, and if they are
        # set here, there will be python errors due to references to operators that
        # do not exist (generators do not correspond to operators)
        generator_nodes = frozenset(gen.name for gen in self.graph.graph.nodes
                                    if gen.type == NodeTypes.GENERATOR)

        upstream_deps = frozenset(
            dep.name for dep in self.graph.upstream_dependency_set(node))

        if generator_nodes & upstream_deps:
            logger.debug(
                'Not passing upstream generator dependencies `%s` to '
                'operator template for node `%s`',
                generator_nodes & upstream_deps, node.name)

        downstream_deps = frozenset(
            dep.name for dep in self.graph.downstream_dependency_set(node))

        if generator_nodes & downstream_deps:
            logger.debug(
                'Not passing downstream generator dependencies `%s` to '
                'operator template for node `%s`',
                generator_nodes & downstream_deps, node.name)

        sorted_operator_args = order_dict(node.operator_args)
        return template.render(
            node=node,
            args=sorted_operator_args,
            upstream_dependencies=list(upstream_deps - generator_nodes),
            downstream_dependencies=list(downstream_deps - generator_nodes),
        )
示例#19
0
    def attach_destroy_resource(resource, graph, fc_node_builder):
        """ Method to attach the destroy-resource node to the graph,
            which is simpler than the attachment of the create-resource node
            because it can be attached directly to the downstream boundary nodes
            without breaking any existing dependencies.
        """
        destroy_resource = resource.destroy_operator
        if not destroy_resource:
            return

        downstream_boundary = _GraphUtil.downstream_resource_boundary(
            resource.name,
            graph)

        logger.debug('downstream boundary nodes for resource `%s`: %s',
                     resource.name, downstream_boundary)

        nodes_to_add = [destroy_resource]

        if _GraphUtil.requires_destroy_resource_sentinel(downstream_boundary, graph):
            # The destroy_resource node is the only leaf node in the DAG.
            # Add a flow control node to the graph that will inherit its state
            # from the nodes that are upstream to the destroy_resource node.
            # This is because Airflow's DagRun final-state logic only looks
            # at leaf node states, and if the destroy_resource step runs
            # successfully after an upstream failure then the DagRun would
            # be marked as a success, which is presumably not the desired
            # behavior.

            if resource.disable_sentinel_node:
                logger.debug(
                    'Not adding a sentinel node for resource %s because '
                    'the DAG or the resource configuration specified '
                    'disable_sentinel_node == True',
                    destroy_resource.name)
            else:
                logger.debug(
                    '`%s` is the only leaf node in the DAG: adding a '
                    'sentinel node to propagate upstream failures to '
                    'the DagRun state',
                    destroy_resource.name)
                fc_node = fc_node_builder(
                    deps=downstream_boundary,
                    name=destroy_resource.name + '-sentinel')
                nodes_to_add.append(fc_node)

        graph.add_nodes_from(nodes_to_add)

        for node in downstream_boundary:
            logger.debug(
                'Adding edge between node %s and %s', node, nodes_to_add)
            graph.add_edges_from((node, new_node) for new_node in nodes_to_add)
示例#20
0
    def _apply_preprocessors(self, args, preprocessors):
        """ Apply any necessary preprocessing to the alread-validated args.
            This must be the last step in case any preprocessors are defined on
            fields that are inserted by the schema defaults.
        """
        result = args.copy()

        for (property_name, preprocessor) in six.iteritems(preprocessors):
            if property_name not in args:
                continue

            processed_value = preprocessor.process_arg(args[property_name],
                                                       node=self,
                                                       raw_args=args)
            logger.debug(
                'Property `%s` raw value: `%s`, processed value: `%s`',
                property_name, args[property_name], processed_value)

            result[property_name] = processed_value

        return result
示例#21
0
    def __init__(self, dag, default_task_args=None):
        self.dag = dag

        self.fc_node_builder = functools.partial(
            util.make_flow_control_node,
            default_task_args=default_task_args or {})

        main_graph = _GraphUtil.build_subgraph(
            dag.get('operators', []) +
            dag.get('sub_dags', []) +
            dag.get('generators', []))

        before_graph = _GraphUtil.build_subgraph(dag.get('before', []))
        after_graph = _GraphUtil.build_subgraph(dag.get('after', []))

        resource_graph = _GraphUtil.build_subgraph(dag.get('resources', []))

        self.graph = nx.algorithms.operators.union_all([
            main_graph,
            before_graph,
            after_graph])

        self.nodes = {node.name: node for node in self.graph.nodes()}
        for upstream_node in _GraphUtil.get_downstream_surface(before_graph):
            for downstream_node in _GraphUtil.get_upstream_surface(main_graph):
                self.graph.add_edge(upstream_node, downstream_node)

        for downstream_node in _GraphUtil.get_upstream_surface(after_graph):
            for upstream_node in _GraphUtil.get_downstream_surface(main_graph):
                self.graph.add_edge(upstream_node, downstream_node)

        # Finally get all resources and figure out how to attach them to the
        # graph
        for resource in resource_graph:
            logger.debug('Attaching resource %s to graph', resource)
            _GraphUtil.attach_create_resource(resource, self.graph, self.fc_node_builder)
            _GraphUtil.attach_destroy_resource(resource, self.graph, self.fc_node_builder)

        self._attach_generator_flow_control()
示例#22
0
    def build_subgraph(nodes):
        graph = nx.DiGraph()

        node_lookup = {node.name: node for node in nodes}

        # First insert every node into the graph
        graph.add_nodes_from(nodes)

        # Now insert every explicit edge into the graph, making sure that
        # any referenced node is actually present in the subgraph
        for node in nodes:
            logger.debug('Processing node %s: upstream %s downstream %s',
                         node, node.upstream_dependencies, node.downstream_dependencies)
            for upstream in node.upstream_dependencies:
                upstream_node = node_lookup.get(upstream)
                if not upstream_node:
                    raise Exception('Invalid reference {} <- {}: perhaps it '
                                    'crosses sub-graph boundaries?'.format(
                                        upstream,
                                        node.name))

                graph.add_edge(upstream_node, node)

            for downstream in node.downstream_dependencies:
                downstream_node = node_lookup.get(downstream)

                if not downstream_node:
                    raise Exception('Invalid reference {} -> {}: perhaps it '
                                    'crosses sub-graph boundaries?'.format(
                                        node.name,
                                        downstream))

                graph.add_edge(node, downstream_node)

        if not nx.algorithms.dag.is_directed_acyclic_graph(graph):
            raise CyclicWorkflowException('Invalid sub-graph: not a DAG!')

        return graph
示例#23
0
    def _parse_all(
            self,
            primary_workflow_path_name,
            cluster_config,
            oozie_config):

        def _build_workflow_path(name):
            return os.path.join(name, 'workflow.xml')

        path = _build_workflow_path(primary_workflow_path_name)
        logger.debug('parsing primary workflow from path %s', path)
        primary = self._parse_workflow(path, cluster_config, oozie_config)

        sub_workflows = {}
        parsed_sub_workflow_targets = set()
        q = [wf for wf in primary['sub_workflows']]

        while q:
            item = q.pop()
            swf_path_name = self._sub_workflow_target_name(item.get_action())
            if swf_path_name in parsed_sub_workflow_targets:
                continue

            parsed_sub_workflow_targets.add(swf_path_name)

            swf_path = _build_workflow_path(swf_path_name)

            logger.debug('parsing sub workflow from path %s', swf_path)
            swf = self._parse_workflow(
                swf_path, cluster_config, oozie_config)

            sub_workflows[swf['name']] = swf
            item.set_target_name(swf['name'])

            q += [wf for wf in swf['sub_workflows']]

        return (primary, sub_workflows)
示例#24
0
    def attach_create_resource(resource, graph, fc_node_builder):
        create_resource_node = resource.create_operator
        graph.add_node(create_resource_node)

        upstream_boundary = \
            _GraphUtil.upstream_resource_boundary(resource.name, graph)

        logger.debug('upstream boundary nodes for resource `%s`: %s',
                     resource.name, upstream_boundary)

        bundler_nodes = _GraphUtil.insert_bundler_nodes(
            upstream_boundary,
            graph,
            fc_node_builder,
            create_resource_node)

        for bundler_node in bundler_nodes.values():
            if bundler_node == create_resource_node:
                continue

            graph.add_edge(bundler_node, create_resource_node)

        for node in upstream_boundary:
            dep_set = _GraphUtil.upstream_dependency_set(node, graph)
            assert len(dep_set) <= 1, \
                'Error: node {} has dependency set {}, but its ' \
                'dependencies should have been bundled!'.format(
                    node,
                    dep_set)

            bundler_node = list(dep_set)[0] if dep_set else None

            # If the create_resource_node is functioning as the bundler
            # node, then all necessary dependencies will have been set up
            # already.
            if bundler_node == create_resource_node:
                continue

            # Otherwise, we have to insert dependencies between the
            # create_resource_node and the downstream node.
            if not bundler_node:
                logger.debug(
                    'Adding edge between create_resource_node and dependency-free node %s',
                    node)
            else:
                logger.debug(
                    'Adding edge between create_resource_node and node %s with bundler node %s',
                    node,
                    bundler_node)

            graph.add_edge(create_resource_node, node)
示例#25
0
    def validate_and_resolve_properties(spec):
        secondary_lookup = {dag['name']: dag for dag in spec.secondary}
        default_task_args = spec.primary.get('default_task_args', {})

        # Construct sets of all of the resources created and requested,
        # so that we can check for unused resources and default args
        all_resources_created = set()
        all_resources_requested = set()
        all_defaults_used = set()

        def validate_dag(dag, execution_context):
            nodes = Workflow.get_all_nodes(dag)

            dag_resources = {
                resource.name: resource
                for resource in dag['resources']
            }

            Workflow.ensure_no_duplicate_names(
                dag['name'],
                [node.name for node in nodes] + list(dag_resources),
                list(execution_context.resources))

            for resource in dag_resources.values():
                (create_properties,
                 destroy_properties) = resource.resolve_properties(
                     execution_context=execution_context,
                     default_task_args=default_task_args,
                     base_operator_loader=plugins.manager.operators,
                     preprocessor_loader=plugins.manager.
                     property_preprocessors,
                 )
                all_defaults_used.update(
                    set(create_properties.sources.default_task_args))
                all_defaults_used.update(
                    set(destroy_properties.sources.default_task_args))

            all_resources_created.update(set(dag_resources))

            all_resources_requested.update(
                set(resource for node in nodes
                    for resource in node.requires_resources))

            resources_available = execution_context.resources.copy()
            resources_available.update(dag_resources)

            missing_resources = {
                name: missing
                for (name, missing) in [(node.name,
                                         frozenset(node.requires_resources) -
                                         frozenset(resources_available))
                                        for node in nodes] if missing
            }

            if missing_resources:
                raise ValueError(
                    'Error in dag {}: Operators require resources '
                    'outside their local contexts: {}'.format(
                        dag['name'], missing_resources))

            for node in nodes:
                properties = node.resolve_properties(
                    execution_context=execution_context._replace(
                        resources=resources_available),
                    default_task_args=default_task_args,
                    base_operator_loader=plugins.manager.operators,
                    preprocessor_loader=plugins.manager.property_preprocessors,
                )

                all_defaults_used.update(
                    set(properties.sources.default_task_args))

            all_referrers = dag['sub_dags'] + dag['generators']
            for referrer in all_referrers:
                subdag = secondary_lookup[referrer.target]
                subdag_resources_available = {
                    name: resource
                    for (name, resource) in six.iteritems(resources_available)
                    if name in frozenset(referrer.requires_resources)
                }
                subdag_ctx = ExecutionContext(
                    referrer=referrer, resources=subdag_resources_available)
                validate_dag(subdag, subdag_ctx)

        validate_dag(spec.primary, ExecutionContext(referrer=None,
                                                    resources={}))

        unused_resources = all_resources_created - all_resources_requested

        if unused_resources:
            raise ValueError('Unused resources `{}`'.format(
                '`, `'.join(unused_resources)))

        unused_defaults = frozenset(default_task_args) - frozenset(
            all_defaults_used)

        if unused_defaults:
            logger.debug('Unused default task args: `%s`',
                         '`, `'.join(unused_defaults))
示例#26
0
    def _prune_paths(self, prune_nodes, keep_nodes,
                     allow_augmented_keep_nodes):
        """ Actually do the pruning.  There are a few steps here.

            First, we must figure out whether there are any nodes in the keep
            pile that require nodes from the prune pile, because those nodes
            are used in paths to the keep nodes (but only if
            allow_augmented_keep_nodes is True)

            Second, we figure out whether there are any nodes in the keep pile
            that refer to sub-workflows that are entirely in the prune pile,
            so that these referring nodes should also be pruned.

            Third, we figure out which workflows, if any, will become inaccessible
            due to the removal of referring nodes, and we discard these.

            Fourth, we prune all the specified nodes out of the graph.

            Finally, we delete any sub-workflows that end up empty after pruning.

            :param prune_nodes: paths to all of the nodes we plan to prune
            :type prune_nodes: list<list<(string, string)>>
            :param keep_nodes: paths to all of the nodes we plan to keep
            :type keep_nodes: list<list<(string, string)>>
            :param allow_augmented_keep_nodes: Whether to augment the list of
                keep_nodes by adding in any nodes required for access to nodes
                in the keep_nodes list.  For example,
        """

        # Use a set to denote the nodes that we are ultimately going
        # to prune
        planned_prune_nodes = set(path[-1] for path in prune_nodes)
        planned_keep_nodes = set(path[-1] for path in keep_nodes)

        # First step, remove items from planned_prune_nodes if necessary
        if allow_augmented_keep_nodes:
            self._augment_keep_nodes_list(keep_nodes, planned_prune_nodes,
                                          planned_keep_nodes)

        # Second step, augment planned_prune_nodes to reflect any fully-deleted
        # sub-workflows
        self._augment_pruned_sub_workflow_referrers(planned_prune_nodes,
                                                    planned_keep_nodes)

        # Third step, identify any sub-workflows that have become inaccessible
        # by removal of referring nodes
        inaccessible_workflows = self._find_inaccessible_workflows(
            planned_prune_nodes)

        # Fourth step, prune the graph
        logger.debug('going to prune away nodes: %s', planned_prune_nodes)

        keyed_graphs = self._build_keyed_graph_map()

        pruned_primary = self._prune_workflow(workflow=self.primary,
                                              is_primary=True,
                                              graph=keyed_graphs[None],
                                              prune_nodes=planned_prune_nodes)
        if self._workflow_is_empty(pruned_primary):
            raise Exception(
                'Pruning operation produced an empty primary workflow: {}'.
                format(pruned_primary))

        pruned_secondary = []
        for workflow in self.secondary:
            workflow_name = workflow['name']

            if workflow_name in inaccessible_workflows:
                logger.debug('Skipping inaccessible workflow %s',
                             workflow_name)
                continue

            pruned_workflow = self._prune_workflow(
                workflow=workflow,
                is_primary=False,
                graph=keyed_graphs[workflow_name],
                prune_nodes=planned_prune_nodes)

            # Fifth step, discard empty workflows
            if self._workflow_is_empty(pruned_workflow):
                logger.info(
                    'Pruning operation produced an empty sub-workflow: %s',
                    pruned_workflow['name'])
            else:
                pruned_secondary.append(pruned_workflow)

        logger.debug('pruned primary workflow: %s', pruned_primary)
        logger.debug('pruned secondary workflows: %s', pruned_secondary)

        return (pruned_primary, pruned_secondary)
示例#27
0
    def resolve_properties(self,
                           execution_context,
                           default_task_args=None,
                           base_operator_loader=None,
                           preprocessor_loader=None):
        """ Get the properties / arguments for the operator, and split them
            according to their source.  Specifically, properties are provided
            to the operator by either the DAG config file, the resources
            available in the operator's context, any task defaults specified
            in the primary DAG, and the schema defaults, in that order of
            precedence.

            Once the properties are all resolved, this method then validates
            all of the resolved arguments against the task's schema.

            :param execution_context: the context in which this node is executed,
                specifically containing the available resources and the node
                that referred to this node, if any
            :type execution_context: boundary_layer.containers.ExecutionContext
            :param default_task_args: the default task args defined in the
                DAG
            :type default_task_args: dict
            :param base_operator_loader: A method that retrieves typed operators,
                equivalent to a Registry.get method
            :type base_operator_loader: callable
            :param preprocessor_loader: A method that retrieves typed preprocessors,
                equivalent to a Registry.get method
            :type preprocessor_loader: callable

            :returns: a mapping of property source to property key/value pairs
            :rtype: dict<dict<string, any>>
        """
        schema = self.get_schema(base_operator_loader)
        schema_properties = frozenset(schema.get('properties', {}).keys())

        self.set_default_task_args(default_task_args)

        (sources, property_values) = self._get_property_sources_and_values(
            schema_properties, execution_context)

        validated = validator.validate_and_fill_defaults(item=property_values,
                                                         schema=schema)

        for key in validated:
            if key not in property_values:
                continue

            sources.schema.add(key)

        logger.debug('%s: validated partitioned properties: %s', self.name,
                     sources)

        preprocessors = self._load_preprocessors(base_operator_loader,
                                                 preprocessor_loader)

        self._preprocessor_imports = {
            pp_name: pp.imports()
            for (pp_name, pp) in six.iteritems(preprocessors)
        }

        preprocessed_values = self._apply_preprocessors(
            args=validated, preprocessors=preprocessors)

        if self._resolved_properties:
            if preprocessed_values != self._resolved_properties.values:
                raise Exception(
                    'resolve_properties() was already called for operator {}, '
                    'and different values were computed this time!  Found: {}, '
                    'expected: {}.  This was probably caused by repeated '
                    'references to a sub-dag or generator using different resource '
                    'contexts.  This is not presently supported!'.format(
                        self, preprocessed_values,
                        self._resolved_properties.values))
            else:
                logger.warning(
                    'resolve_properties() was already called for operator %s, '
                    'but no differences in the computed properties were found.',
                    self)

        self._resolved_properties = ResolvedProperties(
            sources=sources, values=preprocessed_values)

        return self._resolved_properties
示例#28
0
    def _get_property_sources_and_values(self, schema_properties,
                                         execution_context):
        """ For the provided set of properties, determine the values, and the
            sources of those values, for the current node.

            Value sources include:
                 - default task args defined in the DAG
                 - available resources
                 - properties defined in the DAG
                 - fixed values (set below)
                 - global defaults (set below)

            Note that this method does not validate these values against the
            schema, and therefore does not include any values that could be
            derived from the schema's default settings.

            :param schema_properties: the list of property names that are
                applicable to this node
            :type schema_properties: list<str>
            :param default_task_args: default arg values provided in the DAG
            :type default_task_args: dict
            :param execution_context: the execution context
            :type execution_context: boundary_layer.containers.ExecutionContext
            :returns: sources and values as a 2-tuple
            :rtype: (PropertySources, dict)
        """

        sources = PropertySources(dag=set(),
                                  default_task_args=set(),
                                  resources=set(),
                                  schema=set(),
                                  global_defaults=set(),
                                  fixed_args=set())

        property_values = {}

        resource_args = self._get_resource_args(execution_context)

        global_defaults = self._get_global_defaults(execution_context)

        fixed_args = self._get_fixed_args()

        # make sure that the user has not tried to specify values for any
        # fixed args; this prevents the user from trying to attach operators
        # to a different DAG, for instance (which does not make sense because
        # there is only one DAG)
        invalid_properties = [
            property_name for property_name in fixed_args
            if property_name in self.properties
        ]

        if invalid_properties:
            raise Exception(
                'Illegal properties `{}` provided for operator `{}`: these '
                'properties are assigned fixed values by boundary-layer that '
                'cannot be overridden'.format('` ,`'.join(invalid_properties),
                                              self))

        for property_name in schema_properties:
            if property_name in fixed_args:
                # Check fixed args first, because we do not allow these to be
                # set by the user
                value = fixed_args[property_name]
                logger.debug(
                    '%s: Inserting value `%s` for argument `%s` from fixed_args',
                    self.name, value, property_name)

                property_values[property_name] = value
                sources.fixed_args.add(property_name)
                continue

            if property_name in self.properties:
                logger.debug('%s: Property `%s` found in user-props',
                             self.name, property_name)

                property_values[property_name] = self.properties[property_name]
                sources.dag.add(property_name)
                continue

            resource_hits = resource_args.get(property_name, {})

            if len(resource_hits) > 1:
                raise ValueError(
                    'Error in operator {}: Multiple available resources '
                    'provide the argument {}: {}. Please specify a value or limit '
                    'limit the resource scope'.format(self.name, property_name,
                                                      resource_hits))

            if len(resource_hits) == 1:
                (resource_name, value) = resource_hits.popitem()

                logger.debug(
                    '%s: Inserting value `%s` for argument `%s` from resource `%s`',
                    self.name, value, property_name, resource_name)

                property_values[property_name] = value
                sources.resources.add(property_name)
                continue

            if property_name in self._default_task_args:
                value = self._default_task_args[property_name]
                logger.debug(
                    '%s: Inserting value `%s` for argument `%s` from default_task_args',
                    self.name, value, property_name)

                property_values[property_name] = value
                sources.default_task_args.add(property_name)
                continue

            if property_name in global_defaults:
                value = global_defaults[property_name]
                logger.debug(
                    '%s: Inserting value `%s` for argument `%s` from global defaults',
                    self.name, value, property_name)
                property_values[property_name] = value
                sources.global_defaults.add(property_name)
                continue

            logger.debug(
                '%s: No resources or defaults available for property `%s`',
                self.name, property_name)

        return (sources, property_values)