def _prune_unused_resources(dag): resources_defined = dag.get('resources') if not resources_defined: return dag resources_required = frozenset( resource for operator_section in ['before', 'operators', 'after', 'subdags', 'generators'] for operator in dag.get(operator_section, []) for resource in operator.get('requires_resources', [])) keep_resources = [] for resource in resources_defined: if resource['name'] not in resources_required: logger.info( 'Discarding unused resource %s from dag %s', resource['name'], dag['name']) continue keep_resources.append(resource) result = dag.copy() result['resources'] = keep_resources return result
def __init__(self, load_package_plugins=True, plugins=None): self._plugins = [p for p in (plugins or [])] if load_package_plugins: self._plugins += self._load_package_plugins() logger.info('Loaded plugins %s', ', '.join(plugin.name for plugin in self._plugins))
def cluster_config(self): plugins_with_cluster_configs = [ pc for pc in self._plugin_containers if pc.plugin.cluster_config() ] if not plugins_with_cluster_configs: raise Exception( 'No cluster configurations found for oozie parser!') if len(plugins_with_cluster_configs) > 1: logger.info( 'Multiple cluster configurations found. Choosing configuration ' 'from plugin `%s`, with priority `%s`', plugins_with_cluster_configs[0].name, plugins_with_cluster_configs[0].priority) return plugins_with_cluster_configs[0].plugin.cluster_config()
def _unprune_referenced_sub_workflows(self, keep_paths, prune_paths): """ This method is only called when --only-nodes was specified. It enables a user to specify a sub-workflow referencing node as one of the --only-nodes arguments. If such a node is referenced, then the entire sub-workflow that it targets will be added to the keep_paths list, and furthermore, any referring nodes nested in that sub-workflow will also be added to the keep_paths recursively. :param keep_paths: the keep paths computed after partitioning the workflow :type keep_paths: list<list<(string, string)>> :param prune_paths: the prune paths computed after partitioning the workflow :type prune_paths: list<list<(string, string)>> :returns: an updated pair of (keep_paths, prune_paths) :rtype: (list<list<(string, string)>>, list<list<(string, string)>>) """ keep_nodes = frozenset([path[-1] for path in keep_paths]) shift_path_indexes = frozenset(idx for (idx, path) in enumerate(prune_paths) if any(node in keep_nodes for node in path)) if not shift_path_indexes: return (keep_paths, prune_paths) for idx in shift_path_indexes: node = prune_paths[idx][-1] logger.info( "Keeping node %s.%s because it is downstream of an --only-nodes argument", node[0], node[1]) return self._unprune_referenced_sub_workflows( keep_paths + [prune_paths[i] for i in shift_path_indexes], [ path for (i, path) in enumerate(prune_paths) if i not in shift_path_indexes ])
def _prune_paths(self, prune_nodes, keep_nodes, allow_augmented_keep_nodes): """ Actually do the pruning. There are a few steps here. First, we must figure out whether there are any nodes in the keep pile that require nodes from the prune pile, because those nodes are used in paths to the keep nodes (but only if allow_augmented_keep_nodes is True) Second, we figure out whether there are any nodes in the keep pile that refer to sub-workflows that are entirely in the prune pile, so that these referring nodes should also be pruned. Third, we figure out which workflows, if any, will become inaccessible due to the removal of referring nodes, and we discard these. Fourth, we prune all the specified nodes out of the graph. Finally, we delete any sub-workflows that end up empty after pruning. :param prune_nodes: paths to all of the nodes we plan to prune :type prune_nodes: list<list<(string, string)>> :param keep_nodes: paths to all of the nodes we plan to keep :type keep_nodes: list<list<(string, string)>> :param allow_augmented_keep_nodes: Whether to augment the list of keep_nodes by adding in any nodes required for access to nodes in the keep_nodes list. For example, """ # Use a set to denote the nodes that we are ultimately going # to prune planned_prune_nodes = set(path[-1] for path in prune_nodes) planned_keep_nodes = set(path[-1] for path in keep_nodes) # First step, remove items from planned_prune_nodes if necessary if allow_augmented_keep_nodes: self._augment_keep_nodes_list(keep_nodes, planned_prune_nodes, planned_keep_nodes) # Second step, augment planned_prune_nodes to reflect any fully-deleted # sub-workflows self._augment_pruned_sub_workflow_referrers(planned_prune_nodes, planned_keep_nodes) # Third step, identify any sub-workflows that have become inaccessible # by removal of referring nodes inaccessible_workflows = self._find_inaccessible_workflows( planned_prune_nodes) # Fourth step, prune the graph logger.debug('going to prune away nodes: %s', planned_prune_nodes) keyed_graphs = self._build_keyed_graph_map() pruned_primary = self._prune_workflow(workflow=self.primary, is_primary=True, graph=keyed_graphs[None], prune_nodes=planned_prune_nodes) if self._workflow_is_empty(pruned_primary): raise Exception( 'Pruning operation produced an empty primary workflow: {}'. format(pruned_primary)) pruned_secondary = [] for workflow in self.secondary: workflow_name = workflow['name'] if workflow_name in inaccessible_workflows: logger.debug('Skipping inaccessible workflow %s', workflow_name) continue pruned_workflow = self._prune_workflow( workflow=workflow, is_primary=False, graph=keyed_graphs[workflow_name], prune_nodes=planned_prune_nodes) # Fifth step, discard empty workflows if self._workflow_is_empty(pruned_workflow): logger.info( 'Pruning operation produced an empty sub-workflow: %s', pruned_workflow['name']) else: pruned_secondary.append(pruned_workflow) logger.debug('pruned primary workflow: %s', pruned_primary) logger.debug('pruned secondary workflows: %s', pruned_secondary) return (pruned_primary, pruned_secondary)