def _get_pipeline_subset_def(pipeline_def, solids_to_execute): """ Build a pipeline which is a subset of another pipeline. Only includes the solids which are in solids_to_execute. """ check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition) check.set_param(solids_to_execute, "solids_to_execute", of_type=str) for solid_name in solids_to_execute: if not pipeline_def.has_solid_named(solid_name): raise DagsterInvalidSubsetError( "Pipeline {pipeline_name} has no solid named {name}.".format( pipeline_name=pipeline_def.name, name=solid_name ), ) solids = list(map(pipeline_def.solid_named, solids_to_execute)) deps = {_dep_key_of(solid): {} for solid in solids} for solid in solids: for input_handle in solid.input_handles(): if pipeline_def.dependency_structure.has_singular_dep(input_handle): output_handle = pipeline_def.dependency_structure.get_singular_dep(input_handle) if output_handle.solid.name in solids_to_execute: deps[_dep_key_of(solid)][input_handle.input_def.name] = DependencyDefinition( solid=output_handle.solid.name, output=output_handle.output_def.name ) elif pipeline_def.dependency_structure.has_multi_deps(input_handle): output_handles = pipeline_def.dependency_structure.get_multi_deps(input_handle) deps[_dep_key_of(solid)][input_handle.input_def.name] = MultiDependencyDefinition( [ DependencyDefinition( solid=output_handle.solid.name, output=output_handle.output_def.name ) for output_handle in output_handles if output_handle.solid.name in solids_to_execute ] ) try: sub_pipeline_def = PipelineSubsetDefinition( name=pipeline_def.name, # should we change the name for subsetted pipeline? solid_defs=list({solid.definition for solid in solids}), mode_defs=pipeline_def.mode_definitions, dependencies=deps, _parent_pipeline_def=pipeline_def, tags=pipeline_def.tags, hook_defs=pipeline_def.hook_defs, ) return sub_pipeline_def except DagsterInvalidDefinitionError as exc: # This handles the case when you construct a subset such that an unsatisfied # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError, # we re-raise a DagsterInvalidSubsetError. raise DagsterInvalidSubsetError( f"The attempted subset {str_format_set(solids_to_execute)} for pipeline " f"{pipeline_def.name} results in an invalid pipeline" ) from exc
def _resolve_solid_selection(self, solid_selection): # resolve a list of solid selection queries to a frozenset of qualified solid names # e.g. ['foo_solid+'] to {'foo_solid', 'bar_solid'} check.list_param(solid_selection, "solid_selection", of_type=str) solids_to_execute = parse_solid_selection(self.get_definition(), solid_selection) if len(solids_to_execute) == 0: raise DagsterInvalidSubsetError( "No qualified solids to execute found for solid_selection={requested}" .format(requested=solid_selection)) return solids_to_execute
def parse_solid_selection(pipeline_def, solid_selection): """Take pipeline definition and a list of solid selection queries (inlcuding names of solid invocations. See syntax examples below) and return a set of the qualified solid names. It currently only supports top-level solids. Query syntax examples: - "some_solid": select "some_solid" itself - "*some_solid": select "some_solid" and all ancestors (upstream dependencies) - "some_solid*": select "some_solid" and all descendants (downstream dependencies) - "*some_solid*": select "some_solid" and all of its ancestors and descendants - "+some_solid": select "some_solid" and its ancestors at 1 level up - "some_solid+++": select "some_solid" and its descendants within 3 levels down Note: - If one of the query clauses is invalid, we will skip that one and continue to parse the valid ones. Args: pipeline_def (PipelineDefinition): the pipeline to execute. solid_selection (List[str]): a list of the solid selection queries (including single solid names) to execute. Returns: FrozenSet[str]: a frozenset of qualified deduplicated solid names, empty if no qualified subset selected. """ check.list_param(solid_selection, "solid_selection", of_type=str) # special case: select all if len(solid_selection) == 1 and solid_selection[0] == "*": return frozenset(pipeline_def.graph.node_names()) graph = generate_dep_graph(pipeline_def) solids_set = set() # loop over clauses for clause in solid_selection: subset = clause_to_subset(graph, clause) if len(subset) == 0: raise DagsterInvalidSubsetError( "No qualified {node_type} to execute found for {selection_type}={requested}" .format( requested=solid_selection, node_type="ops" if pipeline_def.is_job else "solids", selection_type="op_selection" if pipeline_def.is_job else "solid_selection", )) solids_set.update(subset) return frozenset(solids_set)
def _resolve_solid_selection(self, solid_selection): # resolve a list of solid selection queries to a frozenset of qualified solid names # e.g. ['foo_solid+'] to {'foo_solid', 'bar_solid'} check.list_param(solid_selection, "solid_selection", of_type=str) solids_to_execute = parse_solid_selection(self.get_definition(), solid_selection) if len(solids_to_execute) == 0: node_type = "ops" if self._pipeline_def.is_job else "solids" selection_type = "op_selection" if self._pipeline_def.is_job else "solid_selection" raise DagsterInvalidSubsetError( "No qualified {node_type} to execute found for {selection_type}={requested}".format( node_type=node_type, requested=solid_selection, selection_type=selection_type ) ) return solids_to_execute
def parse_step_selection(step_deps, step_selection): """Take the dependency dictionary generated while building execution plan and a list of step key selection queries and return a set of the qualified step keys. It currently only supports top-level solids. Args: step_deps (Dict[str, Set[str]]): a dictionary of execution step dependency where the key is a step key and the value is a set of direct upstream dependency of the step. step_selection (List[str]): a list of the step key selection queries (including single step key) to execute. Returns: FrozenSet[str]: a frozenset of qualified deduplicated solid names, empty if no qualified subset selected. """ check.list_param(step_selection, "step_selection", of_type=str) # reverse step_deps to get the downstream_deps # make sure we have all items as keys, including the ones without downstream dependencies downstream_deps = defaultdict(set, {k: set() for k in step_deps.keys()}) for downstream_key, upstream_keys in step_deps.items(): for step_key in upstream_keys: downstream_deps[step_key].add(downstream_key) # generate dep graph graph = {"upstream": step_deps, "downstream": downstream_deps} steps_set = set() step_keys = parse_items_from_selection(step_selection) invalid_keys = [key for key in step_keys if key not in step_deps] if invalid_keys: raise DagsterExecutionStepNotFoundError( f"Step selection refers to unknown step{'s' if len(invalid_keys)> 1 else ''}: {', '.join(invalid_keys)}", step_keys=invalid_keys, ) # loop over clauses for clause in step_selection: subset = clause_to_subset(graph, clause) if len(subset) == 0: raise DagsterInvalidSubsetError( "No qualified steps to execute found for step_selection={requested}" .format(requested=step_selection), ) steps_set.update(subset) return frozenset(steps_set)
def get_subselected_graph_definition( graph: GraphDefinition, resolved_op_selection_dict: Dict, parent_handle: Optional[NodeHandle] = None, ) -> SubselectedGraphDefinition: deps: Dict[ Union[str, NodeInvocation], Dict[str, IDependencyDefinition], ] = {} selected_nodes: List[Tuple[str, NodeDefinition]] = [] for node in graph.solids_in_topological_order: node_handle = NodeHandle(node.name, parent=parent_handle) # skip if the node isn't selected if node.name not in resolved_op_selection_dict: continue # rebuild graph if any nodes inside the graph are selected if node.is_graph and resolved_op_selection_dict[node.name] is not LeafNodeSelection: definition = get_subselected_graph_definition( node.definition, resolved_op_selection_dict[node.name], parent_handle=node_handle, ) # use definition if the node as a whole is selected. this includes selecting the entire graph else: definition = node.definition selected_nodes.append((node.name, definition)) # build dependencies for the node. we do it for both cases because nested graphs can have # inputs and outputs too deps[_dep_key_of(node)] = {} for input_handle in node.input_handles(): if graph.dependency_structure.has_direct_dep(input_handle): output_handle = graph.dependency_structure.get_direct_dep(input_handle) if output_handle.solid.name in resolved_op_selection_dict: deps[_dep_key_of(node)][input_handle.input_def.name] = DependencyDefinition( solid=output_handle.solid.name, output=output_handle.output_def.name ) elif graph.dependency_structure.has_dynamic_fan_in_dep(input_handle): output_handle = graph.dependency_structure.get_dynamic_fan_in_dep(input_handle) if output_handle.solid.name in resolved_op_selection_dict: deps[_dep_key_of(node)][ input_handle.input_def.name ] = DynamicCollectDependencyDefinition( solid_name=output_handle.solid.name, output_name=output_handle.output_def.name, ) elif graph.dependency_structure.has_fan_in_deps(input_handle): output_handles = graph.dependency_structure.get_fan_in_deps(input_handle) multi_dependencies = [ DependencyDefinition( solid=output_handle.solid.name, output=output_handle.output_def.name ) for output_handle in output_handles if ( isinstance(output_handle, SolidOutputHandle) and output_handle.solid.name in resolved_op_selection_dict ) ] deps[_dep_key_of(node)][input_handle.input_def.name] = MultiDependencyDefinition( cast( List[Union[DependencyDefinition, Type[MappedInputPlaceholder]]], multi_dependencies, ) ) # else input is unconnected # filter out unselected input/output mapping new_input_mappings = list( filter( lambda input_mapping: input_mapping.maps_to.solid_name in [name for name, _ in selected_nodes], graph._input_mappings, # pylint: disable=protected-access ) ) new_output_mappings = list( filter( lambda output_mapping: output_mapping.maps_from.solid_name in [name for name, _ in selected_nodes], graph._output_mappings, # pylint: disable=protected-access ) ) try: return SubselectedGraphDefinition( parent_graph_def=graph, dependencies=deps, node_defs=[definition for _, definition in selected_nodes], input_mappings=new_input_mappings, output_mappings=new_output_mappings, ) except DagsterInvalidDefinitionError as exc: # This handles the case when you construct a subset such that an unsatisfied # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError, # we re-raise a DagsterInvalidSubsetError. raise DagsterInvalidSubsetError( f"The attempted subset {str_format_set(resolved_op_selection_dict)} for graph " f"{graph.name} results in an invalid graph." ) from exc
def _get_pipeline_subset_def( pipeline_def: PipelineDefinition, solids_to_execute: AbstractSet[str], ) -> "PipelineSubsetDefinition": """ Build a pipeline which is a subset of another pipeline. Only includes the solids which are in solids_to_execute. """ check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition) check.set_param(solids_to_execute, "solids_to_execute", of_type=str) graph = pipeline_def.graph for solid_name in solids_to_execute: if not graph.has_solid_named(solid_name): raise DagsterInvalidSubsetError( "{target_type} {pipeline_name} has no {node_type} named {name}." .format( target_type=pipeline_def.target_type, pipeline_name=pipeline_def.name, name=solid_name, node_type="ops" if pipeline_def.is_job else "solids", ), ) # go in topo order to ensure deps dict is ordered solids = list( filter(lambda solid: solid.name in solids_to_execute, graph.solids_in_topological_order)) deps: Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition], ] = { _dep_key_of(solid): {} for solid in solids } for solid in solids: for input_handle in solid.input_handles(): if graph.dependency_structure.has_direct_dep(input_handle): output_handle = pipeline_def.dependency_structure.get_direct_dep( input_handle) if output_handle.solid.name in solids_to_execute: deps[_dep_key_of(solid)][ input_handle.input_def.name] = DependencyDefinition( solid=output_handle.solid.name, output=output_handle.output_def.name) elif graph.dependency_structure.has_dynamic_fan_in_dep( input_handle): output_handle = graph.dependency_structure.get_dynamic_fan_in_dep( input_handle) if output_handle.solid.name in solids_to_execute: deps[_dep_key_of( solid)][input_handle.input_def. name] = DynamicCollectDependencyDefinition( solid_name=output_handle.solid.name, output_name=output_handle.output_def.name, ) elif graph.dependency_structure.has_fan_in_deps(input_handle): output_handles = graph.dependency_structure.get_fan_in_deps( input_handle) deps[_dep_key_of(solid)][ input_handle.input_def.name] = MultiDependencyDefinition([ DependencyDefinition( solid=output_handle.solid.name, output=output_handle.output_def.name) for output_handle in output_handles if output_handle.solid.name in solids_to_execute ]) # else input is unconnected try: sub_pipeline_def = PipelineSubsetDefinition( name=pipeline_def. name, # should we change the name for subsetted pipeline? solid_defs=list({solid.definition for solid in solids}), mode_defs=pipeline_def.mode_definitions, dependencies=deps, _parent_pipeline_def=pipeline_def, tags=pipeline_def.tags, hook_defs=pipeline_def.hook_defs, ) return sub_pipeline_def except DagsterInvalidDefinitionError as exc: # This handles the case when you construct a subset such that an unsatisfied # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError, # we re-raise a DagsterInvalidSubsetError. raise DagsterInvalidSubsetError( f"The attempted subset {str_format_set(solids_to_execute)} for {pipeline_def.target_type} " f"{pipeline_def.name} results in an invalid {pipeline_def.target_type}" ) from exc