Пример #1
0
def _get_pipeline_subset_def(pipeline_def, solids_to_execute):
    """
    Build a pipeline which is a subset of another pipeline.
    Only includes the solids which are in solids_to_execute.
    """

    check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)
    check.set_param(solids_to_execute, "solids_to_execute", of_type=str)

    for solid_name in solids_to_execute:
        if not pipeline_def.has_solid_named(solid_name):
            raise DagsterInvalidSubsetError(
                "Pipeline {pipeline_name} has no solid named {name}.".format(
                    pipeline_name=pipeline_def.name, name=solid_name
                ),
            )

    solids = list(map(pipeline_def.solid_named, solids_to_execute))
    deps = {_dep_key_of(solid): {} for solid in solids}

    for solid in solids:
        for input_handle in solid.input_handles():
            if pipeline_def.dependency_structure.has_singular_dep(input_handle):
                output_handle = pipeline_def.dependency_structure.get_singular_dep(input_handle)
                if output_handle.solid.name in solids_to_execute:
                    deps[_dep_key_of(solid)][input_handle.input_def.name] = DependencyDefinition(
                        solid=output_handle.solid.name, output=output_handle.output_def.name
                    )
            elif pipeline_def.dependency_structure.has_multi_deps(input_handle):
                output_handles = pipeline_def.dependency_structure.get_multi_deps(input_handle)
                deps[_dep_key_of(solid)][input_handle.input_def.name] = MultiDependencyDefinition(
                    [
                        DependencyDefinition(
                            solid=output_handle.solid.name, output=output_handle.output_def.name
                        )
                        for output_handle in output_handles
                        if output_handle.solid.name in solids_to_execute
                    ]
                )

    try:
        sub_pipeline_def = PipelineSubsetDefinition(
            name=pipeline_def.name,  # should we change the name for subsetted pipeline?
            solid_defs=list({solid.definition for solid in solids}),
            mode_defs=pipeline_def.mode_definitions,
            dependencies=deps,
            _parent_pipeline_def=pipeline_def,
            tags=pipeline_def.tags,
            hook_defs=pipeline_def.hook_defs,
        )

        return sub_pipeline_def
    except DagsterInvalidDefinitionError as exc:
        # This handles the case when you construct a subset such that an unsatisfied
        # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,
        # we re-raise a DagsterInvalidSubsetError.
        raise DagsterInvalidSubsetError(
            f"The attempted subset {str_format_set(solids_to_execute)} for pipeline "
            f"{pipeline_def.name} results in an invalid pipeline"
        ) from exc
Пример #2
0
 def _resolve_solid_selection(self, solid_selection):
     # resolve a list of solid selection queries to a frozenset of qualified solid names
     # e.g. ['foo_solid+'] to {'foo_solid', 'bar_solid'}
     check.list_param(solid_selection, "solid_selection", of_type=str)
     solids_to_execute = parse_solid_selection(self.get_definition(),
                                               solid_selection)
     if len(solids_to_execute) == 0:
         raise DagsterInvalidSubsetError(
             "No qualified solids to execute found for solid_selection={requested}"
             .format(requested=solid_selection))
     return solids_to_execute
Пример #3
0
def parse_solid_selection(pipeline_def, solid_selection):
    """Take pipeline definition and a list of solid selection queries (inlcuding names of solid
        invocations. See syntax examples below) and return a set of the qualified solid names.

    It currently only supports top-level solids.

    Query syntax examples:
    - "some_solid": select "some_solid" itself
    - "*some_solid": select "some_solid" and all ancestors (upstream dependencies)
    - "some_solid*": select "some_solid" and all descendants (downstream dependencies)
    - "*some_solid*": select "some_solid" and all of its ancestors and descendants
    - "+some_solid": select "some_solid" and its ancestors at 1 level up
    - "some_solid+++": select "some_solid" and its descendants within 3 levels down

    Note:
    - If one of the query clauses is invalid, we will skip that one and continue to parse the valid
        ones.

    Args:
        pipeline_def (PipelineDefinition): the pipeline to execute.
        solid_selection (List[str]): a list of the solid selection queries (including single solid
            names) to execute.

    Returns:
        FrozenSet[str]: a frozenset of qualified deduplicated solid names, empty if no qualified
            subset selected.
    """
    check.list_param(solid_selection, "solid_selection", of_type=str)

    # special case: select all
    if len(solid_selection) == 1 and solid_selection[0] == "*":
        return frozenset(pipeline_def.graph.node_names())

    graph = generate_dep_graph(pipeline_def)
    solids_set = set()

    # loop over clauses
    for clause in solid_selection:
        subset = clause_to_subset(graph, clause)
        if len(subset) == 0:
            raise DagsterInvalidSubsetError(
                "No qualified {node_type} to execute found for {selection_type}={requested}"
                .format(
                    requested=solid_selection,
                    node_type="ops" if pipeline_def.is_job else "solids",
                    selection_type="op_selection"
                    if pipeline_def.is_job else "solid_selection",
                ))
        solids_set.update(subset)

    return frozenset(solids_set)
Пример #4
0
 def _resolve_solid_selection(self, solid_selection):
     # resolve a list of solid selection queries to a frozenset of qualified solid names
     # e.g. ['foo_solid+'] to {'foo_solid', 'bar_solid'}
     check.list_param(solid_selection, "solid_selection", of_type=str)
     solids_to_execute = parse_solid_selection(self.get_definition(), solid_selection)
     if len(solids_to_execute) == 0:
         node_type = "ops" if self._pipeline_def.is_job else "solids"
         selection_type = "op_selection" if self._pipeline_def.is_job else "solid_selection"
         raise DagsterInvalidSubsetError(
             "No qualified {node_type} to execute found for {selection_type}={requested}".format(
                 node_type=node_type, requested=solid_selection, selection_type=selection_type
             )
         )
     return solids_to_execute
Пример #5
0
def parse_step_selection(step_deps, step_selection):
    """Take the dependency dictionary generated while building execution plan and a list of step key
     selection queries and return a set of the qualified step keys.

    It currently only supports top-level solids.

    Args:
        step_deps (Dict[str, Set[str]]): a dictionary of execution step dependency where the key is
            a step key and the value is a set of direct upstream dependency of the step.
        step_selection (List[str]): a list of the step key selection queries (including single
            step key) to execute.

    Returns:
        FrozenSet[str]: a frozenset of qualified deduplicated solid names, empty if no qualified
            subset selected.
    """
    check.list_param(step_selection, "step_selection", of_type=str)

    # reverse step_deps to get the downstream_deps
    # make sure we have all items as keys, including the ones without downstream dependencies
    downstream_deps = defaultdict(set, {k: set() for k in step_deps.keys()})
    for downstream_key, upstream_keys in step_deps.items():
        for step_key in upstream_keys:
            downstream_deps[step_key].add(downstream_key)

    # generate dep graph
    graph = {"upstream": step_deps, "downstream": downstream_deps}
    steps_set = set()

    step_keys = parse_items_from_selection(step_selection)
    invalid_keys = [key for key in step_keys if key not in step_deps]
    if invalid_keys:
        raise DagsterExecutionStepNotFoundError(
            f"Step selection refers to unknown step{'s' if len(invalid_keys)> 1 else ''}: {', '.join(invalid_keys)}",
            step_keys=invalid_keys,
        )

    # loop over clauses
    for clause in step_selection:
        subset = clause_to_subset(graph, clause)
        if len(subset) == 0:
            raise DagsterInvalidSubsetError(
                "No qualified steps to execute found for step_selection={requested}"
                .format(requested=step_selection), )
        steps_set.update(subset)

    return frozenset(steps_set)
Пример #6
0
def get_subselected_graph_definition(
    graph: GraphDefinition,
    resolved_op_selection_dict: Dict,
    parent_handle: Optional[NodeHandle] = None,
) -> SubselectedGraphDefinition:
    deps: Dict[
        Union[str, NodeInvocation],
        Dict[str, IDependencyDefinition],
    ] = {}

    selected_nodes: List[Tuple[str, NodeDefinition]] = []

    for node in graph.solids_in_topological_order:
        node_handle = NodeHandle(node.name, parent=parent_handle)
        # skip if the node isn't selected
        if node.name not in resolved_op_selection_dict:
            continue

        # rebuild graph if any nodes inside the graph are selected
        if node.is_graph and resolved_op_selection_dict[node.name] is not LeafNodeSelection:
            definition = get_subselected_graph_definition(
                node.definition,
                resolved_op_selection_dict[node.name],
                parent_handle=node_handle,
            )
        # use definition if the node as a whole is selected. this includes selecting the entire graph
        else:
            definition = node.definition
        selected_nodes.append((node.name, definition))

        # build dependencies for the node. we do it for both cases because nested graphs can have
        # inputs and outputs too
        deps[_dep_key_of(node)] = {}
        for input_handle in node.input_handles():
            if graph.dependency_structure.has_direct_dep(input_handle):
                output_handle = graph.dependency_structure.get_direct_dep(input_handle)
                if output_handle.solid.name in resolved_op_selection_dict:
                    deps[_dep_key_of(node)][input_handle.input_def.name] = DependencyDefinition(
                        solid=output_handle.solid.name, output=output_handle.output_def.name
                    )
            elif graph.dependency_structure.has_dynamic_fan_in_dep(input_handle):
                output_handle = graph.dependency_structure.get_dynamic_fan_in_dep(input_handle)
                if output_handle.solid.name in resolved_op_selection_dict:
                    deps[_dep_key_of(node)][
                        input_handle.input_def.name
                    ] = DynamicCollectDependencyDefinition(
                        solid_name=output_handle.solid.name,
                        output_name=output_handle.output_def.name,
                    )
            elif graph.dependency_structure.has_fan_in_deps(input_handle):
                output_handles = graph.dependency_structure.get_fan_in_deps(input_handle)
                multi_dependencies = [
                    DependencyDefinition(
                        solid=output_handle.solid.name, output=output_handle.output_def.name
                    )
                    for output_handle in output_handles
                    if (
                        isinstance(output_handle, SolidOutputHandle)
                        and output_handle.solid.name in resolved_op_selection_dict
                    )
                ]
                deps[_dep_key_of(node)][input_handle.input_def.name] = MultiDependencyDefinition(
                    cast(
                        List[Union[DependencyDefinition, Type[MappedInputPlaceholder]]],
                        multi_dependencies,
                    )
                )
            # else input is unconnected

    # filter out unselected input/output mapping
    new_input_mappings = list(
        filter(
            lambda input_mapping: input_mapping.maps_to.solid_name
            in [name for name, _ in selected_nodes],
            graph._input_mappings,  # pylint: disable=protected-access
        )
    )
    new_output_mappings = list(
        filter(
            lambda output_mapping: output_mapping.maps_from.solid_name
            in [name for name, _ in selected_nodes],
            graph._output_mappings,  # pylint: disable=protected-access
        )
    )

    try:
        return SubselectedGraphDefinition(
            parent_graph_def=graph,
            dependencies=deps,
            node_defs=[definition for _, definition in selected_nodes],
            input_mappings=new_input_mappings,
            output_mappings=new_output_mappings,
        )
    except DagsterInvalidDefinitionError as exc:
        # This handles the case when you construct a subset such that an unsatisfied
        # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,
        # we re-raise a DagsterInvalidSubsetError.
        raise DagsterInvalidSubsetError(
            f"The attempted subset {str_format_set(resolved_op_selection_dict)} for graph "
            f"{graph.name} results in an invalid graph."
        ) from exc
Пример #7
0
def _get_pipeline_subset_def(
    pipeline_def: PipelineDefinition,
    solids_to_execute: AbstractSet[str],
) -> "PipelineSubsetDefinition":
    """
    Build a pipeline which is a subset of another pipeline.
    Only includes the solids which are in solids_to_execute.
    """

    check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)
    check.set_param(solids_to_execute, "solids_to_execute", of_type=str)
    graph = pipeline_def.graph
    for solid_name in solids_to_execute:
        if not graph.has_solid_named(solid_name):
            raise DagsterInvalidSubsetError(
                "{target_type} {pipeline_name} has no {node_type} named {name}."
                .format(
                    target_type=pipeline_def.target_type,
                    pipeline_name=pipeline_def.name,
                    name=solid_name,
                    node_type="ops" if pipeline_def.is_job else "solids",
                ), )

    # go in topo order to ensure deps dict is ordered
    solids = list(
        filter(lambda solid: solid.name in solids_to_execute,
               graph.solids_in_topological_order))

    deps: Dict[Union[str, NodeInvocation], Dict[str,
                                                IDependencyDefinition], ] = {
                                                    _dep_key_of(solid): {}
                                                    for solid in solids
                                                }

    for solid in solids:
        for input_handle in solid.input_handles():
            if graph.dependency_structure.has_direct_dep(input_handle):
                output_handle = pipeline_def.dependency_structure.get_direct_dep(
                    input_handle)
                if output_handle.solid.name in solids_to_execute:
                    deps[_dep_key_of(solid)][
                        input_handle.input_def.name] = DependencyDefinition(
                            solid=output_handle.solid.name,
                            output=output_handle.output_def.name)
            elif graph.dependency_structure.has_dynamic_fan_in_dep(
                    input_handle):
                output_handle = graph.dependency_structure.get_dynamic_fan_in_dep(
                    input_handle)
                if output_handle.solid.name in solids_to_execute:
                    deps[_dep_key_of(
                        solid)][input_handle.input_def.
                                name] = DynamicCollectDependencyDefinition(
                                    solid_name=output_handle.solid.name,
                                    output_name=output_handle.output_def.name,
                                )
            elif graph.dependency_structure.has_fan_in_deps(input_handle):
                output_handles = graph.dependency_structure.get_fan_in_deps(
                    input_handle)
                deps[_dep_key_of(solid)][
                    input_handle.input_def.name] = MultiDependencyDefinition([
                        DependencyDefinition(
                            solid=output_handle.solid.name,
                            output=output_handle.output_def.name)
                        for output_handle in output_handles
                        if output_handle.solid.name in solids_to_execute
                    ])
            # else input is unconnected

    try:
        sub_pipeline_def = PipelineSubsetDefinition(
            name=pipeline_def.
            name,  # should we change the name for subsetted pipeline?
            solid_defs=list({solid.definition
                             for solid in solids}),
            mode_defs=pipeline_def.mode_definitions,
            dependencies=deps,
            _parent_pipeline_def=pipeline_def,
            tags=pipeline_def.tags,
            hook_defs=pipeline_def.hook_defs,
        )

        return sub_pipeline_def
    except DagsterInvalidDefinitionError as exc:
        # This handles the case when you construct a subset such that an unsatisfied
        # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,
        # we re-raise a DagsterInvalidSubsetError.
        raise DagsterInvalidSubsetError(
            f"The attempted subset {str_format_set(solids_to_execute)} for {pipeline_def.target_type} "
            f"{pipeline_def.name} results in an invalid {pipeline_def.target_type}"
        ) from exc