Пример #1
0
    def subset_for_execution_from_existing_pipeline(self, solids_to_execute):
        # take a frozenset of resolved solid names from an existing pipeline
        # so there's no need to parse the selection
        check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str)

        return self._subset_for_execution(solids_to_execute)
Пример #2
0
    def __init__(
            self,
            solid_defs,
            name=None,
            description=None,
            dependencies=None,
            mode_defs=None,
            preset_defs=None,
            tags=None,
            hook_defs=None,
            input_mappings=None,
            output_mappings=None,
            config_mapping=None,
            positional_inputs=None,
            _parent_pipeline_def=None,  # https://github.com/dagster-io/dagster/issues/2115
    ):
        if not name:
            warnings.warn(
                "Pipeline must have a name. Names will be required starting in 0.10.0 or later."
            )
            name = _anonymous_pipeline_name()

        # For these warnings they check truthiness because they get changed to [] higher
        # in the stack for the decorator case

        if input_mappings:
            experimental_arg_warning("input_mappings", "PipelineDefinition")

        if output_mappings:
            experimental_arg_warning("output_mappings", "PipelineDefinition")

        if config_mapping is not None:
            experimental_arg_warning("config_mapping", "PipelineDefinition")

        if positional_inputs:
            experimental_arg_warning("positional_inputs", "PipelineDefinition")

        super(PipelineDefinition, self).__init__(
            name=name,
            description=description,
            dependencies=dependencies,
            node_defs=solid_defs,
            tags=check.opt_dict_param(tags, "tags", key_type=str),
            positional_inputs=positional_inputs,
            input_mappings=input_mappings,
            output_mappings=output_mappings,
            config_mapping=config_mapping,
        )

        self._current_level_node_defs = solid_defs
        self._tags = validate_tags(tags)

        mode_definitions = check.opt_list_param(mode_defs,
                                                "mode_defs",
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    "Modes must have unique names.").format(
                        mode_name=mode_def.name, pipeline_name=self._name))
            seen_modes.add(mode_def.name)

        self._dagster_type_dict = construct_dagster_type_dictionary(
            self._current_level_node_defs)

        self._hook_defs = check.opt_set_param(hook_defs,
                                              "hook_defs",
                                              of_type=HookDefinition)

        self._preset_defs = check.opt_list_param(preset_defs, "preset_defs",
                                                 PresetDefinition)
        self._preset_dict = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    "PresetDefinitions must have unique names.").format(
                        name=preset.name, pipeline_name=self._name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self._name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        # Validate solid resource dependencies
        _validate_resource_dependencies(
            self._mode_definitions,
            self._current_level_node_defs,
            self._dagster_type_dict,
            self._solid_dict,
            self._hook_defs,
        )

        # Validate unsatisfied inputs can be materialized from config
        _validate_inputs(self._dependency_structure, self._solid_dict,
                         self._mode_definitions)

        # Recursively explore all nodes in the this pipeline
        self._all_node_defs = _build_all_node_defs(
            self._current_level_node_defs)
        self._parent_pipeline_def = check.opt_inst_param(
            _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition)
        self._cached_run_config_schemas = {}
        self._cached_external_pipeline = None
Пример #3
0
 def __init__(self, config_type, func, required_resource_keys):
     self._config_type = check.inst_param(config_type, "config_type",
                                          ConfigType)
     self._func = check.callable_param(func, "func")
     self._required_resource_keys = check.opt_set_param(
         required_resource_keys, "required_resource_keys", of_type=str)
Пример #4
0
def define_dagstermill_op(
    name: str,
    notebook_path: str,
    input_defs: Optional[Sequence[InputDefinition]] = None,
    output_defs: Optional[Sequence[OutputDefinition]] = None,
    config_schema: Optional[Union[Any, Dict[str, Any]]] = None,
    required_resource_keys: Optional[Set[str]] = None,
    output_notebook_name: Optional[str] = None,
    asset_key_prefix: Optional[Union[List[str], str]] = None,
    description: Optional[str] = None,
    tags: Optional[Dict[str, Any]] = None,
):
    """Wrap a Jupyter notebook in a solid.

    Arguments:
        name (str): The name of the solid.
        notebook_path (str): Path to the backing notebook.
        input_defs (Optional[List[InputDefinition]]): The solid's inputs.
        output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should
            call :py:func:`~dagstermill.yield_result` to yield each of these outputs.
        required_resource_keys (Optional[Set[str]]): The string names of any required resources.
        output_notebook_name: (Optional[str]): If set, will be used as the name of an injected output
            of type of :py:class:`~dagster.BufferedIOBase` that is the file object of the executed
            notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always
            created). It allows the downstream solids to access the executed notebook via a file
            object.
        asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the
            asset keys for materialized notebooks.
        description (Optional[str]): If set, description used for solid.
        tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid.
            Dagster uses the tag keys `notebook_path` and `kind`, which cannot be
            overwritten by the user.

    Returns:
        :py:class:`~dagster.SolidDefinition`
    """
    check.str_param(name, "name")
    check.str_param(notebook_path, "notebook_path")
    input_defs = check.opt_list_param(input_defs,
                                      "input_defs",
                                      of_type=InputDefinition)
    output_defs = check.opt_list_param(output_defs,
                                       "output_defs",
                                       of_type=OutputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 "required_resource_keys",
                                                 of_type=str)

    extra_output_defs = []
    if output_notebook_name is not None:
        required_resource_keys.add("output_notebook_io_manager")
        extra_output_defs.append(
            OutputDefinition(name=output_notebook_name,
                             io_manager_key="output_notebook_io_manager"))

    if isinstance(asset_key_prefix, str):
        asset_key_prefix = [asset_key_prefix]

    asset_key_prefix = check.opt_list_param(asset_key_prefix,
                                            "asset_key_prefix",
                                            of_type=str)

    default_description = f"This op is backed by the notebook at {notebook_path}"
    description = check.opt_str_param(description,
                                      "description",
                                      default=default_description)

    user_tags = validate_tags(tags)
    if tags is not None:
        check.invariant(
            "notebook_path" not in tags,
            "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster",
        )
        check.invariant(
            "kind" not in tags,
            "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster",
        )
    default_tags = {"notebook_path": notebook_path, "kind": "ipynb"}

    return OpDefinition(
        name=name,
        input_defs=input_defs,
        compute_fn=_dm_compute(
            "define_dagstermill_op",
            name,
            notebook_path,
            output_notebook_name,
            asset_key_prefix=asset_key_prefix,
        ),
        output_defs=output_defs + extra_output_defs,
        config_schema=config_schema,
        required_resource_keys=required_resource_keys,
        description=description,
        tags={
            **user_tags,
            **default_tags
        },
    )
Пример #5
0
 def __new__(cls, asset_key, partitions=None):
     asset_key = check.inst_param(asset_key, "asset_key", AssetKey)
     partitions = check.opt_set_param(partitions, "partitions", str)
     return super(AssetLineageInfo, cls).__new__(cls, asset_key=asset_key, partitions=partitions)
Пример #6
0
    def __init__(
        self,
        type_check_fn,
        key=None,
        name=None,
        is_builtin=False,
        description=None,
        input_hydration_config=None,
        output_materialization_config=None,
        serialization_strategy=None,
        auto_plugins=None,
        required_resource_keys=None,
        kind=DagsterTypeKind.REGULAR,
    ):
        check.opt_str_param(key, 'key')
        check.opt_str_param(name, 'name')

        check.invariant(not (name is None and key is None),
                        'Must set key or name')

        if name is None:
            check.param_invariant(
                bool(key),
                'key',
                'If name is not provided, must provide key.',
            )
            self.key, self.name = key, None
        elif key is None:
            check.param_invariant(
                bool(name),
                'name',
                'If key is not provided, must provide name.',
            )
            self.key, self.name = name, name
        else:
            check.invariant(key and name)
            self.key, self.name = key, name

        self.description = check.opt_str_param(description, 'description')
        self.input_hydration_config = check.opt_inst_param(
            input_hydration_config, 'input_hydration_config',
            InputHydrationConfig)
        self.output_materialization_config = check.opt_inst_param(
            output_materialization_config,
            'output_materialization_config',
            OutputMaterializationConfig,
        )
        self.serialization_strategy = check.opt_inst_param(
            serialization_strategy,
            'serialization_strategy',
            SerializationStrategy,
            PickleSerializationStrategy(),
        )
        self.required_resource_keys = check.opt_set_param(
            required_resource_keys,
            'required_resource_keys',
        )

        self._type_check_fn = check.callable_param(type_check_fn,
                                                   'type_check_fn')
        _validate_type_check_fn(self._type_check_fn, self.name)

        auto_plugins = check.opt_list_param(auto_plugins,
                                            'auto_plugins',
                                            of_type=type)

        check.param_invariant(
            all(
                issubclass(auto_plugin_type, TypeStoragePlugin)
                for auto_plugin_type in auto_plugins),
            'auto_plugins',
        )

        self.auto_plugins = auto_plugins

        self.is_builtin = check.bool_param(is_builtin, 'is_builtin')
        check.invariant(
            self.display_name is not None,
            'All types must have a valid display name, got None for key {}'.
            format(key),
        )

        self.kind = check.inst_param(kind, 'kind', DagsterTypeKind)
Пример #7
0
    def __init__(
        self,
        solid_defs,
        name=None,
        description=None,
        dependencies=None,
        mode_defs=None,
        preset_defs=None,
        tags=None,
        hook_defs=None,
        _parent_pipeline_def=None,  # https://github.com/dagster-io/dagster/issues/2115
    ):
        if not name:
            warnings.warn(
                "Pipeline must have a name. Names will be required starting in 0.9.13 or later."
            )
        # name might be <<unnamed>> when constructing pipeline subsets
        elif name != "<<unnamed>>" and not is_valid_name(name):
            check_for_invalid_name_and_warn(name)

        self._name = check.opt_str_param(name, "name") or "<<unnamed>>"

        self._description = check.opt_str_param(description, "description")

        mode_definitions = check.opt_list_param(mode_defs, "mode_defs", of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        self._current_level_solid_defs = check.list_param(
            _check_solids_arg(self._name, solid_defs), "solid_defs", of_type=ISolidDefinition
        )
        self._tags = validate_tags(tags)

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError(
                    (
                        'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                        "Modes must have unique names."
                    ).format(mode_name=mode_def.name, pipeline_name=self._name)
                )
            seen_modes.add(mode_def.name)

        self._dependencies = validate_dependency_dict(dependencies)

        dependency_structure, solid_dict = create_execution_structure(
            self._current_level_solid_defs, self._dependencies, container_definition=None
        )

        self._solid_dict = solid_dict
        self._dependency_structure = dependency_structure

        # eager toposort solids to detect cycles
        self.solids_in_topological_order = self._solids_in_topological_order()

        self._dagster_type_dict = construct_dagster_type_dictionary(self._current_level_solid_defs)

        self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)

        self._preset_defs = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition)
        self._preset_dict = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError(
                    (
                        'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                        "PresetDefinitions must have unique names."
                    ).format(name=preset.name, pipeline_name=self._name)
                )
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    (
                        'PresetDefinition "{name}" in "{pipeline_name}" '
                        'references mode "{mode}" which is not defined.'
                    ).format(name=preset.name, pipeline_name=self._name, mode=preset.mode)
                )
            self._preset_dict[preset.name] = preset

        # Validate solid resource dependencies
        _validate_resource_dependencies(
            self._mode_definitions,
            self._current_level_solid_defs,
            self._solid_dict,
            self._hook_defs,
        )

        # Validate unsatisfied inputs can be materialized from config
        _validate_inputs(self._dependency_structure, self._solid_dict)

        self._all_solid_defs = _build_all_solid_defs(self._current_level_solid_defs)
        self._parent_pipeline_def = check.opt_inst_param(
            _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition
        )
        self._cached_run_config_schemas = {}
        self._cached_external_pipeline = None
Пример #8
0
 def __init__(self, solid_def, given_alias=None, tags=None, hook_defs=None):
     self.solid_def = solid_def
     self.given_alias = check.opt_str_param(given_alias, "given_alias")
     self.tags = check.opt_inst_param(tags, "tags", frozentags)
     self.hook_defs = check.opt_set_param(hook_defs, "hook_defs",
                                          HookDefinition)
Пример #9
0
    def __init__(
            self,
            solid_defs: List[NodeDefinition],
            name: str,
            description: Optional[str] = None,
            dependencies: Optional[Dict[Union[str, SolidInvocation],
                                        Dict[str,
                                             IDependencyDefinition]]] = None,
            mode_defs: Optional[List[ModeDefinition]] = None,
            preset_defs: Optional[List[PresetDefinition]] = None,
            tags: Dict[str, Any] = None,
            hook_defs: Optional[AbstractSet[HookDefinition]] = None,
            input_mappings: Optional[List[InputMapping]] = None,
            output_mappings: Optional[List[OutputMapping]] = None,
            config_mapping: Optional[ConfigMapping] = None,
            positional_inputs: List[str] = None,
            _parent_pipeline_def:
        Optional[
            "PipelineDefinition"] = None,  # https://github.com/dagster-io/dagster/issues/2115
    ):
        # For these warnings they check truthiness because they get changed to [] higher
        # in the stack for the decorator case

        if input_mappings:
            experimental_arg_warning("input_mappings", "PipelineDefinition")

        if output_mappings:
            experimental_arg_warning("output_mappings", "PipelineDefinition")

        if config_mapping is not None:
            experimental_arg_warning("config_mapping", "PipelineDefinition")

        if positional_inputs:
            experimental_arg_warning("positional_inputs", "PipelineDefinition")

        super(PipelineDefinition, self).__init__(
            name=name,
            description=description,
            dependencies=dependencies,
            node_defs=solid_defs,
            tags=check.opt_dict_param(tags, "tags", key_type=str),
            positional_inputs=positional_inputs,
            input_mappings=input_mappings,
            output_mappings=output_mappings,
            config_mapping=config_mapping,
        )

        self._current_level_node_defs = solid_defs
        self._tags = validate_tags(tags)

        mode_definitions = check.opt_list_param(mode_defs,
                                                "mode_defs",
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    "Modes must have unique names.").format(
                        mode_name=mode_def.name, pipeline_name=self._name))
            seen_modes.add(mode_def.name)

        self._hook_defs = check.opt_set_param(hook_defs,
                                              "hook_defs",
                                              of_type=HookDefinition)

        self._preset_defs = check.opt_list_param(preset_defs, "preset_defs",
                                                 PresetDefinition)
        self._preset_dict: Dict[str, PresetDefinition] = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    "PresetDefinitions must have unique names.").format(
                        name=preset.name, pipeline_name=self._name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self._name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        self._resource_requirements = {
            mode_def.name: _checked_resource_reqs_for_mode(
                mode_def,
                self._current_level_node_defs,
                self._dagster_type_dict,
                self._solid_dict,
                self._hook_defs,
            )
            for mode_def in self._mode_definitions
        }

        # Validate unsatisfied inputs can be materialized from config
        _validate_inputs(self._dependency_structure, self._solid_dict,
                         self._mode_definitions)

        # Recursively explore all nodes in the this pipeline
        self._all_node_defs = _build_all_node_defs(
            self._current_level_node_defs)
        self._parent_pipeline_def = check.opt_inst_param(
            _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition)
        self._cached_run_config_schemas: Dict[str, "RunConfigSchema"] = {}
        self._cached_external_pipeline = None
Пример #10
0
    def to_job(
        self,
        name: Optional[str] = None,
        description: Optional[str] = None,
        resource_defs: Optional[Dict[str, ResourceDefinition]] = None,
        config: Union[ConfigMapping, Dict[str, Any],
                      "PartitionedConfig"] = None,
        tags: Optional[Dict[str, Any]] = None,
        logger_defs: Optional[Dict[str, LoggerDefinition]] = None,
        executor_def: Optional["ExecutorDefinition"] = None,
        hooks: Optional[AbstractSet[HookDefinition]] = None,
        op_retry_policy: Optional[RetryPolicy] = None,
        version_strategy: Optional[VersionStrategy] = None,
        op_selection: Optional[List[str]] = None,
        partitions_def: Optional["PartitionsDefinition"] = None,
    ) -> "JobDefinition":
        """
        Make this graph in to an executable Job by providing remaining components required for execution.

        Args:
            name (Optional[str]):
                The name for the Job. Defaults to the name of the this graph.
            resource_defs (Optional[Dict[str, ResourceDefinition]]):
                Resources that are required by this graph for execution.
                If not defined, `io_manager` will default to filesystem.
            config:
                Describes how the job is parameterized at runtime.

                If no value is provided, then the schema for the job's run config is a standard
                format based on its solids and resources.

                If a dictionary is provided, then it must conform to the standard config schema, and
                it will be used as the job's run config for the job whenever the job is executed.
                The values provided will be viewable and editable in the Dagit playground, so be
                careful with secrets.

                If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is
                determined by the config mapping, and the ConfigMapping, which should return
                configuration in the standard format to configure the job.

                If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config
                values that can parameterize the job, as well as a function for mapping those
                values to the base config. The values provided will be viewable and editable in the
                Dagit playground, so be careful with secrets.
            tags (Optional[Dict[str, Any]]):
                Arbitrary metadata for any execution of the Job.
                Values that are not strings will be json encoded and must meet the criteria that
                `json.loads(json.dumps(value)) == value`.  These tag values may be overwritten by tag
                values provided at invocation time.
            logger_defs (Optional[Dict[str, LoggerDefinition]]):
                A dictionary of string logger identifiers to their implementations.
            executor_def (Optional[ExecutorDefinition]):
                How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,
                which can be switched between multi-process and in-process modes of execution. The
                default mode of execution is multi-process.
            op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.
                Only used if retry policy is not defined on the op definition or op invocation.
            version_strategy (Optional[VersionStrategy]):
                Defines how each solid (and optionally, resource) in the job can be versioned. If
                provided, memoizaton will be enabled for this job.
            partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition
                keys that can parameterize the job. If this argument is supplied, the config
                argument can't also be supplied.

        Returns:
            JobDefinition
        """
        from .job_definition import JobDefinition
        from .partition import PartitionedConfig, PartitionsDefinition
        from .executor_definition import ExecutorDefinition, multi_or_in_process_executor

        job_name = check_valid_name(name or self.name)

        tags = check.opt_dict_param(tags, "tags", key_type=str)
        executor_def = check.opt_inst_param(
            executor_def,
            "executor_def",
            ExecutorDefinition,
            default=multi_or_in_process_executor)

        if resource_defs and "io_manager" in resource_defs:
            resource_defs_with_defaults = resource_defs
        else:
            resource_defs_with_defaults = merge_dicts(
                {"io_manager": default_job_io_manager}, resource_defs or {})

        hooks = check.opt_set_param(hooks, "hooks", of_type=HookDefinition)
        op_retry_policy = check.opt_inst_param(op_retry_policy,
                                               "op_retry_policy", RetryPolicy)
        op_selection = check.opt_list_param(op_selection,
                                            "op_selection",
                                            of_type=str)
        presets = []
        config_mapping = None
        partitioned_config = None

        if partitions_def:
            check.inst_param(partitions_def, "partitions_def",
                             PartitionsDefinition)
            check.invariant(
                config is None,
                "Can't supply both the 'config' and 'partitions_def' arguments"
            )
            partitioned_config = PartitionedConfig(partitions_def,
                                                   lambda _: {})

        if isinstance(config, ConfigMapping):
            config_mapping = config
        elif isinstance(config, PartitionedConfig):
            partitioned_config = config
        elif isinstance(config, dict):
            presets = [PresetDefinition(name="default", run_config=config)]
            # Using config mapping here is a trick to make it so that the preset will be used even
            # when no config is supplied for the job.
            config_mapping = _config_mapping_with_default_value(
                self._get_config_schema(resource_defs_with_defaults,
                                        executor_def, logger_defs),
                config,
                job_name,
                self.name,
            )
        elif config is not None:
            check.failed(
                f"config param must be a ConfigMapping, a PartitionedConfig, or a dictionary, but "
                f"is an object of type {type(config)}")

        return JobDefinition(
            name=job_name,
            description=description or self.description,
            graph_def=self,
            mode_def=ModeDefinition(
                resource_defs=resource_defs_with_defaults,
                logger_defs=logger_defs,
                executor_defs=[executor_def],
                _config_mapping=config_mapping,
                _partitioned_config=partitioned_config,
            ),
            preset_defs=presets,
            tags=tags,
            hook_defs=hooks,
            version_strategy=version_strategy,
            op_retry_policy=op_retry_policy,
        ).get_job_def_for_op_selection(op_selection)
Пример #11
0
def define_dagstermill_solid(
    name,
    notebook_path,
    input_defs=None,
    output_defs=None,
    config_schema=None,
    required_resource_keys=None,
    output_notebook=None,
    asset_key_prefix=None,
):
    """Wrap a Jupyter notebook in a solid.

    Arguments:
        name (str): The name of the solid.
        notebook_path (str): Path to the backing notebook.
        input_defs (Optional[List[InputDefinition]]): The solid's inputs.
        output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should
            call :py:func:`~dagstermill.yield_result` to yield each of these outputs.
        required_resource_keys (Optional[Set[str]]): The string names of any required resources.
        output_notebook (Optional[str]): If set, will be used as the name of an injected output of
            type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in
            addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This
            respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on
            the pipeline resources via the "file_manager" resource key, so, e.g.,
            if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a :
            py:class:`~dagster_aws.s3.S3FileHandle`.
        asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the
            asset keys for materialized notebooks.

    Returns:
        :py:class:`~dagster.SolidDefinition`
    """
    check.str_param(name, "name")
    check.str_param(notebook_path, "notebook_path")
    input_defs = check.opt_list_param(input_defs,
                                      "input_defs",
                                      of_type=InputDefinition)
    output_defs = check.opt_list_param(output_defs,
                                       "output_defs",
                                       of_type=OutputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 "required_resource_keys",
                                                 of_type=str)
    if output_notebook is not None:
        required_resource_keys.add("file_manager")
    if isinstance(asset_key_prefix, str):
        asset_key_prefix = [asset_key_prefix]

    asset_key_prefix = check.opt_list_param(asset_key_prefix,
                                            "asset_key_prefix",
                                            of_type=str)

    return SolidDefinition(
        name=name,
        input_defs=input_defs,
        compute_fn=_dm_solid_compute(name,
                                     notebook_path,
                                     output_notebook,
                                     asset_key_prefix=asset_key_prefix),
        output_defs=output_defs +
        ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)]
         if output_notebook else []),
        config_schema=config_schema,
        required_resource_keys=required_resource_keys,
        description="This solid is backed by the notebook at {path}".format(
            path=notebook_path),
        tags={
            "notebook_path": notebook_path,
            "kind": "ipynb"
        },
    )
Пример #12
0
def _core_resource_initialization_event_generator(
    resource_defs: Dict[str, ResourceDefinition],
    resource_configs: Dict[str, ResourceConfig],
    resource_log_manager: DagsterLogManager,
    resource_managers: Deque[EventGenerationManager],
    execution_plan: Optional[ExecutionPlan],
    pipeline_run: Optional[PipelineRun],
    resource_keys_to_init: Optional[AbstractSet[str]],
    instance: Optional[DagsterInstance],
    emit_persistent_events: Optional[bool],
    pipeline_def_for_backwards_compat: Optional[PipelineDefinition],
):

    pipeline_name = None
    contains_generator = False
    if emit_persistent_events:
        check.invariant(
            pipeline_run and execution_plan,
            "If emit_persistent_events is enabled, then pipeline_run and execution_plan must be provided",
        )
        pipeline_name = cast(PipelineRun, pipeline_run).pipeline_name
    resource_keys_to_init = check.opt_set_param(resource_keys_to_init,
                                                "resource_keys_to_init")
    resource_instances: Dict[str, "InitializedResource"] = {}
    resource_init_times = {}
    try:
        if emit_persistent_events and resource_keys_to_init:
            yield DagsterEvent.resource_init_start(
                cast(str, pipeline_name),
                cast(ExecutionPlan, execution_plan),
                resource_log_manager,
                resource_keys_to_init,
            )

        resource_dependencies = _resolve_resource_dependencies(resource_defs)

        for level in toposort(resource_dependencies):
            for resource_name in level:
                resource_def = resource_defs[resource_name]
                if not resource_name in resource_keys_to_init:
                    continue

                resource_fn = cast(Callable[[InitResourceContext], Any],
                                   resource_def.resource_fn)
                resources = ScopedResourcesBuilder(resource_instances).build(
                    resource_def.required_resource_keys)
                resource_context = InitResourceContext(
                    resource_def=resource_def,
                    resource_config=resource_configs[resource_name].config,
                    pipeline_run=pipeline_run,
                    # Add tags with information about the resource
                    log_manager=resource_log_manager.with_tags(
                        resource_name=resource_name,
                        resource_fn_name=str(resource_fn.__name__),
                    ),
                    resources=resources,
                    instance=instance,
                    pipeline_def_for_backwards_compat=
                    pipeline_def_for_backwards_compat,
                )
                manager = single_resource_generation_manager(
                    resource_context, resource_name, resource_def)
                for event in manager.generate_setup_events():
                    if event:
                        yield event
                initialized_resource = check.inst(manager.get_object(),
                                                  InitializedResource)
                resource_instances[
                    resource_name] = initialized_resource.resource
                resource_init_times[
                    resource_name] = initialized_resource.duration
                contains_generator = contains_generator or initialized_resource.is_generator
                resource_managers.append(manager)

        if emit_persistent_events and resource_keys_to_init:
            yield DagsterEvent.resource_init_success(
                cast(str, pipeline_name),
                cast(ExecutionPlan, execution_plan),
                resource_log_manager,
                resource_instances,
                resource_init_times,
            )
        yield ScopedResourcesBuilder(resource_instances, contains_generator)
    except DagsterUserCodeExecutionError as dagster_user_error:
        # Can only end up in this state if we attempt to initialize a resource, so
        # resource_keys_to_init cannot be empty
        if emit_persistent_events:
            yield DagsterEvent.resource_init_failure(
                cast(str, pipeline_name),
                cast(ExecutionPlan, execution_plan),
                resource_log_manager,
                resource_keys_to_init,
                serializable_error_info_from_exc_info(
                    dagster_user_error.original_exc_info),
            )
        raise dagster_user_error
Пример #13
0
def resource_initialization_event_generator(
    resource_defs: Dict[str, ResourceDefinition],
    resource_configs: Dict[str, ResourceConfig],
    log_manager: DagsterLogManager,
    execution_plan: Optional[ExecutionPlan],
    pipeline_run: Optional[PipelineRun],
    resource_keys_to_init: Optional[AbstractSet[str]],
    instance: Optional[DagsterInstance],
    emit_persistent_events: Optional[bool],
    pipeline_def_for_backwards_compat: Optional[PipelineDefinition],
):
    check.inst_param(log_manager, "log_manager", DagsterLogManager)
    resource_keys_to_init = check.opt_set_param(resource_keys_to_init,
                                                "resource_keys_to_init",
                                                of_type=str)
    check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)
    check.opt_inst_param(pipeline_run, "pipeline_run", PipelineRun)
    check.opt_inst_param(instance, "instance", DagsterInstance)

    if execution_plan and execution_plan.step_handle_for_single_step_plans():
        step = execution_plan.get_step(
            cast(
                StepHandleUnion,
                cast(ExecutionPlan,
                     execution_plan).step_handle_for_single_step_plans(),
            ))
        resource_log_manager = log_manager.with_tags(
            **cast(ExecutionStep, step).logging_tags)
    else:
        resource_log_manager = log_manager

    generator_closed = False
    resource_managers: Deque[EventGenerationManager] = deque()

    try:

        yield from _core_resource_initialization_event_generator(
            resource_defs=resource_defs,
            resource_configs=resource_configs,
            resource_log_manager=resource_log_manager,
            resource_managers=resource_managers,
            execution_plan=execution_plan,
            pipeline_run=pipeline_run,
            resource_keys_to_init=resource_keys_to_init,
            instance=instance,
            emit_persistent_events=emit_persistent_events,
            pipeline_def_for_backwards_compat=pipeline_def_for_backwards_compat,
        )
    except GeneratorExit:
        # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed
        # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).
        generator_closed = True
        raise
    finally:
        if not generator_closed:
            error = None
            while len(resource_managers) > 0:
                manager = resource_managers.pop()
                try:
                    yield from manager.generate_teardown_events()
                except DagsterUserCodeExecutionError as dagster_user_error:
                    error = dagster_user_error
            if error and emit_persistent_events:
                yield DagsterEvent.resource_teardown_failure(
                    cast(PipelineRun, pipeline_run).pipeline_name,
                    cast(ExecutionPlan, execution_plan),
                    resource_log_manager,
                    resource_keys_to_init,
                    serializable_error_info_from_exc_info(
                        error.original_exc_info),
                )
Пример #14
0
    def __init__(
        self,
        dagster_type=None,
        name: Optional[str] = None,
        description: Optional[str] = None,
        is_required: bool = True,
        io_manager_key: Optional[str] = None,
        metadata: Optional[MetadataUserInput] = None,
        asset_key: Optional[Union[AssetKey, DynamicAssetKey]] = None,
        asset_partitions: Optional[Union[AbstractSet[str],
                                         Callable[["OutputContext"],
                                                  AbstractSet[str]]]] = None,
        asset_partitions_def: Optional["PartitionsDefinition"] = None
        # make sure new parameters are updated in combine_with_inferred below
    ):
        from dagster.core.definitions.partition import PartitionsDefinition

        self._name = check_valid_name(
            check.opt_str_param(name, "name", DEFAULT_OUTPUT))
        self._type_not_set = dagster_type is None
        self._dagster_type = resolve_dagster_type(dagster_type)
        self._description = check.opt_str_param(description, "description")
        self._is_required = check.bool_param(is_required, "is_required")
        self._io_manager_key = check.opt_str_param(
            io_manager_key,
            "io_manager_key",
            default="io_manager",
        )
        self._metadata = check.opt_dict_param(metadata,
                                              "metadata",
                                              key_type=str)
        self._metadata_entries = check.is_list(
            normalize_metadata(self._metadata, [], allow_invalid=True),
            MetadataEntry)

        if asset_key:
            experimental_arg_warning("asset_key", "OutputDefinition.__init__")

        if callable(asset_key):
            warnings.warn(
                "Passing a function as the `asset_key` argument to `Out` or `OutputDefinition` is "
                "deprecated behavior and will be removed in version 0.15.0.")
        else:
            check.opt_inst_param(asset_key, "asset_key", AssetKey)

        self._asset_key = asset_key

        if asset_partitions:
            experimental_arg_warning("asset_partitions",
                                     "OutputDefinition.__init__")
            check.param_invariant(
                asset_key is not None,
                "asset_partitions",
                'Cannot specify "asset_partitions" argument without also specifying "asset_key"',
            )

        self._asset_partitions_fn: Optional[Callable[["OutputContext"],
                                                     AbstractSet[str]]]
        if callable(asset_partitions):
            self._asset_partitions_fn = asset_partitions
        elif asset_partitions is not None:
            asset_partitions = check.opt_set_param(asset_partitions,
                                                   "asset_partitions", str)

            def _fn(_context: "OutputContext") -> AbstractSet[str]:
                return cast(AbstractSet[str], asset_partitions)  # mypy bug?

            self._asset_partitions_fn = _fn
        else:
            self._asset_partitions_fn = None

        if asset_partitions_def:
            experimental_arg_warning("asset_partitions_def",
                                     "OutputDefinition.__init__")
        self._asset_partitions_def = check.opt_inst_param(
            asset_partitions_def, "asset_partition_def", PartitionsDefinition)
Пример #15
0
    def __new__(
        cls,
        pipeline_name=None,
        run_id=None,
        environment_dict=None,
        mode=None,
        solid_selection=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        pipeline_snapshot_id=None,
        execution_plan_snapshot_id=None,
        ## GRAVEYARD BELOW
        # see https://github.com/dagster-io/dagster/issues/2372 for explanation
        previous_run_id=None,
        selector=None,
        solid_subset=None,
    ):
        # a frozenset which contains the names of the solids to execute
        check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str)
        # a list of solid queries provided by the user
        # possible to be None when only solids_to_execute is set by the user directly
        check.opt_list_param(solid_selection, 'solid_selection', of_type=str)

        check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str)

        check.opt_str_param(root_run_id, 'root_run_id')
        check.opt_str_param(parent_run_id, 'parent_run_id')

        check.invariant(
            (root_run_id is not None and parent_run_id is not None)
            or (root_run_id is None and parent_run_id is None),
            (
                'Must set both root_run_id and parent_run_id when creating a PipelineRun that '
                'belongs to a run group'
            ),
        )

        # Compatibility
        # ----------------------------------------------------------------------------------------
        # Historical runs may have previous_run_id set, in which case
        # that previous ID becomes both the root and the parent
        if previous_run_id:
            if not (parent_run_id and root_run_id):
                parent_run_id = previous_run_id
                root_run_id = previous_run_id

        check.opt_inst_param(selector, 'selector', ExecutionSelector)
        if selector:
            check.invariant(
                pipeline_name is None or selector.name == pipeline_name,
                (
                    'Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: '
                    'selector was passed with pipeline {selector_pipeline}'.format(
                        pipeline_name=pipeline_name, selector_pipeline=selector.name
                    )
                ),
            )
            if pipeline_name is None:
                pipeline_name = selector.name

            check.invariant(
                solids_to_execute is None or set(selector.solid_subset) == solids_to_execute,
                (
                    'Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: '
                    'selector was passed with subset {selector_subset}'.format(
                        solids_to_execute=solids_to_execute, selector_subset=selector.solid_subset
                    )
                ),
            )
            # for old runs that only have selector but no solids_to_execute
            if solids_to_execute is None:
                solids_to_execute = (
                    frozenset(selector.solid_subset) if selector.solid_subset else None
                )

        # for old runs that specified list-type solid_subset
        check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
        if solid_subset:
            solids_to_execute = frozenset(solid_subset)
        # ----------------------------------------------------------------------------------------

        return super(PipelineRun, cls).__new__(
            cls,
            pipeline_name=check.opt_str_param(pipeline_name, 'pipeline_name'),
            run_id=check.opt_str_param(run_id, 'run_id', default=make_new_run_id()),
            environment_dict=check.opt_dict_param(
                environment_dict, 'environment_dict', key_type=str
            ),
            mode=check.opt_str_param(mode, 'mode'),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=check.opt_inst_param(
                status, 'status', PipelineRunStatus, PipelineRunStatus.NOT_STARTED
            ),
            tags=check.opt_dict_param(tags, 'tags', key_type=str),
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id, 'pipeline_snapshot_id'),
            execution_plan_snapshot_id=check.opt_str_param(
                execution_plan_snapshot_id, 'execution_plan_snapshot_id'
            ),
        )
Пример #16
0
    def __init__(
        self,
        solid_defs: Optional[List[NodeDefinition]] = None,
        name: Optional[str] = None,
        description: Optional[str] = None,
        dependencies: Optional[Dict[Union[str, NodeInvocation],
                                    Dict[str, IDependencyDefinition]]] = None,
        mode_defs: Optional[List[ModeDefinition]] = None,
        preset_defs: Optional[List[PresetDefinition]] = None,
        tags: Dict[str, Any] = None,
        hook_defs: Optional[AbstractSet[HookDefinition]] = None,
        solid_retry_policy: Optional[RetryPolicy] = None,
        graph_def=None,
        _parent_pipeline_def=None,  # https://github.com/dagster-io/dagster/issues/2115
        version_strategy: Optional[VersionStrategy] = None,
    ):
        # If a graph is specificed directly use it
        if check.opt_inst_param(graph_def, "graph_def", GraphDefinition):
            self._graph_def = graph_def
            self._name = name or graph_def.name

        # Otherwise fallback to legacy construction
        else:
            if name is None:
                check.failed("name must be set provided")
            self._name = name

            if solid_defs is None:
                check.failed("solid_defs must be provided")

            self._graph_def = GraphDefinition(
                name=name,
                dependencies=dependencies,
                node_defs=solid_defs,
                input_mappings=None,
                output_mappings=None,
                config=None,
                description=None,
            )

        # tags and description can exist on graph as well, but since
        # same graph may be in multiple pipelines/jobs, keep separate layer
        self._description = check.opt_str_param(description, "description")
        self._tags = validate_tags(tags)

        self._current_level_node_defs = self._graph_def.node_defs

        mode_definitions = check.opt_list_param(mode_defs,
                                                "mode_defs",
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    "Modes must have unique names.").format(
                        mode_name=mode_def.name, pipeline_name=self.name))
            seen_modes.add(mode_def.name)

        self._hook_defs = check.opt_set_param(hook_defs,
                                              "hook_defs",
                                              of_type=HookDefinition)
        self._solid_retry_policy = check.opt_inst_param(
            solid_retry_policy, "solid_retry_policy", RetryPolicy)

        self._preset_defs = check.opt_list_param(preset_defs, "preset_defs",
                                                 PresetDefinition)
        self._preset_dict: Dict[str, PresetDefinition] = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    "PresetDefinitions must have unique names.").format(
                        name=preset.name, pipeline_name=self.name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self.name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        self._resource_requirements = {
            mode_def.name: _checked_resource_reqs_for_mode(
                mode_def,
                self._current_level_node_defs,
                self._graph_def._dagster_type_dict,
                self._graph_def._node_dict,
                self._hook_defs,
                self._graph_def._dependency_structure,
            )
            for mode_def in self._mode_definitions
        }

        # Recursively explore all nodes in the this pipeline
        self._all_node_defs = _build_all_node_defs(
            self._current_level_node_defs)
        self._parent_pipeline_def = check.opt_inst_param(
            _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition)
        self._cached_run_config_schemas: Dict[str, "RunConfigSchema"] = {}
        self._cached_external_pipeline = None

        self.version_strategy = check.opt_inst_param(version_strategy,
                                                     "version_strategy",
                                                     VersionStrategy)

        if self.version_strategy is not None:
            experimental_class_warning("VersionStrategy")
Пример #17
0
    def create_run_for_pipeline(
        self,
        pipeline_def,
        execution_plan=None,
        run_id=None,
        run_config=None,
        mode=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        solid_selection=None,
    ):
        from dagster.core.execution.api import create_execution_plan
        from dagster.core.execution.plan.plan import ExecutionPlan
        from dagster.core.snap import snapshot_from_execution_plan

        check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)
        check.opt_inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        # note that solids_to_execute is required to execute the solid subset, which is the
        # frozenset version of the previous solid_subset.
        # solid_selection is not required and will not be converted to solids_to_execute here.
        # i.e. this function doesn't handle solid queries.
        # solid_selection is only used to pass the user queries further down.
        check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str)
        check.opt_list_param(solid_selection, 'solid_selection', of_type=str)

        if solids_to_execute:
            if isinstance(pipeline_def, PipelineSubsetDefinition):
                # for the case when pipeline_def is created by ExecutablePipeline or ExternalPipeline
                check.invariant(
                    solids_to_execute == pipeline_def.solids_to_execute,
                    'Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} '
                    'that conflicts with solids_to_execute arg {solids_to_execute}'.format(
                        pipeline_solids_to_execute=str_format_list(pipeline_def.solids_to_execute),
                        solids_to_execute=str_format_list(solids_to_execute),
                    ),
                )
            else:
                # for cases when `create_run_for_pipeline` is directly called
                pipeline_def = pipeline_def.get_pipeline_subset_def(
                    solids_to_execute=solids_to_execute
                )

        if execution_plan is None:
            execution_plan = create_execution_plan(
                pipeline_def,
                run_config=run_config,
                mode=mode,
                step_keys_to_execute=step_keys_to_execute,
            )

        return self.create_run(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=check.opt_str_param(mode, 'mode', default=pipeline_def.get_default_mode_name()),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                execution_plan, pipeline_def.get_pipeline_snapshot_id()
            ),
            parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),
        )
Пример #18
0
    def __new__(
        cls,
        pipeline_name=None,
        run_id=None,
        run_config=None,
        mode=None,
        solid_selection=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        pipeline_snapshot_id=None,
        execution_plan_snapshot_id=None,
        external_pipeline_origin=None,
    ):
        check.invariant(
            (root_run_id is not None and parent_run_id is not None)
            or (root_run_id is None and parent_run_id is None),
            ("Must set both root_run_id and parent_run_id when creating a PipelineRun that "
             "belongs to a run group"),
        )
        # a frozenset which contains the names of the solids to execute
        check.opt_set_param(solids_to_execute,
                            "solids_to_execute",
                            of_type=str)
        # a list of solid queries provided by the user
        # possible to be None when only solids_to_execute is set by the user directly
        check.opt_list_param(solid_selection, "solid_selection", of_type=str)
        check.opt_list_param(step_keys_to_execute,
                             "step_keys_to_execute",
                             of_type=str)

        # Placing this with the other imports causes a cyclic import
        # https://github.com/dagster-io/dagster/issues/3181
        from dagster.core.host_representation.origin import ExternalPipelineOrigin

        if status == PipelineRunStatus.QUEUED:
            check.inst_param(
                external_pipeline_origin,
                "external_pipeline_origin",
                ExternalPipelineOrigin,
                "external_pipeline_origin is required for queued runs",
            )

        return super(PipelineRun, cls).__new__(
            cls,
            pipeline_name=check.opt_str_param(pipeline_name, "pipeline_name"),
            run_id=check.opt_str_param(run_id,
                                       "run_id",
                                       default=make_new_run_id()),
            run_config=check.opt_dict_param(run_config,
                                            "run_config",
                                            key_type=str),
            mode=check.opt_str_param(mode, "mode"),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=check.opt_inst_param(status, "status", PipelineRunStatus,
                                        PipelineRunStatus.NOT_STARTED),
            tags=check.opt_dict_param(tags,
                                      "tags",
                                      key_type=str,
                                      value_type=str),
            root_run_id=check.opt_str_param(root_run_id, "root_run_id"),
            parent_run_id=check.opt_str_param(parent_run_id, "parent_run_id"),
            pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id,
                                                     "pipeline_snapshot_id"),
            execution_plan_snapshot_id=check.opt_str_param(
                execution_plan_snapshot_id, "execution_plan_snapshot_id"),
            external_pipeline_origin=check.opt_inst_param(
                external_pipeline_origin, "external_pipeline_origin",
                ExternalPipelineOrigin),
        )
Пример #19
0
    def create_run_for_pipeline(
        self,
        pipeline_def,
        execution_plan=None,
        run_id=None,
        run_config=None,
        mode=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        solid_selection=None,
    ):
        from dagster.core.execution.api import create_execution_plan
        from dagster.core.execution.plan.plan import ExecutionPlan
        from dagster.core.snap import snapshot_from_execution_plan

        check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)
        check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)

        # note that solids_to_execute is required to execute the solid subset, which is the
        # frozenset version of the previous solid_subset.
        # solid_selection is not required and will not be converted to solids_to_execute here.
        # i.e. this function doesn't handle solid queries.
        # solid_selection is only used to pass the user queries further down.
        check.opt_set_param(solids_to_execute,
                            "solids_to_execute",
                            of_type=str)
        check.opt_list_param(solid_selection, "solid_selection", of_type=str)

        if solids_to_execute:
            if isinstance(pipeline_def, PipelineSubsetDefinition):
                # for the case when pipeline_def is created by IPipeline or ExternalPipeline
                check.invariant(
                    solids_to_execute == pipeline_def.solids_to_execute,
                    "Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} "
                    "that conflicts with solids_to_execute arg {solids_to_execute}"
                    .format(
                        pipeline_solids_to_execute=str_format_list(
                            pipeline_def.solids_to_execute),
                        solids_to_execute=str_format_list(solids_to_execute),
                    ),
                )
            else:
                # for cases when `create_run_for_pipeline` is directly called
                pipeline_def = pipeline_def.get_pipeline_subset_def(
                    solids_to_execute=solids_to_execute)

        full_execution_plan = execution_plan or create_execution_plan(
            pipeline_def,
            run_config=run_config,
            mode=mode,
        )
        check.invariant(
            len(full_execution_plan.step_keys_to_execute) == len(
                full_execution_plan.steps))

        if _is_memoized_run(tags):
            if step_keys_to_execute:
                raise DagsterInvariantViolationError(
                    "step_keys_to_execute parameter cannot be used in conjunction with memoized "
                    "pipeline runs.")

            step_keys_to_execute = self.resolve_unmemoized_steps(
                full_execution_plan,
                run_config=run_config,
                mode=mode,
            )  # TODO: tighter integration with existing step_keys_to_execute functionality

        subsetted_execution_plan = (
            full_execution_plan.build_subset_plan(step_keys_to_execute)
            if step_keys_to_execute else full_execution_plan)

        return self.create_run(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=check.opt_str_param(
                mode, "mode", default=pipeline_def.get_default_mode_name()),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                subsetted_execution_plan,
                pipeline_def.get_pipeline_snapshot_id()),
            parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(
            ),
        )
Пример #20
0
def define_dagstermill_solid(
    name,
    notebook_path,
    input_defs=None,
    output_defs=None,
    config_schema=None,
    required_resource_keys=None,
    output_notebook=None,
    asset_key_prefix=None,
    description=None,
    tags=None,
):
    """Wrap a Jupyter notebook in a solid.

    Arguments:
        name (str): The name of the solid.
        notebook_path (str): Path to the backing notebook.
        input_defs (Optional[List[InputDefinition]]): The solid's inputs.
        output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should
            call :py:func:`~dagstermill.yield_result` to yield each of these outputs.
        required_resource_keys (Optional[Set[str]]): The string names of any required resources.
        output_notebook (Optional[str]): If set, will be used as the name of an injected output of
            type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in
            addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This
            respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on
            the pipeline resources via the "file_manager" resource key, so, e.g.,
            if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a :
            py:class:`~dagster_aws.s3.S3FileHandle`.
        asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the
            asset keys for materialized notebooks.
        description (Optional[str]): If set, description used for solid.
        tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid.
            Dagster uses the tag keys `notebook_path` and `kind`, which cannot be
            overwritten by the user.

    Returns:
        :py:class:`~dagster.SolidDefinition`
    """
    check.str_param(name, "name")
    check.str_param(notebook_path, "notebook_path")
    input_defs = check.opt_list_param(input_defs,
                                      "input_defs",
                                      of_type=InputDefinition)
    output_defs = check.opt_list_param(output_defs,
                                       "output_defs",
                                       of_type=OutputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 "required_resource_keys",
                                                 of_type=str)
    if output_notebook is not None:
        required_resource_keys.add("file_manager")
    if isinstance(asset_key_prefix, str):
        asset_key_prefix = [asset_key_prefix]

    asset_key_prefix = check.opt_list_param(asset_key_prefix,
                                            "asset_key_prefix",
                                            of_type=str)

    default_description = f"This solid is backed by the notebook at {notebook_path}"
    description = check.opt_str_param(description,
                                      "description",
                                      default=default_description)

    user_tags = validate_tags(tags)
    if tags is not None:
        check.invariant(
            "notebook_path" not in tags,
            "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster",
        )
        check.invariant(
            "kind" not in tags,
            "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster",
        )
    default_tags = {"notebook_path": notebook_path, "kind": "ipynb"}

    return SolidDefinition(
        name=name,
        input_defs=input_defs,
        compute_fn=_dm_solid_compute(name,
                                     notebook_path,
                                     output_notebook,
                                     asset_key_prefix=asset_key_prefix),
        output_defs=output_defs +
        ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)]
         if output_notebook else []),
        config_schema=config_schema,
        required_resource_keys=required_resource_keys,
        description=description,
        tags={
            **user_tags,
            **default_tags
        },
    )
Пример #21
0
 def __init__(self, config_type, func, required_resource_keys):
     self._config_type = check.inst_param(config_type, 'config_type',
                                          ConfigType)
     self._func = check.callable_param(func, 'func')
     self._required_resource_keys = check.opt_set_param(
         required_resource_keys, 'required_resource_keys', of_type=str)
Пример #22
0
def create_lakehouse_table_def(
    name,
    lakehouse_fn,
    input_tables=None,
    other_input_defs=None,
    required_resource_keys=None,
    metadata=None,
    description=None,
):
    metadata = check.opt_dict_param(metadata, 'metadata')
    input_tables = check.opt_list_param(input_tables,
                                        input_tables,
                                        of_type=LakehouseTableInputDefinition)
    other_input_defs = check.opt_list_param(other_input_defs,
                                            other_input_defs,
                                            of_type=InputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 'required_resource_keys',
                                                 of_type=str)

    table_type = define_python_dagster_type(python_type=ITableHandle,
                                            name=name,
                                            description=description)

    table_type_inst = table_type.inst()

    table_input_dict = {
        input_table.name: input_table
        for input_table in input_tables
    }
    input_defs = input_tables + other_input_defs
    validate_solid_fn('@solid', name, lakehouse_fn, input_defs, ['context'])

    def _compute(context, inputs):
        '''
        Workhouse function of lakehouse. The inputs are something that inherits from ITableHandle.
        This compute_fn:
        (1) Iterates over input tables and ask the lakehouse resource to
         hydrate their contents or a representation of their contents
         (e.g a pyspark dataframe) into memory for computation
        (2) Pass those into the lakehouse table function. Do the actual thing.
        (3) Pass the output of the lakehouse function to the lakehouse materialize function.
        (4) Yield a materialization if the lakehouse function returned that.


        There's an argument that the hydrate and materialize functions should return
        a stream of events but that started to feel like I was implementing what should
        be a framework feature.
        '''
        check.inst_param(context.resources.lakehouse,
                         'context.resources.lakehouse', Lakehouse)

        # hydrate tables
        hydrated_tables = {}
        other_inputs = {}
        for input_name, value in inputs.items():
            context.log.info(
                'About to hydrate table {input_name} for use in {name}'.format(
                    input_name=input_name, name=name))
            if input_name in table_input_dict:
                table_handle = value
                input_type = table_input_dict[input_name].runtime_type
                hydrated_tables[
                    input_name] = context.resources.lakehouse.hydrate(
                        context,
                        input_type,
                        table_def_of_type(context.pipeline_def,
                                          input_type.name).metadata,
                        table_handle,
                        metadata,
                    )
            else:
                other_inputs[input_name] = value

        # call user-provided business logic which operates on the hydrated values
        # (as opposed to the handles)
        computed_output = lakehouse_fn(context, **hydrated_tables,
                                       **other_inputs)

        materialization, output_table_handle = context.resources.lakehouse.materialize(
            context, table_type_inst, metadata, computed_output)

        if materialization:
            yield materialization

        # just pass in a dummy handle for now if the materialize function
        # does not return one
        yield Output(
            output_table_handle if output_table_handle else TableHandle())

    required_resource_keys.add('lakehouse')

    return LakehouseTableDefinition(
        lakehouse_fn=lakehouse_fn,
        name=name,
        input_tables=input_tables,
        input_defs=input_defs,
        output_defs=[OutputDefinition(table_type)],
        compute_fn=_compute,
        required_resource_keys=required_resource_keys,
        metadata=metadata,
        description=description,
    )
Пример #23
0
 def __init__(self, name=None, required_resource_keys=None):
     self.name = check.opt_str_param(name, 'name')
     self.required_resource_keys = check.opt_set_param(
         required_resource_keys, 'required_resource_keys')
Пример #24
0
    def __init__(
        self,
        type_check_fn,
        key=None,
        name=None,
        is_builtin=False,
        description=None,
        loader=None,
        materializer=None,
        serialization_strategy=None,
        auto_plugins=None,
        required_resource_keys=None,
        kind=DagsterTypeKind.REGULAR,
    ):
        check.opt_str_param(key, "key")
        check.opt_str_param(name, "name")

        check.invariant(not (name is None and key is None), "Must set key or name")

        if name is None:
            check.param_invariant(
                bool(key), "key", "If name is not provided, must provide key.",
            )
            self.key, self._name = key, None
        elif key is None:
            check.param_invariant(
                bool(name), "name", "If key is not provided, must provide name.",
            )
            self.key, self._name = name, name
        else:
            check.invariant(key and name)
            self.key, self._name = key, name

        self.description = check.opt_str_param(description, "description")
        self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)
        self.materializer = check.opt_inst_param(
            materializer, "materializer", DagsterTypeMaterializer
        )

        self.serialization_strategy = check.opt_inst_param(
            serialization_strategy,
            "serialization_strategy",
            SerializationStrategy,
            PickleSerializationStrategy(),
        )
        self.required_resource_keys = check.opt_set_param(
            required_resource_keys, "required_resource_keys",
        )

        self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn")
        _validate_type_check_fn(self._type_check_fn, self._name)

        auto_plugins = check.opt_list_param(auto_plugins, "auto_plugins", of_type=type)

        check.param_invariant(
            all(
                issubclass(auto_plugin_type, TypeStoragePlugin) for auto_plugin_type in auto_plugins
            ),
            "auto_plugins",
        )

        self.auto_plugins = auto_plugins

        self.is_builtin = check.bool_param(is_builtin, "is_builtin")
        check.invariant(
            self.display_name is not None,
            "All types must have a valid display name, got None for key {}".format(key),
        )

        self.kind = check.inst_param(kind, "kind", DagsterTypeKind)
Пример #25
0
 def __init__(self,
              name: Optional[str] = None,
              required_resource_keys: Optional[Set[str]] = None):
     self.name = check.opt_str_param(name, "name")
     self.required_resource_keys = check.opt_set_param(
         required_resource_keys, "required_resource_keys")