def subset_for_execution_from_existing_pipeline(self, solids_to_execute): # take a frozenset of resolved solid names from an existing pipeline # so there's no need to parse the selection check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str) return self._subset_for_execution(solids_to_execute)
def __init__( self, solid_defs, name=None, description=None, dependencies=None, mode_defs=None, preset_defs=None, tags=None, hook_defs=None, input_mappings=None, output_mappings=None, config_mapping=None, positional_inputs=None, _parent_pipeline_def=None, # https://github.com/dagster-io/dagster/issues/2115 ): if not name: warnings.warn( "Pipeline must have a name. Names will be required starting in 0.10.0 or later." ) name = _anonymous_pipeline_name() # For these warnings they check truthiness because they get changed to [] higher # in the stack for the decorator case if input_mappings: experimental_arg_warning("input_mappings", "PipelineDefinition") if output_mappings: experimental_arg_warning("output_mappings", "PipelineDefinition") if config_mapping is not None: experimental_arg_warning("config_mapping", "PipelineDefinition") if positional_inputs: experimental_arg_warning("positional_inputs", "PipelineDefinition") super(PipelineDefinition, self).__init__( name=name, description=description, dependencies=dependencies, node_defs=solid_defs, tags=check.opt_dict_param(tags, "tags", key_type=str), positional_inputs=positional_inputs, input_mappings=input_mappings, output_mappings=output_mappings, config_mapping=config_mapping, ) self._current_level_node_defs = solid_defs self._tags = validate_tags(tags) mode_definitions = check.opt_list_param(mode_defs, "mode_defs", of_type=ModeDefinition) if not mode_definitions: mode_definitions = [ModeDefinition()] self._mode_definitions = mode_definitions seen_modes = set() for mode_def in mode_definitions: if mode_def.name in seen_modes: raise DagsterInvalidDefinitionError(( 'Two modes seen with the name "{mode_name}" in "{pipeline_name}". ' "Modes must have unique names.").format( mode_name=mode_def.name, pipeline_name=self._name)) seen_modes.add(mode_def.name) self._dagster_type_dict = construct_dagster_type_dictionary( self._current_level_node_defs) self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition) self._preset_defs = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition) self._preset_dict = {} for preset in self._preset_defs: if preset.name in self._preset_dict: raise DagsterInvalidDefinitionError(( 'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". ' "PresetDefinitions must have unique names.").format( name=preset.name, pipeline_name=self._name)) if preset.mode not in seen_modes: raise DagsterInvalidDefinitionError( ('PresetDefinition "{name}" in "{pipeline_name}" ' 'references mode "{mode}" which is not defined.').format( name=preset.name, pipeline_name=self._name, mode=preset.mode)) self._preset_dict[preset.name] = preset # Validate solid resource dependencies _validate_resource_dependencies( self._mode_definitions, self._current_level_node_defs, self._dagster_type_dict, self._solid_dict, self._hook_defs, ) # Validate unsatisfied inputs can be materialized from config _validate_inputs(self._dependency_structure, self._solid_dict, self._mode_definitions) # Recursively explore all nodes in the this pipeline self._all_node_defs = _build_all_node_defs( self._current_level_node_defs) self._parent_pipeline_def = check.opt_inst_param( _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition) self._cached_run_config_schemas = {} self._cached_external_pipeline = None
def __init__(self, config_type, func, required_resource_keys): self._config_type = check.inst_param(config_type, "config_type", ConfigType) self._func = check.callable_param(func, "func") self._required_resource_keys = check.opt_set_param( required_resource_keys, "required_resource_keys", of_type=str)
def define_dagstermill_op( name: str, notebook_path: str, input_defs: Optional[Sequence[InputDefinition]] = None, output_defs: Optional[Sequence[OutputDefinition]] = None, config_schema: Optional[Union[Any, Dict[str, Any]]] = None, required_resource_keys: Optional[Set[str]] = None, output_notebook_name: Optional[str] = None, asset_key_prefix: Optional[Union[List[str], str]] = None, description: Optional[str] = None, tags: Optional[Dict[str, Any]] = None, ): """Wrap a Jupyter notebook in a solid. Arguments: name (str): The name of the solid. notebook_path (str): Path to the backing notebook. input_defs (Optional[List[InputDefinition]]): The solid's inputs. output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should call :py:func:`~dagstermill.yield_result` to yield each of these outputs. required_resource_keys (Optional[Set[str]]): The string names of any required resources. output_notebook_name: (Optional[str]): If set, will be used as the name of an injected output of type of :py:class:`~dagster.BufferedIOBase` that is the file object of the executed notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always created). It allows the downstream solids to access the executed notebook via a file object. asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the asset keys for materialized notebooks. description (Optional[str]): If set, description used for solid. tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid. Dagster uses the tag keys `notebook_path` and `kind`, which cannot be overwritten by the user. Returns: :py:class:`~dagster.SolidDefinition` """ check.str_param(name, "name") check.str_param(notebook_path, "notebook_path") input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition) output_defs = check.opt_list_param(output_defs, "output_defs", of_type=OutputDefinition) required_resource_keys = check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str) extra_output_defs = [] if output_notebook_name is not None: required_resource_keys.add("output_notebook_io_manager") extra_output_defs.append( OutputDefinition(name=output_notebook_name, io_manager_key="output_notebook_io_manager")) if isinstance(asset_key_prefix, str): asset_key_prefix = [asset_key_prefix] asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str) default_description = f"This op is backed by the notebook at {notebook_path}" description = check.opt_str_param(description, "description", default=default_description) user_tags = validate_tags(tags) if tags is not None: check.invariant( "notebook_path" not in tags, "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster", ) check.invariant( "kind" not in tags, "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster", ) default_tags = {"notebook_path": notebook_path, "kind": "ipynb"} return OpDefinition( name=name, input_defs=input_defs, compute_fn=_dm_compute( "define_dagstermill_op", name, notebook_path, output_notebook_name, asset_key_prefix=asset_key_prefix, ), output_defs=output_defs + extra_output_defs, config_schema=config_schema, required_resource_keys=required_resource_keys, description=description, tags={ **user_tags, **default_tags }, )
def __new__(cls, asset_key, partitions=None): asset_key = check.inst_param(asset_key, "asset_key", AssetKey) partitions = check.opt_set_param(partitions, "partitions", str) return super(AssetLineageInfo, cls).__new__(cls, asset_key=asset_key, partitions=partitions)
def __init__( self, type_check_fn, key=None, name=None, is_builtin=False, description=None, input_hydration_config=None, output_materialization_config=None, serialization_strategy=None, auto_plugins=None, required_resource_keys=None, kind=DagsterTypeKind.REGULAR, ): check.opt_str_param(key, 'key') check.opt_str_param(name, 'name') check.invariant(not (name is None and key is None), 'Must set key or name') if name is None: check.param_invariant( bool(key), 'key', 'If name is not provided, must provide key.', ) self.key, self.name = key, None elif key is None: check.param_invariant( bool(name), 'name', 'If key is not provided, must provide name.', ) self.key, self.name = name, name else: check.invariant(key and name) self.key, self.name = key, name self.description = check.opt_str_param(description, 'description') self.input_hydration_config = check.opt_inst_param( input_hydration_config, 'input_hydration_config', InputHydrationConfig) self.output_materialization_config = check.opt_inst_param( output_materialization_config, 'output_materialization_config', OutputMaterializationConfig, ) self.serialization_strategy = check.opt_inst_param( serialization_strategy, 'serialization_strategy', SerializationStrategy, PickleSerializationStrategy(), ) self.required_resource_keys = check.opt_set_param( required_resource_keys, 'required_resource_keys', ) self._type_check_fn = check.callable_param(type_check_fn, 'type_check_fn') _validate_type_check_fn(self._type_check_fn, self.name) auto_plugins = check.opt_list_param(auto_plugins, 'auto_plugins', of_type=type) check.param_invariant( all( issubclass(auto_plugin_type, TypeStoragePlugin) for auto_plugin_type in auto_plugins), 'auto_plugins', ) self.auto_plugins = auto_plugins self.is_builtin = check.bool_param(is_builtin, 'is_builtin') check.invariant( self.display_name is not None, 'All types must have a valid display name, got None for key {}'. format(key), ) self.kind = check.inst_param(kind, 'kind', DagsterTypeKind)
def __init__( self, solid_defs, name=None, description=None, dependencies=None, mode_defs=None, preset_defs=None, tags=None, hook_defs=None, _parent_pipeline_def=None, # https://github.com/dagster-io/dagster/issues/2115 ): if not name: warnings.warn( "Pipeline must have a name. Names will be required starting in 0.9.13 or later." ) # name might be <<unnamed>> when constructing pipeline subsets elif name != "<<unnamed>>" and not is_valid_name(name): check_for_invalid_name_and_warn(name) self._name = check.opt_str_param(name, "name") or "<<unnamed>>" self._description = check.opt_str_param(description, "description") mode_definitions = check.opt_list_param(mode_defs, "mode_defs", of_type=ModeDefinition) if not mode_definitions: mode_definitions = [ModeDefinition()] self._mode_definitions = mode_definitions self._current_level_solid_defs = check.list_param( _check_solids_arg(self._name, solid_defs), "solid_defs", of_type=ISolidDefinition ) self._tags = validate_tags(tags) seen_modes = set() for mode_def in mode_definitions: if mode_def.name in seen_modes: raise DagsterInvalidDefinitionError( ( 'Two modes seen with the name "{mode_name}" in "{pipeline_name}". ' "Modes must have unique names." ).format(mode_name=mode_def.name, pipeline_name=self._name) ) seen_modes.add(mode_def.name) self._dependencies = validate_dependency_dict(dependencies) dependency_structure, solid_dict = create_execution_structure( self._current_level_solid_defs, self._dependencies, container_definition=None ) self._solid_dict = solid_dict self._dependency_structure = dependency_structure # eager toposort solids to detect cycles self.solids_in_topological_order = self._solids_in_topological_order() self._dagster_type_dict = construct_dagster_type_dictionary(self._current_level_solid_defs) self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition) self._preset_defs = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition) self._preset_dict = {} for preset in self._preset_defs: if preset.name in self._preset_dict: raise DagsterInvalidDefinitionError( ( 'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". ' "PresetDefinitions must have unique names." ).format(name=preset.name, pipeline_name=self._name) ) if preset.mode not in seen_modes: raise DagsterInvalidDefinitionError( ( 'PresetDefinition "{name}" in "{pipeline_name}" ' 'references mode "{mode}" which is not defined.' ).format(name=preset.name, pipeline_name=self._name, mode=preset.mode) ) self._preset_dict[preset.name] = preset # Validate solid resource dependencies _validate_resource_dependencies( self._mode_definitions, self._current_level_solid_defs, self._solid_dict, self._hook_defs, ) # Validate unsatisfied inputs can be materialized from config _validate_inputs(self._dependency_structure, self._solid_dict) self._all_solid_defs = _build_all_solid_defs(self._current_level_solid_defs) self._parent_pipeline_def = check.opt_inst_param( _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition ) self._cached_run_config_schemas = {} self._cached_external_pipeline = None
def __init__(self, solid_def, given_alias=None, tags=None, hook_defs=None): self.solid_def = solid_def self.given_alias = check.opt_str_param(given_alias, "given_alias") self.tags = check.opt_inst_param(tags, "tags", frozentags) self.hook_defs = check.opt_set_param(hook_defs, "hook_defs", HookDefinition)
def __init__( self, solid_defs: List[NodeDefinition], name: str, description: Optional[str] = None, dependencies: Optional[Dict[Union[str, SolidInvocation], Dict[str, IDependencyDefinition]]] = None, mode_defs: Optional[List[ModeDefinition]] = None, preset_defs: Optional[List[PresetDefinition]] = None, tags: Dict[str, Any] = None, hook_defs: Optional[AbstractSet[HookDefinition]] = None, input_mappings: Optional[List[InputMapping]] = None, output_mappings: Optional[List[OutputMapping]] = None, config_mapping: Optional[ConfigMapping] = None, positional_inputs: List[str] = None, _parent_pipeline_def: Optional[ "PipelineDefinition"] = None, # https://github.com/dagster-io/dagster/issues/2115 ): # For these warnings they check truthiness because they get changed to [] higher # in the stack for the decorator case if input_mappings: experimental_arg_warning("input_mappings", "PipelineDefinition") if output_mappings: experimental_arg_warning("output_mappings", "PipelineDefinition") if config_mapping is not None: experimental_arg_warning("config_mapping", "PipelineDefinition") if positional_inputs: experimental_arg_warning("positional_inputs", "PipelineDefinition") super(PipelineDefinition, self).__init__( name=name, description=description, dependencies=dependencies, node_defs=solid_defs, tags=check.opt_dict_param(tags, "tags", key_type=str), positional_inputs=positional_inputs, input_mappings=input_mappings, output_mappings=output_mappings, config_mapping=config_mapping, ) self._current_level_node_defs = solid_defs self._tags = validate_tags(tags) mode_definitions = check.opt_list_param(mode_defs, "mode_defs", of_type=ModeDefinition) if not mode_definitions: mode_definitions = [ModeDefinition()] self._mode_definitions = mode_definitions seen_modes = set() for mode_def in mode_definitions: if mode_def.name in seen_modes: raise DagsterInvalidDefinitionError(( 'Two modes seen with the name "{mode_name}" in "{pipeline_name}". ' "Modes must have unique names.").format( mode_name=mode_def.name, pipeline_name=self._name)) seen_modes.add(mode_def.name) self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition) self._preset_defs = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition) self._preset_dict: Dict[str, PresetDefinition] = {} for preset in self._preset_defs: if preset.name in self._preset_dict: raise DagsterInvalidDefinitionError(( 'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". ' "PresetDefinitions must have unique names.").format( name=preset.name, pipeline_name=self._name)) if preset.mode not in seen_modes: raise DagsterInvalidDefinitionError( ('PresetDefinition "{name}" in "{pipeline_name}" ' 'references mode "{mode}" which is not defined.').format( name=preset.name, pipeline_name=self._name, mode=preset.mode)) self._preset_dict[preset.name] = preset self._resource_requirements = { mode_def.name: _checked_resource_reqs_for_mode( mode_def, self._current_level_node_defs, self._dagster_type_dict, self._solid_dict, self._hook_defs, ) for mode_def in self._mode_definitions } # Validate unsatisfied inputs can be materialized from config _validate_inputs(self._dependency_structure, self._solid_dict, self._mode_definitions) # Recursively explore all nodes in the this pipeline self._all_node_defs = _build_all_node_defs( self._current_level_node_defs) self._parent_pipeline_def = check.opt_inst_param( _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition) self._cached_run_config_schemas: Dict[str, "RunConfigSchema"] = {} self._cached_external_pipeline = None
def to_job( self, name: Optional[str] = None, description: Optional[str] = None, resource_defs: Optional[Dict[str, ResourceDefinition]] = None, config: Union[ConfigMapping, Dict[str, Any], "PartitionedConfig"] = None, tags: Optional[Dict[str, Any]] = None, logger_defs: Optional[Dict[str, LoggerDefinition]] = None, executor_def: Optional["ExecutorDefinition"] = None, hooks: Optional[AbstractSet[HookDefinition]] = None, op_retry_policy: Optional[RetryPolicy] = None, version_strategy: Optional[VersionStrategy] = None, op_selection: Optional[List[str]] = None, partitions_def: Optional["PartitionsDefinition"] = None, ) -> "JobDefinition": """ Make this graph in to an executable Job by providing remaining components required for execution. Args: name (Optional[str]): The name for the Job. Defaults to the name of the this graph. resource_defs (Optional[Dict[str, ResourceDefinition]]): Resources that are required by this graph for execution. If not defined, `io_manager` will default to filesystem. config: Describes how the job is parameterized at runtime. If no value is provided, then the schema for the job's run config is a standard format based on its solids and resources. If a dictionary is provided, then it must conform to the standard config schema, and it will be used as the job's run config for the job whenever the job is executed. The values provided will be viewable and editable in the Dagit playground, so be careful with secrets. If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is determined by the config mapping, and the ConfigMapping, which should return configuration in the standard format to configure the job. If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config values that can parameterize the job, as well as a function for mapping those values to the base config. The values provided will be viewable and editable in the Dagit playground, so be careful with secrets. tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution of the Job. Values that are not strings will be json encoded and must meet the criteria that `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag values provided at invocation time. logger_defs (Optional[Dict[str, LoggerDefinition]]): A dictionary of string logger identifiers to their implementations. executor_def (Optional[ExecutorDefinition]): How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`, which can be switched between multi-process and in-process modes of execution. The default mode of execution is multi-process. op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job. Only used if retry policy is not defined on the op definition or op invocation. version_strategy (Optional[VersionStrategy]): Defines how each solid (and optionally, resource) in the job can be versioned. If provided, memoizaton will be enabled for this job. partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition keys that can parameterize the job. If this argument is supplied, the config argument can't also be supplied. Returns: JobDefinition """ from .job_definition import JobDefinition from .partition import PartitionedConfig, PartitionsDefinition from .executor_definition import ExecutorDefinition, multi_or_in_process_executor job_name = check_valid_name(name or self.name) tags = check.opt_dict_param(tags, "tags", key_type=str) executor_def = check.opt_inst_param( executor_def, "executor_def", ExecutorDefinition, default=multi_or_in_process_executor) if resource_defs and "io_manager" in resource_defs: resource_defs_with_defaults = resource_defs else: resource_defs_with_defaults = merge_dicts( {"io_manager": default_job_io_manager}, resource_defs or {}) hooks = check.opt_set_param(hooks, "hooks", of_type=HookDefinition) op_retry_policy = check.opt_inst_param(op_retry_policy, "op_retry_policy", RetryPolicy) op_selection = check.opt_list_param(op_selection, "op_selection", of_type=str) presets = [] config_mapping = None partitioned_config = None if partitions_def: check.inst_param(partitions_def, "partitions_def", PartitionsDefinition) check.invariant( config is None, "Can't supply both the 'config' and 'partitions_def' arguments" ) partitioned_config = PartitionedConfig(partitions_def, lambda _: {}) if isinstance(config, ConfigMapping): config_mapping = config elif isinstance(config, PartitionedConfig): partitioned_config = config elif isinstance(config, dict): presets = [PresetDefinition(name="default", run_config=config)] # Using config mapping here is a trick to make it so that the preset will be used even # when no config is supplied for the job. config_mapping = _config_mapping_with_default_value( self._get_config_schema(resource_defs_with_defaults, executor_def, logger_defs), config, job_name, self.name, ) elif config is not None: check.failed( f"config param must be a ConfigMapping, a PartitionedConfig, or a dictionary, but " f"is an object of type {type(config)}") return JobDefinition( name=job_name, description=description or self.description, graph_def=self, mode_def=ModeDefinition( resource_defs=resource_defs_with_defaults, logger_defs=logger_defs, executor_defs=[executor_def], _config_mapping=config_mapping, _partitioned_config=partitioned_config, ), preset_defs=presets, tags=tags, hook_defs=hooks, version_strategy=version_strategy, op_retry_policy=op_retry_policy, ).get_job_def_for_op_selection(op_selection)
def define_dagstermill_solid( name, notebook_path, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, output_notebook=None, asset_key_prefix=None, ): """Wrap a Jupyter notebook in a solid. Arguments: name (str): The name of the solid. notebook_path (str): Path to the backing notebook. input_defs (Optional[List[InputDefinition]]): The solid's inputs. output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should call :py:func:`~dagstermill.yield_result` to yield each of these outputs. required_resource_keys (Optional[Set[str]]): The string names of any required resources. output_notebook (Optional[str]): If set, will be used as the name of an injected output of type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on the pipeline resources via the "file_manager" resource key, so, e.g., if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a : py:class:`~dagster_aws.s3.S3FileHandle`. asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the asset keys for materialized notebooks. Returns: :py:class:`~dagster.SolidDefinition` """ check.str_param(name, "name") check.str_param(notebook_path, "notebook_path") input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition) output_defs = check.opt_list_param(output_defs, "output_defs", of_type=OutputDefinition) required_resource_keys = check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str) if output_notebook is not None: required_resource_keys.add("file_manager") if isinstance(asset_key_prefix, str): asset_key_prefix = [asset_key_prefix] asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str) return SolidDefinition( name=name, input_defs=input_defs, compute_fn=_dm_solid_compute(name, notebook_path, output_notebook, asset_key_prefix=asset_key_prefix), output_defs=output_defs + ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)] if output_notebook else []), config_schema=config_schema, required_resource_keys=required_resource_keys, description="This solid is backed by the notebook at {path}".format( path=notebook_path), tags={ "notebook_path": notebook_path, "kind": "ipynb" }, )
def _core_resource_initialization_event_generator( resource_defs: Dict[str, ResourceDefinition], resource_configs: Dict[str, ResourceConfig], resource_log_manager: DagsterLogManager, resource_managers: Deque[EventGenerationManager], execution_plan: Optional[ExecutionPlan], pipeline_run: Optional[PipelineRun], resource_keys_to_init: Optional[AbstractSet[str]], instance: Optional[DagsterInstance], emit_persistent_events: Optional[bool], pipeline_def_for_backwards_compat: Optional[PipelineDefinition], ): pipeline_name = None contains_generator = False if emit_persistent_events: check.invariant( pipeline_run and execution_plan, "If emit_persistent_events is enabled, then pipeline_run and execution_plan must be provided", ) pipeline_name = cast(PipelineRun, pipeline_run).pipeline_name resource_keys_to_init = check.opt_set_param(resource_keys_to_init, "resource_keys_to_init") resource_instances: Dict[str, "InitializedResource"] = {} resource_init_times = {} try: if emit_persistent_events and resource_keys_to_init: yield DagsterEvent.resource_init_start( cast(str, pipeline_name), cast(ExecutionPlan, execution_plan), resource_log_manager, resource_keys_to_init, ) resource_dependencies = _resolve_resource_dependencies(resource_defs) for level in toposort(resource_dependencies): for resource_name in level: resource_def = resource_defs[resource_name] if not resource_name in resource_keys_to_init: continue resource_fn = cast(Callable[[InitResourceContext], Any], resource_def.resource_fn) resources = ScopedResourcesBuilder(resource_instances).build( resource_def.required_resource_keys) resource_context = InitResourceContext( resource_def=resource_def, resource_config=resource_configs[resource_name].config, pipeline_run=pipeline_run, # Add tags with information about the resource log_manager=resource_log_manager.with_tags( resource_name=resource_name, resource_fn_name=str(resource_fn.__name__), ), resources=resources, instance=instance, pipeline_def_for_backwards_compat= pipeline_def_for_backwards_compat, ) manager = single_resource_generation_manager( resource_context, resource_name, resource_def) for event in manager.generate_setup_events(): if event: yield event initialized_resource = check.inst(manager.get_object(), InitializedResource) resource_instances[ resource_name] = initialized_resource.resource resource_init_times[ resource_name] = initialized_resource.duration contains_generator = contains_generator or initialized_resource.is_generator resource_managers.append(manager) if emit_persistent_events and resource_keys_to_init: yield DagsterEvent.resource_init_success( cast(str, pipeline_name), cast(ExecutionPlan, execution_plan), resource_log_manager, resource_instances, resource_init_times, ) yield ScopedResourcesBuilder(resource_instances, contains_generator) except DagsterUserCodeExecutionError as dagster_user_error: # Can only end up in this state if we attempt to initialize a resource, so # resource_keys_to_init cannot be empty if emit_persistent_events: yield DagsterEvent.resource_init_failure( cast(str, pipeline_name), cast(ExecutionPlan, execution_plan), resource_log_manager, resource_keys_to_init, serializable_error_info_from_exc_info( dagster_user_error.original_exc_info), ) raise dagster_user_error
def resource_initialization_event_generator( resource_defs: Dict[str, ResourceDefinition], resource_configs: Dict[str, ResourceConfig], log_manager: DagsterLogManager, execution_plan: Optional[ExecutionPlan], pipeline_run: Optional[PipelineRun], resource_keys_to_init: Optional[AbstractSet[str]], instance: Optional[DagsterInstance], emit_persistent_events: Optional[bool], pipeline_def_for_backwards_compat: Optional[PipelineDefinition], ): check.inst_param(log_manager, "log_manager", DagsterLogManager) resource_keys_to_init = check.opt_set_param(resource_keys_to_init, "resource_keys_to_init", of_type=str) check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan) check.opt_inst_param(pipeline_run, "pipeline_run", PipelineRun) check.opt_inst_param(instance, "instance", DagsterInstance) if execution_plan and execution_plan.step_handle_for_single_step_plans(): step = execution_plan.get_step( cast( StepHandleUnion, cast(ExecutionPlan, execution_plan).step_handle_for_single_step_plans(), )) resource_log_manager = log_manager.with_tags( **cast(ExecutionStep, step).logging_tags) else: resource_log_manager = log_manager generator_closed = False resource_managers: Deque[EventGenerationManager] = deque() try: yield from _core_resource_initialization_event_generator( resource_defs=resource_defs, resource_configs=resource_configs, resource_log_manager=resource_log_manager, resource_managers=resource_managers, execution_plan=execution_plan, pipeline_run=pipeline_run, resource_keys_to_init=resource_keys_to_init, instance=instance, emit_persistent_events=emit_persistent_events, pipeline_def_for_backwards_compat=pipeline_def_for_backwards_compat, ) except GeneratorExit: # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/). generator_closed = True raise finally: if not generator_closed: error = None while len(resource_managers) > 0: manager = resource_managers.pop() try: yield from manager.generate_teardown_events() except DagsterUserCodeExecutionError as dagster_user_error: error = dagster_user_error if error and emit_persistent_events: yield DagsterEvent.resource_teardown_failure( cast(PipelineRun, pipeline_run).pipeline_name, cast(ExecutionPlan, execution_plan), resource_log_manager, resource_keys_to_init, serializable_error_info_from_exc_info( error.original_exc_info), )
def __init__( self, dagster_type=None, name: Optional[str] = None, description: Optional[str] = None, is_required: bool = True, io_manager_key: Optional[str] = None, metadata: Optional[MetadataUserInput] = None, asset_key: Optional[Union[AssetKey, DynamicAssetKey]] = None, asset_partitions: Optional[Union[AbstractSet[str], Callable[["OutputContext"], AbstractSet[str]]]] = None, asset_partitions_def: Optional["PartitionsDefinition"] = None # make sure new parameters are updated in combine_with_inferred below ): from dagster.core.definitions.partition import PartitionsDefinition self._name = check_valid_name( check.opt_str_param(name, "name", DEFAULT_OUTPUT)) self._type_not_set = dagster_type is None self._dagster_type = resolve_dagster_type(dagster_type) self._description = check.opt_str_param(description, "description") self._is_required = check.bool_param(is_required, "is_required") self._io_manager_key = check.opt_str_param( io_manager_key, "io_manager_key", default="io_manager", ) self._metadata = check.opt_dict_param(metadata, "metadata", key_type=str) self._metadata_entries = check.is_list( normalize_metadata(self._metadata, [], allow_invalid=True), MetadataEntry) if asset_key: experimental_arg_warning("asset_key", "OutputDefinition.__init__") if callable(asset_key): warnings.warn( "Passing a function as the `asset_key` argument to `Out` or `OutputDefinition` is " "deprecated behavior and will be removed in version 0.15.0.") else: check.opt_inst_param(asset_key, "asset_key", AssetKey) self._asset_key = asset_key if asset_partitions: experimental_arg_warning("asset_partitions", "OutputDefinition.__init__") check.param_invariant( asset_key is not None, "asset_partitions", 'Cannot specify "asset_partitions" argument without also specifying "asset_key"', ) self._asset_partitions_fn: Optional[Callable[["OutputContext"], AbstractSet[str]]] if callable(asset_partitions): self._asset_partitions_fn = asset_partitions elif asset_partitions is not None: asset_partitions = check.opt_set_param(asset_partitions, "asset_partitions", str) def _fn(_context: "OutputContext") -> AbstractSet[str]: return cast(AbstractSet[str], asset_partitions) # mypy bug? self._asset_partitions_fn = _fn else: self._asset_partitions_fn = None if asset_partitions_def: experimental_arg_warning("asset_partitions_def", "OutputDefinition.__init__") self._asset_partitions_def = check.opt_inst_param( asset_partitions_def, "asset_partition_def", PartitionsDefinition)
def __new__( cls, pipeline_name=None, run_id=None, environment_dict=None, mode=None, solid_selection=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot_id=None, execution_plan_snapshot_id=None, ## GRAVEYARD BELOW # see https://github.com/dagster-io/dagster/issues/2372 for explanation previous_run_id=None, selector=None, solid_subset=None, ): # a frozenset which contains the names of the solids to execute check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str) # a list of solid queries provided by the user # possible to be None when only solids_to_execute is set by the user directly check.opt_list_param(solid_selection, 'solid_selection', of_type=str) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) check.opt_str_param(root_run_id, 'root_run_id') check.opt_str_param(parent_run_id, 'parent_run_id') check.invariant( (root_run_id is not None and parent_run_id is not None) or (root_run_id is None and parent_run_id is None), ( 'Must set both root_run_id and parent_run_id when creating a PipelineRun that ' 'belongs to a run group' ), ) # Compatibility # ---------------------------------------------------------------------------------------- # Historical runs may have previous_run_id set, in which case # that previous ID becomes both the root and the parent if previous_run_id: if not (parent_run_id and root_run_id): parent_run_id = previous_run_id root_run_id = previous_run_id check.opt_inst_param(selector, 'selector', ExecutionSelector) if selector: check.invariant( pipeline_name is None or selector.name == pipeline_name, ( 'Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: ' 'selector was passed with pipeline {selector_pipeline}'.format( pipeline_name=pipeline_name, selector_pipeline=selector.name ) ), ) if pipeline_name is None: pipeline_name = selector.name check.invariant( solids_to_execute is None or set(selector.solid_subset) == solids_to_execute, ( 'Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: ' 'selector was passed with subset {selector_subset}'.format( solids_to_execute=solids_to_execute, selector_subset=selector.solid_subset ) ), ) # for old runs that only have selector but no solids_to_execute if solids_to_execute is None: solids_to_execute = ( frozenset(selector.solid_subset) if selector.solid_subset else None ) # for old runs that specified list-type solid_subset check.opt_list_param(solid_subset, 'solid_subset', of_type=str) if solid_subset: solids_to_execute = frozenset(solid_subset) # ---------------------------------------------------------------------------------------- return super(PipelineRun, cls).__new__( cls, pipeline_name=check.opt_str_param(pipeline_name, 'pipeline_name'), run_id=check.opt_str_param(run_id, 'run_id', default=make_new_run_id()), environment_dict=check.opt_dict_param( environment_dict, 'environment_dict', key_type=str ), mode=check.opt_str_param(mode, 'mode'), solid_selection=solid_selection, solids_to_execute=solids_to_execute, step_keys_to_execute=step_keys_to_execute, status=check.opt_inst_param( status, 'status', PipelineRunStatus, PipelineRunStatus.NOT_STARTED ), tags=check.opt_dict_param(tags, 'tags', key_type=str), root_run_id=root_run_id, parent_run_id=parent_run_id, pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id, 'pipeline_snapshot_id'), execution_plan_snapshot_id=check.opt_str_param( execution_plan_snapshot_id, 'execution_plan_snapshot_id' ), )
def __init__( self, solid_defs: Optional[List[NodeDefinition]] = None, name: Optional[str] = None, description: Optional[str] = None, dependencies: Optional[Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]] = None, mode_defs: Optional[List[ModeDefinition]] = None, preset_defs: Optional[List[PresetDefinition]] = None, tags: Dict[str, Any] = None, hook_defs: Optional[AbstractSet[HookDefinition]] = None, solid_retry_policy: Optional[RetryPolicy] = None, graph_def=None, _parent_pipeline_def=None, # https://github.com/dagster-io/dagster/issues/2115 version_strategy: Optional[VersionStrategy] = None, ): # If a graph is specificed directly use it if check.opt_inst_param(graph_def, "graph_def", GraphDefinition): self._graph_def = graph_def self._name = name or graph_def.name # Otherwise fallback to legacy construction else: if name is None: check.failed("name must be set provided") self._name = name if solid_defs is None: check.failed("solid_defs must be provided") self._graph_def = GraphDefinition( name=name, dependencies=dependencies, node_defs=solid_defs, input_mappings=None, output_mappings=None, config=None, description=None, ) # tags and description can exist on graph as well, but since # same graph may be in multiple pipelines/jobs, keep separate layer self._description = check.opt_str_param(description, "description") self._tags = validate_tags(tags) self._current_level_node_defs = self._graph_def.node_defs mode_definitions = check.opt_list_param(mode_defs, "mode_defs", of_type=ModeDefinition) if not mode_definitions: mode_definitions = [ModeDefinition()] self._mode_definitions = mode_definitions seen_modes = set() for mode_def in mode_definitions: if mode_def.name in seen_modes: raise DagsterInvalidDefinitionError(( 'Two modes seen with the name "{mode_name}" in "{pipeline_name}". ' "Modes must have unique names.").format( mode_name=mode_def.name, pipeline_name=self.name)) seen_modes.add(mode_def.name) self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition) self._solid_retry_policy = check.opt_inst_param( solid_retry_policy, "solid_retry_policy", RetryPolicy) self._preset_defs = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition) self._preset_dict: Dict[str, PresetDefinition] = {} for preset in self._preset_defs: if preset.name in self._preset_dict: raise DagsterInvalidDefinitionError(( 'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". ' "PresetDefinitions must have unique names.").format( name=preset.name, pipeline_name=self.name)) if preset.mode not in seen_modes: raise DagsterInvalidDefinitionError( ('PresetDefinition "{name}" in "{pipeline_name}" ' 'references mode "{mode}" which is not defined.').format( name=preset.name, pipeline_name=self.name, mode=preset.mode)) self._preset_dict[preset.name] = preset self._resource_requirements = { mode_def.name: _checked_resource_reqs_for_mode( mode_def, self._current_level_node_defs, self._graph_def._dagster_type_dict, self._graph_def._node_dict, self._hook_defs, self._graph_def._dependency_structure, ) for mode_def in self._mode_definitions } # Recursively explore all nodes in the this pipeline self._all_node_defs = _build_all_node_defs( self._current_level_node_defs) self._parent_pipeline_def = check.opt_inst_param( _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition) self._cached_run_config_schemas: Dict[str, "RunConfigSchema"] = {} self._cached_external_pipeline = None self.version_strategy = check.opt_inst_param(version_strategy, "version_strategy", VersionStrategy) if self.version_strategy is not None: experimental_class_warning("VersionStrategy")
def create_run_for_pipeline( self, pipeline_def, execution_plan=None, run_id=None, run_config=None, mode=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, solid_selection=None, ): from dagster.core.execution.api import create_execution_plan from dagster.core.execution.plan.plan import ExecutionPlan from dagster.core.snap import snapshot_from_execution_plan check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition) check.opt_inst_param(execution_plan, 'execution_plan', ExecutionPlan) # note that solids_to_execute is required to execute the solid subset, which is the # frozenset version of the previous solid_subset. # solid_selection is not required and will not be converted to solids_to_execute here. # i.e. this function doesn't handle solid queries. # solid_selection is only used to pass the user queries further down. check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str) check.opt_list_param(solid_selection, 'solid_selection', of_type=str) if solids_to_execute: if isinstance(pipeline_def, PipelineSubsetDefinition): # for the case when pipeline_def is created by ExecutablePipeline or ExternalPipeline check.invariant( solids_to_execute == pipeline_def.solids_to_execute, 'Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} ' 'that conflicts with solids_to_execute arg {solids_to_execute}'.format( pipeline_solids_to_execute=str_format_list(pipeline_def.solids_to_execute), solids_to_execute=str_format_list(solids_to_execute), ), ) else: # for cases when `create_run_for_pipeline` is directly called pipeline_def = pipeline_def.get_pipeline_subset_def( solids_to_execute=solids_to_execute ) if execution_plan is None: execution_plan = create_execution_plan( pipeline_def, run_config=run_config, mode=mode, step_keys_to_execute=step_keys_to_execute, ) return self.create_run( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config, mode=check.opt_str_param(mode, 'mode', default=pipeline_def.get_default_mode_name()), solid_selection=solid_selection, solids_to_execute=solids_to_execute, step_keys_to_execute=step_keys_to_execute, status=status, tags=tags, root_run_id=root_run_id, parent_run_id=parent_run_id, pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), execution_plan_snapshot=snapshot_from_execution_plan( execution_plan, pipeline_def.get_pipeline_snapshot_id() ), parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(), )
def __new__( cls, pipeline_name=None, run_id=None, run_config=None, mode=None, solid_selection=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot_id=None, execution_plan_snapshot_id=None, external_pipeline_origin=None, ): check.invariant( (root_run_id is not None and parent_run_id is not None) or (root_run_id is None and parent_run_id is None), ("Must set both root_run_id and parent_run_id when creating a PipelineRun that " "belongs to a run group"), ) # a frozenset which contains the names of the solids to execute check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str) # a list of solid queries provided by the user # possible to be None when only solids_to_execute is set by the user directly check.opt_list_param(solid_selection, "solid_selection", of_type=str) check.opt_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str) # Placing this with the other imports causes a cyclic import # https://github.com/dagster-io/dagster/issues/3181 from dagster.core.host_representation.origin import ExternalPipelineOrigin if status == PipelineRunStatus.QUEUED: check.inst_param( external_pipeline_origin, "external_pipeline_origin", ExternalPipelineOrigin, "external_pipeline_origin is required for queued runs", ) return super(PipelineRun, cls).__new__( cls, pipeline_name=check.opt_str_param(pipeline_name, "pipeline_name"), run_id=check.opt_str_param(run_id, "run_id", default=make_new_run_id()), run_config=check.opt_dict_param(run_config, "run_config", key_type=str), mode=check.opt_str_param(mode, "mode"), solid_selection=solid_selection, solids_to_execute=solids_to_execute, step_keys_to_execute=step_keys_to_execute, status=check.opt_inst_param(status, "status", PipelineRunStatus, PipelineRunStatus.NOT_STARTED), tags=check.opt_dict_param(tags, "tags", key_type=str, value_type=str), root_run_id=check.opt_str_param(root_run_id, "root_run_id"), parent_run_id=check.opt_str_param(parent_run_id, "parent_run_id"), pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id, "pipeline_snapshot_id"), execution_plan_snapshot_id=check.opt_str_param( execution_plan_snapshot_id, "execution_plan_snapshot_id"), external_pipeline_origin=check.opt_inst_param( external_pipeline_origin, "external_pipeline_origin", ExternalPipelineOrigin), )
def create_run_for_pipeline( self, pipeline_def, execution_plan=None, run_id=None, run_config=None, mode=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, solid_selection=None, ): from dagster.core.execution.api import create_execution_plan from dagster.core.execution.plan.plan import ExecutionPlan from dagster.core.snap import snapshot_from_execution_plan check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition) check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan) # note that solids_to_execute is required to execute the solid subset, which is the # frozenset version of the previous solid_subset. # solid_selection is not required and will not be converted to solids_to_execute here. # i.e. this function doesn't handle solid queries. # solid_selection is only used to pass the user queries further down. check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str) check.opt_list_param(solid_selection, "solid_selection", of_type=str) if solids_to_execute: if isinstance(pipeline_def, PipelineSubsetDefinition): # for the case when pipeline_def is created by IPipeline or ExternalPipeline check.invariant( solids_to_execute == pipeline_def.solids_to_execute, "Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} " "that conflicts with solids_to_execute arg {solids_to_execute}" .format( pipeline_solids_to_execute=str_format_list( pipeline_def.solids_to_execute), solids_to_execute=str_format_list(solids_to_execute), ), ) else: # for cases when `create_run_for_pipeline` is directly called pipeline_def = pipeline_def.get_pipeline_subset_def( solids_to_execute=solids_to_execute) full_execution_plan = execution_plan or create_execution_plan( pipeline_def, run_config=run_config, mode=mode, ) check.invariant( len(full_execution_plan.step_keys_to_execute) == len( full_execution_plan.steps)) if _is_memoized_run(tags): if step_keys_to_execute: raise DagsterInvariantViolationError( "step_keys_to_execute parameter cannot be used in conjunction with memoized " "pipeline runs.") step_keys_to_execute = self.resolve_unmemoized_steps( full_execution_plan, run_config=run_config, mode=mode, ) # TODO: tighter integration with existing step_keys_to_execute functionality subsetted_execution_plan = ( full_execution_plan.build_subset_plan(step_keys_to_execute) if step_keys_to_execute else full_execution_plan) return self.create_run( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config, mode=check.opt_str_param( mode, "mode", default=pipeline_def.get_default_mode_name()), solid_selection=solid_selection, solids_to_execute=solids_to_execute, step_keys_to_execute=step_keys_to_execute, status=status, tags=tags, root_run_id=root_run_id, parent_run_id=parent_run_id, pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), execution_plan_snapshot=snapshot_from_execution_plan( subsetted_execution_plan, pipeline_def.get_pipeline_snapshot_id()), parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot( ), )
def define_dagstermill_solid( name, notebook_path, input_defs=None, output_defs=None, config_schema=None, required_resource_keys=None, output_notebook=None, asset_key_prefix=None, description=None, tags=None, ): """Wrap a Jupyter notebook in a solid. Arguments: name (str): The name of the solid. notebook_path (str): Path to the backing notebook. input_defs (Optional[List[InputDefinition]]): The solid's inputs. output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should call :py:func:`~dagstermill.yield_result` to yield each of these outputs. required_resource_keys (Optional[Set[str]]): The string names of any required resources. output_notebook (Optional[str]): If set, will be used as the name of an injected output of type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on the pipeline resources via the "file_manager" resource key, so, e.g., if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a : py:class:`~dagster_aws.s3.S3FileHandle`. asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the asset keys for materialized notebooks. description (Optional[str]): If set, description used for solid. tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid. Dagster uses the tag keys `notebook_path` and `kind`, which cannot be overwritten by the user. Returns: :py:class:`~dagster.SolidDefinition` """ check.str_param(name, "name") check.str_param(notebook_path, "notebook_path") input_defs = check.opt_list_param(input_defs, "input_defs", of_type=InputDefinition) output_defs = check.opt_list_param(output_defs, "output_defs", of_type=OutputDefinition) required_resource_keys = check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str) if output_notebook is not None: required_resource_keys.add("file_manager") if isinstance(asset_key_prefix, str): asset_key_prefix = [asset_key_prefix] asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str) default_description = f"This solid is backed by the notebook at {notebook_path}" description = check.opt_str_param(description, "description", default=default_description) user_tags = validate_tags(tags) if tags is not None: check.invariant( "notebook_path" not in tags, "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster", ) check.invariant( "kind" not in tags, "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster", ) default_tags = {"notebook_path": notebook_path, "kind": "ipynb"} return SolidDefinition( name=name, input_defs=input_defs, compute_fn=_dm_solid_compute(name, notebook_path, output_notebook, asset_key_prefix=asset_key_prefix), output_defs=output_defs + ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)] if output_notebook else []), config_schema=config_schema, required_resource_keys=required_resource_keys, description=description, tags={ **user_tags, **default_tags }, )
def __init__(self, config_type, func, required_resource_keys): self._config_type = check.inst_param(config_type, 'config_type', ConfigType) self._func = check.callable_param(func, 'func') self._required_resource_keys = check.opt_set_param( required_resource_keys, 'required_resource_keys', of_type=str)
def create_lakehouse_table_def( name, lakehouse_fn, input_tables=None, other_input_defs=None, required_resource_keys=None, metadata=None, description=None, ): metadata = check.opt_dict_param(metadata, 'metadata') input_tables = check.opt_list_param(input_tables, input_tables, of_type=LakehouseTableInputDefinition) other_input_defs = check.opt_list_param(other_input_defs, other_input_defs, of_type=InputDefinition) required_resource_keys = check.opt_set_param(required_resource_keys, 'required_resource_keys', of_type=str) table_type = define_python_dagster_type(python_type=ITableHandle, name=name, description=description) table_type_inst = table_type.inst() table_input_dict = { input_table.name: input_table for input_table in input_tables } input_defs = input_tables + other_input_defs validate_solid_fn('@solid', name, lakehouse_fn, input_defs, ['context']) def _compute(context, inputs): ''' Workhouse function of lakehouse. The inputs are something that inherits from ITableHandle. This compute_fn: (1) Iterates over input tables and ask the lakehouse resource to hydrate their contents or a representation of their contents (e.g a pyspark dataframe) into memory for computation (2) Pass those into the lakehouse table function. Do the actual thing. (3) Pass the output of the lakehouse function to the lakehouse materialize function. (4) Yield a materialization if the lakehouse function returned that. There's an argument that the hydrate and materialize functions should return a stream of events but that started to feel like I was implementing what should be a framework feature. ''' check.inst_param(context.resources.lakehouse, 'context.resources.lakehouse', Lakehouse) # hydrate tables hydrated_tables = {} other_inputs = {} for input_name, value in inputs.items(): context.log.info( 'About to hydrate table {input_name} for use in {name}'.format( input_name=input_name, name=name)) if input_name in table_input_dict: table_handle = value input_type = table_input_dict[input_name].runtime_type hydrated_tables[ input_name] = context.resources.lakehouse.hydrate( context, input_type, table_def_of_type(context.pipeline_def, input_type.name).metadata, table_handle, metadata, ) else: other_inputs[input_name] = value # call user-provided business logic which operates on the hydrated values # (as opposed to the handles) computed_output = lakehouse_fn(context, **hydrated_tables, **other_inputs) materialization, output_table_handle = context.resources.lakehouse.materialize( context, table_type_inst, metadata, computed_output) if materialization: yield materialization # just pass in a dummy handle for now if the materialize function # does not return one yield Output( output_table_handle if output_table_handle else TableHandle()) required_resource_keys.add('lakehouse') return LakehouseTableDefinition( lakehouse_fn=lakehouse_fn, name=name, input_tables=input_tables, input_defs=input_defs, output_defs=[OutputDefinition(table_type)], compute_fn=_compute, required_resource_keys=required_resource_keys, metadata=metadata, description=description, )
def __init__(self, name=None, required_resource_keys=None): self.name = check.opt_str_param(name, 'name') self.required_resource_keys = check.opt_set_param( required_resource_keys, 'required_resource_keys')
def __init__( self, type_check_fn, key=None, name=None, is_builtin=False, description=None, loader=None, materializer=None, serialization_strategy=None, auto_plugins=None, required_resource_keys=None, kind=DagsterTypeKind.REGULAR, ): check.opt_str_param(key, "key") check.opt_str_param(name, "name") check.invariant(not (name is None and key is None), "Must set key or name") if name is None: check.param_invariant( bool(key), "key", "If name is not provided, must provide key.", ) self.key, self._name = key, None elif key is None: check.param_invariant( bool(name), "name", "If key is not provided, must provide name.", ) self.key, self._name = name, name else: check.invariant(key and name) self.key, self._name = key, name self.description = check.opt_str_param(description, "description") self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader) self.materializer = check.opt_inst_param( materializer, "materializer", DagsterTypeMaterializer ) self.serialization_strategy = check.opt_inst_param( serialization_strategy, "serialization_strategy", SerializationStrategy, PickleSerializationStrategy(), ) self.required_resource_keys = check.opt_set_param( required_resource_keys, "required_resource_keys", ) self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn") _validate_type_check_fn(self._type_check_fn, self._name) auto_plugins = check.opt_list_param(auto_plugins, "auto_plugins", of_type=type) check.param_invariant( all( issubclass(auto_plugin_type, TypeStoragePlugin) for auto_plugin_type in auto_plugins ), "auto_plugins", ) self.auto_plugins = auto_plugins self.is_builtin = check.bool_param(is_builtin, "is_builtin") check.invariant( self.display_name is not None, "All types must have a valid display name, got None for key {}".format(key), ) self.kind = check.inst_param(kind, "kind", DagsterTypeKind)
def __init__(self, name: Optional[str] = None, required_resource_keys: Optional[Set[str]] = None): self.name = check.opt_str_param(name, "name") self.required_resource_keys = check.opt_set_param( required_resource_keys, "required_resource_keys")