def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, environment_dict=None, handle_kwargs=None, run_config_kwargs=None, solid_subset=None, solid_handle_kwargs=None, ): '''Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. ''' check.opt_str_param(output_log_path, 'output_log_path') check.opt_str_param(marshal_dir, 'marshal_dir') environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) check.dict_param(run_config_kwargs, 'run_config_kwargs') check.dict_param(handle_kwargs, 'handle_kwargs') check.opt_list_param(solid_subset, 'solid_subset', of_type=str) check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs') try: handle = load_handle.handle_for_pipeline_cli_args( handle_kwargs, use_default_repository_yaml=False) except (check.CheckError, load_handle.CliUsageError) as err: six.raise_from( DagstermillError( 'Cannot invoke a dagstermill solid from an in-memory pipeline that was not loaded ' 'from an ExecutionTargetHandle. Run this pipeline using dagit, the dagster CLI, ' 'through dagster-graphql, or in-memory after loading it through an ' 'ExecutionTargetHandle.'), err, ) pipeline_def = check.inst_param( handle.build_pipeline_definition(), 'pipeline_def (from handle {handle_dict})'.format( handle_dict=handle.data._asdict()), PipelineDefinition, ).build_sub_pipeline(solid_subset) solid_handle = SolidHandle.from_dict(solid_handle_kwargs) solid_def = pipeline_def.get_solid(solid_handle) run_config = RunConfig(**run_config_kwargs) # since we are rehydrating the SqliteEventSink we will skip the db init run_config = run_config.with_event_sink( SqliteEventSink(output_log_path, skip_db_init=True)) self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline_def = pipeline_def with scoped_pipeline_context( self.pipeline_def, environment_dict, run_config, scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillExecutionContext(pipeline_context) return self.context
def __new__(cls, data): return super(JsonMetadataEntryData, cls).__new__(cls, check.dict_param(data, 'data', key_type=str))
def _cli_get_user_process_api(kwargs): check.dict_param(kwargs, 'kwargs') return UserProcessApi.GRPC if kwargs.get('grpc') else UserProcessApi.CLI
def test_dict_param(): assert check.dict_param({}, "dict_param") == {} assert check.dict_param(frozendict(), "dict_param") == {} ddict = {"a": 2} assert check.dict_param(ddict, "dict_param") == ddict with pytest.raises(ParameterCheckError): check.dict_param(None, "dict_param") with pytest.raises(ParameterCheckError): check.dict_param(0, "dict_param") with pytest.raises(ParameterCheckError): check.dict_param(1, "dict_param") with pytest.raises(ParameterCheckError): check.dict_param("foo", "dict_param") with pytest.raises(ParameterCheckError): check.dict_param(["foo"], "dict_param") with pytest.raises(ParameterCheckError): check.dict_param([], "dict_param")
def pipeline_initialization_event_generator( execution_plan, environment_dict, pipeline_run, instance, scoped_resources_builder_cm, system_storage_data=None, raise_on_error=False, ): execution_plan = check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) pipeline_def = execution_plan.pipeline.get_definition() environment_dict = check.dict_param(environment_dict, 'environment_dict', key_type=str) pipeline_run = check.inst_param(pipeline_run, 'pipeline_run', PipelineRun) instance = check.inst_param(instance, 'instance', DagsterInstance) scoped_resources_builder_cm = check.callable_param( scoped_resources_builder_cm, 'scoped_resources_builder_cm' ) system_storage_data = check.opt_inst_param( system_storage_data, 'system_storage_data', SystemStorageData ) raise_on_error = check.bool_param(raise_on_error, 'raise_on_error') pipeline_context = None resources_manager = None try: context_creation_data = create_context_creation_data( execution_plan, environment_dict, pipeline_run, instance, ) executor_config = create_executor_config(context_creation_data) log_manager = create_log_manager(context_creation_data) resources_manager = scoped_resources_builder_cm( execution_plan, context_creation_data.environment_config, context_creation_data.pipeline_run, log_manager, context_creation_data.resource_keys_to_init, ) for event in resources_manager.generate_setup_events(): yield event scoped_resources_builder = check.inst( resources_manager.get_object(), ScopedResourcesBuilder ) system_storage_data = create_system_storage_data( context_creation_data, system_storage_data, scoped_resources_builder ) pipeline_context = construct_pipeline_execution_context( context_creation_data=context_creation_data, scoped_resources_builder=scoped_resources_builder, system_storage_data=system_storage_data, log_manager=log_manager, executor_config=executor_config, raise_on_error=raise_on_error, ) _validate_plan_with_context(pipeline_context, execution_plan) yield pipeline_context for event in resources_manager.generate_teardown_events(): yield event except DagsterError as dagster_error: if pipeline_context is None: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info() ) error_info = serializable_error_info_from_exc_info(user_facing_exc_info) yield DagsterEvent.pipeline_init_failure( pipeline_name=pipeline_def.name, failure_data=PipelineInitFailureData(error=error_info), log_manager=_create_context_free_log_manager(instance, pipeline_run, pipeline_def), ) if resources_manager: for event in resources_manager.generate_teardown_events(): yield event else: # pipeline teardown failure raise dagster_error if raise_on_error: raise dagster_error
def ensure_single_item(ddict): check.dict_param(ddict, 'ddict') check.param_invariant( len(ddict) == 1, 'ddict', 'Expected dict with single item') return list(ddict.items())[0]
def event_generator( self, execution_plan, run_config, pipeline_run, instance, scoped_resources_builder_cm, intermediate_storage=None, raise_on_error=False, ): execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan) pipeline_def = execution_plan.pipeline.get_definition() run_config = check.dict_param(run_config, "run_config", key_type=str) pipeline_run = check.inst_param(pipeline_run, "pipeline_run", PipelineRun) instance = check.inst_param(instance, "instance", DagsterInstance) scoped_resources_builder_cm = check.callable_param( scoped_resources_builder_cm, "scoped_resources_builder_cm") intermediate_storage = check.opt_inst_param( intermediate_storage, "intermediate_storage_data", IntermediateStorage) raise_on_error = check.bool_param(raise_on_error, "raise_on_error") execution_context = None resources_manager = None try: context_creation_data = create_context_creation_data( execution_plan, run_config, pipeline_run, instance, ) log_manager = create_log_manager(context_creation_data) resources_manager = scoped_resources_builder_cm( execution_plan, context_creation_data.environment_config, context_creation_data.pipeline_run, log_manager, context_creation_data.resource_keys_to_init, instance, ) yield from resources_manager.generate_setup_events() scoped_resources_builder = check.inst( resources_manager.get_object(), ScopedResourcesBuilder) intermediate_storage = create_intermediate_storage( context_creation_data, intermediate_storage, scoped_resources_builder, ) execution_context = self.construct_context( context_creation_data=context_creation_data, scoped_resources_builder=scoped_resources_builder, log_manager=log_manager, intermediate_storage=intermediate_storage, raise_on_error=raise_on_error, ) _validate_plan_with_context(execution_context, execution_plan) yield execution_context yield from resources_manager.generate_teardown_events() except DagsterError as dagster_error: if execution_context is None: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info()) error_info = serializable_error_info_from_exc_info( user_facing_exc_info) yield DagsterEvent.pipeline_init_failure( pipeline_name=pipeline_def.name, failure_data=PipelineInitFailureData(error=error_info), log_manager=_create_context_free_log_manager( instance, pipeline_run, pipeline_def), ) if resources_manager: yield from resources_manager.generate_teardown_events() else: # pipeline teardown failure raise dagster_error if raise_on_error: raise dagster_error
def _execute_plan(self, execute_step_args_packed, executable_dict): execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(executable_dict, "executable_dict") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline = ReconstructablePipeline.from_dict(executable_dict) retry_mode = execute_step_args.retry_mode pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.invariant( pipeline_run, "Could not load run {}".format(execute_step_args.pipeline_run_id)) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) execution_plan = create_execution_plan( pipeline, pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=execute_step_args.step_keys_to_execute, known_state=execute_step_args.known_state, ) engine_event = instance.report_engine_event( "Executing steps {} in celery worker".format(step_keys_str), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "step_keys"), EventMetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryExecutor, step_key=execution_plan.step_handle_for_single_step_plans().to_key( ), ) events = [engine_event] for step_event in execute_plan_iterator( execution_plan=execution_plan, pipeline=pipeline, pipeline_run=pipeline_run, instance=instance, retry_mode=retry_mode, run_config=pipeline_run.run_config, ): events.append(step_event) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _build_execution_param_list_for_backfill( instance, partition_data_list, backfill_id, backfill_params, pipeline_selector, external_partition_set, ): check.inst_param(instance, "instance", DagsterInstance) check.list_param(partition_data_list, "partition_data_list", of_type=ExternalPartitionExecutionParamData) check.str_param(backfill_id, "backfill_id") check.dict_param(backfill_params, "backfill_params") check.inst_param(pipeline_selector, "pipeline_selector", PipelineSelector) check.inst_param(external_partition_set, "external_partition_set", ExternalPartitionSet) backfill_tags = PipelineRun.tags_for_backfill_id(backfill_id) execution_tags = { t["key"]: t["value"] for t in backfill_params.get("tags", []) } execution_param_list = [] for partition_data in partition_data_list: tags = merge_dicts(merge_dicts(partition_data.tags, backfill_tags), execution_tags) if not backfill_params.get("fromFailure") and not backfill_params.get( "reexecutionSteps"): # full pipeline execution execution_param_list.append( ExecutionParams( selector=pipeline_selector, run_config=partition_data.run_config, mode=external_partition_set.mode, execution_metadata=ExecutionMetadata(run_id=None, tags=tags), step_keys=None, )) continue last_run = _fetch_last_run(instance, external_partition_set, partition_data.name) if backfill_params.get("fromFailure"): if not last_run or last_run.status != PipelineRunStatus.FAILURE: continue execution_param_list.append( ExecutionParams( selector=pipeline_selector, run_config=partition_data.run_config, mode=external_partition_set.mode, execution_metadata=ExecutionMetadata( run_id=None, tags=merge_dicts(tags, {RESUME_RETRY_TAG: "true"}), root_run_id=last_run.root_run_id or last_run.run_id, parent_run_id=last_run.run_id, ), step_keys=None, )) continue # partial reexecution from success if not last_run or last_run.status != PipelineRunStatus.SUCCESS: continue execution_param_list.append( ExecutionParams( selector=pipeline_selector, run_config=partition_data.run_config, mode=external_partition_set.mode, execution_metadata=ExecutionMetadata( run_id=None, tags=tags, root_run_id=last_run.root_run_id or last_run.run_id, parent_run_id=last_run.run_id, ), step_keys=backfill_params["reexecutionSteps"], )) continue return execution_param_list
def build_resources( resources: Dict[str, Any], instance: Optional[DagsterInstance] = None, resource_config: Optional[Dict[str, Any]] = None, pipeline_run: Optional[PipelineRun] = None, log_manager: Optional[DagsterLogManager] = None, ) -> Generator[Resources, None, None]: """Context manager that yields resources using provided resource definitions and run config. This API allows for using resources in an independent context. Resources will be initialized with the provided run config, and optionally, pipeline_run. The resulting resources will be yielded on a dictionary keyed identically to that provided for `resource_defs`. Upon exiting the context, resources will also be torn down safely. Args: resources (Dict[str, Any]): Resource instances or definitions to build. All required resource dependencies to a given resource must be contained within this dictionary, or the resource build will fail. instance (Optional[DagsterInstance]): The dagster instance configured to instantiate resources on. resource_config (Optional[Dict[str, Any]]): A dict representing the config to be provided to each resource during initialization and teardown. pipeline_run (Optional[PipelineRun]): The pipeline run to provide during resource initialization and teardown. If the provided resources require either the `pipeline_run` or `run_id` attributes of the provided context during resource initialization and/or teardown, this must be provided, or initialization will fail. log_manager (Optional[DagsterLogManager]): Log Manager to use during resource initialization. Defaults to system log manager. Examples: .. code-block:: python from dagster import resource, build_resources @resource def the_resource(): return "foo" with build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources: assert resources.from_def == "foo" assert resources.from_val == "bar" """ resources = check.dict_param(resources, "resource_defs", key_type=str) instance = check.opt_inst_param(instance, "instance", DagsterInstance) resource_config = check.opt_dict_param(resource_config, "resource_config", key_type=str) log_manager = check.opt_inst_param(log_manager, "log_manager", DagsterLogManager) resource_defs = wrap_resources_for_execution(resources) mapped_resource_config = _get_mapped_resource_config( resource_defs, resource_config) with ephemeral_instance_if_missing(instance) as dagster_instance: resources_manager = resource_initialization_manager( resource_defs=resource_defs, resource_configs=mapped_resource_config, log_manager=log_manager if log_manager else initialize_console_manager(pipeline_run), execution_plan=None, pipeline_run=pipeline_run, resource_keys_to_init=set(resource_defs.keys()), instance=dagster_instance, emit_persistent_events=False, pipeline_def_for_backwards_compat=None, ) try: list(resources_manager.generate_setup_events()) instantiated_resources = check.inst(resources_manager.get_object(), ScopedResourcesBuilder) yield instantiated_resources.build( set(instantiated_resources.resource_instance_dict.keys())) finally: list(resources_manager.generate_teardown_events())
def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, environment_dict=None, handle_kwargs=None, pipeline_run_dict=None, solid_subset=None, solid_handle_kwargs=None, instance_ref_dict=None, ): '''Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. ''' check.opt_str_param(output_log_path, 'output_log_path') check.opt_str_param(marshal_dir, 'marshal_dir') environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) check.dict_param(pipeline_run_dict, 'pipeline_run_dict') check.dict_param(handle_kwargs, 'handle_kwargs') check.opt_list_param(solid_subset, 'solid_subset', of_type=str) check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs') check.dict_param(instance_ref_dict, 'instance_ref_dict') try: handle = load_handle.handle_for_pipeline_cli_args( handle_kwargs, use_default_repository_yaml=False ) except (check.CheckError, load_handle.UsageError) as err: six.raise_from( DagstermillError( 'Cannot invoke a dagstermill solid from an in-memory pipeline that was not loaded ' 'from an ExecutionTargetHandle. Run this pipeline using dagit, the dagster CLI, ' 'through dagster-graphql, or in-memory after loading it through an ' 'ExecutionTargetHandle.' ), err, ) try: instance_ref = unpack_value(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) except Exception as err: # pylint: disable=broad-except six.raise_from( DagstermillError( 'Error when attempting to resolve DagsterInstance from serialized InstanceRef' ), err, ) pipeline_def = check.inst_param( handle.build_pipeline_definition(), 'pipeline_def (from handle {handle_dict})'.format(handle_dict=handle.data._asdict()), PipelineDefinition, ).build_sub_pipeline(solid_subset) solid_handle = SolidHandle.from_dict(solid_handle_kwargs) solid_def = pipeline_def.get_solid(solid_handle) pipeline_run = unpack_value(pipeline_run_dict) self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline_def = pipeline_def with scoped_pipeline_context( self.pipeline_def, environment_dict, pipeline_run, instance=instance, scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillExecutionContext(pipeline_context) return self.context
def from_dict(config): check.dict_param(config, "config", key_type=str) return ResourceConfig(config=config.get("config"))
def dagster_event_from_dict(event_dict, pipeline_name): check.dict_param(event_dict, 'event_dict', key_type=str) check.str_param(pipeline_name, 'pipeline_name') # Get event_type event_type = HANDLED_EVENTS.get(event_dict['__typename']) if not event_type: raise Exception('unhandled event type %s' % event_dict['__typename']) # Get event_specific_data event_specific_data = None if event_type == DagsterEventType.STEP_OUTPUT: event_specific_data = StepOutputData( step_output_handle=StepOutputHandle(event_dict['step']['key'], event_dict['outputName']), type_check_data=TypeCheckData( success=event_dict['typeCheck']['success'], label=event_dict['typeCheck']['label'], description=event_dict.get('description'), metadata_entries=list( event_metadata_entries(event_dict.get('metadataEntries')) or []), ), ) elif event_type == DagsterEventType.STEP_INPUT: event_specific_data = StepInputData( input_name=event_dict['inputName'], type_check_data=TypeCheckData( success=event_dict['typeCheck']['success'], label=event_dict['typeCheck']['label'], description=event_dict.get('description'), metadata_entries=list( event_metadata_entries(event_dict.get('metadataEntries')) or []), ), ) elif event_type == DagsterEventType.STEP_SUCCESS: event_specific_data = StepSuccessData(0.0) elif event_type == DagsterEventType.STEP_UP_FOR_RETRY: event_specific_data = StepRetryData( error=error_from_data(event_dict['retryError']), seconds_to_wait=event_dict['secondsToWait'], ) elif event_type == DagsterEventType.STEP_MATERIALIZATION: materialization = event_dict['materialization'] event_specific_data = StepMaterializationData( materialization=materialization_from_data(materialization)) elif event_type == DagsterEventType.STEP_EXPECTATION_RESULT: expectation_result = expectation_result_from_data( event_dict['expectationResult']) event_specific_data = StepExpectationResultData(expectation_result) elif event_type == DagsterEventType.STEP_FAILURE: event_specific_data = StepFailureData( error_from_data(event_dict['error']), UserFailureData( label=event_dict['failureMetadata']['label'], description=event_dict['failureMetadata']['description'], metadata_entries=list( event_metadata_entries(event_dict.get('metadataEntries')) or []), ) if event_dict.get('failureMetadata') else None, ) elif event_type == DagsterEventType.ENGINE_EVENT: event_specific_data = EngineEventData( metadata_entries=list( event_metadata_entries(event_dict.get('metadataEntries'))), marker_start=event_dict.get('markerStart'), marker_end=event_dict.get('markerEnd'), error=error_from_data(event_dict['engineError']) if event_dict.get('engineError') else None, ) # We should update the GraphQL response so that clients don't need to do this handle parsing. # See: https://github.com/dagster-io/dagster/issues/1559 handle = None step_key = None step_kind_value = None if 'step' in event_dict and event_dict['step']: step_key = event_dict['step']['key'] step_kind_value = event_dict['step']['kind'] keys = event_dict['step']['solidHandleID'].split('.') while keys: handle = SolidHandle(keys.pop(0), parent=handle) return DagsterEvent( event_type_value=event_type.value, pipeline_name=pipeline_name, step_key=step_key, solid_handle=handle, step_kind_value=step_kind_value, logging_tags=None, event_specific_data=event_specific_data, )
def __init__(self, run_id, tags, loggers): self.run_id = check.str_param(run_id, 'run_id') self.tags = check.dict_param(tags, 'tags') self.loggers = check.list_param(loggers, 'loggers', of_type=logging.Logger)
def test_dict_param(): assert check.dict_param({}, 'dict_param') == {} ddict = {'a': 2} assert check.dict_param(ddict, 'dict_param') == ddict with pytest.raises(ParameterCheckError): check.dict_param(None, 'dict_param') with pytest.raises(ParameterCheckError): check.dict_param(0, 'dict_param') with pytest.raises(ParameterCheckError): check.dict_param(1, 'dict_param') with pytest.raises(ParameterCheckError): check.dict_param('foo', 'dict_param') with pytest.raises(ParameterCheckError): check.dict_param(['foo'], 'dict_param') with pytest.raises(ParameterCheckError): check.dict_param([], 'dict_param')
def __new__(cls, run_id, tags): return super(ExecutionMetadata, cls).__new__( cls, check.opt_str_param(run_id, 'run_id'), check.dict_param(tags, 'tags', key_type=str, value_type=str), )
def __init__(self, *args, **kwargs): super(frozentags, self).__init__(*args, **kwargs) check.dict_param(self, 'self', key_type=str, value_type=str)
def _execute_step_docker( self, execute_step_args_packed, docker_config, ): """Run step execution in a Docker container.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(docker_config, "docker_config") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) input_json = serialize_dagster_namedtuple(execute_step_args) command = "dagster api execute_step {}".format(json.dumps(input_json)) docker_image = (docker_config["image"] if docker_config.get("image") else execute_step_args. pipeline_origin.repository_origin.container_image) if not docker_image: raise Exception( "No docker image specified by either the job or the repository" ) client = docker.client.from_env() if docker_config.get("registry"): client.login( registry=docker_config["registry"]["url"], username=docker_config["registry"]["username"], password=docker_config["registry"]["password"], ) # Post event for starting execution engine_event = instance.report_engine_event( "Executing steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "Step keys"), EventMetadataEntry.text(docker_image, "Image"), EventMetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) serialized_events = [serialize_dagster_namedtuple(engine_event)] docker_env = {} if docker_config.get("env_vars"): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config["env_vars"] } try: docker_response = client.containers.run( docker_image, command=command, detach=False, auto_remove=True, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, network=docker_config.get("network", None), ) res = docker_response.decode("utf-8") except docker.errors.ContainerError as err: instance.report_engine_event( "Failed to run steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, "Job image"), EventMetadataEntry.text(err.stderr, "Docker stderr"), ], ), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) raise else: if res is None: raise Exception( "No response from execute_step in CeleryDockerExecutor") serialized_events += [event for event in res.split("\n") if event] return serialized_events
def test_dict_param(kwargs, should_succeed): if should_succeed: assert check.dict_param(**kwargs, param_name="name") == kwargs["obj"] else: with pytest.raises(CheckError): check.dict_param(**kwargs, param_name="name")
def _input_schema(_context, value): check.dict_param(value, 'value') check.param_invariant(set(value.keys()) == field_names, 'value') return value
def _expand_fields_dict(original_root, fields, stack): check.dict_param(fields, 'fields') return { name: _convert_potential_field(original_root, value, stack + [name]) for name, value in fields.items() }
def graph( name: Optional[Union[Callable[..., Any], str]] = None, description: Optional[str] = None, input_defs: Optional[List[InputDefinition]] = None, output_defs: Optional[List[OutputDefinition]] = None, ins: Optional[Dict[str, GraphIn]] = None, out: Optional[Union[GraphOut, Dict[str, GraphOut]]] = None, tags: Optional[Dict[str, Any]] = None, config: Optional[Union[ConfigMapping, Dict[str, Any]]] = None, ) -> Union[GraphDefinition, _Graph]: """Create a graph with the specified parameters from the decorated composition function. Using this decorator allows you to build up a dependency graph by writing a function that invokes ops (or other graphs) and passes the output to subsequent invocations. Args: name (Optional[str]): The name of the graph. Must be unique within any :py:class:`RepositoryDefinition` containing the graph. description (Optional[str]): A human-readable description of the graph. input_defs (Optional[List[InputDefinition]]): Information about the inputs that this graph maps. Information provided here will be combined with what can be inferred from the function signature, with these explicit InputDefinitions taking precedence. Uses of inputs in the body of the decorated composition function will determine the :py:class:`InputMappings <InputMapping>` passed to the underlying :py:class:`GraphDefinition`. output_defs (Optional[List[OutputDefinition]]): Output definitions for the graph. If not provided explicitly, these will be inferred from typehints. Uses of these outputs in the body of the decorated composition function, as well as the return value of the decorated function, will be used to infer the appropriate set of :py:class:`OutputMappings <OutputMapping>` for the underlying :py:class:`GraphDefinition`. To map multiple outputs, return a dictionary from the composition function. ins (Optional[Dict[str, GraphIn]]): Information about the inputs that this graph maps. Information provided here will be combined with what can be inferred from the function signature, with these explicit GraphIn taking precedence. out (Optional[Union[GraphOut, Dict[str, GraphOut]]]): Information about the outputs that this graph maps. Information provided here will be combined with what can be inferred from the return type signature if the function does not use yield. To map multiple outputs, return a dictionary from the composition function. tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the graph. Values that are not strings will be json encoded and must meet the criteria that `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag values provided at invocation time. """ if callable(name): check.invariant(description is None) return _Graph()(name) config_mapping = None # Case 1: a dictionary of config is provided, convert to config mapping. if config is not None and not isinstance(config, ConfigMapping): config = check.dict_param(config, "config", key_type=str) config_mapping = ConfigMapping(config_fn=lambda _: config, config_schema=None) # Case 2: actual config mapping is provided. else: config_mapping = config return _Graph( name=name, description=description, input_defs=input_defs, output_defs=output_defs, ins=ins, out=out, tags=tags, config_mapping=config_mapping, )
def test_dict_param_with_type(): str_to_int = {"str": 1} assert check.dict_param(str_to_int, "str_to_int", key_type=str, value_type=int) assert check.dict_param(str_to_int, "str_to_int", value_type=int) assert check.dict_param(str_to_int, "str_to_int", key_type=str) assert check.dict_param(str_to_int, "str_to_int") assert check.dict_param({}, "str_to_int", key_type=str, value_type=int) == {} assert check.dict_param({}, "str_to_int", value_type=int) == {} assert check.dict_param({}, "str_to_int", key_type=str) == {} assert check.dict_param({}, "str_to_int") == {} class Wrong(object): pass with pytest.raises(CheckError): assert check.dict_param(str_to_int, "str_to_int", key_type=Wrong, value_type=Wrong) with pytest.raises(CheckError): assert check.dict_param(str_to_int, "str_to_int", key_type=Wrong, value_type=int) with pytest.raises(CheckError): assert check.dict_param(str_to_int, "str_to_int", key_type=str, value_type=Wrong) with pytest.raises(CheckError): assert check.dict_param(str_to_int, "str_to_int", key_type=Wrong) with pytest.raises(CheckError): assert check.dict_param(str_to_int, "str_to_int", value_type=Wrong)
def _validate_resource_dependencies(mode_definitions, node_defs, dagster_type_dict, solid_dict, pipeline_hook_defs): """This validation ensures that each pipeline context provides the resources that are required by each solid. """ check.list_param(mode_definitions, "mode_definitions", of_type=ModeDefinition) check.list_param(node_defs, "node_defs", of_type=NodeDefinition) check.dict_param(dagster_type_dict, "dagster_type_dict") check.dict_param(solid_dict, "solid_dict") check.set_param(pipeline_hook_defs, "pipeline_hook_defs", of_type=HookDefinition) for mode_def in mode_definitions: mode_resources = set(mode_def.resource_defs.keys()) for node_def in node_defs: for required_resource in node_def.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError(( 'Resource "{resource}" is required by solid def {node_def_name}, but is not ' 'provided by mode "{mode_name}".').format( resource=required_resource, node_def_name=node_def.name, mode_name=mode_def.name, )) _validate_type_resource_deps_for_mode(mode_def, mode_resources, dagster_type_dict) for system_storage_def in mode_def.system_storage_defs: for required_resource in system_storage_def.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError(( "Resource '{resource}' is required by system storage " "'{storage_name}', but is not provided by mode '{mode_name}'." ).format( resource=required_resource, storage_name=system_storage_def.name, mode_name=mode_def.name, )) for intermediate_storage in mode_def.intermediate_storage_defs or []: for required_resource in intermediate_storage.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError(( "Resource '{resource}' is required by intermediate storage " "'{storage_name}', but is not provided by mode '{mode_name}'." ).format( resource=required_resource, storage_name=intermediate_storage.name, mode_name=mode_def.name, )) for solid in solid_dict.values(): for hook_def in solid.hook_defs: for required_resource in hook_def.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError(( 'Resource "{resource}" is required by hook "{hook_name}", but is not ' 'provided by mode "{mode_name}".').format( resource=required_resource, hook_name=hook_def.name, mode_name=mode_def.name, )) for hook_def in pipeline_hook_defs: for required_resource in hook_def.required_resource_keys: if required_resource not in mode_resources: raise DagsterInvalidDefinitionError(( 'Resource "{resource}" is required by hook "{hook_name}", but is not ' 'provided by mode "{mode_name}".').format( resource=required_resource, hook_name=hook_def.name, mode_name=mode_def.name, ))
def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, run_config=None, executable_dict=None, pipeline_run_dict=None, solid_handle_kwargs=None, instance_ref_dict=None, ): """Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. """ check.opt_str_param(output_log_path, "output_log_path") check.opt_str_param(marshal_dir, "marshal_dir") run_config = check.opt_dict_param(run_config, "run_config", key_type=str) check.dict_param(pipeline_run_dict, "pipeline_run_dict") check.dict_param(executable_dict, "executable_dict") check.dict_param(solid_handle_kwargs, "solid_handle_kwargs") check.dict_param(instance_ref_dict, "instance_ref_dict") pipeline = ReconstructablePipeline.from_dict(executable_dict) pipeline_def = pipeline.get_definition() try: instance_ref = unpack_value(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) except Exception as err: # pylint: disable=broad-except six.raise_from( DagstermillError( "Error when attempting to resolve DagsterInstance from serialized InstanceRef" ), err, ) pipeline_run = unpack_value(pipeline_run_dict) solid_handle = SolidHandle.from_dict(solid_handle_kwargs) solid_def = pipeline_def.get_solid(solid_handle).definition self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline = pipeline execution_plan = create_execution_plan( self.pipeline, run_config, mode=pipeline_run.mode, step_keys_to_execute=pipeline_run.step_keys_to_execute, ) with scoped_pipeline_context( execution_plan, run_config, pipeline_run, instance, scoped_resources_builder_cm=self._setup_resources, # Set this flag even though we're not in test for clearer error reporting raise_on_error=True, ) as pipeline_context: self.context = DagstermillRuntimeExecutionContext( pipeline_context=pipeline_context, solid_config=run_config.get("solids", {}).get(solid_def.name, {}).get("config"), resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_context.system_storage_def, pipeline_context.intermediate_storage_def, ), solid_name=solid_def.name, ) return self.context
def _execute_step_k8s_job( self, execute_step_args_packed, job_config_dict, job_namespace, load_incluster_config, job_wait_timeout, user_defined_k8s_config_dict=None, kubeconfig_file=None, ): """Run step execution in a K8s job pod.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", ) ) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.invariant( len(execute_step_args.step_keys_to_execute) == 1, "Celery K8s task executor can only execute 1 step at a time", ) # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, "job_config", DagsterK8sJobConfig) check.str_param(job_namespace, "job_namespace") check.bool_param(load_incluster_config, "load_incluster_config") user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict( user_defined_k8s_config_dict ) check.opt_inst_param( user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig, ) check.opt_str_param(kubeconfig_file, "kubeconfig_file") # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id(execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_key = execute_step_args.step_keys_to_execute[0] celery_worker_name = self.request.hostname celery_pod_name = os.environ.get("HOSTNAME") instance.report_engine_event( "Task for step {step_key} picked up by Celery".format(step_key=step_key), pipeline_run, EngineEventData( [ MetadataEntry("Celery worker name", value=celery_worker_name), MetadataEntry("Celery worker Kubernetes Pod name", value=celery_pod_name), ] ), CeleryK8sJobExecutor, step_key=step_key, ) if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( "Not scheduling step because dagster run status is not STARTED", pipeline_run, EngineEventData( [ MetadataEntry("Step key", value=step_key), ] ), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id, step_key) retry_state = execute_step_args.known_state.get_retry_state() if retry_state.get_attempt_count(step_key): attempt_number = retry_state.get_attempt_count(step_key) job_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number) pod_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number) else: job_name = "dagster-step-%s" % (k8s_name_key) pod_name = "dagster-step-%s" % (k8s_name_key) args = execute_step_args.get_command_args() job = construct_dagster_k8s_job( job_config, args, job_name, user_defined_k8s_config, pod_name, component="step_worker", labels={ "dagster/job": execute_step_args.pipeline_origin.pipeline_name, "dagster/op": step_key, "dagster/run-id": execute_step_args.pipeline_run_id, }, ) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( "Executing step {} in Kubernetes job {}".format(step_key, job_name), pipeline_run, EngineEventData( [ MetadataEntry("Step key", value=step_key), MetadataEntry("Kubernetes Job name", value=job_name), MetadataEntry("Job image", value=job_config.job_image), MetadataEntry("Image pull policy", value=job_config.image_pull_policy), MetadataEntry("Image pull secrets", value=str(job_config.image_pull_secrets)), MetadataEntry( "Service account name", value=str(job_config.service_account_name) ), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == "Conflict": # There is an existing job with the same name so proceed and see if the existing job succeeded instance.report_engine_event( "Did not create Kubernetes job {} for step {} since job name already " "exists, proceeding with existing job.".format(job_name, step_key), pipeline_run, EngineEventData( [ MetadataEntry("Step key", value=step_key), MetadataEntry("Kubernetes Job name", value=job_name), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( "Encountered unexpected error while creating Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ MetadataEntry("Step key", value=step_key), ], error=serializable_error_info_from_exc_info(sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=execute_step_args.pipeline_run_id, wait_timeout=job_wait_timeout, ) except (DagsterK8sError, DagsterK8sTimeoutError) as err: step_failure_event = construct_step_failure_event_and_handle( pipeline_run, step_key, err, instance=instance ) events.append(step_failure_event) except DagsterK8sPipelineStatusException: instance.report_engine_event( "Terminating Kubernetes Job because dagster run status is not STARTED", pipeline_run, EngineEventData( [ MetadataEntry("Step key", value=step_key), MetadataEntry("Kubernetes Job name", value=job_name), MetadataEntry("Kubernetes Job namespace", value=job_namespace), ] ), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return [] except ( DagsterK8sUnrecoverableAPIError, DagsterK8sAPIRetryLimitExceeded, # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in # a retry boundary. We still catch it here just in case we missed one so that we can # report it to the event log kubernetes.client.rest.ApiException, ) as err: instance.report_engine_event( "Encountered unexpected error while waiting on Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ MetadataEntry("Step key", value=step_key), ], error=serializable_error_info_from_exc_info(sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ MetadataEntry("Step key", value=step_key), ], error=serializable_error_info_from_exc_info(sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Post engine event for log retrieval engine_event = instance.report_engine_event( "Retrieving logs from Kubernetes Job pods", pipeline_run, EngineEventData([MetadataEntry("Pod names", value="\n".join(pod_names))]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: try: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split("\n") except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error while fetching pod logs for Kubernetes job {}, " "Pod name {} for step {}. Will attempt to continue with other pods.".format( job_name, pod_name, step_key ), pipeline_run, EngineEventData( [ MetadataEntry("Step key", value=step_key), ], error=serializable_error_info_from_exc_info(sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) events += filter_dagster_events_from_pod_logs(logs) serialized_events = [serialize_dagster_namedtuple(event) for event in events] return serialized_events
def __init__(self, step_dict, deps): self.step_dict = check.dict_param( step_dict, 'step_dict', key_type=str, value_type=ExecutionStep ) self.deps = check.dict_param(deps, 'deps', key_type=str, value_type=set) self.steps = list(step_dict.values())
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, resources=None, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time') check.dict_param(run_config, 'run_config') check.str_param(mode, 'mode') check.str_param(repo_name, 'repo_name') check.str_param(repo_location_name, 'repo_location_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.dict_param(retries_dict, 'retries_dict') check.opt_dict_param(resources, 'resources', key_type=str, value_type=dict) check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_key = step_keys[0] if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( 'Not scheduling step because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = _get_k8s_name_key(run_id, step_keys) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) else: job_name = 'dagster-job-%s' % (k8s_name_key) pod_name = 'dagster-job-%s' % (k8s_name_key) variables = { 'executionParams': { 'runConfigData': run_config, 'mode': mode, 'selector': { 'repositoryLocationName': repo_location_name, 'repositoryName': repo_name, 'pipelineName': pipeline_run.pipeline_name, 'solidSelection': list(pipeline_run.solids_to_execute) if pipeline_run.solids_to_execute else None, }, 'executionMetadata': { 'runId': run_id }, 'stepKeys': step_keys, }, 'retries': retries.to_graphql_input(), } args = [ '-p', 'executePlan', '-v', seven.json.dumps(variables), '--remap-sigterm' ] job = construct_dagster_graphql_k8s_job(job_config, args, job_name, resources, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( 'Executing step {} in Kubernetes job {}'.format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text(str(job_config.image_pull_secrets), 'Image pull secrets'), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( 'Terminating Kubernetes Job because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(job_namespace, 'Kubernetes Job namespace'), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData( [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') res = parse_raw_log_lines(logs) handle_execution_errors(res, 'executePlan') step_events = handle_execute_plan_result(res) events += step_events serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def create_execution_structure(solid_defs, dependencies_dict, container_definition): '''This builder takes the dependencies dictionary specified during creation of the PipelineDefinition object and builds (1) the execution structure and (2) a solid dependency dictionary. For example, for the following dependencies: dep_dict = { SolidInvocation('giver'): {}, SolidInvocation('sleeper', alias='sleeper_1'): { 'units': DependencyDefinition('giver', 'out_1') }, SolidInvocation('sleeper', alias='sleeper_2'): { 'units': DependencyDefinition('giver', 'out_2') }, SolidInvocation('sleeper', alias='sleeper_3'): { 'units': DependencyDefinition('giver', 'out_3') }, SolidInvocation('sleeper', alias='sleeper_4'): { 'units': DependencyDefinition('giver', 'out_4') }, SolidInvocation('total'): { 'in_1': DependencyDefinition('sleeper_1', 'total'), 'in_2': DependencyDefinition('sleeper_2', 'total'), 'in_3': DependencyDefinition('sleeper_3', 'total'), 'in_4': DependencyDefinition('sleeper_4', 'total'), }, }, This will create: pipeline_solid_dict = { 'giver': <dagster.core.definitions.dependency.Solid object>, 'sleeper_1': <dagster.core.definitions.dependency.Solid object>, 'sleeper_2': <dagster.core.definitions.dependency.Solid object>, 'sleeper_3': <dagster.core.definitions.dependency.Solid object>, 'sleeper_4': <dagster.core.definitions.dependency.Solid object>, 'total': <dagster.core.definitions.dependency.Solid object> } as well as a dagster.core.definitions.dependency.DependencyStructure object. ''' from .solid import ISolidDefinition, CompositeSolidDefinition check.list_param(solid_defs, 'solid_defs', of_type=ISolidDefinition) check.dict_param( dependencies_dict, 'dependencies_dict', key_type=six.string_types + (SolidInvocation, ), value_type=dict, ) # container_definition is none in the context of a pipeline check.opt_inst_param(container_definition, 'container_definition', CompositeSolidDefinition) # Same as dep_dict but with SolidInvocation replaced by alias string aliased_dependencies_dict = {} # Keep track of solid name -> all aliases used and alias -> name name_to_aliases = defaultdict(set) alias_to_solid_instance = {} alias_to_name = {} for solid_key, input_dep_dict in dependencies_dict.items(): # We allow deps of the form dependencies={'foo': DependencyDefition('bar')} # Here, we replace 'foo' with SolidInvocation('foo') if not isinstance(solid_key, SolidInvocation): solid_key = SolidInvocation(solid_key) alias = solid_key.alias or solid_key.name name_to_aliases[solid_key.name].add(alias) alias_to_solid_instance[alias] = solid_key alias_to_name[alias] = solid_key.name aliased_dependencies_dict[alias] = input_dep_dict for dependency in input_dep_dict.values(): for dep in dependency.get_definitions(): name_to_aliases[dep.solid].add(dep.solid) pipeline_solid_dict = _build_pipeline_solid_dict(solid_defs, name_to_aliases, alias_to_solid_instance, container_definition) _validate_dependencies(aliased_dependencies_dict, pipeline_solid_dict, alias_to_name) dependency_structure = DependencyStructure.from_definitions( pipeline_solid_dict, aliased_dependencies_dict) return dependency_structure, pipeline_solid_dict
def __init__(self, ddict): check.dict_param(ddict, 'ddict', key_type=str, value_type=Field) super(FieldDefinitionDictionary, self).__init__(ddict)