def __init__( self, pipeline: IPipeline, environment_config: EnvironmentConfig, mode: Optional[str], step_keys_to_execute: Optional[List[str]], known_state, ): self.pipeline = check.inst_param(pipeline, "pipeline", IPipeline) self.environment_config = check.inst_param( environment_config, "environment_config", EnvironmentConfig ) check.opt_str_param(mode, "mode") check.opt_list_param(step_keys_to_execute, "step_keys_to_execute", str) self.step_keys_to_execute = step_keys_to_execute self.mode_definition = ( pipeline.get_definition().get_mode_definition(mode) if mode is not None else pipeline.get_definition().get_default_mode() ) self._steps: Dict[str, ExecutionStepUnion] = OrderedDict() self.step_output_map: Dict[ SolidOutputHandle, Union[StepOutputHandle, UnresolvedStepOutputHandle] ] = dict() self.known_state = known_state self._seen_handles: Set[StepHandleUnion] = set()
def execute_run_iterator(pipeline: IPipeline, pipeline_run: PipelineRun, instance: DagsterInstance) -> Iterator[DagsterEvent]: check.inst_param(pipeline, "pipeline", IPipeline) check.inst_param(pipeline_run, "pipeline_run", PipelineRun) check.inst_param(instance, "instance", DagsterInstance) if pipeline_run.status == PipelineRunStatus.CANCELED: # This can happen if the run was force-terminated while it was starting def gen_execute_on_cancel(): yield instance.report_engine_event( "Not starting execution since the run was canceled before execution could start", pipeline_run, ) return gen_execute_on_cancel() check.invariant( pipeline_run.status == PipelineRunStatus.NOT_STARTED or pipeline_run.status == PipelineRunStatus.STARTING, desc="Pipeline run {} ({}) in state {}, expected NOT_STARTED or STARTING" .format(pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status), ) if pipeline_run.solids_to_execute: pipeline_def = pipeline.get_definition() if isinstance(pipeline_def, PipelineSubsetDefinition): check.invariant( pipeline_run.solids_to_execute == pipeline.solids_to_execute, "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that conflicts " "with pipeline subset {pipeline_solids_to_execute}.".format( pipeline_solids_to_execute=str_format_set( pipeline.solids_to_execute), solids_to_execute=str_format_set( pipeline_run.solids_to_execute), ), ) else: # when `execute_run_iterator` is directly called, the sub pipeline hasn't been created # note that when we receive the solids to execute via PipelineRun, it won't support # solid selection query syntax pipeline = pipeline.subset_for_execution_from_existing_pipeline( pipeline_run.solids_to_execute) execution_plan = _get_execution_plan_from_run(pipeline, pipeline_run, instance) return iter( ExecuteRunWithPlanIterable( execution_plan=execution_plan, iterator=pipeline_execution_iterator, execution_context_manager=PipelineExecutionContextManager( pipeline=pipeline, execution_plan=execution_plan, pipeline_run=pipeline_run, instance=instance, run_config=pipeline_run.run_config, raise_on_error=False, ), ))
def _resolve_reexecute_step_selection( instance: DagsterInstance, pipeline: IPipeline, mode: Optional[str], run_config: Optional[dict], parent_pipeline_run: PipelineRun, step_selection: List[str], ) -> ExecutionPlan: if parent_pipeline_run.solid_selection: pipeline = pipeline.subset_for_execution( parent_pipeline_run.solid_selection) parent_logs = instance.all_logs(parent_pipeline_run.run_id) parent_plan = create_execution_plan( pipeline, parent_pipeline_run.run_config, mode, known_state=KnownExecutionState.derive_from_logs(parent_logs), ) step_keys_to_execute = parse_step_selection( parent_plan.get_all_step_deps(), step_selection) execution_plan = create_execution_plan( pipeline, run_config, mode, step_keys_to_execute=list(step_keys_to_execute), known_state=KnownExecutionState.for_reexecution( parent_logs, step_keys_to_execute), ) return execution_plan
def rebuild_execution_plan_from_snapshot( pipeline: IPipeline, run_config: Optional[dict], mode: Optional[str], execution_plan_snapshot: ExecutionPlanSnapshot, ) -> ExecutionPlan: pipeline_def = pipeline.get_definition() environment_config = EnvironmentConfig.build(pipeline_def, run_config, mode=mode) return ExecutionPlan.rebuild_from_snapshot( pipeline, pipeline_def.name, execution_plan_snapshot, environment_config, )
def _check_persistent_storage_requirement( pipeline: IPipeline, mode_def: ModeDefinition, environment_config: EnvironmentConfig, ) -> None: from dagster.core.execution.context_creation_pipeline import executor_def_from_config pipeline_def = pipeline.get_definition() executor_def = executor_def_from_config(mode_def, environment_config) if ExecutorRequirement.PERSISTENT_OUTPUTS not in executor_def.requirements: return intermediate_storage_def = environment_config.intermediate_storage_def_for_mode( mode_def) if not (can_isolate_steps(pipeline_def, mode_def) or (intermediate_storage_def and intermediate_storage_def.is_persistent)): raise DagsterUnmetExecutorRequirementsError( "You have attempted to use an executor that uses multiple processes, but your pipeline " "includes solid outputs that will not be stored somewhere where other processes can " "retrieve them. Please use a persistent IO manager for these outputs. E.g. with\n" ' @pipeline(mode_defs=[ModeDefinition(resource_defs={"io_manager": fs_io_manager})])' )
def execute_run( pipeline: IPipeline, pipeline_run: PipelineRun, instance: DagsterInstance, raise_on_error: bool = False, ) -> PipelineExecutionResult: """Executes an existing pipeline run synchronously. Synchronous version of execute_run_iterator. Args: pipeline (IPipeline): The pipeline to execute. pipeline_run (PipelineRun): The run to execute instance (DagsterInstance): The instance in which the run has been created. raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur. Defaults to ``False``. Returns: PipelineExecutionResult: The result of the execution. """ if isinstance(pipeline, PipelineDefinition): raise DagsterInvariantViolationError( "execute_run requires an IPipeline but received a PipelineDefinition " "directly instead. To support hand-off to other processes provide a " "ReconstructablePipeline which can be done using reconstructable(). For in " "process only execution you can use InMemoryPipeline.") check.inst_param(pipeline, "pipeline", IPipeline) check.inst_param(pipeline_run, "pipeline_run", PipelineRun) check.inst_param(instance, "instance", DagsterInstance) if pipeline_run.status == PipelineRunStatus.CANCELED: message = "Not starting execution since the run was canceled before execution could start" instance.report_engine_event( message, pipeline_run, ) raise DagsterInvariantViolationError(message) check.invariant( pipeline_run.status == PipelineRunStatus.NOT_STARTED or pipeline_run.status == PipelineRunStatus.STARTING, desc="Pipeline run {} ({}) in state {}, expected NOT_STARTED or STARTING" .format(pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status), ) pipeline_def = pipeline.get_definition() if pipeline_run.solids_to_execute: if isinstance(pipeline_def, PipelineSubsetDefinition): check.invariant( pipeline_run.solids_to_execute == pipeline.solids_to_execute, "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that " "conflicts with pipeline subset {pipeline_solids_to_execute}.". format( pipeline_solids_to_execute=str_format_set( pipeline.solids_to_execute), solids_to_execute=str_format_set( pipeline_run.solids_to_execute), ), ) else: # when `execute_run` is directly called, the sub pipeline hasn't been created # note that when we receive the solids to execute via PipelineRun, it won't support # solid selection query syntax pipeline = pipeline.subset_for_execution_from_existing_pipeline( pipeline_run.solids_to_execute) execution_plan = _get_execution_plan_from_run(pipeline, pipeline_run, instance) if is_memoized_run(pipeline_run.tags): resolved_run_config = ResolvedRunConfig.build( pipeline.get_definition(), pipeline_run.run_config, pipeline_run.mode) execution_plan = resolve_memoized_execution_plan( execution_plan, pipeline.get_definition(), pipeline_run.run_config, instance, resolved_run_config, ) output_capture: Optional[Dict[StepOutputHandle, Any]] = {} _execute_run_iterable = ExecuteRunWithPlanIterable( execution_plan=execution_plan, iterator=pipeline_execution_iterator, execution_context_manager=PlanOrchestrationContextManager( context_event_generator=orchestration_context_event_generator, pipeline=pipeline, execution_plan=execution_plan, pipeline_run=pipeline_run, instance=instance, run_config=pipeline_run.run_config, raise_on_error=raise_on_error, executor_defs=None, output_capture=output_capture, ), ) event_list = list(_execute_run_iterable) return PipelineExecutionResult( pipeline.get_definition(), pipeline_run.run_id, event_list, lambda: scoped_pipeline_context( execution_plan, pipeline, pipeline_run.run_config, pipeline_run, instance, ), output_capture=output_capture, )
def execute_run( pipeline: IPipeline, pipeline_run: PipelineRun, instance: DagsterInstance, raise_on_error: bool = False, ) -> PipelineExecutionResult: """Executes an existing pipeline run synchronously. Synchronous version of execute_run_iterator. Args: pipeline (IPipeline): The pipeline to execute. pipeline_run (PipelineRun): The run to execute instance (DagsterInstance): The instance in which the run has been created. raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur. Defaults to ``False``. Returns: PipelineExecutionResult: The result of the execution. """ if isinstance(pipeline, PipelineDefinition): raise DagsterInvariantViolationError( "execute_run requires an IPipeline but received a PipelineDefinition " "directly instead. To support hand-off to other processes provide a " "ReconstructablePipeline which can be done using reconstructable(). For in " "process only execution you can use InMemoryPipeline.") check.inst_param(pipeline, "pipeline", IPipeline) check.inst_param(pipeline_run, "pipeline_run", PipelineRun) check.inst_param(instance, "instance", DagsterInstance) if pipeline_run.status == PipelineRunStatus.CANCELED: message = "Not starting execution since the run was canceled before execution could start" instance.report_engine_event( message, pipeline_run, ) raise DagsterInvariantViolationError(message) check.invariant( pipeline_run.status == PipelineRunStatus.NOT_STARTED or pipeline_run.status == PipelineRunStatus.STARTING, desc="Pipeline run {} ({}) in state {}, expected NOT_STARTED or STARTING" .format(pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status), ) pipeline_def = pipeline.get_definition() if pipeline_run.solids_to_execute: if isinstance(pipeline_def, PipelineSubsetDefinition): check.invariant( pipeline_run.solids_to_execute == pipeline.solids_to_execute, "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that " "conflicts with pipeline subset {pipeline_solids_to_execute}.". format( pipeline_solids_to_execute=str_format_set( pipeline.solids_to_execute), solids_to_execute=str_format_set( pipeline_run.solids_to_execute), ), ) else: # when `execute_run` is directly called, the sub pipeline hasn't been created # note that when we receive the solids to execute via PipelineRun, it won't support # solid selection query syntax pipeline = pipeline.subset_for_execution_from_existing_pipeline( pipeline_run.solids_to_execute) execution_plan = create_execution_plan( pipeline, run_config=pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=pipeline_run.step_keys_to_execute, ) if is_memoized_run(pipeline_run.tags): execution_plan = resolve_memoized_execution_plan(execution_plan) _execute_run_iterable = _ExecuteRunWithPlanIterable( execution_plan=execution_plan, iterator=_pipeline_execution_iterator, execution_context_manager=PipelineExecutionContextManager( execution_plan=execution_plan, pipeline_run=pipeline_run, instance=instance, run_config=pipeline_run.run_config, raise_on_error=raise_on_error, ), ) event_list = list(_execute_run_iterable) pipeline_context = _execute_run_iterable.pipeline_context # workaround for mem_io_manager to work in reconstruct_context, e.g. result.result_for_solid # in-memory values dict will get lost when the resource is re-initiated in reconstruct_context # so instead of re-initiating every single resource, we pass the resource instances to # reconstruct_context directly to avoid re-building from resource def. resource_instances_to_override = {} if pipeline_context: # None if we have a pipeline failure for ( key, resource_instance, ) in pipeline_context.scoped_resources_builder.resource_instance_dict.items( ): if isinstance(resource_instance, InMemoryIOManager): resource_instances_to_override[key] = resource_instance return PipelineExecutionResult( pipeline.get_definition(), pipeline_run.run_id, event_list, lambda hardcoded_resources_arg: scoped_pipeline_context( execution_plan, pipeline_run.run_config, pipeline_run, instance, intermediate_storage=pipeline_context.intermediate_storage, resource_instances_to_override=hardcoded_resources_arg, ), resource_instances_to_override=resource_instances_to_override, )
def execute_run_iterator( pipeline: IPipeline, pipeline_run: PipelineRun, instance: DagsterInstance, resume_from_failure: bool = False, ) -> Iterator[DagsterEvent]: check.inst_param(pipeline, "pipeline", IPipeline) check.inst_param(pipeline_run, "pipeline_run", PipelineRun) check.inst_param(instance, "instance", DagsterInstance) if pipeline_run.status == PipelineRunStatus.CANCELED: # This can happen if the run was force-terminated while it was starting def gen_execute_on_cancel(): yield instance.report_engine_event( "Not starting execution since the run was canceled before execution could start", pipeline_run, ) return gen_execute_on_cancel() if not resume_from_failure: if pipeline_run.status not in (PipelineRunStatus.NOT_STARTED, PipelineRunStatus.STARTING): if instance.run_monitoring_enabled: # This can happen if the pod was unexpectedly restarted by the cluster - ignore it since # the run monitoring daemon will also spin up a new pod def gen_ignore_duplicate_run_worker(): yield instance.report_engine_event( "Ignoring a duplicate run that was started from somewhere other than the run monitor daemon", pipeline_run, ) return gen_ignore_duplicate_run_worker() else: raise Exception( f"{pipeline_run.pipeline_name} ({pipeline_run.run_id}) started " f"a new run while the run was already in state {pipeline_run.status}. " "This most frequently happens when the run worker unexpectedly stops and is " "restarted by the cluster.", ) else: check.invariant( pipeline_run.status == PipelineRunStatus.STARTED or pipeline_run.status == PipelineRunStatus.STARTING, desc= "Run of {} ({}) in state {}, expected STARTED or STARTING because it's " "resuming from a run worker failure".format( pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status), ) if pipeline_run.solids_to_execute: pipeline_def = pipeline.get_definition() if isinstance(pipeline_def, PipelineSubsetDefinition): check.invariant( pipeline_run.solids_to_execute == pipeline.solids_to_execute, "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that conflicts " "with pipeline subset {pipeline_solids_to_execute}.".format( pipeline_solids_to_execute=str_format_set( pipeline.solids_to_execute), solids_to_execute=str_format_set( pipeline_run.solids_to_execute), ), ) else: # when `execute_run_iterator` is directly called, the sub pipeline hasn't been created # note that when we receive the solids to execute via PipelineRun, it won't support # solid selection query syntax pipeline = pipeline.subset_for_execution_from_existing_pipeline( pipeline_run.solids_to_execute) execution_plan = _get_execution_plan_from_run(pipeline, pipeline_run, instance) return iter( ExecuteRunWithPlanIterable( execution_plan=execution_plan, iterator=pipeline_execution_iterator, execution_context_manager=PlanOrchestrationContextManager( context_event_generator=orchestration_context_event_generator, pipeline=pipeline, execution_plan=execution_plan, pipeline_run=pipeline_run, instance=instance, run_config=pipeline_run.run_config, raise_on_error=False, executor_defs=None, output_capture=None, resume_from_failure=resume_from_failure, ), ))