def _resolve_reexecute_step_selection( instance: DagsterInstance, pipeline: IPipeline, mode: Optional[str], run_config: Optional[dict], parent_pipeline_run: PipelineRun, step_selection: List[str], ) -> ExecutionPlan: if parent_pipeline_run.solid_selection: pipeline = pipeline.subset_for_execution( parent_pipeline_run.solid_selection) parent_logs = instance.all_logs(parent_pipeline_run.run_id) parent_plan = create_execution_plan( pipeline, parent_pipeline_run.run_config, mode, known_state=KnownExecutionState.derive_from_logs(parent_logs), ) step_keys_to_execute = parse_step_selection( parent_plan.get_all_step_deps(), step_selection) execution_plan = create_execution_plan( pipeline, run_config, mode, step_keys_to_execute=list(step_keys_to_execute), known_state=KnownExecutionState.for_reexecution( parent_logs, step_keys_to_execute), ) return execution_plan
def compute_step_keys_to_execute(graphene_info, external_pipeline, execution_params): check.inst_param(graphene_info, "graphene_info", ResolveInfo) check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline) check.inst_param(execution_params, "execution_params", ExecutionParams) instance = graphene_info.context.instance if not execution_params.step_keys and is_resume_retry(execution_params): # Get step keys from parent_run_id if it's a resume/retry external_execution_plan = get_external_execution_plan_or_raise( graphene_info=graphene_info, external_pipeline=external_pipeline, mode=execution_params.mode, run_config=execution_params.run_config, step_keys_to_execute=None, known_state=None, ) return get_retry_steps_from_execution_plan( instance, external_execution_plan, execution_params.execution_metadata.parent_run_id ) else: known_state = None if execution_params.execution_metadata.parent_run_id: known_state = KnownExecutionState.for_reexecution( instance.all_logs(execution_params.execution_metadata.parent_run_id), execution_params.step_keys, ) return execution_params.step_keys, known_state
def test_execute_step_verify_step_framework_error(mock_verify_step): with get_foo_pipeline_handle() as pipeline_handle: runner = CliRunner() mock_verify_step.side_effect = Exception( "Unexpected framework error text") with instance_for_test( overrides={ "compute_logs": { "module": "dagster.core.storage.noop_compute_log_manager", "class": "NoOpComputeLogManager", } }) as instance: run = create_run_for_test( instance, pipeline_name="foo", run_id="new_run", ) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_handle.get_python_origin(), pipeline_run_id=run.run_id, step_keys_to_execute=["fake_step"], instance_ref=instance.get_ref(), should_verify_step=True, known_state=KnownExecutionState( {}, { "blah": { "result": ["0", "1", "2"] }, }, ), )) result = runner.invoke(api.execute_step_command, [input_json]) assert result.exit_code != 0 # Framework error logged to event log logs = instance.all_logs(run.run_id) log_entry = logs[0] assert ( log_entry.message == "An exception was thrown during step execution that is likely a framework error, rather than an error in user code." ) assert log_entry.step_key == "fake_step" assert "Unexpected framework error text" in str( log_entry.dagster_event.event_specific_data.error)
def compute_step_keys_to_execute(graphene_info, execution_params): check.inst_param(graphene_info, "graphene_info", ResolveInfo) check.inst_param(execution_params, "execution_params", ExecutionParams) instance = graphene_info.context.instance if not execution_params.step_keys and is_resume_retry(execution_params): # Get step keys from parent_run_id if it's a resume/retry return get_retry_steps_from_parent_run( instance, execution_params.execution_metadata.parent_run_id ) else: known_state = None if execution_params.execution_metadata.parent_run_id and execution_params.step_keys: known_state = KnownExecutionState.for_reexecution( instance.all_logs(execution_params.execution_metadata.parent_run_id), execution_params.step_keys, ) return execution_params.step_keys, known_state
def test_tags(): known_state = KnownExecutionState( {}, { emit.name: { "result": ["0", "1", "2"] }, }, ) plan = create_execution_plan(dynamic_pipeline, known_state=known_state) assert plan.get_step_by_key(emit.name).tags == {"first": "1"} for mapping_key in range(3): assert plan.get_step_by_key( f"{multiply_inputs.name}[{mapping_key}]").tags == { "second": "2" } assert plan.get_step_by_key( f"{multiply_by_two.name}[{mapping_key}]").tags == { "third": "3" }
def create_backfill_run(instance, repo_location, external_pipeline, external_partition_set, backfill_job, partition_data): check.inst_param(instance, "instance", DagsterInstance) check.inst_param(repo_location, "repo_location", RepositoryLocation) check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline) check.inst_param(external_partition_set, "external_partition_set", ExternalPartitionSet) check.inst_param(backfill_job, "backfill_job", PartitionBackfill) check.inst_param(partition_data, "partition_data", ExternalPartitionExecutionParamData) full_external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, partition_data.run_config, external_partition_set.mode, step_keys_to_execute=None, known_state=None, ) tags = merge_dicts( external_pipeline.tags, partition_data.tags, PipelineRun.tags_for_backfill_id(backfill_job.backfill_id), backfill_job.tags, ) if not backfill_job.from_failure and not backfill_job.reexecution_steps: step_keys_to_execute = None parent_run_id = None root_run_id = None known_state = None elif backfill_job.from_failure: last_run = _fetch_last_run(instance, external_partition_set, partition_data.name) if not last_run or last_run.status != PipelineRunStatus.FAILURE: return None parent_run_id = last_run.run_id root_run_id = last_run.root_run_id or last_run.run_id tags = merge_dicts( tags, { RESUME_RETRY_TAG: "true", PARENT_RUN_ID_TAG: parent_run_id, ROOT_RUN_ID_TAG: root_run_id, }, ) step_keys_to_execute, known_state = get_retry_steps_from_execution_plan( instance, full_external_execution_plan, parent_run_id) elif backfill_job.reexecution_steps: last_run = _fetch_last_run(instance, external_partition_set, partition_data.name) parent_run_id = last_run.run_id if last_run else None root_run_id = (last_run.root_run_id or last_run.run_id) if last_run else None if parent_run_id and root_run_id: tags = merge_dicts(tags, { PARENT_RUN_ID_TAG: parent_run_id, ROOT_RUN_ID_TAG: root_run_id }) step_keys_to_execute = backfill_job.reexecution_steps if last_run and last_run.status == PipelineRunStatus.SUCCESS: known_state = KnownExecutionState.for_reexecution( instance.all_logs(parent_run_id), step_keys_to_execute, ) else: known_state = None if step_keys_to_execute: external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, partition_data.run_config, external_partition_set.mode, step_keys_to_execute=step_keys_to_execute, known_state=known_state, ) else: external_execution_plan = full_external_execution_plan return instance.create_run( pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=external_execution_plan. execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, pipeline_name=external_pipeline.name, run_id=make_new_run_id(), solids_to_execute=frozenset(external_partition_set.solid_selection) if external_partition_set.solid_selection else None, run_config=partition_data.run_config, mode=external_partition_set.mode, step_keys_to_execute=step_keys_to_execute, tags=tags, root_run_id=root_run_id, parent_run_id=parent_run_id, status=PipelineRunStatus.NOT_STARTED, external_pipeline_origin=external_pipeline.get_external_origin(), )
def get_known_state(self): return KnownExecutionState( previous_retry_attempts=self._retry_state.snapshot_attempts(), dynamic_mappings=dict(self._successful_dynamic_outputs), )
def create_backfill_run(instance, repo_location, external_pipeline, external_partition_set, backfill_job, partition_data): from dagster.daemon.daemon import get_telemetry_daemon_session_id check.inst_param(instance, "instance", DagsterInstance) check.inst_param(repo_location, "repo_location", RepositoryLocation) check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline) check.inst_param(external_partition_set, "external_partition_set", ExternalPartitionSet) check.inst_param(backfill_job, "backfill_job", PartitionBackfill) check.inst_param(partition_data, "partition_data", ExternalPartitionExecutionParamData) log_action( instance, BACKFILL_RUN_CREATED, metadata={ "DAEMON_SESSION_ID": get_telemetry_daemon_session_id(), "repo_hash": hash_name(repo_location.name), "pipeline_name_hash": hash_name(external_pipeline.name), }, ) tags = merge_dicts( external_pipeline.tags, partition_data.tags, PipelineRun.tags_for_backfill_id(backfill_job.backfill_id), backfill_job.tags, ) solids_to_execute = None solid_selection = None if not backfill_job.from_failure and not backfill_job.reexecution_steps: step_keys_to_execute = None parent_run_id = None root_run_id = None known_state = None if external_partition_set.solid_selection: solids_to_execute = frozenset( external_partition_set.solid_selection) solid_selection = external_partition_set.solid_selection elif backfill_job.from_failure: last_run = _fetch_last_run(instance, external_partition_set, partition_data.name) if not last_run or last_run.status != PipelineRunStatus.FAILURE: return None return instance.create_reexecuted_run_from_failure( last_run, repo_location, external_pipeline, tags=tags, run_config=partition_data.run_config, mode=external_partition_set.mode, ) elif backfill_job.reexecution_steps: last_run = _fetch_last_run(instance, external_partition_set, partition_data.name) parent_run_id = last_run.run_id if last_run else None root_run_id = (last_run.root_run_id or last_run.run_id) if last_run else None if parent_run_id and root_run_id: tags = merge_dicts(tags, { PARENT_RUN_ID_TAG: parent_run_id, ROOT_RUN_ID_TAG: root_run_id }) step_keys_to_execute = backfill_job.reexecution_steps if last_run and last_run.status == PipelineRunStatus.SUCCESS: known_state = KnownExecutionState.for_reexecution( instance.all_logs(parent_run_id), step_keys_to_execute, ) else: known_state = None if external_partition_set.solid_selection: solids_to_execute = frozenset( external_partition_set.solid_selection) solid_selection = external_partition_set.solid_selection external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, partition_data.run_config, external_partition_set.mode, step_keys_to_execute=step_keys_to_execute, known_state=known_state, instance=instance, ) return instance.create_run( pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=external_execution_plan. execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, pipeline_name=external_pipeline.name, run_id=make_new_run_id(), solids_to_execute=solids_to_execute, run_config=partition_data.run_config, mode=external_partition_set.mode, step_keys_to_execute=step_keys_to_execute, tags=tags, root_run_id=root_run_id, parent_run_id=parent_run_id, status=PipelineRunStatus.NOT_STARTED, external_pipeline_origin=external_pipeline.get_external_origin(), pipeline_code_origin=external_pipeline.get_python_origin(), solid_selection=solid_selection, )
def test_tags_to_dynamic_plan(): @solid( tags={ USER_DEFINED_K8S_CONFIG_KEY: { "container_config": { "resources": { "requests": { "cpu": "500m", "memory": "128Mi" }, "limits": { "cpu": "1000m", "memory": "1Gi" }, } } } }) def multiply_inputs(_, x): return 2 * x @solid( tags={ USER_DEFINED_K8S_CONFIG_KEY: { "container_config": { "resources": { "requests": { "cpu": "250m", "memory": "64Mi" }, "limits": { "cpu": "500m", "memory": "2560Mi" }, } } } }, output_defs=[DynamicOutputDefinition()], ) def emit(_): for i in range(3): yield DynamicOutput(value=i, mapping_key=str(i)) @pipeline def k8s_ready(): return emit().map(multiply_inputs) known_state = KnownExecutionState( {}, { emit.name: { "result": ["0", "1", "2"] }, }, ) plan = create_execution_plan(k8s_ready, known_state=known_state) emit_step = plan.get_step_by_key(emit.name) user_defined_k8s_config = get_user_defined_k8s_config(emit_step.tags) assert user_defined_k8s_config.container_config assert user_defined_k8s_config.container_config["resources"] resources = user_defined_k8s_config.container_config["resources"] assert resources["requests"]["cpu"] == "250m" assert resources["requests"]["memory"] == "64Mi" assert resources["limits"]["cpu"] == "500m" assert resources["limits"]["memory"] == "2560Mi" for mapping_key in range(3): multiply_inputs_step = plan.get_step_by_key( f"{multiply_inputs.name}[{mapping_key}]") dynamic_step_user_defined_k8s_config = get_user_defined_k8s_config( multiply_inputs_step.tags) assert dynamic_step_user_defined_k8s_config.container_config assert dynamic_step_user_defined_k8s_config.container_config[ "resources"] resources = dynamic_step_user_defined_k8s_config.container_config[ "resources"] assert resources["requests"]["cpu"] == "500m" assert resources["requests"]["memory"] == "128Mi" assert resources["limits"]["cpu"] == "1000m" assert resources["limits"]["memory"] == "1Gi"
def get_known_state(self): return KnownExecutionState( previous_retry_attempts=self._retry_state.snapshot_attempts(), )
def get_retry_steps_from_parent_run( instance, parent_run_id ) -> Tuple[List[str], Optional[KnownExecutionState]]: check.inst_param(instance, "instance", DagsterInstance) check.str_param(parent_run_id, "parent_run_id") parent_run = instance.get_run_by_id(parent_run_id) parent_run_logs = instance.all_logs(parent_run_id) execution_plan_snapshot = instance.get_execution_plan_snapshot( parent_run.execution_plan_snapshot_id ) if not execution_plan_snapshot: raise DagsterExecutionPlanSnapshotNotFoundError( f"Could not load execution plan snapshot for run {parent_run_id}" ) execution_plan = ExternalExecutionPlan(execution_plan_snapshot=execution_plan_snapshot) # keep track of steps with dicts that point: # * step_key -> set(step_handle) in the normal case # * unresolved_step_key -> set(resolved_step_handle, ...) for dynamic outputs all_steps_in_parent_run_logs: Dict[str, set] = defaultdict(set) failed_steps_in_parent_run_logs: Dict[str, set] = defaultdict(set) successful_steps_in_parent_run_logs: Dict[str, set] = defaultdict(set) interrupted_steps_in_parent_run_logs: Dict[str, set] = defaultdict(set) skipped_steps_in_parent_run_logs: Dict[str, set] = defaultdict(set) for record in parent_run_logs: if record.dagster_event and record.dagster_event.step_handle: step_handle = record.dagster_event.step_handle _update_tracking_dict(all_steps_in_parent_run_logs, step_handle) if record.dagster_event_type == DagsterEventType.STEP_FAILURE: _update_tracking_dict(failed_steps_in_parent_run_logs, step_handle) if record.dagster_event_type == DagsterEventType.STEP_SUCCESS: _update_tracking_dict(successful_steps_in_parent_run_logs, step_handle) if record.dagster_event_type == DagsterEventType.STEP_SKIPPED: _update_tracking_dict(skipped_steps_in_parent_run_logs, step_handle) for step_set in all_steps_in_parent_run_logs.values(): for step_handle in step_set: if ( not _in_tracking_dict(step_handle, failed_steps_in_parent_run_logs) and not _in_tracking_dict(step_handle, successful_steps_in_parent_run_logs) and not _in_tracking_dict(step_handle, skipped_steps_in_parent_run_logs) ): _update_tracking_dict(interrupted_steps_in_parent_run_logs, step_handle) to_retry = defaultdict(set) execution_deps = execution_plan.execution_deps() for step_snap in execution_plan.topological_steps(): step_key = step_snap.key step_handle = StepHandle.parse_from_key(step_snap.key) if parent_run.step_keys_to_execute and step_snap.key not in parent_run.step_keys_to_execute: continue if step_snap.key in failed_steps_in_parent_run_logs: to_retry[step_key].update(failed_steps_in_parent_run_logs[step_key]) # Interrupted steps can occur when graceful cleanup from a step failure fails to run, # and a step failure event is not generated if step_key in interrupted_steps_in_parent_run_logs: to_retry[step_key].update(interrupted_steps_in_parent_run_logs[step_key]) # Missing steps did not execute, e.g. when a run was terminated if step_key not in all_steps_in_parent_run_logs: to_retry[step_key].add(step_handle) step_dep_keys = execution_deps[step_key] retrying_dep_keys = step_dep_keys.intersection(to_retry.keys()) # this step is downstream of a step we are about to retry if retrying_dep_keys: for retrying_key in retrying_dep_keys: # If this step and its ancestor are both downstream of a dynamic output, # add resolved instances of this step for the retrying mapping keys if isinstance(step_handle, UnresolvedStepHandle) and all( map( lambda handle: isinstance(handle, ResolvedFromDynamicStepHandle), to_retry[retrying_key], ) ): for resolved_handle in to_retry[retrying_key]: to_retry[step_key].add(step_handle.resolve(resolved_handle.mapping_key)) else: to_retry[step_key].add(step_handle) steps_to_retry = [ step_handle.to_key() for step_set in to_retry.values() for step_handle in step_set ] return steps_to_retry, KnownExecutionState.for_reexecution(parent_run_logs, steps_to_retry)