def dagster_event_sequence_for_step(step_context, inputs, intermediates_manager): ''' Yield a sequence of dagster events for the given step with the step context. This function yields the events that directly result from the computation in the non-throwing case, and then also wraps that in an error boundary, so that any thrown exception gets translated in a step failure event, and then halts computation for that step. Additionally, if the pipeline is configured to reraise that error up through the execute_pipeline call (configured in the InProcessExecutorConfig) that reraise happens here. ''' check.inst_param(step_context, 'step_context', SystemStepExecutionContext) check.dict_param(inputs, 'inputs', key_type=str) check.inst_param(intermediates_manager, 'intermediates_manager', IntermediatesManager) try: for step_event in check.generator( _core_dagster_event_sequence_for_step(step_context, inputs, intermediates_manager) ): if step_event.event_type is DagsterEventType.STEP_OUTPUT: step_context.log.info( 'Step {step} emitted {value} for output {output}'.format( step=step_context.step.key, value=step_event.step_output_data.value_repr, output=step_event.step_output_data.output_name, ) ) yield step_event except DagsterError as dagster_error: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info() ) error_info = serializable_error_info_from_exc_info(user_facing_exc_info) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=error_info) ) if step_context.executor_config.raise_on_error: raise dagster_error return except: error_info = serializable_error_info_from_exc_info(sys.exc_info()) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=error_info) ) raise
def check_step_health( self, step_contexts: List[IStepContext], known_state: KnownExecutionState, ): assert len(step_contexts) == 1, "Checking multiple steps is not currently supported" step_context = step_contexts[0] k8s_name_key = get_k8s_job_name( self.pipeline_context.plan_data.pipeline_run.run_id, step_context.step.key, ) job_name = "dagster-job-%s" % (k8s_name_key) job = kubernetes.client.BatchV1Api().read_namespaced_job( namespace=self._job_namespace, name=job_name ) if job.status.failed: step_failure_event = DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=None, user_failure_data=None), ) return [step_failure_event] return []
def _step_failure_event_from_exc_info(step_context, exc_info, user_failure_data=None): return DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData( error=serializable_error_info_from_exc_info(exc_info), user_failure_data=user_failure_data, ), )
def execute_step_in_memory(step_context, inputs, intermediates_manager): check.inst_param(step_context, 'step_context', SystemStepExecutionContext) check.dict_param(inputs, 'inputs', key_type=str) check.inst_param(intermediates_manager, 'intermediates_manager', IntermediatesManager) try: for step_event in check.generator( _execute_steps_core_loop(step_context, inputs, intermediates_manager) ): if step_event.event_type is DagsterEventType.STEP_OUTPUT: step_context.log.info( 'Step {step} emitted {value} for output {output}'.format( step=step_context.step.key, value=step_event.step_output_data.value_repr, output=step_event.step_output_data.output_name, ) ) yield step_event except DagsterError as dagster_error: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info() ) error_info = serializable_error_info_from_exc_info(user_facing_exc_info) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=error_info) ) if step_context.executor_config.raise_on_error: raise dagster_error return except: error_info = serializable_error_info_from_exc_info(sys.exc_info()) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=error_info) ) raise
def _step_failure_event_from_exc_info( step_context: SystemStepExecutionContext, exc_info: ExcInfo, user_failure_data: Optional[UserFailureData] = None, ): return DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData( error=serializable_error_info_from_exc_info(exc_info), user_failure_data=user_failure_data, ), )
def step_failure_event_from_exc_info( step_context: "StepExecutionContext", exc_info: ExcInfo, user_failure_data: Optional[UserFailureData] = None, error_source: Optional[ErrorSource] = None, ): from dagster.core.events import DagsterEvent return DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData( error=serializable_error_info_from_exc_info(exc_info), user_failure_data=user_failure_data, error_source=error_source, ), )
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multiprocess engine: parent process (pid: {pid})" .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: with execution_plan.start( retries=self.retries) as active_execution: active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: try: # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[step.key] = multiprocessing.Event() active_iters[ step. key] = self.execute_step_out_of_process( step_context, step, errors, term_events) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event( event_or_none) except ChildProcessCrashException as crash: serializable_error = serializable_error_info_from_exc_info( sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, ("Multiprocess executor: child process for step {step_key} " "unexpectedly exited with code {exit_code}" ).format(step_key=key, exit_code=crash.exit_code), EngineEventData.engine_error( serializable_error), step_key=key, ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step( active_execution.get_step_by_key(key)), step_failure_data=StepFailureData( error=serializable_error, user_failure_data=None), ) active_execution.handle_event( step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] if term_events[key].is_set(): stopping = True del term_events[key] active_execution.verify_complete( pipeline_context, key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # In the very small chance that we get interrupted in this coordination section and not # polling the subprocesses for events - try to clean up gracefully except KeyboardInterrupt: yield DagsterEvent.engine_event( pipeline_context, "Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes", EngineEventData.interrupted( list(term_events.keys())), ) stopping = True for event in term_events.values(): event.set() errs = {pid: err for pid, err in errors.items() if err} if errs: raise DagsterSubprocessError( "During multiprocess execution errors occurred in child processes:\n{error_list}" .format(error_list="\n".join([ "In process {pid}: {err}".format( pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multiprocess engine: parent process exiting after {duration} (pid: {pid})" .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def _dagster_event_sequence_for_step(step_context, retries): """ Yield a sequence of dagster events for the given step with the step context. This function also processes errors. It handles a few error cases: (1) User code requests to be retried: A RetryRequested has been raised. We will either put the step in to up_for_retry state or a failure state depending on the number of previous attempts and the max_retries on the received RetryRequested. (2) User code fails successfully: The user-space code has raised a Failure which may have explicit metadata attached. (3) User code fails unexpectedly: The user-space code has raised an Exception. It has been wrapped in an exception derived from DagsterUserCodeException. In that case the original user exc_info is stashed on the exception as the original_exc_info property. (4) User error: The framework raised a DagsterError that indicates a usage error or some other error not communicated by a user-thrown exception. For example, if the user yields an object out of a compute function that is not a proper event (not an Output, ExpectationResult, etc). (5) Framework failure or interrupt: An unexpected error occurred. This is a framework error. Either there has been an internal error in the framework OR we have forgotten to put a user code error boundary around invoked user-space code. These terminate the computation immediately (by re-raising). The "raised_dagster_errors" context manager can be used to force these errors to be re-raised and surfaced to the user. This is mostly to get sensible errors in test and ad-hoc contexts, rather than forcing the user to wade through the PipelineExecutionResult API in order to find the step that failed. For tools, however, this option should be false, and a sensible error message signaled to the user within that tool. """ check.inst_param(step_context, "step_context", SystemStepExecutionContext) check.inst_param(retries, "retries", Retries) try: prior_attempt_count = retries.get_attempt_count(step_context.step.key) if step_context.step_launcher: step_events = step_context.step_launcher.launch_step( step_context, prior_attempt_count) else: step_events = core_dagster_event_sequence_for_step( step_context, prior_attempt_count) for step_event in check.generator(step_events): yield step_event # case (1) in top comment except RetryRequested as retry_request: retry_err_info = serializable_error_info_from_exc_info(sys.exc_info()) if retries.disabled: fail_err = SerializableErrorInfo( message="RetryRequested but retries are disabled", stack=retry_err_info.stack, cls_name=retry_err_info.cls_name, cause=retry_err_info.cause, ) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=fail_err, user_failure_data=None), ) else: # retries.enabled or retries.deferred prev_attempts = retries.get_attempt_count(step_context.step.key) if prev_attempts >= retry_request.max_retries: fail_err = SerializableErrorInfo( message="Exceeded max_retries of {}".format( retry_request.max_retries), stack=retry_err_info.stack, cls_name=retry_err_info.cls_name, cause=retry_err_info.cause, ) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=fail_err, user_failure_data=None), ) else: yield DagsterEvent.step_retry_event( step_context, StepRetryData( error=retry_err_info, seconds_to_wait=retry_request.seconds_to_wait, ), ) # case (2) in top comment except Failure as failure: yield _step_failure_event_from_exc_info( step_context, sys.exc_info(), UserFailureData( label="intentional-failure", description=failure.description, metadata_entries=failure.metadata_entries, ), ) if step_context.raise_on_error: raise failure # case (3) in top comment except DagsterUserCodeExecutionError as dagster_user_error: yield _step_failure_event_from_exc_info( step_context, dagster_user_error.original_exc_info, ) if step_context.raise_on_error: raise dagster_user_error.user_exception # case (4) in top comment except DagsterError as dagster_error: yield _step_failure_event_from_exc_info(step_context, sys.exc_info()) if step_context.raise_on_error: raise dagster_error # case (5) in top comment except (Exception, KeyboardInterrupt) as unexpected_exception: # pylint: disable=broad-except yield _step_failure_event_from_exc_info(step_context, sys.exc_info()) raise unexpected_exception
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multiprocess executor: parent process (pid: {pid})" .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: with execution_plan.start( retry_mode=self.retries) as active_execution: active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: received termination signal - " "forwarding to active child processes", EngineEventData.interrupted( list(term_events.keys())), ) stopping = True active_execution.mark_interrupted() for key, event in term_events.items(): event.set() # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[step.key] = multiprocessing.Event() active_iters[ step.key] = self.execute_step_out_of_process( step_context, step, errors, term_events, active_execution.get_known_state(), ) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event(event_or_none) except ChildProcessCrashException as crash: serializable_error = serializable_error_info_from_exc_info( sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, ("Multiprocess executor: child process for step {step_key} " "unexpectedly exited with code {exit_code}" ).format(step_key=key, exit_code=crash.exit_code), EngineEventData.engine_error( serializable_error), step_handle=active_execution.get_step_by_key( key).handle, ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step( active_execution.get_step_by_key(key)), step_failure_data=StepFailureData( error=serializable_error, user_failure_data=None), ) active_execution.handle_event(step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] del term_events[key] active_execution.verify_complete(pipeline_context, key) # process skipped and abandoned steps yield from active_execution.plan_events_iterator( pipeline_context) errs = {pid: err for pid, err in errors.items() if err} # After termination starts, raise an interrupted exception once all subprocesses # have finished cleaning up (and the only errors were from being interrupted) if (stopping and (not active_iters) and all([ err_info.cls_name == "DagsterExecutionInterruptedError" for err_info in errs.values() ])): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: interrupted all active child processes", event_specific_data=EngineEventData(), ) raise DagsterExecutionInterruptedError() elif errs: raise DagsterSubprocessError( "During multiprocess execution errors occurred in child processes:\n{error_list}" .format(error_list="\n".join([ "In process {pid}: {err}".format( pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: parent process exiting after {duration} (pid: {pid})" .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def dagster_event_sequence_for_step( step_context: StepExecutionContext, force_local_execution: bool = False ) -> Iterator[DagsterEvent]: """ Yield a sequence of dagster events for the given step with the step context. This function also processes errors. It handles a few error cases: (1) User code requests to be retried: A RetryRequested has been raised. We will either put the step in to up_for_retry state or a failure state depending on the number of previous attempts and the max_retries on the received RetryRequested. (2) User code fails successfully: The user-space code has raised a Failure which may have explicit metadata attached. (3) User code fails unexpectedly: The user-space code has raised an Exception. It has been wrapped in an exception derived from DagsterUserCodeException. In that case the original user exc_info is stashed on the exception as the original_exc_info property. (4) Execution interrupted: The run was interrupted in the middle of execution (typically by a termination request). (5) User error: The framework raised a DagsterError that indicates a usage error or some other error not communicated by a user-thrown exception. For example, if the user yields an object out of a compute function that is not a proper event (not an Output, ExpectationResult, etc). (6) Framework failure: An unexpected error occurred. This is a framework error. Either there has been an internal error in the framework OR we have forgotten to put a user code error boundary around invoked user-space code. These terminate the computation immediately (by re-raising). The "raised_dagster_errors" context manager can be used to force these errors to be re-raised and surfaced to the user. This is mostly to get sensible errors in test and ad-hoc contexts, rather than forcing the user to wade through the PipelineExecutionResult API in order to find the step that failed. For tools, however, this option should be false, and a sensible error message signaled to the user within that tool. When we launch a step that has a step launcher, we use this function on both the host process and the remote process. When we run the step in the remote process, to prevent an infinite loop of launching steps that then launch steps, and so on, the remote process will run this with the force_local_execution argument set to True. """ check.inst_param(step_context, "step_context", StepExecutionContext) try: if step_context.step_launcher and not force_local_execution: # info all on step_context - should deprecate second arg step_events = step_context.step_launcher.launch_step( step_context, step_context.previous_attempt_count ) else: step_events = core_dagster_event_sequence_for_step(step_context) for step_event in check.generator(step_events): yield step_event # case (1) in top comment except RetryRequested as retry_request: retry_err_info = serializable_error_info_from_exc_info(sys.exc_info()) if step_context.retry_mode.disabled: fail_err = SerializableErrorInfo( message="RetryRequested but retries are disabled", stack=retry_err_info.stack, cls_name=retry_err_info.cls_name, cause=retry_err_info.cause, ) step_context.capture_step_exception(retry_request) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=fail_err, user_failure_data=None), ) else: # retries.enabled or retries.deferred prev_attempts = step_context.previous_attempt_count if prev_attempts >= retry_request.max_retries: fail_err = SerializableErrorInfo( message=f"Exceeded max_retries of {retry_request.max_retries}\n", stack=retry_err_info.stack, cls_name=retry_err_info.cls_name, cause=retry_err_info.cause, ) step_context.capture_step_exception(retry_request) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData( error=fail_err, user_failure_data=None, # set the flag to omit the outer stack if we have a cause to show error_source=ErrorSource.USER_CODE_ERROR if fail_err.cause else None, ), ) else: yield DagsterEvent.step_retry_event( step_context, StepRetryData( error=retry_err_info, seconds_to_wait=retry_request.seconds_to_wait, ), ) # case (2) in top comment except Failure as failure: step_context.capture_step_exception(failure) yield step_failure_event_from_exc_info( step_context, sys.exc_info(), UserFailureData( label="intentional-failure", description=failure.description, metadata_entries=failure.metadata_entries, ), ) if step_context.raise_on_error: raise failure # case (3) in top comment except DagsterUserCodeExecutionError as dagster_user_error: step_context.capture_step_exception(dagster_user_error.user_exception) yield step_failure_event_from_exc_info( step_context, sys.exc_info(), error_source=ErrorSource.USER_CODE_ERROR, ) if step_context.raise_on_error: raise dagster_user_error.user_exception # case (4) in top comment except (KeyboardInterrupt, DagsterExecutionInterruptedError) as interrupt_error: step_context.capture_step_exception(interrupt_error) yield step_failure_event_from_exc_info( step_context, sys.exc_info(), error_source=ErrorSource.INTERRUPT, ) raise interrupt_error # case (5) in top comment except DagsterError as dagster_error: step_context.capture_step_exception(dagster_error) yield step_failure_event_from_exc_info( step_context, sys.exc_info(), error_source=ErrorSource.FRAMEWORK_ERROR, ) if step_context.raise_on_error: raise dagster_error # case (6) in top comment except BaseException as unexpected_exception: step_context.capture_step_exception(unexpected_exception) yield step_failure_event_from_exc_info( step_context, sys.exc_info(), error_source=ErrorSource.UNEXPECTED_ERROR, ) raise unexpected_exception
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multithread executor (pid: {pid})".format(pid=os.getpid()), event_specific_data=EngineEventData.in_process(os.getpid(), execution_plan.step_keys_to_execute), ) with time_execution_scope() as timer_result: with execution_plan.start(retries=self.retries) as active_execution: active_iters = {} errors = {} while not active_execution.is_complete or active_iters: # start iterators while len(active_iters) < limit: steps = active_execution.get_steps_to_execute(limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) active_iters[step.key] = self.execute_step_in_thread(step.key, step_context, errors) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue yield event_or_none active_execution.handle_event(event_or_none) except ThreadCrashException: serializable_error = serializable_error_info_from_exc_info(sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, f"Multithread executor: thread for step {key} exited unexpectedly", EngineEventData.engine_error(serializable_error), ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step(active_execution.get_step_by_key(key)), step_failure_data=StepFailureData(error=serializable_error, user_failure_data=None), ) active_execution.handle_event(step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] active_execution.verify_complete(pipeline_context, key) # process skipped and abandoned steps for event in active_execution.plan_events_iterator(pipeline_context): yield event errs = {tid: err for tid, err in errors.items() if err} if errs: raise DagsterThreadError( "During multithread execution errors occurred in threads:\n{error_list}".format( error_list="\n".join( [ "In thread {tid}: {err}".format(tid=tid, err=err.to_string()) for tid, err in errs.items() ] ) ), thread_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multithread executor: parent process exiting after {duration} (pid: {pid})".format( duration=format_duration(timer_result.millis), pid=os.getpid() ), event_specific_data=EngineEventData.multiprocess(os.getpid()), )