def execute(pipeline_context, execution_plan): from .tasks import make_app check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor_config, CeleryConfig), 'pipeline_context', 'Expected executor_config to be CeleryConfig got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config storage = pipeline_context.environment_dict.get('storage') if (celery_config.broker and not is_local_uri(celery_config.broker) ) or (celery_config.backend and not is_local_uri(celery_config.backend)): check.invariant( storage.get('s3') or storage.get('gcs'), 'Must use S3 or GCS storage with non-local Celery broker: {broker} ' 'and backend: {backend}'.format(broker=celery_config.broker, backend=celery_config.backend), ) else: check.invariant( not storage.get('in_memory'), 'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS', ) app = make_app(celery_config) priority_for_step = lambda step: (-1 * int( step.tags.get('dagster-celery/priority', task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception as e: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get('dagster-celery/queue', task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) step_results[step.key] = _submit_task( app, pipeline_context, step, queue) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'. format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occurred in workers:\n{error_list}' .format(error_list='\n'.join([ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def inner_plan_execution_iterator(pipeline_context, execution_plan, retries): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.inst_param(retries, 'retries', Retries) for event in copy_required_intermediates_for_execution( pipeline_context, execution_plan): yield event # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 active_execution = execution_plan.start(retries=retries) while not active_execution.is_complete: step = active_execution.get_next_step() step_context = pipeline_context.for_step(step) missing_resources = [ resource_key for resource_key in step_context.required_resource_keys if not hasattr(step_context.resources, resource_key) ] check.invariant( len(missing_resources) == 0, ('Expected step context for solid {solid_name} to have all required resources, but ' 'missing {missing_resources}.').format( solid_name=step_context.solid.name, missing_resources=missing_resources), ) with pipeline_context.instance.compute_log_manager.watch( step_context.pipeline_run, step_context.step.key): # capture all of the logs for this step uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional(uncovered_inputs, execution_plan, step.key) step_context.log.info(( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}').format( uncovered_inputs=uncovered_inputs, step=step.key)) yield DagsterEvent.step_skipped_event(step_context) active_execution.mark_skipped(step.key) else: for step_event in check.generator( _dagster_event_sequence_for_step( step_context, retries)): check.inst(step_event, DagsterEvent) yield step_event active_execution.handle_event(step_event) active_execution.verify_complete(pipeline_context, step.key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event
def dagster_event_from_dict(event_dict, pipeline_name): check.dict_param(event_dict, 'event_dict', key_type=str) check.str_param(pipeline_name, 'pipeline_name') # Get event_type event_type = _handled_events().get(event_dict['__typename']) if not event_type: raise Exception('unhandled event type %s' % event_dict['__typename']) # Get event_specific_data event_specific_data = None if event_type == DagsterEventType.STEP_OUTPUT: event_specific_data = StepOutputData( step_output_handle=StepOutputHandle(event_dict['step']['key'], event_dict['outputName']), type_check_data=TypeCheckData( success=event_dict['typeCheck']['success'], label=event_dict['typeCheck']['label'], description=event_dict.get('description'), metadata_entries=list( event_metadata_entries(event_dict.get('metadataEntries')) or []), ), ) elif event_type == DagsterEventType.STEP_INPUT: event_specific_data = StepInputData( input_name=event_dict['inputName'], type_check_data=TypeCheckData( success=event_dict['typeCheck']['success'], label=event_dict['typeCheck']['label'], description=event_dict.get('description'), metadata_entries=list( event_metadata_entries(event_dict.get('metadataEntries')) or []), ), ) elif event_type == DagsterEventType.STEP_SUCCESS: event_specific_data = StepSuccessData(0.0) elif event_type == DagsterEventType.STEP_MATERIALIZATION: materialization = event_dict['materialization'] event_specific_data = StepMaterializationData( materialization=materialization_from_data(materialization)) elif event_type == DagsterEventType.STEP_EXPECTATION_RESULT: expectation_result = expectation_result_from_data( event_dict['expectationResult']) event_specific_data = StepExpectationResultData(expectation_result) elif event_type == DagsterEventType.STEP_FAILURE: error_info = SerializableErrorInfo(event_dict['error']['message'], stack=None, cls_name=None) event_specific_data = StepFailureData( error_info, UserFailureData( label=event_dict['failureMetadata']['label'], description=event_dict['failureMetadata']['description'], metadata_entries=list( event_metadata_entries(event_dict.get('metadataEntries')) or []), ) if event_dict.get('failureMetadata') else None, ) # We should update the GraphQL response so that clients don't need to do this handle parsing. # See: https://github.com/dagster-io/dagster/issues/1559 keys = event_dict['step']['solidHandleID'].split('.') handle = None while keys: handle = SolidHandle(keys.pop(0), definition_name=None, parent=handle) return DagsterEvent( event_type_value=event_type.value, pipeline_name=pipeline_name, step_key=event_dict['step']['key'], solid_handle=handle, step_kind_value=event_dict['step']['kind'], logging_tags=None, event_specific_data=event_specific_data, )
def _dagster_event_sequence_for_step( step_context: StepExecutionContext) -> Iterator[DagsterEvent]: """ Yield a sequence of dagster events for the given step with the step context. This function also processes errors. It handles a few error cases: (1) User code requests to be retried: A RetryRequested has been raised. We will either put the step in to up_for_retry state or a failure state depending on the number of previous attempts and the max_retries on the received RetryRequested. (2) User code fails successfully: The user-space code has raised a Failure which may have explicit metadata attached. (3) User code fails unexpectedly: The user-space code has raised an Exception. It has been wrapped in an exception derived from DagsterUserCodeException. In that case the original user exc_info is stashed on the exception as the original_exc_info property. (4) Execution interrupted: The run was interrupted in the middle of execution (typically by a termination request). (5) User error: The framework raised a DagsterError that indicates a usage error or some other error not communicated by a user-thrown exception. For example, if the user yields an object out of a compute function that is not a proper event (not an Output, ExpectationResult, etc). (6) Framework failure: An unexpected error occurred. This is a framework error. Either there has been an internal error in the framework OR we have forgotten to put a user code error boundary around invoked user-space code. These terminate the computation immediately (by re-raising). The "raised_dagster_errors" context manager can be used to force these errors to be re-raised and surfaced to the user. This is mostly to get sensible errors in test and ad-hoc contexts, rather than forcing the user to wade through the PipelineExecutionResult API in order to find the step that failed. For tools, however, this option should be false, and a sensible error message signaled to the user within that tool. """ check.inst_param(step_context, "step_context", StepExecutionContext) try: if step_context.step_launcher: # info all on step_context - should deprecate second arg step_events = step_context.step_launcher.launch_step( step_context, step_context.previous_attempt_count) else: step_events = core_dagster_event_sequence_for_step(step_context) for step_event in check.generator(step_events): yield step_event # case (1) in top comment except RetryRequested as retry_request: retry_err_info = serializable_error_info_from_exc_info(sys.exc_info()) if step_context.retry_mode.disabled: fail_err = SerializableErrorInfo( message="RetryRequested but retries are disabled", stack=retry_err_info.stack, cls_name=retry_err_info.cls_name, cause=retry_err_info.cause, ) step_context.capture_step_exception(retry_request) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=fail_err, user_failure_data=None), ) else: # retries.enabled or retries.deferred prev_attempts = step_context.previous_attempt_count if prev_attempts >= retry_request.max_retries: fail_err = SerializableErrorInfo( message="Exceeded max_retries of {}".format( retry_request.max_retries), stack=retry_err_info.stack, cls_name=retry_err_info.cls_name, cause=retry_err_info.cause, ) step_context.capture_step_exception(retry_request) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=fail_err, user_failure_data=None), ) else: yield DagsterEvent.step_retry_event( step_context, StepRetryData( error=retry_err_info, seconds_to_wait=retry_request.seconds_to_wait, ), ) # case (2) in top comment except Failure as failure: step_context.capture_step_exception(failure) yield step_failure_event_from_exc_info( step_context, sys.exc_info(), UserFailureData( label="intentional-failure", description=failure.description, metadata_entries=failure.metadata_entries, ), ) if step_context.raise_on_error: raise failure # case (3) in top comment except DagsterUserCodeExecutionError as dagster_user_error: step_context.capture_step_exception(dagster_user_error.user_exception) yield step_failure_event_from_exc_info( step_context, sys.exc_info(), error_source=ErrorSource.USER_CODE_ERROR, ) if step_context.raise_on_error: raise dagster_user_error.user_exception # case (4) in top comment except (KeyboardInterrupt, DagsterExecutionInterruptedError) as interrupt_error: step_context.capture_step_exception(interrupt_error) yield step_failure_event_from_exc_info( step_context, sys.exc_info(), error_source=ErrorSource.INTERRUPT, ) raise interrupt_error # case (5) in top comment except DagsterError as dagster_error: step_context.capture_step_exception(dagster_error) yield step_failure_event_from_exc_info( step_context, sys.exc_info(), error_source=ErrorSource.FRAMEWORK_ERROR, ) if step_context.raise_on_error: raise dagster_error # case (6) in top comment except Exception as unexpected_exception: # pylint: disable=broad-except step_context.capture_step_exception(unexpected_exception) yield step_failure_event_from_exc_info( step_context, sys.exc_info(), error_source=ErrorSource.UNEXPECTED_ERROR, ) raise unexpected_exception
def launch_step(self, step_handler_context: StepHandlerContext): events = [] assert (len( step_handler_context.execute_step_args.step_keys_to_execute) == 1 ), "Launching multiple steps is not currently supported" step_key = step_handler_context.execute_step_args.step_keys_to_execute[ 0] job_name = self._get_k8s_step_job_name(step_handler_context) pod_name = job_name args = step_handler_context.execute_step_args.get_command_args() job_config = self._job_config if not job_config.job_image: job_config = job_config.with_image( step_handler_context.execute_step_args.pipeline_origin. repository_origin.container_image) if not job_config.job_image: raise Exception( "No image included in either executor config or the job") user_defined_k8s_config = get_user_defined_k8s_config( frozentags(step_handler_context.step_tags[step_key])) job = construct_dagster_k8s_job( job_config=job_config, args=args, job_name=job_name, pod_name=pod_name, component="step_worker", user_defined_k8s_config=user_defined_k8s_config, labels={ "dagster/job": step_handler_context.execute_step_args.pipeline_origin. pipeline_name, "dagster/op": step_key, }, ) events.append( DagsterEvent( event_type_value=DagsterEventType.ENGINE_EVENT.value, pipeline_name=step_handler_context.execute_step_args. pipeline_origin.pipeline_name, step_key=step_key, message= f"Executing step {step_key} in Kubernetes job {job_name}", event_specific_data=EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), ], ), )) self._batch_api.create_namespaced_job(body=job, namespace=self._job_namespace) return events
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) step_levels = execution_plan.execution_step_levels() intermediates_manager = pipeline_context.intermediates_manager limit = pipeline_context.executor_config.max_concurrent step_key_set = set(step.key for step in execution_plan.execution_steps()) yield DagsterEvent.engine_event( pipeline_context, 'Executing steps using multiprocess engine: parent process (pid: {pid})'.format( pid=os.getpid() ), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=step_key_set ), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: for event in copy_required_intermediates_for_execution( pipeline_context, execution_plan ): yield event for step_level in step_levels: step_contexts_to_execute = [] for step in step_level: step_context = pipeline_context.for_step(step) if not intermediates_manager.all_inputs_covered(step_context, step): uncovered_inputs = intermediates_manager.uncovered_inputs( step_context, step ) step_context.log.error( ( 'Not all inputs covered for {step}. Not executing.' 'Output missing for inputs: {uncovered_inputs}' ).format(uncovered_inputs=uncovered_inputs, step=step.key) ) continue step_contexts_to_execute.append(step_context) for step_event in bounded_parallel_executor( pipeline_context, step_contexts_to_execute, limit ): yield step_event yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: parent process exiting after {duration} (pid: {pid})'.format( duration=format_duration(timer_result.millis), pid=os.getpid() ), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def core_dagster_event_sequence_for_step( step_context: SystemStepExecutionContext, prior_attempt_count: int) -> Iterator[DagsterEvent]: """ Execute the step within the step_context argument given the in-memory events. This function yields a sequence of DagsterEvents, but without catching any exceptions that have bubbled up during the computation of the step. """ check.inst_param(step_context, "step_context", SystemStepExecutionContext) check.int_param(prior_attempt_count, "prior_attempt_count") if prior_attempt_count > 0: yield DagsterEvent.step_restarted_event(step_context, prior_attempt_count) else: yield DagsterEvent.step_start_event(step_context) inputs = {} for step_input in step_context.step.step_inputs: input_def = step_input.source.get_input_def(step_context.pipeline_def) dagster_type = input_def.dagster_type if dagster_type.kind == DagsterTypeKind.NOTHING: continue for event_or_input_value in ensure_gen( step_input.source.load_input_object(step_context)): if isinstance(event_or_input_value, DagsterEvent): yield event_or_input_value else: check.invariant(step_input.name not in inputs) inputs[step_input.name] = event_or_input_value for input_name, input_value in inputs.items(): for evt in check.generator( _type_checked_event_sequence_for_input(step_context, input_name, input_value)): yield evt with time_execution_scope() as timer_result: user_event_sequence = check.generator( _user_event_sequence_for_step_compute_fn(step_context, inputs)) # It is important for this loop to be indented within the # timer block above in order for time to be recorded accurately. for user_event in check.generator( _step_output_error_checked_user_event_sequence( step_context, user_event_sequence)): if isinstance(user_event, (Output, DynamicOutput)): for evt in _type_check_and_store_output( step_context, user_event): yield evt elif isinstance(user_event, (AssetMaterialization, Materialization)): yield DagsterEvent.step_materialization( step_context, user_event) elif isinstance(user_event, ExpectationResult): yield DagsterEvent.step_expectation_result( step_context, user_event) else: check.failed( "Unexpected event {event}, should have been caught earlier" .format(event=user_event)) yield DagsterEvent.step_success_event( step_context, StepSuccessData(duration_ms=timer_result.millis))
def execute(pipeline_context, execution_plan, step_keys_to_execute=None): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) step_key_set = None if step_keys_to_execute is None else set(step_keys_to_execute) yield DagsterEvent.engine_event( pipeline_context, 'Executing steps in process (pid: {pid})'.format(pid=os.getpid()), event_specific_data=EngineEventData.in_process(os.getpid(), step_key_set), ) with time_execution_scope() as timer_result: check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config ), ) failed_or_skipped_steps = set() step_levels = execution_plan.topological_step_levels() # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 for step_level in step_levels: for step in step_level: if step_key_set and step.key not in step_key_set: continue step_context = pipeline_context.for_step(step) with mirror_step_io(step_context): # capture all of the logs for this step failed_inputs = [] for step_input in step.step_inputs: failed_inputs.extend( failed_or_skipped_steps.intersection(step_input.dependency_keys) ) if failed_inputs: step_context.log.info( ( 'Dependencies for step {step} failed: {failed_inputs}. Not executing.' ).format(step=step.key, failed_inputs=failed_inputs) ) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step ) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional( uncovered_inputs, execution_plan, step.key ) step_context.log.info( ( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}' ).format(uncovered_inputs=uncovered_inputs, step=step.key) ) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue for step_event in check.generator( dagster_event_sequence_for_step(step_context) ): check.inst(step_event, DagsterEvent) if step_event.is_step_failure: failed_or_skipped_steps.add(step.key) yield step_event yield DagsterEvent.engine_event( pipeline_context, 'Finished steps in process (pid: {pid}) in {duration_ms}'.format( pid=os.getpid(), duration_ms=format_duration(timer_result.millis) ), event_specific_data=EngineEventData.in_process(os.getpid(), step_key_set), )
def test_fetch_records_by_update_timestamp(self, storage): assert storage self._skip_in_memory(storage) one = make_new_run_id() two = make_new_run_id() three = make_new_run_id() storage.add_run( TestRunStorage.build_run( run_id=one, pipeline_name="some_pipeline", status=PipelineRunStatus.STARTED ) ) storage.add_run( TestRunStorage.build_run( run_id=two, pipeline_name="some_pipeline", status=PipelineRunStatus.FAILURE ) ) storage.add_run( TestRunStorage.build_run( run_id=three, pipeline_name="some_pipeline", status=PipelineRunStatus.STARTED ) ) storage.handle_run_event( three, # three succeeds DagsterEvent( message="a message", event_type_value=DagsterEventType.PIPELINE_SUCCESS.value, pipeline_name="some_pipeline", ), ) storage.handle_run_event( one, # fail one after two has fails and three has succeeded DagsterEvent( message="a message", event_type_value=DagsterEventType.PIPELINE_FAILURE.value, pipeline_name="some_pipeline", ), ) record_two = storage.get_run_records( filters=PipelineRunsFilter(run_ids=[two], updated_after=datetime(2020, 1, 1)) )[0] run_two_update_timestamp = record_two.update_timestamp assert [ record.pipeline_run.run_id for record in storage.get_run_records( filters=PipelineRunsFilter(updated_after=run_two_update_timestamp), order_by="update_timestamp", ascending=True, ) ] == [three, one] assert [ record.pipeline_run.run_id for record in storage.get_run_records( filters=PipelineRunsFilter( statuses=[PipelineRunStatus.FAILURE], updated_after=run_two_update_timestamp ), ) ] == [one]
def event_generator( self, execution_plan, run_config, pipeline_run, instance, scoped_resources_builder_cm, intermediate_storage=None, raise_on_error=False, resource_instances_to_override=None, output_capture=None, ): execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan) pipeline_def = execution_plan.pipeline.get_definition() run_config = check.dict_param(run_config, "run_config", key_type=str) pipeline_run = check.inst_param(pipeline_run, "pipeline_run", PipelineRun) instance = check.inst_param(instance, "instance", DagsterInstance) scoped_resources_builder_cm = check.callable_param( scoped_resources_builder_cm, "scoped_resources_builder_cm") intermediate_storage = check.opt_inst_param( intermediate_storage, "intermediate_storage_data", IntermediateStorage) raise_on_error = check.bool_param(raise_on_error, "raise_on_error") resource_instances_to_override = check.opt_dict_param( resource_instances_to_override, "resource_instances_to_override") execution_context = None resources_manager = None try: context_creation_data = create_context_creation_data( execution_plan, run_config, pipeline_run, instance, ) log_manager = create_log_manager(context_creation_data) resource_defs = execution_plan.pipeline_def.get_mode_definition( context_creation_data.environment_config.mode).resource_defs resources_manager = scoped_resources_builder_cm( resource_defs=resource_defs, resource_configs=context_creation_data.environment_config. resources, log_manager=log_manager, execution_plan=execution_plan, pipeline_run=context_creation_data.pipeline_run, resource_keys_to_init=context_creation_data. resource_keys_to_init, instance=instance, resource_instances_to_override=resource_instances_to_override, emit_persistent_events=True, ) yield from resources_manager.generate_setup_events() scoped_resources_builder = check.inst( resources_manager.get_object(), ScopedResourcesBuilder) intermediate_storage = create_intermediate_storage( context_creation_data, intermediate_storage, scoped_resources_builder, ) execution_context = self.construct_context( context_creation_data=context_creation_data, scoped_resources_builder=scoped_resources_builder, log_manager=log_manager, intermediate_storage=intermediate_storage, raise_on_error=raise_on_error, output_capture=output_capture, ) _validate_plan_with_context(execution_context, execution_plan) yield execution_context yield from resources_manager.generate_teardown_events() except DagsterError as dagster_error: if execution_context is None: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info()) error_info = serializable_error_info_from_exc_info( user_facing_exc_info) yield DagsterEvent.pipeline_init_failure( pipeline_name=pipeline_def.name, failure_data=PipelineInitFailureData(error=error_info), log_manager=_create_context_free_log_manager( instance, pipeline_run, pipeline_def), ) if resources_manager: yield from resources_manager.generate_teardown_events() else: # pipeline teardown failure raise dagster_error if raise_on_error: raise dagster_error
def inner_plan_execution_iterator(pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) retries = pipeline_context.retries yield from copy_required_intermediates_for_execution(pipeline_context, execution_plan) with execution_plan.start(retries=retries) as active_execution: # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 while not active_execution.is_complete: step = active_execution.get_next_step() step_context = pipeline_context.for_step(step) step_event_list = [] missing_resources = [ resource_key for resource_key in step_context.required_resource_keys if not hasattr(step_context.resources, resource_key) ] check.invariant( len(missing_resources) == 0, ( "Expected step context for solid {solid_name} to have all required resources, but " "missing {missing_resources}." ).format(solid_name=step_context.solid.name, missing_resources=missing_resources), ) # capture all of the logs for this step with pipeline_context.instance.compute_log_manager.watch( step_context.pipeline_run, step_context.step.key ): missing_input_sources = pipeline_context.intermediate_storage.get_missing_input_sources( step_context, step ) if missing_input_sources: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_sources_from_optional_outputs( missing_input_sources, execution_plan, step.key ) step_context.log.info( ( "Not all inputs covered for {step}. Not executing. Sources missing: {missing_input_sources}" ).format(missing_input_sources=missing_input_sources, step=step.key) ) step_event = DagsterEvent.step_skipped_event(step_context) step_event_list.append(step_event) yield step_event active_execution.mark_skipped(step.key) else: for step_event in check.generator( _dagster_event_sequence_for_step(step_context, retries) ): check.inst(step_event, DagsterEvent) step_event_list.append(step_event) yield step_event active_execution.handle_event(step_event) active_execution.verify_complete(pipeline_context, step.key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator(pipeline_context): step_event_list.append(event) yield event # pass a list of step events to hooks for hook_event in _trigger_hook(step_context, step_event_list): yield hook_event
def inner_plan_execution_iterator( pipeline_context: PlanExecutionContext, execution_plan: ExecutionPlan) -> Iterator[DagsterEvent]: check.inst_param(pipeline_context, "pipeline_context", PlanExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) with execution_plan.start( retry_mode=pipeline_context.retry_mode) as active_execution: # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 while not active_execution.is_complete: step = active_execution.get_next_step() step_context = cast( StepExecutionContext, pipeline_context.for_step( step, active_execution.retry_state.get_attempt_count(step.key)), ) step_event_list = [] missing_resources = [ resource_key for resource_key in step_context.required_resource_keys if not hasattr(step_context.resources, resource_key) ] check.invariant( len(missing_resources) == 0, ("Expected step context for solid {solid_name} to have all required resources, but " "missing {missing_resources}.").format( solid_name=step_context.solid.name, missing_resources=missing_resources), ) # capture all of the logs for this step with pipeline_context.instance.compute_log_manager.watch( step_context.pipeline_run, step_context.step.key): yield DagsterEvent.capture_logs(step_context, log_key=step_context.step.key, steps=[step_context.step]) for step_event in check.generator( _dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) step_event_list.append(step_event) yield step_event active_execution.handle_event(step_event) active_execution.verify_complete(pipeline_context, step.key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator( pipeline_context): step_event_list.append(event) yield event # pass a list of step events to hooks for hook_event in _trigger_hook(step_context, step_event_list): yield hook_event
def pipeline_initialization_event_generator( pipeline_def, environment_dict, pipeline_run, instance, execution_plan, scoped_resources_builder_cm, system_storage_data=None, raise_on_error=False, ): pipeline_def = check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition) environment_dict = check.dict_param(environment_dict, 'environment_dict', key_type=str) pipeline_run = check.inst_param(pipeline_run, 'pipeline_run', PipelineRun) instance = check.inst_param(instance, 'instance', DagsterInstance) execution_plan = check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) scoped_resources_builder_cm = check.callable_param( scoped_resources_builder_cm, 'scoped_resources_builder_cm') system_storage_data = check.opt_inst_param(system_storage_data, 'system_storage_data', SystemStorageData) raise_on_error = check.bool_param(raise_on_error, 'raise_on_error') pipeline_context = None resources_manager = None try: context_creation_data = create_context_creation_data( pipeline_def, environment_dict, pipeline_run, instance, execution_plan, ) executor_config = create_executor_config(context_creation_data) log_manager = create_log_manager(context_creation_data) resources_manager = scoped_resources_builder_cm( execution_plan, context_creation_data.environment_config, context_creation_data.pipeline_run, log_manager, context_creation_data.resource_keys_to_init, ) for event in resources_manager.generate_setup_events(): yield event scoped_resources_builder = check.inst(resources_manager.get_object(), ScopedResourcesBuilder) system_storage_data = create_system_storage_data( context_creation_data, system_storage_data, scoped_resources_builder) pipeline_context = construct_pipeline_execution_context( context_creation_data=context_creation_data, scoped_resources_builder=scoped_resources_builder, system_storage_data=system_storage_data, log_manager=log_manager, executor_config=executor_config, raise_on_error=raise_on_error, ) _validate_plan_with_context(pipeline_context, execution_plan) yield pipeline_context for event in resources_manager.generate_teardown_events(): yield event except DagsterError as dagster_error: if pipeline_context is None: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info()) error_info = serializable_error_info_from_exc_info( user_facing_exc_info) yield DagsterEvent.pipeline_init_failure( pipeline_name=pipeline_def.name, failure_data=PipelineInitFailureData(error=error_info), log_manager=_create_context_free_log_manager( instance, pipeline_run, pipeline_def), ) if resources_manager: for event in resources_manager.generate_teardown_events(): yield event else: # pipeline teardown failure raise dagster_error if raise_on_error: raise dagster_error
def _core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn): from .tasks import make_app check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.callable_param(step_execution_fn, 'step_execution_fn') check.param_invariant( isinstance(pipeline_context.executor_config, (CeleryConfig, CeleryK8sJobConfig)), 'pipeline_context', 'Expected executor_config to be Celery config got {}'.format( pipeline_context.executor_config ), ) celery_config = pipeline_context.executor_config # https://github.com/dagster-io/dagster/issues/2440 check.invariant( pipeline_context.system_storage_def.is_persistent, 'Cannot use in-memory storage with Celery, use filesystem (on top of NFS or ' 'similar system that allows files to be available to all nodes), S3, or GCS', ) app = make_app(celery_config) priority_for_step = lambda step: ( -1 * int(step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority)) + -1 * _get_run_priority(pipeline_context) ) priority_for_key = lambda step_key: ( priority_for_step(execution_plan.get_step_by_key(step_key)) ) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step ) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted(step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[step_key] = serializable_error_info_from_exc_info(sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple(step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator(pipeline_context): yield event # don't add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".'.format( step_key=step.key, queue=queue ), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) # Get the Celery priority for this step priority = _get_step_priority(pipeline_context, step) # Submit the Celery tasks step_results[step.key] = step_execution_fn( app, pipeline_context, step, queue, priority ) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'.format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occurred in workers:\n{error_list}'.format( error_list='\n'.join( [ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ] ) ), subprocess_error_infos=list(step_errors.values()), )
def test_multiline_logging_complex(): msg = 'DagsterEventType.STEP_FAILURE for step start.materialization.output.result.0' kwargs = { 'pipeline': 'example', 'pipeline_name': 'example', 'step_key': 'start.materialization.output.result.0', 'solid': 'start', 'solid_definition': 'emit_num', 'dagster_event': DagsterEvent( event_type_value='STEP_FAILURE', pipeline_name='error_monster', step_key='start.materialization.output.result.0', solid_handle=SolidHandle('start', 'emit_num', None), step_kind_value='MATERIALIZATION_THUNK', logging_tags={ 'pipeline': 'error_monster', 'step_key': 'start.materialization.output.result.0', 'solid': 'start', 'solid_definition': 'emit_num', }, event_specific_data=StepFailureData( error=SerializableErrorInfo( message= "FileNotFoundError: [Errno 2] No such file or directory: '/path/to/file'\n", stack=[ ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/errors.py", line 186, in user_code_error_boundary\n yield\n', ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/execution_plan/simple_engine.py", line 365, in _event_sequence_for_step_compute_fn\n for step_output in gen:\n', ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/execution_plan/materialization_thunk.py", line 28, in _fn\n runtime_type.output_materialization_config.materialize_runtime_value(config_spec, runtime_value)\n', ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/types/config_schema.py", line 93, in materialize_runtime_value\n return func(config_value, runtime_value)\n', ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/types/config_schema.py", line 110, in _selector\n return func(selector_key, selector_value, runtime_value)\n', ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/types/builtin_config_schemas.py", line 59, in _builtin_output_schema\n with open(json_file_path, \'w\') as ff:\n', ], cls_name='FileNotFoundError', ), user_failure_data=None, ), ), } with _setup_logger(DAGSTER_DEFAULT_LOGGER) as (captured_results, logger): dl = DagsterLogManager('123', {}, [logger]) dl.info(msg, **kwargs) kv_pairs = set(captured_results[0].split('\n')[1:]) expected_pairs = [ ' orig_message = "DagsterEventType.STEP_FAILURE for step start.materialization.output.result.0"', ' run_id = "123"', ' pipeline = "example"', ' solid_definition = "emit_num"', ' pipeline_name = "example"', ' solid = "start"', ' step_key = "start.materialization.output.result.0"', ] for e in expected_pairs: assert e in kv_pairs assert _regex_match_kv_pair( r' log_message_id = "{0}"'.format(REGEX_UUID), kv_pairs) assert _regex_match_kv_pair( r' log_timestamp = "{0}"'.format(REGEX_TS), kv_pairs) expected_dagster_event = { 'event_specific_data': [ [ "FileNotFoundError: [Errno 2] No such file or directory: '/path/to/file'\n", [ ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/errors.py", line 186, in user_code_error_boundary\n yield\n', ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/execution_plan/simple_engine.py", line 365, in _event_sequence_for_step_compute_fn\n for step_output in gen:\n', ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/execution_plan/materialization_thunk.py", line 28, in _fn\n runtime_type.output_materialization_config.materialize_runtime_value(config_spec, runtime_value)\n', ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/types/config_schema.py", line 93, in materialize_runtime_value\n return func(config_value, runtime_value)\n', ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/types/config_schema.py", line 110, in _selector\n return func(selector_key, selector_value, runtime_value)\n', ' File "/Users/nate/src/dagster/python_modules/dagster/dagster/core/types/builtin_config_schemas.py", line 59, in _builtin_output_schema\n with open(json_file_path, \'w\') as ff:\n', ], 'FileNotFoundError', ], None, # user_failure_data ], 'event_type_value': 'STEP_FAILURE', 'message': None, 'pipeline_name': 'error_monster', 'solid_handle': ['start', 'emit_num', None], 'step_key': 'start.materialization.output.result.0', 'step_kind_value': 'MATERIALIZATION_THUNK', 'logging_tags': { 'pipeline': 'error_monster', 'solid': 'start', 'solid_definition': 'emit_num', 'step_key': 'start.materialization.output.result.0', }, } dagster_event = json.loads([ pair for pair in kv_pairs if 'dagster_event' in pair ][0].strip(' dagster_event = ')) assert dagster_event == expected_dagster_event
def execute(self, plan_context: PlanOrchestrationContext, execution_plan: ExecutionPlan): check.inst_param(plan_context, "plan_context", PlanOrchestrationContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) self._event_cursor = -1 # pylint: disable=attribute-defined-outside-init yield DagsterEvent.engine_event( plan_context, f"Starting execution with step handler {self._step_handler.name}", EngineEventData(), ) with execution_plan.start(retry_mode=self.retries) as active_execution: running_steps: Dict[str, ExecutionStep] = {} if plan_context.resume_from_failure: yield DagsterEvent.engine_event( plan_context, "Resuming execution from failure", EngineEventData(), ) prior_events = self._pop_events( plan_context.instance, plan_context.run_id, ) for dagster_event in prior_events: yield dagster_event possibly_in_flight_steps = active_execution.rebuild_from_events( prior_events) for step in possibly_in_flight_steps: yield DagsterEvent.engine_event( plan_context, "Checking on status of possibly launched steps", EngineEventData(), step.handle, ) # TODO: check if failure event included. For now, hacky assumption that # we don't log anything on successful check if self._step_handler.check_step_health( self._get_step_handler_context( plan_context, [step], active_execution)): # health check failed, launch the step self._log_new_events( self._step_handler.launch_step( self._get_step_handler_context( plan_context, [step], active_execution)), plan_context, { step.key: step for step in possibly_in_flight_steps }, ) running_steps[step.key] = step last_check_step_health_time = pendulum.now("UTC") # Order of events is important here. During an interation, we call handle_event, then get_steps_to_execute, # then is_complete. get_steps_to_execute updates the state of ActiveExecution, and without it # is_complete can return true when we're just between steps. while not active_execution.is_complete: if active_execution.check_for_interrupts(): if not plan_context.instance.run_will_resume( plan_context.run_id): yield DagsterEvent.engine_event( plan_context, "Executor received termination signal, forwarding to steps", EngineEventData.interrupted( list(running_steps.keys())), ) active_execution.mark_interrupted() for _, step in running_steps.items(): self._log_new_events( self._step_handler.terminate_step( self._get_step_handler_context( plan_context, [step], active_execution)), plan_context, running_steps, ) else: yield DagsterEvent.engine_event( plan_context, "Executor received termination signal, not forwarding to steps because " "run will be resumed", EngineEventData(metadata_entries=[ EventMetadataEntry.text( str(running_steps.keys()), "steps_in_flight") ]), ) active_execution.mark_interrupted() return for dagster_event in self._pop_events( plan_context.instance, plan_context.run_id, ): # type: ignore # STEP_SKIPPED events are only emitted by ActiveExecution, which already handles # and yields them. if dagster_event.is_step_skipped: assert isinstance(dagster_event.step_key, str) active_execution.verify_complete( plan_context, dagster_event.step_key) else: yield dagster_event active_execution.handle_event(dagster_event) if dagster_event.is_step_success or dagster_event.is_step_failure: assert isinstance(dagster_event.step_key, str) del running_steps[dagster_event.step_key] active_execution.verify_complete( plan_context, dagster_event.step_key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator( plan_context): yield event curr_time = pendulum.now("UTC") if (curr_time - last_check_step_health_time).total_seconds( ) >= self._check_step_health_interval_seconds: last_check_step_health_time = curr_time for _, step in running_steps.items(): self._log_new_events( self._step_handler.check_step_health( self._get_step_handler_context( plan_context, [step], active_execution)), plan_context, running_steps, ) for step in active_execution.get_steps_to_execute(): running_steps[step.key] = step self._log_new_events( self._step_handler.launch_step( self._get_step_handler_context( plan_context, [step], active_execution)), plan_context, running_steps, ) time.sleep(self._sleep_seconds)
def host_mode_execution_context_event_generator( pipeline, execution_plan, run_config, pipeline_run, instance, raise_on_error, executor_defs, output_capture, resume_from_failure: bool = False, ): check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.inst_param(pipeline, "pipeline", ReconstructablePipeline) check.dict_param(run_config, "run_config", key_type=str) check.inst_param(pipeline_run, "pipeline_run", PipelineRun) check.inst_param(instance, "instance", DagsterInstance) executor_defs = check.list_param(executor_defs, "executor_defs", of_type=ExecutorDefinition) check.bool_param(raise_on_error, "raise_on_error") check.invariant(output_capture is None) execution_context = None loggers = [] for (logger_def, logger_config) in default_system_loggers(): loggers.append( logger_def.logger_fn( InitLoggerContext( logger_config, pipeline_def=None, logger_def=logger_def, run_id=pipeline_run.run_id, ))) log_manager = DagsterLogManager.create(loggers=loggers, pipeline_run=pipeline_run, instance=instance) try: executor = _get_host_mode_executor(pipeline, run_config, executor_defs, instance) execution_context = PlanOrchestrationContext( plan_data=PlanData( pipeline=pipeline, pipeline_run=pipeline_run, instance=instance, execution_plan=execution_plan, raise_on_error=raise_on_error, retry_mode=executor.retries, ), log_manager=log_manager, executor=executor, output_capture=None, resume_from_failure=resume_from_failure, ) yield execution_context except DagsterError as dagster_error: if execution_context is None: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info # type: ignore if dagster_error.is_user_code_error else sys.exc_info()) error_info = serializable_error_info_from_exc_info( user_facing_exc_info) event = DagsterEvent.pipeline_failure( pipeline_context_or_name=pipeline_run.pipeline_name, context_msg= (f'Pipeline failure during initialization for pipeline "{pipeline_run.pipeline_name}". ' "This may be due to a failure in initializing the executor or one of the loggers." ), error_info=error_info, ) log_manager.log_dagster_event( level=logging.ERROR, msg=event.message, dagster_event=event # type: ignore ) yield event else: # pipeline teardown failure raise dagster_error if raise_on_error: raise dagster_error
def core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.callable_param(step_execution_fn, "step_execution_fn") executor = pipeline_context.executor # https://github.com/dagster-io/dagster/issues/2440 check.invariant( execution_plan.artifacts_persisted, "Cannot use in-memory storage with Celery, use filesystem (on top of NFS or " "similar system that allows files to be available to all nodes), S3, or GCS", ) app = make_app(executor.app_args()) priority_for_step = lambda step: (-1 * int( step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} with execution_plan.start( retries=pipeline_context.executor.retries, sort_key_fn=priority_for_step, ) as active_execution: stopping = False while (not active_execution.is_complete and not stopping) or step_results: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Celery executor: received termination signal - revoking active tasks from workers", EngineEventData.interrupted(list(step_results.keys())), ) stopping = True active_execution.mark_interrupted() for result in step_results.values(): result.revoke() results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except TaskRevokedError: step_events = [] yield DagsterEvent.engine_event( pipeline_context, 'celery task for running step "{step_key}" was revoked.' .format(step_key=step_key, ), EngineEventData(marker_end=DELEGATE_MARKER), step_handle=active_execution.get_step_by_key( step_key).handle, ) except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping or step_errors: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_handle=step.handle, ) # Get the Celery priority for this step priority = _get_step_priority(pipeline_context, step) # Submit the Celery tasks step_results[step.key] = step_execution_fn( app, pipeline_context, step, queue, priority) except Exception: yield DagsterEvent.engine_event( pipeline_context, "Encountered error during celery task submission.". format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( "During celery execution errors occurred in workers:\n{error_list}" .format(error_list="\n".join([ "[{step}]: {err}".format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) intermediates_manager = pipeline_context.intermediates_manager limit = pipeline_context.executor_config.max_concurrent yield DagsterEvent.engine_event( pipeline_context, 'Executing steps using multiprocess engine: parent process (pid: {pid})' .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries) active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: try: # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[ step.key] = get_multiprocessing_context( ).Event() active_iters[ step.key] = execute_step_out_of_process( step_context, step, errors, term_events) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event(event_or_none) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] if term_events[key].is_set(): stopping = True del term_events[key] active_execution.verify_complete(pipeline_context, key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # In the very small chance that we get interrupted in this coordination section and not # polling the subprocesses for events - try to clean up gracefully except KeyboardInterrupt: yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes', EngineEventData.interrupted(list(term_events.keys())), ) stopping = True for event in term_events.values(): event.set() errs = {pid: err for pid, err in errors.items() if err} if errs: raise DagsterSubprocessError( 'During multiprocess execution errors occurred in child processes:\n{error_list}' .format(error_list='\n'.join([ 'In process {pid}: {err}'.format(pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: parent process exiting after {duration} (pid: {pid})' .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def core_dagster_event_sequence_for_step(step_context, prior_attempt_count): """ Execute the step within the step_context argument given the in-memory events. This function yields a sequence of DagsterEvents, but without catching any exceptions that have bubbled up during the computation of the step. """ check.inst_param(step_context, "step_context", SystemStepExecutionContext) check.int_param(prior_attempt_count, "prior_attempt_count") if prior_attempt_count > 0: yield DagsterEvent.step_restarted_event(step_context, prior_attempt_count) else: yield DagsterEvent.step_start_event(step_context) inputs = {} for input_name, input_value in _input_values_from_intermediate_storage( step_context): if isinstance(input_value, ObjectStoreOperation): yield DagsterEvent.object_store_operation( step_context, ObjectStoreOperation.serializable(input_value, value_name=input_name)) inputs[input_name] = input_value.obj elif isinstance(input_value, MultipleStepOutputsListWrapper): for op in input_value: if isinstance(input_value, ObjectStoreOperation): yield DagsterEvent.object_store_operation( step_context, ObjectStoreOperation.serializable( op, value_name=input_name)) elif isinstance(input_value, AssetStoreOperation): yield DagsterEvent.asset_store_operation( step_context, input_value) inputs[input_name] = [op.obj for op in input_value] elif isinstance(input_value, AssetStoreOperation): yield DagsterEvent.asset_store_operation(step_context, input_value) inputs[input_name] = input_value.obj else: inputs[input_name] = input_value for input_name, input_value in inputs.items(): for evt in check.generator( _type_checked_event_sequence_for_input(step_context, input_name, input_value)): yield evt with time_execution_scope() as timer_result: user_event_sequence = check.generator( _user_event_sequence_for_step_compute_fn(step_context, inputs)) # It is important for this loop to be indented within the # timer block above in order for time to be recorded accurately. for user_event in check.generator( _step_output_error_checked_user_event_sequence( step_context, user_event_sequence)): if isinstance(user_event, Output): for evt in _create_step_events_for_output( step_context, user_event): yield evt elif isinstance(user_event, (AssetMaterialization, Materialization)): yield DagsterEvent.step_materialization( step_context, user_event) elif isinstance(user_event, ExpectationResult): yield DagsterEvent.step_expectation_result( step_context, user_event) else: check.failed( "Unexpected event {event}, should have been caught earlier" .format(event=user_event)) yield DagsterEvent.step_success_event( step_context, StepSuccessData(duration_ms=timer_result.millis))
def execute(pipeline_context, execution_plan, step_keys_to_execute=None): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) step_key_set = None if step_keys_to_execute is None else set(step_keys_to_execute) check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config ), ) failed_or_skipped_steps = set() step_levels = execution_plan.topological_step_levels() # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 for step_level in step_levels: for step in step_level: if step_key_set and step.key not in step_key_set: continue step_context = pipeline_context.for_step(step) failed_inputs = [ step_input.prev_output_handle.step_key for step_input in step.step_inputs if step_input.is_from_output and step_input.prev_output_handle.step_key in failed_or_skipped_steps ] if failed_inputs: step_context.log.info( ( 'Dependencies for step {step} failed: {failed_inputs}. Not executing.' ).format(step=step.key, failed_inputs=failed_inputs) ) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step ) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional(uncovered_inputs, execution_plan, step.key) step_context.log.info( ( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}' ).format(uncovered_inputs=uncovered_inputs, step=step.key) ) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue for step_event in check.generator(dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) if step_event.is_step_failure: failed_or_skipped_steps.add(step.key) yield step_event
def resource_initialization_event_generator( resource_defs: Dict[str, ResourceDefinition], resource_configs: Dict[str, ResourceConfig], log_manager: DagsterLogManager, execution_plan: Optional[ExecutionPlan], pipeline_run: Optional[PipelineRun], resource_keys_to_init: Optional[Set[str]], instance: Optional[DagsterInstance], resource_instances_to_override: Optional[Dict[str, "InitializedResource"]], emit_persistent_events: Optional[bool], ): check.inst_param(log_manager, "log_manager", DagsterLogManager) resource_keys_to_init = check.opt_set_param(resource_keys_to_init, "resource_keys_to_init", of_type=str) check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan) check.opt_inst_param(pipeline_run, "pipeline_run", PipelineRun) check.opt_inst_param(instance, "instance", DagsterInstance) check.opt_dict_param(resource_instances_to_override, "resource_instances_to_override") if execution_plan and execution_plan.step_handle_for_single_step_plans(): step = execution_plan.get_step( cast( StepHandleUnion, cast(ExecutionPlan, execution_plan).step_handle_for_single_step_plans(), )) resource_log_manager = log_manager.with_tags( **cast(ExecutionStep, step).logging_tags) else: resource_log_manager = log_manager generator_closed = False resource_managers: Deque[EventGenerationManager] = deque() try: yield from _core_resource_initialization_event_generator( resource_defs=resource_defs, resource_configs=resource_configs, resource_log_manager=resource_log_manager, resource_managers=resource_managers, execution_plan=execution_plan, pipeline_run=pipeline_run, resource_keys_to_init=resource_keys_to_init, instance=instance, resource_instances_to_override=resource_instances_to_override, emit_persistent_events=emit_persistent_events, ) except GeneratorExit: # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/). generator_closed = True raise finally: if not generator_closed: error = None while len(resource_managers) > 0: manager = resource_managers.pop() try: yield from manager.generate_teardown_events() except DagsterUserCodeExecutionError as dagster_user_error: error = dagster_user_error if error: yield DagsterEvent.resource_teardown_failure( execution_plan, resource_log_manager, resource_keys_to_init, serializable_error_info_from_exc_info( error.original_exc_info), )
def _core_resource_initialization_event_generator( execution_plan, environment_config, pipeline_run, resource_keys_to_init, resource_log_manager, resource_managers, ): pipeline_def = execution_plan.pipeline_def resource_instances = {} mode_definition = pipeline_def.get_mode_definition(pipeline_run.mode) resource_init_times = {} try: if resource_keys_to_init: yield DagsterEvent.resource_init_start( execution_plan, resource_log_manager, resource_keys_to_init, ) for resource_name, resource_def in sorted( mode_definition.resource_defs.items()): if not resource_name in resource_keys_to_init: continue resource_context = InitResourceContext( pipeline_def=pipeline_def, resource_def=resource_def, resource_config=environment_config.resources.get( resource_name, {}).get("config"), run_id=pipeline_run.run_id, # Add tags with information about the resource log_manager=resource_log_manager.with_tags( resource_name=resource_name, resource_fn_name=str(resource_def.resource_fn.__name__), ), ) manager = single_resource_generation_manager( resource_context, resource_name, resource_def) for event in manager.generate_setup_events(): if event: yield event initialized_resource = check.inst(manager.get_object(), InitializedResource) resource_instances[resource_name] = initialized_resource.resource resource_init_times[resource_name] = initialized_resource.duration resource_managers.append(manager) if resource_keys_to_init: yield DagsterEvent.resource_init_success(execution_plan, resource_log_manager, resource_instances, resource_init_times) yield ScopedResourcesBuilder(resource_instances) except DagsterUserCodeExecutionError as dagster_user_error: yield DagsterEvent.resource_init_failure( execution_plan, resource_log_manager, resource_keys_to_init, serializable_error_info_from_exc_info( dagster_user_error.original_exc_info), ) raise dagster_user_error
def _core_resource_initialization_event_generator( resource_defs: Dict[str, ResourceDefinition], resource_configs: Dict[str, ResourceConfig], resource_log_manager: DagsterLogManager, resource_managers: Deque[EventGenerationManager], execution_plan: Optional[ExecutionPlan], pipeline_run: Optional[PipelineRun], resource_keys_to_init: Optional[Set[str]], instance: Optional[DagsterInstance], resource_instances_to_override: Optional[Dict[str, "InitializedResource"]], emit_persistent_events: Optional[bool], ): if emit_persistent_events: check.invariant( execution_plan, "If emit_persistent_events is enabled, then execution_plan must be provided", ) resource_instances_to_override = check.opt_dict_param( resource_instances_to_override, "resource_instances_to_override") resource_keys_to_init = check.opt_set_param(resource_keys_to_init, "resource_keys_to_init") resource_instances: Dict[str, "InitializedResource"] = {} resource_init_times = {} try: if emit_persistent_events and resource_keys_to_init: yield DagsterEvent.resource_init_start( execution_plan, resource_log_manager, resource_keys_to_init, ) resource_dependencies = _resolve_resource_dependencies(resource_defs) for level in toposort(resource_dependencies): for resource_name in level: if resource_name in resource_instances_to_override: # use the given resource instances instead of re-initiating it from resource def resource_def = ResourceDefinition.hardcoded_resource( resource_instances_to_override[resource_name]) else: resource_def = resource_defs[resource_name] if not resource_name in resource_keys_to_init: continue resource_context = InitResourceContext( resource_def=resource_def, resource_config=resource_configs[resource_name].config, pipeline_run=pipeline_run, # Add tags with information about the resource log_manager=resource_log_manager.with_tags( resource_name=resource_name, resource_fn_name=str( resource_def.resource_fn.__name__), ), resource_instance_dict=resource_instances, required_resource_keys=resource_def.required_resource_keys, instance=instance, pipeline_def_for_backwards_compat=execution_plan. pipeline_def if execution_plan else None, ) manager = single_resource_generation_manager( resource_context, resource_name, resource_def) for event in manager.generate_setup_events(): if event: yield event initialized_resource = check.inst(manager.get_object(), InitializedResource) resource_instances[ resource_name] = initialized_resource.resource resource_init_times[ resource_name] = initialized_resource.duration resource_managers.append(manager) if emit_persistent_events and resource_keys_to_init: yield DagsterEvent.resource_init_success(execution_plan, resource_log_manager, resource_instances, resource_init_times) yield ScopedResourcesBuilder(resource_instances) except DagsterUserCodeExecutionError as dagster_user_error: # Can only end up in this state if we attempt to initialize a resource, so # resource_keys_to_init cannot be empty if emit_persistent_events: yield DagsterEvent.resource_init_failure( execution_plan, resource_log_manager, resource_keys_to_init, serializable_error_info_from_exc_info( dagster_user_error.original_exc_info), ) raise dagster_user_error
def _dagster_event_sequence_for_step(step_context, retries): ''' Yield a sequence of dagster events for the given step with the step context. This function also processes errors. It handles a few error cases: (1) User code requests to be retried: A RetryRequested has been raised. We will either put the step in to up_for_retry state or a failure state depending on the number of previous attempts and the max_retries on the received RetryRequested. (2) User code fails successfully: The user-space code has raised a Failure which may have explicit metadata attached. (3) User code fails unexpectedly: The user-space code has raised an Exception. It has been wrapped in an exception derived from DagsterUserCodeException. In that case the original user exc_info is stashed on the exception as the original_exc_info property. (4) User error: The framework raised a DagsterError that indicates a usage error or some other error not communicated by a user-thrown exception. For example, if the user yields an object out of a compute function that is not a proper event (not an Output, ExpectationResult, etc). (5) Framework failure or interrupt: An unexpected error occurred. This is a framework error. Either there has been an internal error in the framework OR we have forgotten to put a user code error boundary around invoked user-space code. These terminate the computation immediately (by re-raising). The "raised_dagster_errors" context manager can be used to force these errors to be re-raised and surfaced to the user. This is mostly to get sensible errors in test and ad-hoc contexts, rather than forcing the user to wade through the PipelineExecutionResult API in order to find the step that failed. For tools, however, this option should be false, and a sensible error message signaled to the user within that tool. ''' check.inst_param(step_context, 'step_context', SystemStepExecutionContext) check.inst_param(retries, 'retries', Retries) try: prior_attempt_count = retries.get_attempt_count(step_context.step.key) if step_context.step_launcher: step_events = step_context.step_launcher.launch_step( step_context, prior_attempt_count) else: step_events = core_dagster_event_sequence_for_step( step_context, prior_attempt_count) for step_event in check.generator(step_events): yield step_event # case (1) in top comment except RetryRequested as retry_request: retry_err_info = serializable_error_info_from_exc_info(sys.exc_info()) if retries.disabled: fail_err = SerializableErrorInfo( message='RetryRequested but retries are disabled', stack=retry_err_info.stack, cls_name=retry_err_info.cls_name, cause=retry_err_info.cause, ) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=fail_err, user_failure_data=None), ) else: # retries.enabled or retries.deferred prev_attempts = retries.get_attempt_count(step_context.step.key) if prev_attempts >= retry_request.max_retries: fail_err = SerializableErrorInfo( message='Exceeded max_retries of {}'.format( retry_request.max_retries), stack=retry_err_info.stack, cls_name=retry_err_info.cls_name, cause=retry_err_info.cause, ) yield DagsterEvent.step_failure_event( step_context=step_context, step_failure_data=StepFailureData(error=fail_err, user_failure_data=None), ) else: attempt_num = prev_attempts + 1 yield DagsterEvent.step_retry_event( step_context, StepRetryData( error=retry_err_info, seconds_to_wait=retry_request.seconds_to_wait, ), ) # case (2) in top comment except Failure as failure: yield _step_failure_event_from_exc_info( step_context, sys.exc_info(), UserFailureData( label='intentional-failure', description=failure.description, metadata_entries=failure.metadata_entries, ), ) if step_context.raise_on_error: raise failure # case (3) in top comment except DagsterUserCodeExecutionError as dagster_user_error: yield _step_failure_event_from_exc_info( step_context, dagster_user_error.original_exc_info, ) if step_context.raise_on_error: raise dagster_user_error.user_exception # case (4) in top comment except DagsterError as dagster_error: yield _step_failure_event_from_exc_info(step_context, sys.exc_info()) if step_context.raise_on_error: raise dagster_error # case (5) in top comment except (Exception, KeyboardInterrupt) as unexpected_exception: # pylint: disable=broad-except yield _step_failure_event_from_exc_info(step_context, sys.exc_info()) raise unexpected_exception
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multiprocess executor: parent process (pid: {pid})" .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: with execution_plan.start( retry_mode=self.retries) as active_execution: active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: received termination signal - " "forwarding to active child processes", EngineEventData.interrupted( list(term_events.keys())), ) stopping = True active_execution.mark_interrupted() for key, event in term_events.items(): event.set() # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[step.key] = multiprocessing.Event() active_iters[ step.key] = self.execute_step_out_of_process( step_context, step, errors, term_events, active_execution.get_known_state(), ) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event(event_or_none) except ChildProcessCrashException as crash: serializable_error = serializable_error_info_from_exc_info( sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, ("Multiprocess executor: child process for step {step_key} " "unexpectedly exited with code {exit_code}" ).format(step_key=key, exit_code=crash.exit_code), EngineEventData.engine_error( serializable_error), step_handle=active_execution.get_step_by_key( key).handle, ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step( active_execution.get_step_by_key(key)), step_failure_data=StepFailureData( error=serializable_error, user_failure_data=None), ) active_execution.handle_event(step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] del term_events[key] active_execution.verify_complete(pipeline_context, key) # process skipped and abandoned steps yield from active_execution.plan_events_iterator( pipeline_context) errs = {pid: err for pid, err in errors.items() if err} # After termination starts, raise an interrupted exception once all subprocesses # have finished cleaning up (and the only errors were from being interrupted) if (stopping and (not active_iters) and all([ err_info.cls_name == "DagsterExecutionInterruptedError" for err_info in errs.values() ])): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: interrupted all active child processes", event_specific_data=EngineEventData(), ) raise DagsterExecutionInterruptedError() elif errs: raise DagsterSubprocessError( "During multiprocess execution errors occurred in child processes:\n{error_list}" .format(error_list="\n".join([ "In process {pid}: {err}".format( pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: parent process exiting after {duration} (pid: {pid})" .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def scoped_pipeline_context( pipeline_def, environment_dict, run_config, instance, system_storage_data=None, scoped_resources_builder_cm=create_resource_builder, raise_on_error=False, ): check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition) check.dict_param(environment_dict, 'environment_dict', key_type=str) check.inst_param(run_config, 'run_config', RunConfig) check.inst_param(instance, 'instance', DagsterInstance) check.opt_inst_param(system_storage_data, 'system_storage_data', SystemStorageData) context_creation_data = create_context_creation_data( pipeline_def, environment_dict, run_config, instance) executor_config = create_executor_config(context_creation_data) # After this try block, a Dagster exception thrown will result in a pipeline init failure event. pipeline_context = None try: executor_config.check_requirements( instance, context_creation_data.system_storage_def) log_manager = create_log_manager(context_creation_data) with scoped_resources_builder_cm( context_creation_data.pipeline_def, context_creation_data.environment_config, context_creation_data.run_config, log_manager, ) as scoped_resources_builder: system_storage_data = create_system_storage_data( context_creation_data, system_storage_data, scoped_resources_builder) pipeline_context = construct_pipeline_execution_context( context_creation_data=context_creation_data, scoped_resources_builder=scoped_resources_builder, system_storage_data=system_storage_data, log_manager=log_manager, executor_config=executor_config, raise_on_error=raise_on_error, ) yield pipeline_context except DagsterError as dagster_error: # only yield an init failure event if we haven't already yielded context if pipeline_context is None: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info()) error_info = serializable_error_info_from_exc_info( user_facing_exc_info) yield DagsterEvent.pipeline_init_failure( pipeline_name=pipeline_def.name, failure_data=PipelineInitFailureData(error=error_info), log_manager=_create_context_free_log_manager( instance, run_config, pipeline_def), ) if raise_on_error: raise dagster_error # if we've caught an error after context init we're in a problematic state and should just raise else: raise dagster_error
def orchestration_context_event_generator( pipeline: IPipeline, execution_plan: ExecutionPlan, run_config: Dict[str, Any], pipeline_run: PipelineRun, instance: DagsterInstance, raise_on_error: bool, executor_defs: Optional[List[ExecutorDefinition]], output_capture: Optional[Dict["StepOutputHandle", Any]], ) -> Generator[Union[DagsterEvent, PlanOrchestrationContext], None, None]: check.invariant(executor_defs is None) context_creation_data = create_context_creation_data( pipeline, execution_plan, run_config, pipeline_run, instance, ) log_manager = create_log_manager(context_creation_data) try: executor = create_executor(context_creation_data) execution_context = PlanOrchestrationContext( plan_data=create_plan_data(context_creation_data, raise_on_error, executor.retries), log_manager=log_manager, executor=executor, output_capture=output_capture, ) _validate_plan_with_context(execution_context, execution_plan) yield execution_context except DagsterError as dagster_error: dagster_error = cast(DagsterUserCodeExecutionError, dagster_error) user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info()) error_info = serializable_error_info_from_exc_info( user_facing_exc_info) event = DagsterEvent.pipeline_failure( pipeline_context_or_name=pipeline_run.pipeline_name, context_msg= (f'Pipeline failure during initialization for pipeline "{pipeline_run.pipeline_name}". ' "This may be due to a failure in initializing the executor or one of the loggers." ), error_info=error_info, ) log_manager.error( event.message, dagster_event=event, pipeline_name=pipeline_run.pipeline_name, ) yield event if raise_on_error: raise dagster_error
def pipeline_execution_iterator( pipeline_context: PlanOrchestrationContext, execution_plan: ExecutionPlan) -> Iterator[DagsterEvent]: """A complete execution of a pipeline. Yields pipeline start, success, and failure events. Args: pipeline_context (PlanOrchestrationContext): execution_plan (ExecutionPlan): """ yield DagsterEvent.pipeline_start(pipeline_context) pipeline_exception_info = None pipeline_canceled_info = None failed_steps = [] generator_closed = False try: for event in pipeline_context.executor.execute(pipeline_context, execution_plan): if event.is_step_failure: failed_steps.append(event.step_key) yield event except GeneratorExit: # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/). generator_closed = True pipeline_exception_info = serializable_error_info_from_exc_info( sys.exc_info()) if pipeline_context.raise_on_error: raise except (KeyboardInterrupt, DagsterExecutionInterruptedError): pipeline_canceled_info = serializable_error_info_from_exc_info( sys.exc_info()) if pipeline_context.raise_on_error: raise except Exception: # pylint: disable=broad-except pipeline_exception_info = serializable_error_info_from_exc_info( sys.exc_info()) if pipeline_context.raise_on_error: raise # finally block will run before this is re-raised finally: if pipeline_canceled_info: reloaded_run = pipeline_context.instance.get_run_by_id( pipeline_context.run_id) if reloaded_run and reloaded_run.status == PipelineRunStatus.CANCELING: event = DagsterEvent.pipeline_canceled(pipeline_context, pipeline_canceled_info) else: event = DagsterEvent.pipeline_failure( pipeline_context, "Execution was interrupted unexpectedly. " "No user initiated termination request was found, treating as failure.", pipeline_canceled_info, ) elif pipeline_exception_info: event = DagsterEvent.pipeline_failure( pipeline_context, "An exception was thrown during execution.", pipeline_exception_info, ) elif failed_steps: event = DagsterEvent.pipeline_failure( pipeline_context, "Steps failed: {}.".format(failed_steps), ) else: event = DagsterEvent.pipeline_success(pipeline_context) if not generator_closed: yield event
def check_step_health( self, step_handler_context: StepHandlerContext) -> List[DagsterEvent]: step_key = step_handler_context.execute_step_args.step_keys_to_execute[ 0] client = self._get_client() container_name = self._get_container_name( step_handler_context.execute_step_args.pipeline_run_id, step_key, ) try: container = client.containers.get(container_name) except Exception as e: return [ DagsterEvent( event_type_value=DagsterEventType.STEP_FAILURE.value, pipeline_name=step_handler_context.execute_step_args. pipeline_origin.pipeline_name, step_key=step_key, message= f"Error when checking on step container health: {e}", event_specific_data=StepFailureData( error=None, user_failure_data=None, ), ) ] if container.status == "running": return [] try: container_info = container.wait(timeout=0.1) except Exception as e: return [ DagsterEvent( event_type_value=DagsterEventType.STEP_FAILURE.value, pipeline_name=step_handler_context.execute_step_args. pipeline_origin.pipeline_name, step_key=step_key, message= f"Container status is {container.status}. Hit exception attempting to get its return code: {e}", event_specific_data=StepFailureData( error=None, user_failure_data=None, ), ) ] ret_code = container_info.get("StatusCode") if ret_code == 0: return [] return [ DagsterEvent( event_type_value=DagsterEventType.STEP_FAILURE.value, pipeline_name=step_handler_context.execute_step_args. pipeline_origin.pipeline_name, step_key=step_key, message= f"Container status is {container.status}. Return code is {str(ret_code)}.", event_specific_data=StepFailureData( error=None, user_failure_data=None, ), ) ]