def terminate(self, run_id): check.str_param(run_id, "run_id") run = self._instance.get_run_by_id(run_id) if not run: return False can_terminate = self.can_terminate(run_id) if not can_terminate: self._instance.report_engine_event( message="Unable to terminate run; can_terminate returned {}".format(can_terminate), pipeline_run=run, cls=self.__class__, ) return False self._instance.report_run_canceling(run) job_name = get_job_name_from_run_id( run_id, resume_attempt_number=self._instance.count_resume_run_attempts(run.run_id) ) try: termination_result = delete_job(job_name=job_name, namespace=self.job_namespace) if termination_result: self._instance.report_engine_event( message="Run was terminated successfully.", pipeline_run=run, cls=self.__class__, ) else: self._instance.report_engine_event( message="Run was not terminated successfully; delete_job returned {}".format( termination_result ), pipeline_run=run, cls=self.__class__, ) return termination_result except Exception: self._instance.report_engine_event( message="Run was not terminated successfully; encountered error in delete_job", pipeline_run=run, engine_event_data=EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info()) ), cls=self.__class__, )
def _execute_schedule(graphene_info, external_pipeline, execution_params, errors): check.inst_param(external_pipeline, 'external_pipeline', ExternalPipeline) instance = graphene_info.context.instance mode, environment_dict = execution_params.mode, execution_params.environment_dict validation_result = validate_config_from_snap( external_pipeline.config_schema_snapshot, external_pipeline.root_config_key_for_mode(mode), environment_dict, ) if validation_result.success: external_execution_plan = graphene_info.context.get_external_execution_plan( external_pipeline, environment_dict, mode, execution_params.step_keys) pipeline_run = instance.create_run( pipeline_name=external_pipeline.name, environment_dict=environment_dict, mode=mode, solid_subset=(execution_params.selector.solid_subset if execution_params.selector is not None else None), tags=execution_params.execution_metadata.tags, pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=external_execution_plan. execution_plan_snapshot, ) # Inject errors into event log at this point if len(errors) > 0: for error in errors: instance.report_engine_event(error.message, pipeline_run, EngineEventData.engine_error(error)) # Launch run if run launcher is defined run_launcher = graphene_info.context.instance.run_launcher if run_launcher: result = _launch_pipeline_execution_for_created_run( graphene_info, pipeline_run.run_id) else: result = _start_pipeline_execution_for_created_run( graphene_info, pipeline_run.run_id) return pipeline_run, result
def _execute_schedule(graphene_info, pipeline_def, execution_params, errors): instance = graphene_info.context.instance execution_plan = None if is_config_valid(pipeline_def, execution_params.environment_dict, execution_params.mode): execution_plan = create_execution_plan( pipeline_def, execution_params.environment_dict, mode=execution_params.mode, ) execution_plan_snapshot = None if execution_plan: execution_plan_snapshot = snapshot_from_execution_plan( execution_plan, pipeline_def.get_pipeline_snapshot_id() ) pipeline_run = instance.create_run( pipeline_name=pipeline_def.name, environment_dict=execution_params.environment_dict, mode=execution_params.mode, solid_subset=( execution_params.selector.solid_subset if execution_params.selector is not None else None ), tags=execution_params.execution_metadata.tags, pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), execution_plan_snapshot=execution_plan_snapshot, ) # Inject errors into event log at this point if len(errors) > 0: for error in errors: instance.report_engine_event( error.message, pipeline_run, EngineEventData.engine_error(error) ) # Launch run if run launcher is defined run_launcher = graphene_info.context.instance.run_launcher if run_launcher: result = _launch_pipeline_execution_for_created_run(graphene_info, pipeline_run.run_id) else: result = _start_pipeline_execution_for_created_run(graphene_info, pipeline_run.run_id) return pipeline_run, result
def _execute_run_command_body(recon_pipeline, pipeline_run_id, instance, write_stream_fn): # we need to send but the fact that we have loaded the args so the calling # process knows it is safe to clean up the temp input file write_stream_fn(ExecuteRunArgsLoadComplete()) pipeline_run = instance.get_run_by_id(pipeline_run_id) pid = os.getpid() instance.report_engine_event( "Started process for pipeline (pid: {pid}).".format(pid=pid), pipeline_run, EngineEventData.in_process(pid, marker_end="cli_api_subprocess_init"), ) # Perform setup so that termination of the execution will unwind and report to the # instance correctly setup_windows_interrupt_support() try: for event in execute_run_iterator(recon_pipeline, pipeline_run, instance): write_stream_fn(event) except (KeyboardInterrupt, DagsterExecutionInterruptedError): instance.report_engine_event( message="Pipeline execution terminated by interrupt", pipeline_run=pipeline_run, ) _report_run_failed_if_not_finished(instance, pipeline_run_id) except Exception: # pylint: disable=broad-except instance.report_engine_event( "An exception was thrown during execution that is likely a framework error, " "rather than an error in user code.", pipeline_run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), ) _report_run_failed_if_not_finished(instance, pipeline_run_id) finally: instance.report_engine_event( "Process for pipeline exited (pid: {pid}).".format(pid=pid), pipeline_run, )
def core_execute_run(recon_pipeline, pipeline_run, instance): check.inst_param(recon_pipeline, "recon_pipeline", ReconstructablePipeline) check.inst_param(pipeline_run, "pipeline_run", PipelineRun) check.inst_param(instance, "instance", DagsterInstance) try: yield from execute_run_iterator(recon_pipeline, pipeline_run, instance) except (KeyboardInterrupt, DagsterExecutionInterruptedError): yield from _report_run_failed_if_not_finished(instance, pipeline_run.run_id) yield instance.report_engine_event( message="Pipeline execution terminated by interrupt", pipeline_run=pipeline_run, ) except Exception: # pylint: disable=broad-except yield instance.report_engine_event( "An exception was thrown during execution that is likely a framework error, " "rather than an error in user code.", pipeline_run, EngineEventData.engine_error(serializable_error_info_from_exc_info(sys.exc_info())), ) yield from _report_run_failed_if_not_finished(instance, pipeline_run.run_id)
def start_run(self, execute_run_args): check.inst_param(execute_run_args, "execute_run_args", ExecuteExternalPipelineArgs) with DagsterInstance.from_ref(execute_run_args.instance_ref) as instance: try: res = self._query( "StartRun", api_pb2.StartRunRequest, serialized_execute_run_args=serialize_dagster_namedtuple(execute_run_args), ) return deserialize_json_to_dagster_namedtuple(res.serialized_start_run_result) except Exception: # pylint: disable=bare-except pipeline_run = instance.get_run_by_id(execute_run_args.pipeline_run_id) instance.report_engine_event( message="Unexpected error in IPC client", pipeline_run=pipeline_run, engine_event_data=EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info()) ), ) raise
def _create_sensor_run(context, instance, repo_location, external_sensor, external_pipeline, run_request): execution_plan_errors = [] execution_plan_snapshot = None try: external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, run_request.run_config, external_sensor.mode, step_keys_to_execute=None, ) execution_plan_snapshot = external_execution_plan.execution_plan_snapshot except DagsterSubprocessError as e: execution_plan_errors.extend(e.subprocess_error_infos) except Exception as e: # pylint: disable=broad-except execution_plan_errors.append( serializable_error_info_from_exc_info(sys.exc_info())) pipeline_tags = external_pipeline.tags or {} check_tags(pipeline_tags, "pipeline_tags") tags = merge_dicts( merge_dicts(pipeline_tags, run_request.tags), PipelineRun.tags_for_sensor(external_sensor), ) if run_request.run_key: tags[RUN_KEY_TAG] = run_request.run_key run = instance.create_run( pipeline_name=external_sensor.pipeline_name, run_id=None, run_config=run_request.run_config, mode=external_sensor.mode, solids_to_execute=external_pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=external_sensor.solid_selection, status=(PipelineRunStatus.FAILURE if len(execution_plan_errors) > 0 else PipelineRunStatus.NOT_STARTED), root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, external_pipeline_origin=external_pipeline.get_external_origin(), ) if len(execution_plan_errors) > 0: for error in execution_plan_errors: instance.report_engine_event( error.message, run, EngineEventData.engine_error(error), ) instance.report_run_failed(run) context.logger.error( "Failed to fetch execution plan for {sensor_name}: {error_string}". format( sensor_name=external_sensor.name, error_string="\n".join( [error.to_string() for error in execution_plan_errors]), ), ) return run
def _launch_run(instance, repo_location, external_schedule, external_pipeline, tick_context, run_request): run_config = run_request.run_config schedule_tags = run_request.tags execution_plan_snapshot = None errors = [] try: external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, run_config, external_schedule.mode, step_keys_to_execute=None, ) execution_plan_snapshot = external_execution_plan.execution_plan_snapshot except DagsterSubprocessError as e: errors.extend(e.subprocess_error_infos) except Exception as e: # pylint: disable=broad-except errors.append(serializable_error_info_from_exc_info(sys.exc_info())) pipeline_tags = external_pipeline.tags or {} check_tags(pipeline_tags, "pipeline_tags") tags = merge_dicts(pipeline_tags, schedule_tags) # Enter the run in the DB with the information we have possibly_invalid_pipeline_run = instance.create_run( pipeline_name=external_schedule.pipeline_name, run_id=None, run_config=run_config, mode=external_schedule.mode, solids_to_execute=external_pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=external_pipeline.solid_selection, status=None, root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, external_pipeline_origin=external_pipeline.get_external_origin(), ) tick_context.add_run(run_id=possibly_invalid_pipeline_run.run_id, run_key=run_request.run_key) # If there were errors, inject them into the event log and fail the run if len(errors) > 0: for error in errors: instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) tick_context.stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=errors)) return try: launched_run = instance.submit_run( possibly_invalid_pipeline_run.run_id, external_pipeline) except Exception: # pylint: disable=broad-except tick_context.stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=[serializable_error_info_from_exc_info(sys.exc_info())], )) return tick_context.stream.send( ScheduledExecutionSuccess(run_id=launched_run.run_id))
def _launch_scheduled_execution(instance, repo_location, external_repo, external_schedule, tick, stream): pipeline_selector = PipelineSelector( location_name=repo_location.name, repository_name=external_repo.name, pipeline_name=external_schedule.pipeline_name, solid_selection=external_schedule.solid_selection, ) subset_pipeline_result = repo_location.get_subset_external_pipeline_result( pipeline_selector) external_pipeline = ExternalPipeline( subset_pipeline_result.external_pipeline_data, external_repo.handle, ) schedule_execution_data = repo_location.get_external_schedule_execution_data( instance=instance, repository_handle=external_repo.handle, schedule_name=external_schedule.name, schedule_execution_data_mode=ScheduleExecutionDataMode. LAUNCH_SCHEDULED_EXECUTION, scheduled_execution_time= None, # No way to know this in general for this scheduler ) run_config = {} schedule_tags = {} execution_plan_snapshot = None errors = [] if isinstance(schedule_execution_data, ExternalScheduleExecutionErrorData): error = schedule_execution_data.error tick.update_with_status(ScheduleTickStatus.FAILURE, error=error) stream.send(ScheduledExecutionFailed(run_id=None, errors=[error])) return elif not schedule_execution_data.should_execute: # Update tick to skipped state and return tick.update_with_status(ScheduleTickStatus.SKIPPED) stream.send(ScheduledExecutionSkipped()) return else: run_config = schedule_execution_data.run_config schedule_tags = schedule_execution_data.tags try: external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, run_config, external_schedule.mode, step_keys_to_execute=None, ) execution_plan_snapshot = external_execution_plan.execution_plan_snapshot except DagsterSubprocessError as e: errors.extend(e.subprocess_error_infos) except Exception as e: # pylint: disable=broad-except errors.append(serializable_error_info_from_exc_info( sys.exc_info())) pipeline_tags = external_pipeline.tags or {} check_tags(pipeline_tags, "pipeline_tags") tags = merge_dicts(pipeline_tags, schedule_tags) # Enter the run in the DB with the information we have possibly_invalid_pipeline_run = instance.create_run( pipeline_name=external_schedule.pipeline_name, run_id=None, run_config=run_config, mode=external_schedule.mode, solids_to_execute=external_pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=external_pipeline.solid_selection, status=None, root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, ) tick.update_with_status(ScheduleTickStatus.SUCCESS, run_id=possibly_invalid_pipeline_run.run_id) # If there were errors, inject them into the event log and fail the run if len(errors) > 0: for error in errors: instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=errors)) return try: launched_run = instance.launch_run( possibly_invalid_pipeline_run.run_id, external_pipeline) except Exception: # pylint: disable=broad-except stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=[error])) return stream.send(ScheduledExecutionSuccess(run_id=launched_run.run_id)) return
def _start_pipeline_execution_for_created_run(graphene_info, run_id): check.inst_param(graphene_info, 'graphene_info', ResolveInfo) instance = graphene_info.context.instance execution_manager_settings = instance.dagit_settings.get( 'execution_manager') if execution_manager_settings and execution_manager_settings.get( 'disabled'): return graphene_info.schema.type_named( 'StartPipelineRunDisabledError')() pipeline_run = instance.get_run_by_id(run_id) if not pipeline_run: return graphene_info.schema.type_named('PipelineRunNotFoundError')( run_id) pipeline_def = get_pipeline_def_from_selector(graphene_info, pipeline_run.selector) environment_schema = create_environment_schema(pipeline_def, pipeline_run.mode) validated_config = validate_config(environment_schema.environment_type, pipeline_run.environment_dict) if not validated_config.success: # If the config is invalid, we construct a DagsterInvalidConfigError exception and # insert it into the event log. We also return a PipelineConfigValidationInvalid user facing # graphql error. # We currently re-use the engine events machinery to add the error to the event log, but # may need to create a new event type and instance method to handle these erros. invalid_config_exception = DagsterInvalidConfigError( 'Error in config for pipeline {}'.format(pipeline_def.name), validated_config.errors, pipeline_run.environment_dict, ) instance.report_engine_event( str(invalid_config_exception.message), pipeline_run, EngineEventData.engine_error( SerializableErrorInfo( invalid_config_exception.message, [], DagsterInvalidConfigError.__class__.__name__, None, )), ) instance.report_run_failed(pipeline_run) return DauphinPipelineConfigValidationInvalid.for_validation_errors( pipeline_def.get_pipeline_index(), validated_config.errors) graphene_info.context.execution_manager.execute_pipeline( graphene_info.context.get_handle(), pipeline_def, pipeline_run, instance=instance, ) return graphene_info.schema.type_named('StartPipelineRunSuccess')( run=graphene_info.schema.type_named('PipelineRun')(pipeline_run))
def get_run_execution_info_for_created_run_or_error( graphene_info, repository_location_name, repository_name, run_id ): """ Previously created run could either be created in a different process *or* during the launchScheduledRun call where we want to have a record of a run the was created but have invalid configuration """ check.inst_param(graphene_info, "graphene_info", ResolveInfo) check.str_param(repository_location_name, "repository_location_name") check.str_param(repository_name, "repository_name") check.str_param(run_id, "run_id") instance = graphene_info.context.instance pipeline_run = instance.get_run_by_id(run_id) if not pipeline_run: return graphene_info.schema.type_named("PipelineRunNotFoundError")(run_id) external_pipeline = get_external_pipeline_or_raise( graphene_info, _get_selector_with_workaround( graphene_info.context, repository_location_name, repository_name, pipeline_run ), ) validated_config = validate_config_from_snap( external_pipeline.config_schema_snapshot, external_pipeline.root_config_key_for_mode(pipeline_run.mode), pipeline_run.run_config, ) if not validated_config.success: # If the config is invalid, we construct a DagsterInvalidConfigError exception and # insert it into the event log. We also return a PipelineConfigValidationInvalid user facing # graphql error. # We currently re-use the engine events machinery to add the error to the event log, but # may need to create a new event type and instance method to handle these errors. invalid_config_exception = DagsterInvalidConfigError( "Error in config for pipeline {}".format(external_pipeline.name), validated_config.errors, pipeline_run.run_config, ) instance.report_engine_event( str(invalid_config_exception.message), pipeline_run, EngineEventData.engine_error( SerializableErrorInfo( invalid_config_exception.message, [], DagsterInvalidConfigError.__class__.__name__, None, ) ), ) instance.report_run_failed(pipeline_run) return DauphinPipelineConfigValidationInvalid.for_validation_errors( external_pipeline, validated_config.errors ) return RunExecutionInfo(external_pipeline, pipeline_run)
def _launch_pipeline_execution_for_created_run(graphene_info, run_id): check.inst_param(graphene_info, 'graphene_info', ResolveInfo) check.str_param(run_id, 'run_id') # First retrieve the pipeline run instance = graphene_info.context.instance pipeline_run = instance.get_run_by_id(run_id) if not pipeline_run: return graphene_info.schema.type_named('PipelineRunNotFoundError')( run_id) external_pipeline = get_external_pipeline_or_raise( graphene_info, pipeline_run.selector.name, pipeline_run.selector.solid_subset) # Run config validation # If there are any config errors, then inject them into the event log validated_config = ensure_valid_config(external_pipeline, pipeline_run.mode, pipeline_run.environment_dict) if not validated_config.success: # If the config is invalid, we construct a DagsterInvalidConfigError exception and # insert it into the event log. We also return a PipelineConfigValidationInvalid user facing # graphql error. # We currently re-use the engine events machinery to add the error to the event log, but # may need to create a new event type and instance method to handle these errors. invalid_config_exception = DagsterInvalidConfigError( 'Error in config for pipeline {}'.format(external_pipeline.name), validated_config.errors, pipeline_run.environment_dict, ) instance.report_engine_event( str(invalid_config_exception.message), pipeline_run, EngineEventData.engine_error( SerializableErrorInfo( invalid_config_exception.message, [], DagsterInvalidConfigError.__class__.__name__, None, )), ) instance.report_run_failed(pipeline_run) return DauphinPipelineConfigValidationInvalid.for_validation_errors( external_pipeline, validated_config.errors) try: pipeline_run = instance.launch_run(pipeline_run.run_id) except DagsterLaunchFailedError: error = serializable_error_info_from_exc_info(sys.exc_info()) instance.report_engine_event( error.message, pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(pipeline_run) return graphene_info.schema.type_named('LaunchPipelineRunSuccess')( run=graphene_info.schema.type_named('PipelineRun')(pipeline_run))
def _create_scheduler_run( instance, logger, schedule_time_utc, repo_location, external_repo, external_schedule, external_pipeline, tick_holder, ): schedule_execution_data = repo_location.get_external_schedule_execution_data( instance=instance, repository_handle=external_repo.handle, schedule_name=external_schedule.name, schedule_execution_data_mode=ScheduleExecutionDataMode. LAUNCH_SCHEDULED_EXECUTION, scheduled_execution_datetime_utc=schedule_time_utc, ) if isinstance(schedule_execution_data, ExternalScheduleExecutionErrorData): error = schedule_execution_data.error logger.error( "Failed to fetch schedule data for {schedule_name}: {error}". format(schedule_name=external_schedule.name, error=error.to_string()), ) tick_holder.update_with_status(ScheduleTickStatus.FAILURE, error=error) return None elif not schedule_execution_data.should_execute: logger.info( "should_execute returned False for {schedule_name}, skipping". format(schedule_name=external_schedule.name)) # Update tick to skipped state and return tick_holder.update_with_status(ScheduleTickStatus.SKIPPED) return None run_config = schedule_execution_data.run_config schedule_tags = schedule_execution_data.tags execution_plan_errors = [] execution_plan_snapshot = None try: external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, run_config, external_schedule.mode, step_keys_to_execute=None, ) execution_plan_snapshot = external_execution_plan.execution_plan_snapshot except DagsterSubprocessError as e: execution_plan_errors.extend(e.subprocess_error_infos) except Exception as e: # pylint: disable=broad-except execution_plan_errors.append( serializable_error_info_from_exc_info(sys.exc_info())) pipeline_tags = external_pipeline.tags or {} check_tags(pipeline_tags, "pipeline_tags") tags = merge_dicts(pipeline_tags, schedule_tags) tags[SCHEDULED_EXECUTION_TIME_TAG] = schedule_time_utc.isoformat() # If the run was scheduled correctly but there was an error creating its # run config, enter it into the run DB with a FAILURE status possibly_invalid_pipeline_run = instance.create_run( pipeline_name=external_schedule.pipeline_name, run_id=None, run_config=run_config, mode=external_schedule.mode, solids_to_execute=external_pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=external_pipeline.solid_selection, status=(PipelineRunStatus.FAILURE if len(execution_plan_errors) > 0 else PipelineRunStatus.NOT_STARTED), root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, ) if len(execution_plan_errors) > 0: for error in execution_plan_errors: instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) logger.error( "Failed to fetch execution plan for {schedule_name}: {error_string}" .format( schedule_name=external_schedule.name, error_string="\n".join( [error.to_string() for error in execution_plan_errors]), ), ) return possibly_invalid_pipeline_run
def in_mp_process(cls, handle, pipeline_run, instance_ref, term_event): """ Execute pipeline using message queue as a transport """ run_id = pipeline_run.run_id pipeline_name = pipeline_run.pipeline_name instance = DagsterInstance.from_ref(instance_ref) pid = os.getpid() instance.report_engine_event( 'Started process for pipeline (pid: {pid}).'.format(pid=pid), pipeline_run, EngineEventData.in_process(pid, marker_end='dagit_subprocess_init'), cls, ) start_termination_thread(term_event) try: handle.build_repository_definition() pipeline_def = handle.with_pipeline_name( pipeline_name).build_pipeline_definition() except Exception: # pylint: disable=broad-except instance.report_engine_event( 'Failed attempting to load pipeline "{}"'.format( pipeline_name), pipeline_run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), cls, ) return try: event_list = [] for event in execute_run_iterator( pipeline_def.build_sub_pipeline( pipeline_run.selector.solid_subset), pipeline_run, instance, ): event_list.append(event) return PipelineExecutionResult(pipeline_def, run_id, event_list, lambda: None) # Add a DagsterEvent for unexpected exceptions # Explicitly ignore KeyboardInterrupts since they are used for termination except DagsterSubprocessError as err: if not all([ err_info.cls_name == 'KeyboardInterrupt' for err_info in err.subprocess_error_infos ]): instance.report_engine_event( 'An exception was thrown during execution that is likely a framework error, ' 'rather than an error in user code.', pipeline_run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), cls, ) except Exception: # pylint: disable=broad-except instance.report_engine_event( 'An exception was thrown during execution that is likely a framework error, ' 'rather than an error in user code.', pipeline_run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), cls, ) finally: instance.report_engine_event( 'Process for pipeline exited (pid: {pid}).'.format(pid=pid), pipeline_run, cls=cls, )
def execute(self, context): try: from dagster_graphql.client.mutations import ( DagsterGraphQLClientError, handle_execution_errors, handle_execute_plan_result_raw, ) except ImportError: raise AirflowException( 'To use the DagsterDockerOperator, dagster and dagster_graphql must be installed ' 'in your Airflow environment.') if 'run_id' in self.params: self._run_id = self.params['run_id'] elif 'dag_run' in context and context['dag_run'] is not None: self._run_id = context['dag_run'].run_id try: if self.instance: run = self.instance.register_managed_run( pipeline_name=self.pipeline_name, run_id=self.run_id, environment_dict=self.environment_dict, mode=self.mode, solids_to_execute=None, step_keys_to_execute=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot=self.pipeline_snapshot, execution_plan_snapshot=self.execution_plan_snapshot, parent_pipeline_snapshot=self.parent_pipeline_snapshot, ) raw_res = super(DagsterDockerOperator, self).execute(context) self.log.info('Finished executing container.') res = parse_raw_log_lines(raw_res) try: handle_execution_errors(res, 'executePlan') except DagsterGraphQLClientError as err: if self.instance: self.instance.report_engine_event( str(err), run, EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info())), self.__class__, ) raise events = handle_execute_plan_result_raw(res) if self.instance: for event in events: self.instance.handle_new_event(event) events = [e.dagster_event for e in events] check_events_for_failures(events) check_events_for_skips(events) return events finally: self._run_id = None
def execute(self, context): try: from dagster_graphql.client.mutations import ( DagsterGraphQLClientError, handle_execution_errors, handle_execute_plan_result_raw, ) except ImportError: raise AirflowException( 'To use the DagsterKubernetesPodOperator, dagster and dagster_graphql must be' ' installed in your Airflow environment.' ) if 'run_id' in self.params: self._run_id = self.params['run_id'] elif 'dag_run' in context and context['dag_run'] is not None: self._run_id = context['dag_run'].run_id # return to original execute code: try: client = kube_client.get_kube_client( in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file, ) gen = pod_generator.PodGenerator() for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.query(context.get('ts')), labels=self.labels, ) pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.image_pull_secrets = self.image_pull_secrets pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity pod.node_selectors = self.node_selectors pod.hostnetwork = self.hostnetwork pod.tolerations = self.tolerations pod.configmaps = self.configmaps pod.security_context = self.security_context launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push) try: if self.instance: tags = ( {AIRFLOW_EXECUTION_DATE_STR: context.get('ts')} if 'ts' in context else {} ) run = self.instance.register_managed_run( pipeline_name=self.pipeline_name, run_id=self.run_id, run_config=self.run_config, mode=self.mode, solids_to_execute=None, step_keys_to_execute=None, tags=tags, root_run_id=None, parent_run_id=None, pipeline_snapshot=self.pipeline_snapshot, execution_plan_snapshot=self.execution_plan_snapshot, parent_pipeline_snapshot=self.parent_pipeline_snapshot, ) # we won't use the "result", which is the pod's xcom json file (final_state, _) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs ) # fetch the last line independently of whether logs were read # unbelievably, if you set tail_lines=1, the returned json has its double quotes # turned into unparseable single quotes res = None num_attempts = 0 while not res and num_attempts < LOG_RETRIEVAL_MAX_ATTEMPTS: raw_res = client.read_namespaced_pod_log( name=pod.name, namespace=pod.namespace, container='base' ) res = parse_raw_log_lines(raw_res.split('\n')) time.sleep(LOG_RETRIEVAL_WAITS_BETWEEN_ATTEMPTS_SEC) num_attempts += 1 try: handle_execution_errors(res, 'executePlan') except DagsterGraphQLClientError as err: self.instance.report_engine_event( str(err), run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info()) ), self.__class__, ) raise events = handle_execute_plan_result_raw(res) if self.instance: for event in events: self.instance.handle_new_event(event) events = [e.dagster_event for e in events] check_events_for_failures(events) check_events_for_skips(events) return events finally: self._run_id = None if self.is_delete_operator_pod: launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException('Pod returned a failure: {state}'.format(state=final_state)) # note the lack of returning the default xcom except AirflowException as ex: raise AirflowException('Pod Launching failed: {error}'.format(error=ex))
def _core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn): from .tasks import make_app check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.callable_param(step_execution_fn, 'step_execution_fn') check.param_invariant( isinstance(pipeline_context.executor_config, (CeleryConfig, CeleryK8sJobConfig)), 'pipeline_context', 'Expected executor_config to be Celery config got {}'.format( pipeline_context.executor_config ), ) celery_config = pipeline_context.executor_config # https://github.com/dagster-io/dagster/issues/2440 check.invariant( pipeline_context.system_storage_def.is_persistent, 'Cannot use in-memory storage with Celery, use filesystem (on top of NFS or ' 'similar system that allows files to be available to all nodes), S3, or GCS', ) app = make_app(celery_config) priority_for_step = lambda step: ( -1 * int(step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority)) + -1 * _get_run_priority(pipeline_context) ) priority_for_key = lambda step_key: ( priority_for_step(execution_plan.get_step_by_key(step_key)) ) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step ) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted(step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[step_key] = serializable_error_info_from_exc_info(sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple(step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator(pipeline_context): yield event # don't add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".'.format( step_key=step.key, queue=queue ), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) # Get the Celery priority for this step priority = _get_step_priority(pipeline_context, step) # Submit the Celery tasks step_results[step.key] = step_execution_fn( app, pipeline_context, step, queue, priority ) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'.format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occurred in workers:\n{error_list}'.format( error_list='\n'.join( [ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ] ) ), subprocess_error_infos=list(step_errors.values()), )
def execute(pipeline_context, execution_plan): from .tasks import make_app check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor_config, CeleryConfig), 'pipeline_context', 'Expected executor_config to be CeleryConfig got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config storage = pipeline_context.environment_dict.get('storage') if (celery_config.broker and not is_local_uri(celery_config.broker) ) or (celery_config.backend and not is_local_uri(celery_config.backend)): check.invariant( storage.get('s3') or storage.get('gcs'), 'Must use S3 or GCS storage with non-local Celery broker: {broker} ' 'and backend: {backend}'.format(broker=celery_config.broker, backend=celery_config.backend), ) else: check.invariant( not storage.get('in_memory'), 'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS', ) app = make_app(celery_config) priority_for_step = lambda step: (-1 * int( step.tags.get('dagster-celery/priority', task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception as e: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get('dagster-celery/queue', task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) step_results[step.key] = _submit_task( app, pipeline_context, step, queue) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'. format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occurred in workers:\n{error_list}' .format(error_list='\n'.join([ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def _execute_run(request): try: execute_run_args = deserialize_json_to_dagster_namedtuple( request.serialized_execute_run_args) check.inst_param(execute_run_args, 'execute_run_args', ExecuteRunArgs) recon_pipeline = recon_pipeline_from_origin( execute_run_args.pipeline_origin) instance = DagsterInstance.from_ref(execute_run_args.instance_ref) pipeline_run = instance.get_run_by_id(execute_run_args.pipeline_run_id) pid = os.getpid() except: # pylint: disable=bare-except yield IPCErrorMessage( serializable_error_info=serializable_error_info_from_exc_info( sys.exc_info()), message='Error during RPC setup for ExecuteRun', ) return yield instance.report_engine_event( 'Started process for pipeline (pid: {pid}).'.format(pid=pid), pipeline_run, EngineEventData.in_process(pid, marker_end='cli_api_subprocess_init'), ) # This is so nasty but seemingly unavoidable # https://amir.rachum.com/blog/2017/03/03/generator-cleanup/ closed = False try: for event in execute_run_iterator(recon_pipeline, pipeline_run, instance): yield event except DagsterSubprocessError as err: if not all([ err_info.cls_name == 'KeyboardInterrupt' for err_info in err.subprocess_error_infos ]): yield instance.report_engine_event( 'An exception was thrown during execution that is likely a framework error, ' 'rather than an error in user code.', pipeline_run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), ) instance.report_run_failed(pipeline_run) except GeneratorExit: closed = True raise except Exception: # pylint: disable=broad-except yield instance.report_engine_event( 'An exception was thrown during execution that is likely a framework error, ' 'rather than an error in user code.', pipeline_run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), ) instance.report_run_failed(pipeline_run) finally: if not closed: yield instance.report_engine_event( 'Process for pipeline exited (pid: {pid}).'.format(pid=pid), pipeline_run, )
def core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.callable_param(step_execution_fn, "step_execution_fn") executor = pipeline_context.executor # https://github.com/dagster-io/dagster/issues/2440 check.invariant( execution_plan.artifacts_persisted, "Cannot use in-memory storage with Celery, use filesystem (on top of NFS or " "similar system that allows files to be available to all nodes), S3, or GCS", ) app = make_app(executor.app_args()) priority_for_step = lambda step: (-1 * int( step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} with execution_plan.start( retries=pipeline_context.executor.retries, sort_key_fn=priority_for_step, ) as active_execution: stopping = False while (not active_execution.is_complete and not stopping) or step_results: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Celery executor: received termination signal - revoking active tasks from workers", EngineEventData.interrupted(list(step_results.keys())), ) stopping = True active_execution.mark_interrupted() for result in step_results.values(): result.revoke() results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except TaskRevokedError: step_events = [] yield DagsterEvent.engine_event( pipeline_context, 'celery task for running step "{step_key}" was revoked.' .format(step_key=step_key, ), EngineEventData(marker_end=DELEGATE_MARKER), step_handle=active_execution.get_step_by_key( step_key).handle, ) except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping or step_errors: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_handle=step.handle, ) # Get the Celery priority for this step priority = _get_step_priority(pipeline_context, step) # Submit the Celery tasks step_results[step.key] = step_execution_fn( app, pipeline_context, step, queue, priority) except Exception: yield DagsterEvent.engine_event( pipeline_context, "Encountered error during celery task submission.". format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( "During celery execution errors occurred in workers:\n{error_list}" .format(error_list="\n".join([ "[{step}]: {err}".format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor_config, CeleryConfig), 'pipeline_context', 'Expected executor_config to be CeleryConfig got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config storage = pipeline_context.environment_dict.get('storage') if (celery_config.broker and not is_local_uri(celery_config.broker) ) or (celery_config.backend and not is_local_uri(celery_config.backend)): check.invariant( storage.get('s3') or storage.get('gcs'), 'Must use S3 or GCS storage with non-local Celery broker: {broker} ' 'and backend: {backend}'.format(broker=celery_config.broker, backend=celery_config.backend), ) else: check.invariant( not storage.get('in_memory'), 'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS', ) pipeline_name = pipeline_context.pipeline_def.name handle_dict = pipeline_context.execution_target_handle.to_dict() instance_ref_dict = pipeline_context.instance.get_ref().to_dict() environment_dict = dict(pipeline_context.environment_dict, execution={'in_process': {}}) mode = pipeline_context.mode_def.name run_id = pipeline_context.pipeline_run.run_id app = make_app(celery_config) task_signatures = {} # Dict[step_key, celery.Signature] apply_kwargs = defaultdict(dict) # Dict[step_key, Dict[str, Any]] priority_for_step = lambda step: (-1 * int( step.tags.get('dagster-celery/priority', task_default_priority))) priority_for_key = lambda step_key: (-1 * apply_kwargs[step_key][ 'priority']) _warn_on_priority_misuse(pipeline_context, execution_plan) for step_key in execution_plan.step_keys_to_execute: step = execution_plan.get_step_by_key(step_key) priority = int( step.tags.get('dagster-celery/priority', task_default_priority)) queue = step.tags.get('dagster-celery/queue', task_default_queue) task = create_task(app) variables = { 'executionParams': { 'selector': { 'name': pipeline_name }, 'environmentConfigData': environment_dict, 'mode': mode, 'executionMetadata': { 'runId': run_id }, 'stepKeys': [step_key], } } task_signatures[step_key] = task.si(handle_dict, variables, instance_ref_dict) apply_kwargs[step_key] = { 'priority': priority, 'queue': queue, 'routing_key': '{queue}.execute_query'.format(queue=queue), } step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_success = {} step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start(sort_key_fn=priority_for_step) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception as e: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event if event.is_step_success: step_success[step_key] = True elif event.is_step_failure: step_success[step_key] = False results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] was_success = step_success.get(step_key) if was_success == True: active_execution.mark_success(step_key) elif was_success == False: active_execution.mark_failed(step_key) else: # check errors list? pipeline_context.log.error( 'Step {key} finished without success or failure event, assuming failure.' .format(key=step_key)) active_execution.mark_failed(step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # dont add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: step_results[step.key] = task_signatures[ step.key].apply_async(**apply_kwargs[step.key]) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'. format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occured in workers:\n{error_list}' .format(error_list='\n'.join([ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def _schedule_run_at_time( instance, logger, repo_location, schedule_state, schedule_time_utc, tick_holder, debug_crash_flags, ): schedule_name = schedule_state.name repo_dict = repo_location.get_repositories() check.invariant( len(repo_dict) == 1, "Reconstructed repository location should have exactly one repository", ) external_repo = next(iter(repo_dict.values())) external_schedule = external_repo.get_external_schedule(schedule_name) pipeline_selector = PipelineSelector( location_name=repo_location.name, repository_name=external_repo.name, pipeline_name=external_schedule.pipeline_name, solid_selection=external_schedule.solid_selection, ) subset_pipeline_result = repo_location.get_subset_external_pipeline_result( pipeline_selector) external_pipeline = ExternalPipeline( subset_pipeline_result.external_pipeline_data, external_repo.handle, ) # Rule out the case where the scheduler crashed between creating a run for this time # and launching it runs_filter = PipelineRunsFilter(tags=merge_dicts( PipelineRun.tags_for_schedule(schedule_state), {SCHEDULED_EXECUTION_TIME_TAG: schedule_time_utc.isoformat()}, )) existing_runs = instance.get_runs(runs_filter) run_to_launch = None if len(existing_runs): check.invariant(len(existing_runs) == 1) run = existing_runs[0] if run.status != PipelineRunStatus.NOT_STARTED: # A run already exists and was launched for this time period, # but the scheduler must have crashed before the tick could be put # into a SUCCESS state logger.info( "Run {run_id} already completed for this execution of {schedule_name}" .format(run_id=run.run_id, schedule_name=schedule_state.name)) tick_holder.update_with_status(ScheduleTickStatus.SUCCESS, run_id=run.run_id) return else: logger.info( "Run {run_id} already created for this execution of {schedule_name}" .format(run_id=run.run_id, schedule_name=schedule_state.name)) run_to_launch = run else: run_to_launch = _create_scheduler_run( instance, logger, schedule_time_utc, repo_location, external_repo, external_schedule, external_pipeline, tick_holder, ) _check_for_debug_crash(debug_crash_flags, "RUN_CREATED") if not run_to_launch: check.invariant(tick_holder.status != ScheduleTickStatus.STARTED and tick_holder.status != ScheduleTickStatus.SUCCESS) return if run_to_launch.status != PipelineRunStatus.FAILURE: try: instance.launch_run(run_to_launch.run_id, external_pipeline) logger.info( "Completed scheduled launch of run {run_id} for {schedule_name}" .format(run_id=run_to_launch.run_id, schedule_name=schedule_name)) except Exception as e: # pylint: disable=broad-except if not isinstance(e, KeyboardInterrupt): error = serializable_error_info_from_exc_info(sys.exc_info()) instance.report_engine_event( error.message, run_to_launch, EngineEventData.engine_error(error), ) instance.report_run_failed(run_to_launch) logger.error( "Run {run_id} created successfully but failed to launch.". format(run_id=run_to_launch.run_id)) _check_for_debug_crash(debug_crash_flags, "RUN_LAUNCHED") tick_holder.update_with_status(ScheduleTickStatus.SUCCESS, run_id=run_to_launch.run_id) _check_for_debug_crash(debug_crash_flags, "TICK_SUCCESS")
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multiprocess executor: parent process (pid: {pid})" .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: with execution_plan.start( retry_mode=self.retries) as active_execution: active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: received termination signal - " "forwarding to active child processes", EngineEventData.interrupted( list(term_events.keys())), ) stopping = True active_execution.mark_interrupted() for key, event in term_events.items(): event.set() # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[step.key] = multiprocessing.Event() active_iters[ step.key] = self.execute_step_out_of_process( step_context, step, errors, term_events, active_execution.get_known_state(), ) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event(event_or_none) except ChildProcessCrashException as crash: serializable_error = serializable_error_info_from_exc_info( sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, ("Multiprocess executor: child process for step {step_key} " "unexpectedly exited with code {exit_code}" ).format(step_key=key, exit_code=crash.exit_code), EngineEventData.engine_error( serializable_error), step_handle=active_execution.get_step_by_key( key).handle, ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step( active_execution.get_step_by_key(key)), step_failure_data=StepFailureData( error=serializable_error, user_failure_data=None), ) active_execution.handle_event(step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] del term_events[key] active_execution.verify_complete(pipeline_context, key) # process skipped and abandoned steps yield from active_execution.plan_events_iterator( pipeline_context) errs = {pid: err for pid, err in errors.items() if err} # After termination starts, raise an interrupted exception once all subprocesses # have finished cleaning up (and the only errors were from being interrupted) if (stopping and (not active_iters) and all([ err_info.cls_name == "DagsterExecutionInterruptedError" for err_info in errs.values() ])): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: interrupted all active child processes", event_specific_data=EngineEventData(), ) raise DagsterExecutionInterruptedError() elif errs: raise DagsterSubprocessError( "During multiprocess execution errors occurred in child processes:\n{error_list}" .format(error_list="\n".join([ "In process {pid}: {err}".format( pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: parent process exiting after {duration} (pid: {pid})" .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def _launch_scheduled_execution(instance, schedule_def, pipeline, tick, stream): pipeline_def = pipeline.get_definition() # Run should_execute and halt if it returns False schedule_context = ScheduleExecutionContext(instance) with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution of should_execute for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): should_execute = schedule_def.should_execute(schedule_context) if not should_execute: # Update tick to skipped state and return tick.update_with_status(ScheduleTickStatus.SKIPPED) stream.send(ScheduledExecutionSkipped()) return errors = [] run_config = {} schedule_tags = {} try: with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution of run_config_fn for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): run_config = schedule_def.get_run_config(schedule_context) except DagsterUserCodeExecutionError: error_data = serializable_error_info_from_exc_info(sys.exc_info()) errors.append(error_data) try: with user_code_error_boundary( ScheduleExecutionError, lambda: 'Error occurred during the execution of tags_fn for schedule ' '{schedule_name}'.format(schedule_name=schedule_def.name), ): schedule_tags = schedule_def.get_tags(schedule_context) except DagsterUserCodeExecutionError: error_data = serializable_error_info_from_exc_info(sys.exc_info()) errors.append(error_data) pipeline_tags = pipeline_def.tags or {} check_tags(pipeline_tags, 'pipeline_tags') tags = merge_dicts(pipeline_tags, schedule_tags) mode = schedule_def.mode execution_plan_snapshot = None try: execution_plan = create_execution_plan( pipeline_def, run_config=run_config, mode=mode, ) execution_plan_snapshot = snapshot_from_execution_plan( execution_plan, pipeline_def.get_pipeline_snapshot_id()) except DagsterInvalidConfigError: error_data = serializable_error_info_from_exc_info(sys.exc_info()) errors.append(error_data) # Enter the run in the DB with the information we have possibly_invalid_pipeline_run = instance.create_run( pipeline_name=schedule_def.pipeline_name, run_id=None, run_config=run_config, mode=mode, solids_to_execute=pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=pipeline.solid_selection, status=None, root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(), ) tick.update_with_status(ScheduleTickStatus.SUCCESS, run_id=possibly_invalid_pipeline_run.run_id) # If there were errors, inject them into the event log and fail the run if len(errors) > 0: for error in errors: instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=errors)) return # Otherwise the run should be valid so lets launch it # Need an ExternalPipeline to launch so make one here recon_repo = pipeline.get_reconstructable_repository() repo_location = InProcessRepositoryLocation(recon_repo) external_pipeline = repo_location.get_repository( recon_repo.get_definition().name).get_full_external_pipeline( pipeline_def.name) try: launched_run = instance.launch_run( possibly_invalid_pipeline_run.run_id, external_pipeline) except DagsterLaunchFailedError: error = serializable_error_info_from_exc_info(sys.exc_info()) instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) stream.send( ScheduledExecutionFailed( run_id=possibly_invalid_pipeline_run.run_id, errors=[error])) return stream.send(ScheduledExecutionSuccess(run_id=launched_run.run_id)) return
def execute_run(self, execute_run_args): check.inst_param(execute_run_args, "execute_run_args", ExecuteRunArgs) with DagsterInstance.from_ref( execute_run_args.instance_ref) as instance: try: pipeline_run = instance.get_run_by_id( execute_run_args.pipeline_run_id) event_iterator = self._streaming_query( "ExecuteRun", api_pb2.ExecuteRunRequest, serialized_execute_run_args=serialize_dagster_namedtuple( execute_run_args), ) except Exception as exc: # pylint: disable=bare-except yield instance.report_engine_event( message="Unexpected error in IPC client", pipeline_run=pipeline_run, engine_event_data=EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), ) raise exc try: for event in event_iterator: yield deserialize_json_to_dagster_namedtuple( event.serialized_dagster_event_or_ipc_error_message) except KeyboardInterrupt: self.cancel_execution( CancelExecutionRequest( run_id=execute_run_args.pipeline_run_id)) raise except grpc.RpcError as rpc_error: if ( # posix "Socket closed" in rpc_error.debug_error_string() # pylint: disable=no-member # windows or "Stream removed" in rpc_error.debug_error_string() # pylint: disable=no-member ): yield instance.report_engine_event( message= "User process: GRPC server for {run_id} terminated unexpectedly" .format(run_id=pipeline_run.run_id), pipeline_run=pipeline_run, engine_event_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info())), ) yield instance.report_run_failed(pipeline_run) else: yield instance.report_engine_event( message="Unexpected error in IPC client", pipeline_run=pipeline_run, engine_event_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info())), ) raise rpc_error except Exception as exc: # pylint: disable=bare-except yield instance.report_engine_event( message="Unexpected error in IPC client", pipeline_run=pipeline_run, engine_event_data=EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), ) raise exc
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multiprocess engine: parent process (pid: {pid})" .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: with execution_plan.start( retries=self.retries) as active_execution: active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: try: # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[step.key] = multiprocessing.Event() active_iters[ step. key] = self.execute_step_out_of_process( step_context, step, errors, term_events) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event( event_or_none) except ChildProcessCrashException as crash: serializable_error = serializable_error_info_from_exc_info( sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, ("Multiprocess executor: child process for step {step_key} " "unexpectedly exited with code {exit_code}" ).format(step_key=key, exit_code=crash.exit_code), EngineEventData.engine_error( serializable_error), step_key=key, ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step( active_execution.get_step_by_key(key)), step_failure_data=StepFailureData( error=serializable_error, user_failure_data=None), ) active_execution.handle_event( step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] if term_events[key].is_set(): stopping = True del term_events[key] active_execution.verify_complete( pipeline_context, key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # In the very small chance that we get interrupted in this coordination section and not # polling the subprocesses for events - try to clean up gracefully except KeyboardInterrupt: yield DagsterEvent.engine_event( pipeline_context, "Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes", EngineEventData.interrupted( list(term_events.keys())), ) stopping = True for event in term_events.values(): event.set() errs = {pid: err for pid, err in errors.items() if err} if errs: raise DagsterSubprocessError( "During multiprocess execution errors occurred in child processes:\n{error_list}" .format(error_list="\n".join([ "In process {pid}: {err}".format( pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multiprocess engine: parent process exiting after {duration} (pid: {pid})" .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def _create_scheduler_run( instance, logger, schedule_time, repo_location, external_schedule, external_pipeline, run_request, ): run_config = run_request.run_config schedule_tags = run_request.tags execution_plan_errors = [] execution_plan_snapshot = None try: external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, run_config, external_schedule.mode, step_keys_to_execute=None, ) execution_plan_snapshot = external_execution_plan.execution_plan_snapshot except DagsterSubprocessError as e: execution_plan_errors.extend(e.subprocess_error_infos) except Exception as e: # pylint: disable=broad-except execution_plan_errors.append( serializable_error_info_from_exc_info(sys.exc_info())) pipeline_tags = external_pipeline.tags or {} check_tags(pipeline_tags, "pipeline_tags") tags = merge_dicts(pipeline_tags, schedule_tags) tags[SCHEDULED_EXECUTION_TIME_TAG] = schedule_time.in_tz("UTC").isoformat() if run_request.run_key: tags[RUN_KEY_TAG] = run_request.run_key # If the run was scheduled correctly but there was an error creating its # run config, enter it into the run DB with a FAILURE status possibly_invalid_pipeline_run = instance.create_run( pipeline_name=external_schedule.pipeline_name, run_id=None, run_config=run_config, mode=external_schedule.mode, solids_to_execute=external_pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=external_pipeline.solid_selection, status=(PipelineRunStatus.FAILURE if len(execution_plan_errors) > 0 else PipelineRunStatus.NOT_STARTED), root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, external_pipeline_origin=external_pipeline.get_external_origin(), ) if len(execution_plan_errors) > 0: for error in execution_plan_errors: instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) error_string = "\n".join( [error.to_string() for error in execution_plan_errors]) logger.error( f"Failed to fetch execution plan for {external_schedule.name}: {error_string}" ) return possibly_invalid_pipeline_run
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor_config, CeleryConfig), 'pipeline_context', 'Expected executor_config to be CeleryConfig got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config pipeline_name = pipeline_context.pipeline_def.name handle_dict = pipeline_context.execution_target_handle.to_dict() instance_ref_dict = pipeline_context.instance.get_ref().to_dict() environment_dict = dict(pipeline_context.environment_dict, execution={'in_process': {}}) mode = pipeline_context.mode_def.name run_id = pipeline_context.pipeline_run.run_id app = make_app(celery_config) pending_steps = execution_plan.execution_deps() task_signatures = {} # Dict[step_key, celery.Signature] apply_kwargs = defaultdict(dict) # Dict[step_key, Dict[str, Any]] sort_by_priority = lambda step_key: (-1 * apply_kwargs[step_key][ 'priority']) for step_key in execution_plan.step_keys_to_execute: step = execution_plan.get_step_by_key(step_key) priority = step.metadata.get('dagster-celery/priority', task_default_priority) queue = step.metadata.get('dagster-celery/queue', task_default_queue) task = create_task(app) variables = { 'executionParams': { 'selector': { 'name': pipeline_name }, 'environmentConfigData': environment_dict, 'mode': mode, 'executionMetadata': { 'runId': run_id }, 'stepKeys': [step_key], } } task_signatures[step_key] = task.si(handle_dict, variables, instance_ref_dict) apply_kwargs[step_key] = { 'priority': priority, 'queue': queue, 'routing_key': '{queue}.execute_query'.format(queue=queue), } step_results = {} # Dict[ExecutionStep, celery.AsyncResult] completed_steps = set({}) # Set[step_key] while pending_steps or step_results: results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: sort_by_priority(x[0])): if result.ready(): try: step_events = result.get() except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] for step_event in step_events: yield deserialize_json_to_dagster_namedtuple( step_event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] pending_to_pop = [] for step_key, requirements in pending_steps.items(): if requirements.issubset(completed_steps): pending_to_pop.append(step_key) # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. to_execute = sorted(pending_to_pop, key=sort_by_priority) for step_key in to_execute: try: step_results[step_key] = task_signatures[ step_key].apply_async(**apply_kwargs[step_key]) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'. format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise for step_key in pending_to_pop: if step_key in pending_steps: del pending_steps[step_key] time.sleep(TICK_SECONDS)
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multithread executor (pid: {pid})".format(pid=os.getpid()), event_specific_data=EngineEventData.in_process(os.getpid(), execution_plan.step_keys_to_execute), ) with time_execution_scope() as timer_result: with execution_plan.start(retries=self.retries) as active_execution: active_iters = {} errors = {} while not active_execution.is_complete or active_iters: # start iterators while len(active_iters) < limit: steps = active_execution.get_steps_to_execute(limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) active_iters[step.key] = self.execute_step_in_thread(step.key, step_context, errors) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue yield event_or_none active_execution.handle_event(event_or_none) except ThreadCrashException: serializable_error = serializable_error_info_from_exc_info(sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, f"Multithread executor: thread for step {key} exited unexpectedly", EngineEventData.engine_error(serializable_error), ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step(active_execution.get_step_by_key(key)), step_failure_data=StepFailureData(error=serializable_error, user_failure_data=None), ) active_execution.handle_event(step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] active_execution.verify_complete(pipeline_context, key) # process skipped and abandoned steps for event in active_execution.plan_events_iterator(pipeline_context): yield event errs = {tid: err for tid, err in errors.items() if err} if errs: raise DagsterThreadError( "During multithread execution errors occurred in threads:\n{error_list}".format( error_list="\n".join( [ "In thread {tid}: {err}".format(tid=tid, err=err.to_string()) for tid, err in errs.items() ] ) ), thread_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multithread executor: parent process exiting after {duration} (pid: {pid})".format( duration=format_duration(timer_result.millis), pid=os.getpid() ), event_specific_data=EngineEventData.multiprocess(os.getpid()), )