def sync_launch_scheduled_execution(schedule_origin, system_tz=None): check.inst_param(schedule_origin, "schedule_origin", ExternalJobOrigin) with get_temp_file_name() as output_file: parts = ( [sys.executable, "-m", "dagster", "api", "launch_scheduled_execution", output_file,] + xplat_shlex_split(schedule_origin.get_repo_cli_args()) + ["--schedule_name={}".format(schedule_origin.job_name)] + (["--override-system-timezone={}".format(system_tz)] if system_tz else []) ) subprocess.check_call(parts) result = read_unary_response(output_file) if isinstance(result, ScheduledExecutionResult): return result elif isinstance(result, IPCErrorMessage): error = result.serializable_error_info raise DagsterSubprocessError( "Error in API subprocess: {message}\n\n{err}".format( message=result.message, err=error.to_string() ), subprocess_error_infos=[error], ) else: check.failed("Unexpected result {}".format(result))
def sync_launch_scheduled_execution(schedule_origin): check.inst_param(schedule_origin, 'schedule_origin', SchedulePythonOrigin) with get_temp_file_name() as output_file: parts = ([ schedule_origin.executable_path, '-m', 'dagster', 'api', 'launch_scheduled_execution', output_file, ] + xplat_shlex_split(schedule_origin.get_repo_cli_args()) + [ '--schedule_name={}'.format(schedule_origin.schedule_name), ]) execute_command_in_subprocess(parts) result = read_unary_response(output_file) if isinstance(result, ScheduledExecutionResult): return result elif isinstance(result, IPCErrorMessage): error = result.serializable_error_info raise DagsterSubprocessError( 'Error in API subprocess: {message}\n\n{err}'.format( message=result.message, err=error.to_string()), subprocess_error_infos=[error], ) else: check.failed('Unexpected result {}'.format(result))
def sync_code(): # Sync remote dagster packages with local dagster code sync_code_command = [ 'rsync', '-av', '-progress', "--exclude='scala_modules/'", "--exclude='js_modules/'", "--exclude='.git/'", "--exclude='docs/'", '-e', '"ssh -i {aws_emr_pem_file}"'.format(aws_emr_pem_file=os.environ['AWS_EMR_PEM_FILE']), os.environ['DAGSTER_DIR'], os.environ['AWS_EMR_NODE_ADDRESS'] + ':~/', ] if ( subprocess.call( ' '.join(sync_code_command), stdout=sys.stdout, stderr=sys.stderr, shell=True ) != 0 ): raise DagsterSubprocessError('Failed to sync code to EMR') # Install dagster packages on remote node remote_install_dagster_packages_command = ['sudo', 'python3', '-m', 'pip', 'install'] + [ token for package_subpath in ['dagster', 'libraries/dagster-pyspark'] for token in ['-e', '/home/hadoop/dagster/python_modules/' + package_subpath] ] install_dagster_packages_command = [ 'ssh', '-i', os.environ['AWS_EMR_PEM_FILE'], os.environ['AWS_EMR_NODE_ADDRESS'], "'" + ' '.join(remote_install_dagster_packages_command) + "'", ] if ( subprocess.call( ' '.join(install_dagster_packages_command), stdout=sys.stdout, stderr=sys.stderr, shell=True, ) != 0 ): raise DagsterSubprocessError('Failed to install dagster packages on EMR')
def sync_code(): # Sync remote dagster packages with local dagster code sync_code_command = [ "rsync", "-av", "-progress", "--exclude='scala_modules/'", "--exclude='js_modules/'", "--exclude='.git/'", "--exclude='docs/'", "-e", '"ssh -i {aws_emr_pem_file}"'.format( aws_emr_pem_file=os.environ["AWS_EMR_PEM_FILE"]), os.environ["DAGSTER_DIR"], os.environ["AWS_EMR_NODE_ADDRESS"] + ":~/", ] if (subprocess.call(" ".join(sync_code_command), stdout=sys.stdout, stderr=sys.stderr, shell=True) != 0): raise DagsterSubprocessError("Failed to sync code to EMR") # Install dagster packages on remote node remote_install_dagster_packages_command = [ "sudo", "python3", "-m", "pip", "install" ] + [ token for package_subpath in ["dagster", "libraries/dagster-pyspark"] for token in ["-e", "/home/hadoop/dagster/python_modules/" + package_subpath] ] install_dagster_packages_command = [ "ssh", "-i", os.environ["AWS_EMR_PEM_FILE"], os.environ["AWS_EMR_NODE_ADDRESS"], "'" + " ".join(remote_install_dagster_packages_command) + "'", ] if (subprocess.call( " ".join(install_dagster_packages_command), stdout=sys.stdout, stderr=sys.stderr, shell=True, ) != 0): raise DagsterSubprocessError( "Failed to install dagster packages on EMR")
def bounded_parallel_executor(pipeline_context, step_contexts, limit): pending_execution = list(step_contexts) active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and pending_execution) or active_iters: try: while len(active_iters ) < limit and pending_execution and not stopping: step_context = pending_execution.pop(0) step = step_context.step term_events[step.key] = get_multiprocessing_context().Event() active_iters[step.key] = execute_step_out_of_process( step_context, step, errors, term_events) empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none except StopIteration: empty_iters.append(key) for key in empty_iters: del active_iters[key] if term_events[key].is_set(): stopping = True del term_events[key] # In the very small chance that we get interrupted in this coordination section and not # polling the subprocesses for events - try to clean up greacefully except KeyboardInterrupt: yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes', EngineEventData.interrupted(list(term_events.keys())), ) for event in term_events.values(): event.set() errs = {pid: err for pid, err in errors.items() if err} if errs: raise DagsterSubprocessError( 'During multiprocess execution errors occured in child processes:\n{error_list}' .format(error_list='\n'.join([ 'In process {pid}: {err}'.format(pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), )
def sync_get_external_execution_plan_grpc( api_client, pipeline_origin, run_config, mode, pipeline_snapshot_id, solid_selection=None, step_keys_to_execute=None, known_state=None, ): from dagster.grpc.client import DagsterGrpcClient check.inst_param(api_client, "api_client", DagsterGrpcClient) check.inst_param(pipeline_origin, "pipeline_origin", ExternalPipelineOrigin) check.opt_list_param(solid_selection, "solid_selection", of_type=str) check.dict_param(run_config, "run_config") check.str_param(mode, "mode") check.opt_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str) check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id") check.opt_inst_param(known_state, "known_state", KnownExecutionState) result = check.inst( api_client.execution_plan_snapshot( execution_plan_snapshot_args=ExecutionPlanSnapshotArgs( pipeline_origin=pipeline_origin, solid_selection=solid_selection, run_config=run_config, mode=mode, step_keys_to_execute=step_keys_to_execute, pipeline_snapshot_id=pipeline_snapshot_id, known_state=known_state, )), (ExecutionPlanSnapshot, ExecutionPlanSnapshotErrorData), ) if isinstance(result, ExecutionPlanSnapshotErrorData): raise DagsterSubprocessError(result.error.to_string(), subprocess_error_infos=[result.error]) return result
def bounded_parallel_executor(step_contexts, limit): pending_execution = list(step_contexts) active_iters = {} pid_tracker = {} while pending_execution or active_iters: while len(active_iters) < limit and pending_execution: step_context = pending_execution.pop() step = step_context.step active_iters[step.key] = execute_step_out_of_process(step_context, step, pid_tracker) empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none except StopIteration: empty_iters.append(key) for key in empty_iters: del active_iters[key] errs = {pid: err for pid, err in pid_tracker.items() if err} if errs: raise DagsterSubprocessError( 'During multiprocess execution errors occured in child processes:\n{error_list}'.format( error_list='\n'.join( [ 'In process {pid}: {err}'.format(pid=pid, err=err.to_string()) for pid, err in errs.items() ] ) ), subprocess_error_infos=list(errs.values()), )
def sync_get_external_execution_plan( pipeline_origin, run_config, mode, pipeline_snapshot_id, solid_selection=None, step_keys_to_execute=None, ): check.inst_param(pipeline_origin, "pipeline_origin", PipelinePythonOrigin) check.opt_list_param(solid_selection, "solid_selection", of_type=str) check.dict_param(run_config, "run_config") check.str_param(mode, "mode") check.opt_list_param(step_keys_to_execute, "step_keys_to_execute", of_type=str) check.str_param(pipeline_snapshot_id, "pipeline_snapshot_id") result = check.inst( execute_unary_api_cli_command( pipeline_origin.executable_path, "execution_plan", ExecutionPlanSnapshotArgs( pipeline_origin=pipeline_origin, solid_selection=solid_selection, run_config=run_config, mode=mode, step_keys_to_execute=step_keys_to_execute, pipeline_snapshot_id=pipeline_snapshot_id, ), ), (ExecutionPlanSnapshot, ExecutionPlanSnapshotErrorData), ) if isinstance(result, ExecutionPlanSnapshotErrorData): raise DagsterSubprocessError(result.error.to_string(), subprocess_error_infos=[result.error]) return result
def sync_get_external_execution_plan_grpc( api_client, pipeline_origin, run_config, mode, pipeline_snapshot_id, solid_selection=None, step_keys_to_execute=None, ): from dagster.grpc.client import DagsterGrpcClient check.inst_param(api_client, 'api_client', DagsterGrpcClient) check.inst_param(pipeline_origin, 'pipeline_origin', PipelineOrigin) check.opt_list_param(solid_selection, 'solid_selection', of_type=str) check.dict_param(run_config, 'run_config') check.str_param(mode, 'mode') check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) check.str_param(pipeline_snapshot_id, 'pipeline_snapshot_id') result = check.inst( api_client.execution_plan_snapshot( execution_plan_snapshot_args=ExecutionPlanSnapshotArgs( pipeline_origin=pipeline_origin, solid_selection=solid_selection, run_config=run_config, mode=mode, step_keys_to_execute=step_keys_to_execute, pipeline_snapshot_id=pipeline_snapshot_id, )), (ExecutionPlanSnapshot, ExecutionPlanSnapshotErrorData), ) if isinstance(result, ExecutionPlanSnapshotErrorData): raise DagsterSubprocessError(result.error.to_string(), subprocess_error_infos=[result.error]) return result
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multiprocess engine: parent process (pid: {pid})" .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: with execution_plan.start( retries=self.retries) as active_execution: active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: try: # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[step.key] = multiprocessing.Event() active_iters[ step. key] = self.execute_step_out_of_process( step_context, step, errors, term_events) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event( event_or_none) except ChildProcessCrashException as crash: serializable_error = serializable_error_info_from_exc_info( sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, ("Multiprocess executor: child process for step {step_key} " "unexpectedly exited with code {exit_code}" ).format(step_key=key, exit_code=crash.exit_code), EngineEventData.engine_error( serializable_error), step_key=key, ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step( active_execution.get_step_by_key(key)), step_failure_data=StepFailureData( error=serializable_error, user_failure_data=None), ) active_execution.handle_event( step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] if term_events[key].is_set(): stopping = True del term_events[key] active_execution.verify_complete( pipeline_context, key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # In the very small chance that we get interrupted in this coordination section and not # polling the subprocesses for events - try to clean up gracefully except KeyboardInterrupt: yield DagsterEvent.engine_event( pipeline_context, "Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes", EngineEventData.interrupted( list(term_events.keys())), ) stopping = True for event in term_events.values(): event.set() errs = {pid: err for pid, err in errors.items() if err} if errs: raise DagsterSubprocessError( "During multiprocess execution errors occurred in child processes:\n{error_list}" .format(error_list="\n".join([ "In process {pid}: {err}".format( pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multiprocess engine: parent process exiting after {duration} (pid: {pid})" .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor_config, CeleryConfig), 'pipeline_context', 'Expected executor_config to be CeleryConfig got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config storage = pipeline_context.environment_dict.get('storage') if (celery_config.broker and not is_local_uri(celery_config.broker) ) or (celery_config.backend and not is_local_uri(celery_config.backend)): check.invariant( storage.get('s3') or storage.get('gcs'), 'Must use S3 or GCS storage with non-local Celery broker: {broker} ' 'and backend: {backend}'.format(broker=celery_config.broker, backend=celery_config.backend), ) else: check.invariant( not storage.get('in_memory'), 'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS', ) pipeline_name = pipeline_context.pipeline_def.name handle_dict = pipeline_context.execution_target_handle.to_dict() instance_ref_dict = pipeline_context.instance.get_ref().to_dict() environment_dict = dict(pipeline_context.environment_dict, execution={'in_process': {}}) mode = pipeline_context.mode_def.name run_id = pipeline_context.pipeline_run.run_id app = make_app(celery_config) task_signatures = {} # Dict[step_key, celery.Signature] apply_kwargs = defaultdict(dict) # Dict[step_key, Dict[str, Any]] priority_for_step = lambda step: (-1 * int( step.tags.get('dagster-celery/priority', task_default_priority))) priority_for_key = lambda step_key: (-1 * apply_kwargs[step_key][ 'priority']) _warn_on_priority_misuse(pipeline_context, execution_plan) for step_key in execution_plan.step_keys_to_execute: step = execution_plan.get_step_by_key(step_key) priority = int( step.tags.get('dagster-celery/priority', task_default_priority)) queue = step.tags.get('dagster-celery/queue', task_default_queue) task = create_task(app) variables = { 'executionParams': { 'selector': { 'name': pipeline_name }, 'environmentConfigData': environment_dict, 'mode': mode, 'executionMetadata': { 'runId': run_id }, 'stepKeys': [step_key], } } task_signatures[step_key] = task.si(handle_dict, variables, instance_ref_dict) apply_kwargs[step_key] = { 'priority': priority, 'queue': queue, 'routing_key': '{queue}.execute_query'.format(queue=queue), } step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_success = {} step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start(sort_key_fn=priority_for_step) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception as e: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event if event.is_step_success: step_success[step_key] = True elif event.is_step_failure: step_success[step_key] = False results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] was_success = step_success.get(step_key) if was_success == True: active_execution.mark_success(step_key) elif was_success == False: active_execution.mark_failed(step_key) else: # check errors list? pipeline_context.log.error( 'Step {key} finished without success or failure event, assuming failure.' .format(key=step_key)) active_execution.mark_failed(step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # dont add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: step_results[step.key] = task_signatures[ step.key].apply_async(**apply_kwargs[step.key]) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'. format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occured in workers:\n{error_list}' .format(error_list='\n'.join([ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) intermediates_manager = pipeline_context.intermediates_manager limit = pipeline_context.executor_config.max_concurrent yield DagsterEvent.engine_event( pipeline_context, 'Executing steps using multiprocess engine: parent process (pid: {pid})' .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries) active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: try: # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[ step.key] = get_multiprocessing_context( ).Event() active_iters[ step.key] = execute_step_out_of_process( step_context, step, errors, term_events) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event(event_or_none) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] if term_events[key].is_set(): stopping = True del term_events[key] active_execution.verify_complete(pipeline_context, key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # In the very small chance that we get interrupted in this coordination section and not # polling the subprocesses for events - try to clean up gracefully except KeyboardInterrupt: yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes', EngineEventData.interrupted(list(term_events.keys())), ) stopping = True for event in term_events.values(): event.set() errs = {pid: err for pid, err in errors.items() if err} if errs: raise DagsterSubprocessError( 'During multiprocess execution errors occurred in child processes:\n{error_list}' .format(error_list='\n'.join([ 'In process {pid}: {err}'.format(pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: parent process exiting after {duration} (pid: {pid})' .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multiprocess executor: parent process (pid: {pid})" .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: with execution_plan.start( retry_mode=self.retries) as active_execution: active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: received termination signal - " "forwarding to active child processes", EngineEventData.interrupted( list(term_events.keys())), ) stopping = True active_execution.mark_interrupted() for key, event in term_events.items(): event.set() # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[step.key] = multiprocessing.Event() active_iters[ step.key] = self.execute_step_out_of_process( step_context, step, errors, term_events, active_execution.get_known_state(), ) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event(event_or_none) except ChildProcessCrashException as crash: serializable_error = serializable_error_info_from_exc_info( sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, ("Multiprocess executor: child process for step {step_key} " "unexpectedly exited with code {exit_code}" ).format(step_key=key, exit_code=crash.exit_code), EngineEventData.engine_error( serializable_error), step_handle=active_execution.get_step_by_key( key).handle, ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step( active_execution.get_step_by_key(key)), step_failure_data=StepFailureData( error=serializable_error, user_failure_data=None), ) active_execution.handle_event(step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] del term_events[key] active_execution.verify_complete(pipeline_context, key) # process skipped and abandoned steps yield from active_execution.plan_events_iterator( pipeline_context) errs = {pid: err for pid, err in errors.items() if err} # After termination starts, raise an interrupted exception once all subprocesses # have finished cleaning up (and the only errors were from being interrupted) if (stopping and (not active_iters) and all([ err_info.cls_name == "DagsterExecutionInterruptedError" for err_info in errs.values() ])): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: interrupted all active child processes", event_specific_data=EngineEventData(), ) raise DagsterExecutionInterruptedError() elif errs: raise DagsterSubprocessError( "During multiprocess execution errors occurred in child processes:\n{error_list}" .format(error_list="\n".join([ "In process {pid}: {err}".format( pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: parent process exiting after {duration} (pid: {pid})" .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.callable_param(step_execution_fn, "step_execution_fn") executor = pipeline_context.executor # https://github.com/dagster-io/dagster/issues/2440 check.invariant( execution_plan.artifacts_persisted, "Cannot use in-memory storage with Celery, use filesystem (on top of NFS or " "similar system that allows files to be available to all nodes), S3, or GCS", ) app = make_app(executor.app_args()) priority_for_step = lambda step: (-1 * int( step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} with execution_plan.start( retries=pipeline_context.executor.retries, sort_key_fn=priority_for_step, ) as active_execution: stopping = False while (not active_execution.is_complete and not stopping) or step_results: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Celery executor: received termination signal - revoking active tasks from workers", EngineEventData.interrupted(list(step_results.keys())), ) stopping = True active_execution.mark_interrupted() for result in step_results.values(): result.revoke() results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except TaskRevokedError: step_events = [] yield DagsterEvent.engine_event( pipeline_context, 'celery task for running step "{step_key}" was revoked.' .format(step_key=step_key, ), EngineEventData(marker_end=DELEGATE_MARKER), step_handle=active_execution.get_step_by_key( step_key).handle, ) except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping or step_errors: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_handle=step.handle, ) # Get the Celery priority for this step priority = _get_step_priority(pipeline_context, step) # Submit the Celery tasks step_results[step.key] = step_execution_fn( app, pipeline_context, step, queue, priority) except Exception: yield DagsterEvent.engine_event( pipeline_context, "Encountered error during celery task submission.". format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( "During celery execution errors occurred in workers:\n{error_list}" .format(error_list="\n".join([ "[{step}]: {err}".format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def _core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn): from .tasks import make_app check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.callable_param(step_execution_fn, 'step_execution_fn') check.param_invariant( isinstance(pipeline_context.executor_config, (CeleryConfig, CeleryK8sJobConfig)), 'pipeline_context', 'Expected executor_config to be Celery config got {}'.format( pipeline_context.executor_config ), ) celery_config = pipeline_context.executor_config # https://github.com/dagster-io/dagster/issues/2440 check.invariant( pipeline_context.system_storage_def.is_persistent, 'Cannot use in-memory storage with Celery, use filesystem (on top of NFS or ' 'similar system that allows files to be available to all nodes), S3, or GCS', ) app = make_app(celery_config) priority_for_step = lambda step: ( -1 * int(step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority)) + -1 * _get_run_priority(pipeline_context) ) priority_for_key = lambda step_key: ( priority_for_step(execution_plan.get_step_by_key(step_key)) ) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step ) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted(step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[step_key] = serializable_error_info_from_exc_info(sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple(step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator(pipeline_context): yield event # don't add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".'.format( step_key=step.key, queue=queue ), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) # Get the Celery priority for this step priority = _get_step_priority(pipeline_context, step) # Submit the Celery tasks step_results[step.key] = step_execution_fn( app, pipeline_context, step, queue, priority ) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'.format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occurred in workers:\n{error_list}'.format( error_list='\n'.join( [ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ] ) ), subprocess_error_infos=list(step_errors.values()), )
def execute(pipeline_context, execution_plan): from .tasks import make_app check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor_config, CeleryConfig), 'pipeline_context', 'Expected executor_config to be CeleryConfig got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config storage = pipeline_context.environment_dict.get('storage') if (celery_config.broker and not is_local_uri(celery_config.broker) ) or (celery_config.backend and not is_local_uri(celery_config.backend)): check.invariant( storage.get('s3') or storage.get('gcs'), 'Must use S3 or GCS storage with non-local Celery broker: {broker} ' 'and backend: {backend}'.format(broker=celery_config.broker, backend=celery_config.backend), ) else: check.invariant( not storage.get('in_memory'), 'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS', ) app = make_app(celery_config) priority_for_step = lambda step: (-1 * int( step.tags.get('dagster-celery/priority', task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception as e: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get('dagster-celery/queue', task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) step_results[step.key] = _submit_task( app, pipeline_context, step, queue) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'. format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occurred in workers:\n{error_list}' .format(error_list='\n'.join([ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )