def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) step_levels = execution_plan.execution_step_levels() intermediates_manager = pipeline_context.intermediates_manager limit = pipeline_context.executor_config.max_concurrent step_key_set = set(step.key for step in execution_plan.execution_steps()) yield DagsterEvent.engine_event( pipeline_context, 'Executing steps using multiprocess engine: parent process (pid: {pid})'.format( pid=os.getpid() ), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=step_key_set ), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: for event in copy_required_intermediates_for_execution( pipeline_context, execution_plan ): yield event for step_level in step_levels: step_contexts_to_execute = [] for step in step_level: step_context = pipeline_context.for_step(step) if not intermediates_manager.all_inputs_covered(step_context, step): uncovered_inputs = intermediates_manager.uncovered_inputs( step_context, step ) step_context.log.error( ( 'Not all inputs covered for {step}. Not executing.' 'Output missing for inputs: {uncovered_inputs}' ).format(uncovered_inputs=uncovered_inputs, step=step.key) ) continue step_contexts_to_execute.append(step_context) for step_event in bounded_parallel_executor(step_contexts_to_execute, limit): yield step_event yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: parent process exiting after {duration} (pid: {pid})'.format( duration=format_duration(timer_result.millis), pid=os.getpid() ), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def inner_plan_execution_iterator(pipeline_context, execution_plan, retries): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.inst_param(retries, 'retries', Retries) for event in copy_required_intermediates_for_execution(pipeline_context, execution_plan): yield event # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 active_execution = execution_plan.start(retries=retries) while not active_execution.is_complete: step = active_execution.get_next_step() step_context = pipeline_context.for_step(step) check.invariant( all( hasattr(step_context.resources, resource_key) for resource_key in step_context.required_resource_keys ), 'expected step context to have all required resources', ) with pipeline_context.instance.compute_log_manager.watch( step_context.pipeline_run, step_context.step.key ): # capture all of the logs for this step uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step ) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional(uncovered_inputs, execution_plan, step.key) step_context.log.info( ( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}' ).format(uncovered_inputs=uncovered_inputs, step=step.key) ) yield DagsterEvent.step_skipped_event(step_context) active_execution.mark_skipped(step.key) else: for step_event in check.generator( dagster_event_sequence_for_step(step_context, retries) ): check.inst(step_event, DagsterEvent) yield step_event active_execution.handle_event(step_event) active_execution.verify_complete(pipeline_context, step.key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator(pipeline_context): yield event
def inner_plan_execution_iterator(pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) retries = pipeline_context.retries yield from copy_required_intermediates_for_execution( pipeline_context, execution_plan) with execution_plan.start(retries=retries) as active_execution: # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 while not active_execution.is_complete: step = active_execution.get_next_step() step_context = pipeline_context.for_step(step) step_event_list = [] missing_resources = [ resource_key for resource_key in step_context.required_resource_keys if not hasattr(step_context.resources, resource_key) ] check.invariant( len(missing_resources) == 0, ("Expected step context for solid {solid_name} to have all required resources, but " "missing {missing_resources}.").format( solid_name=step_context.solid.name, missing_resources=missing_resources), ) # capture all of the logs for this step with pipeline_context.instance.compute_log_manager.watch( step_context.pipeline_run, step_context.step.key): for step_event in check.generator( _dagster_event_sequence_for_step( step_context, retries)): check.inst(step_event, DagsterEvent) step_event_list.append(step_event) yield step_event active_execution.handle_event(step_event) active_execution.verify_complete(pipeline_context, step.key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator( pipeline_context): step_event_list.append(event) yield event # pass a list of step events to hooks for hook_event in _trigger_hook(step_context, step_event_list): yield hook_event
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) yield DagsterEvent.engine_event( pipeline_context, 'Executing steps in process (pid: {pid})'.format(pid=os.getpid()), event_specific_data=EngineEventData.in_process( os.getpid(), execution_plan.step_keys_to_execute), ) with time_execution_scope() as timer_result: check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config), ) for event in copy_required_intermediates_for_execution( pipeline_context, execution_plan): yield event # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 active_execution = execution_plan.start() while not active_execution.is_complete: steps = active_execution.get_steps_to_execute(limit=1) check.invariant( len(steps) == 1, 'Invariant Violation: expected step to be available to execute' ) step = steps[0] step_context = pipeline_context.for_step(step) check.invariant( all( hasattr(step_context.resources, resource_key) for resource_key in step_context.required_resource_keys), 'expected step context to have all required resources', ) with mirror_step_io(step_context): # capture all of the logs for this step uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional( uncovered_inputs, execution_plan, step.key) step_context.log.info(( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}').format( uncovered_inputs=uncovered_inputs, step=step.key)) yield DagsterEvent.step_skipped_event(step_context) active_execution.mark_skipped(step.key) continue for step_event in check.generator( dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) yield step_event active_execution.handle_event(step_event) active_execution.verify_complete(pipeline_context, step.key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event yield DagsterEvent.engine_event( pipeline_context, 'Finished steps in process (pid: {pid}) in {duration_ms}'.format( pid=os.getpid(), duration_ms=format_duration(timer_result.millis)), event_specific_data=EngineEventData.in_process( os.getpid(), execution_plan.step_keys_to_execute), )
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) step_levels = execution_plan.execution_step_levels() step_key_set = set(step.key for step_level in step_levels for step in step_level) yield DagsterEvent.engine_event( pipeline_context, 'Executing steps in process (pid: {pid})'.format(pid=os.getpid()), event_specific_data=EngineEventData.in_process( os.getpid(), step_key_set), ) with time_execution_scope() as timer_result: check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config), ) for event in copy_required_intermediates_for_execution( pipeline_context, execution_plan): yield event failed_or_skipped_steps = set() # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 for step_level in step_levels: for step in step_level: step_context = pipeline_context.for_step(step) with mirror_step_io(step_context): # capture all of the logs for this step failed_inputs = [] for step_input in step.step_inputs: failed_inputs.extend( failed_or_skipped_steps.intersection( step_input.dependency_keys)) if failed_inputs: step_context.log.info(( 'Dependencies for step {step} failed: {failed_inputs}. Not executing.' ).format(step=step.key, failed_inputs=failed_inputs)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional( uncovered_inputs, execution_plan, step.key) step_context.log.info(( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}').format( uncovered_inputs=uncovered_inputs, step=step.key)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue for step_event in check.generator( dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) if step_event.is_step_failure: failed_or_skipped_steps.add(step.key) yield step_event yield DagsterEvent.engine_event( pipeline_context, 'Finished steps in process (pid: {pid}) in {duration_ms}'.format( pid=os.getpid(), duration_ms=format_duration(timer_result.millis)), event_specific_data=EngineEventData.in_process( os.getpid(), step_key_set), )
def inner_plan_execution_iterator(pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) retries = pipeline_context.retries for event in copy_required_intermediates_for_execution( pipeline_context, execution_plan): yield event with execution_plan.start(retries=retries) as active_execution: # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 while not active_execution.is_complete: step = active_execution.get_next_step() step_context = pipeline_context.for_step(step) step_event_list = [] missing_resources = [ resource_key for resource_key in step_context.required_resource_keys if not hasattr(step_context.resources, resource_key) ] check.invariant( len(missing_resources) == 0, ("Expected step context for solid {solid_name} to have all required resources, but " "missing {missing_resources}.").format( solid_name=step_context.solid.name, missing_resources=missing_resources), ) with pipeline_context.instance.compute_log_manager.watch( step_context.pipeline_run, step_context.step.key): # capture all of the logs for this step uncovered_inputs = pipeline_context.intermediate_storage.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional(uncovered_inputs, execution_plan, step.key) step_context.log.info(( "Not all inputs covered for {step}. Not executing. Output missing for " "inputs: {uncovered_inputs}").format( uncovered_inputs=uncovered_inputs, step=step.key)) step_event = DagsterEvent.step_skipped_event(step_context) step_event_list.append(step_event) yield step_event active_execution.mark_skipped(step.key) else: for step_event in check.generator( _dagster_event_sequence_for_step( step_context, retries)): check.inst(step_event, DagsterEvent) step_event_list.append(step_event) yield step_event active_execution.handle_event(step_event) active_execution.verify_complete(pipeline_context, step.key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): step_event_list.append(event) yield event # pass a list of step events to hooks for hook_event in _trigger_hook(step_context, step_event_list): yield hook_event
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) intermediates_manager = pipeline_context.intermediates_manager limit = pipeline_context.executor_config.max_concurrent yield DagsterEvent.engine_event( pipeline_context, 'Executing steps using multiprocess engine: parent process (pid: {pid})' .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: for event in copy_required_intermediates_for_execution( pipeline_context, execution_plan): yield event active_execution = execution_plan.start() active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: try: # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[ step.key] = get_multiprocessing_context( ).Event() active_iters[ step.key] = execute_step_out_of_process( step_context, step, errors, term_events) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event(event_or_none) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] if term_events[key].is_set(): stopping = True del term_events[key] active_execution.verify_complete(pipeline_context, key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # In the very small chance that we get interrupted in this coordination section and not # polling the subprocesses for events - try to clean up greacefully except KeyboardInterrupt: yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes', EngineEventData.interrupted(list(term_events.keys())), ) for event in term_events.values(): event.set() errs = {pid: err for pid, err in errors.items() if err} if errs: raise DagsterSubprocessError( 'During multiprocess execution errors occured in child processes:\n{error_list}' .format(error_list='\n'.join([ 'In process {pid}: {err}'.format(pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: parent process exiting after {duration} (pid: {pid})' .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )