예제 #1
0
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        step_levels = execution_plan.execution_step_levels()

        intermediates_manager = pipeline_context.intermediates_manager

        limit = pipeline_context.executor_config.max_concurrent

        step_key_set = set(step.key for step in execution_plan.execution_steps())

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Executing steps using multiprocess engine: parent process (pid: {pid})'.format(
                pid=os.getpid()
            ),
            event_specific_data=EngineEventData.multiprocess(
                os.getpid(), step_keys_to_execute=step_key_set
            ),
        )

        # It would be good to implement a reference tracking algorithm here so we could
        # garbage collection results that are no longer needed by any steps
        # https://github.com/dagster-io/dagster/issues/811
        with time_execution_scope() as timer_result:
            for event in copy_required_intermediates_for_execution(
                pipeline_context, execution_plan
            ):
                yield event

            for step_level in step_levels:
                step_contexts_to_execute = []
                for step in step_level:
                    step_context = pipeline_context.for_step(step)

                    if not intermediates_manager.all_inputs_covered(step_context, step):
                        uncovered_inputs = intermediates_manager.uncovered_inputs(
                            step_context, step
                        )
                        step_context.log.error(
                            (
                                'Not all inputs covered for {step}. Not executing.'
                                'Output missing for inputs: {uncovered_inputs}'
                            ).format(uncovered_inputs=uncovered_inputs, step=step.key)
                        )
                        continue

                    step_contexts_to_execute.append(step_context)

                for step_event in bounded_parallel_executor(step_contexts_to_execute, limit):
                    yield step_event

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Multiprocess engine: parent process exiting after {duration} (pid: {pid})'.format(
                duration=format_duration(timer_result.millis), pid=os.getpid()
            ),
            event_specific_data=EngineEventData.multiprocess(os.getpid()),
        )
예제 #2
0
def inner_plan_execution_iterator(pipeline_context, execution_plan, retries):
    check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext)
    check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
    check.inst_param(retries, 'retries', Retries)

    for event in copy_required_intermediates_for_execution(pipeline_context, execution_plan):
        yield event

    # It would be good to implement a reference tracking algorithm here to
    # garbage collect results that are no longer needed by any steps
    # https://github.com/dagster-io/dagster/issues/811
    active_execution = execution_plan.start(retries=retries)
    while not active_execution.is_complete:
        step = active_execution.get_next_step()

        step_context = pipeline_context.for_step(step)
        check.invariant(
            all(
                hasattr(step_context.resources, resource_key)
                for resource_key in step_context.required_resource_keys
            ),
            'expected step context to have all required resources',
        )

        with pipeline_context.instance.compute_log_manager.watch(
            step_context.pipeline_run, step_context.step.key
        ):
            # capture all of the logs for this step
            uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs(
                step_context, step
            )
            if uncovered_inputs:
                # In partial pipeline execution, we may end up here without having validated the
                # missing dependent outputs were optional
                _assert_missing_inputs_optional(uncovered_inputs, execution_plan, step.key)

                step_context.log.info(
                    (
                        'Not all inputs covered for {step}. Not executing. Output missing for '
                        'inputs: {uncovered_inputs}'
                    ).format(uncovered_inputs=uncovered_inputs, step=step.key)
                )
                yield DagsterEvent.step_skipped_event(step_context)
                active_execution.mark_skipped(step.key)
            else:
                for step_event in check.generator(
                    dagster_event_sequence_for_step(step_context, retries)
                ):
                    check.inst(step_event, DagsterEvent)
                    yield step_event
                    active_execution.handle_event(step_event)

            active_execution.verify_complete(pipeline_context, step.key)

        # process skips from failures or uncovered inputs
        for event in active_execution.skipped_step_events_iterator(pipeline_context):
            yield event
예제 #3
0
def inner_plan_execution_iterator(pipeline_context, execution_plan):
    check.inst_param(pipeline_context, "pipeline_context",
                     SystemExecutionContext)
    check.inst_param(execution_plan, "execution_plan", ExecutionPlan)

    retries = pipeline_context.retries

    yield from copy_required_intermediates_for_execution(
        pipeline_context, execution_plan)

    with execution_plan.start(retries=retries) as active_execution:

        # It would be good to implement a reference tracking algorithm here to
        # garbage collect results that are no longer needed by any steps
        # https://github.com/dagster-io/dagster/issues/811
        while not active_execution.is_complete:
            step = active_execution.get_next_step()
            step_context = pipeline_context.for_step(step)
            step_event_list = []

            missing_resources = [
                resource_key
                for resource_key in step_context.required_resource_keys
                if not hasattr(step_context.resources, resource_key)
            ]
            check.invariant(
                len(missing_resources) == 0,
                ("Expected step context for solid {solid_name} to have all required resources, but "
                 "missing {missing_resources}.").format(
                     solid_name=step_context.solid.name,
                     missing_resources=missing_resources),
            )

            # capture all of the logs for this step
            with pipeline_context.instance.compute_log_manager.watch(
                    step_context.pipeline_run, step_context.step.key):

                for step_event in check.generator(
                        _dagster_event_sequence_for_step(
                            step_context, retries)):
                    check.inst(step_event, DagsterEvent)
                    step_event_list.append(step_event)
                    yield step_event
                    active_execution.handle_event(step_event)

                active_execution.verify_complete(pipeline_context, step.key)

            # process skips from failures or uncovered inputs
            for event in active_execution.plan_events_iterator(
                    pipeline_context):
                step_event_list.append(event)
                yield event

            # pass a list of step events to hooks
            for hook_event in _trigger_hook(step_context, step_event_list):
                yield hook_event
예제 #4
0
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Executing steps in process (pid: {pid})'.format(pid=os.getpid()),
            event_specific_data=EngineEventData.in_process(
                os.getpid(), execution_plan.step_keys_to_execute),
        )

        with time_execution_scope() as timer_result:
            check.param_invariant(
                isinstance(pipeline_context.executor_config, ExecutorConfig),
                'pipeline_context',
                'Expected executor_config to be ExecutorConfig got {}'.format(
                    pipeline_context.executor_config),
            )

            for event in copy_required_intermediates_for_execution(
                    pipeline_context, execution_plan):
                yield event

            # It would be good to implement a reference tracking algorithm here to
            # garbage collect results that are no longer needed by any steps
            # https://github.com/dagster-io/dagster/issues/811
            active_execution = execution_plan.start()
            while not active_execution.is_complete:

                steps = active_execution.get_steps_to_execute(limit=1)
                check.invariant(
                    len(steps) == 1,
                    'Invariant Violation: expected step to be available to execute'
                )
                step = steps[0]
                step_context = pipeline_context.for_step(step)
                check.invariant(
                    all(
                        hasattr(step_context.resources, resource_key) for
                        resource_key in step_context.required_resource_keys),
                    'expected step context to have all required resources',
                )

                with mirror_step_io(step_context):
                    # capture all of the logs for this step
                    uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs(
                        step_context, step)
                    if uncovered_inputs:
                        # In partial pipeline execution, we may end up here without having validated the
                        # missing dependent outputs were optional
                        _assert_missing_inputs_optional(
                            uncovered_inputs, execution_plan, step.key)

                        step_context.log.info((
                            'Not all inputs covered for {step}. Not executing. Output missing for '
                            'inputs: {uncovered_inputs}').format(
                                uncovered_inputs=uncovered_inputs,
                                step=step.key))
                        yield DagsterEvent.step_skipped_event(step_context)
                        active_execution.mark_skipped(step.key)
                        continue

                    for step_event in check.generator(
                            dagster_event_sequence_for_step(step_context)):
                        check.inst(step_event, DagsterEvent)
                        yield step_event
                        active_execution.handle_event(step_event)

                    active_execution.verify_complete(pipeline_context,
                                                     step.key)

                # process skips from failures or uncovered inputs
                for event in active_execution.skipped_step_events_iterator(
                        pipeline_context):
                    yield event

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Finished steps in process (pid: {pid}) in {duration_ms}'.format(
                pid=os.getpid(),
                duration_ms=format_duration(timer_result.millis)),
            event_specific_data=EngineEventData.in_process(
                os.getpid(), execution_plan.step_keys_to_execute),
        )
예제 #5
0
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        step_levels = execution_plan.execution_step_levels()
        step_key_set = set(step.key for step_level in step_levels
                           for step in step_level)

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Executing steps in process (pid: {pid})'.format(pid=os.getpid()),
            event_specific_data=EngineEventData.in_process(
                os.getpid(), step_key_set),
        )

        with time_execution_scope() as timer_result:
            check.param_invariant(
                isinstance(pipeline_context.executor_config, ExecutorConfig),
                'pipeline_context',
                'Expected executor_config to be ExecutorConfig got {}'.format(
                    pipeline_context.executor_config),
            )

            for event in copy_required_intermediates_for_execution(
                    pipeline_context, execution_plan):
                yield event

            failed_or_skipped_steps = set()

            # It would be good to implement a reference tracking algorithm here to
            # garbage collect results that are no longer needed by any steps
            # https://github.com/dagster-io/dagster/issues/811
            for step_level in step_levels:
                for step in step_level:
                    step_context = pipeline_context.for_step(step)

                    with mirror_step_io(step_context):
                        # capture all of the logs for this step

                        failed_inputs = []
                        for step_input in step.step_inputs:
                            failed_inputs.extend(
                                failed_or_skipped_steps.intersection(
                                    step_input.dependency_keys))

                        if failed_inputs:
                            step_context.log.info((
                                'Dependencies for step {step} failed: {failed_inputs}. Not executing.'
                            ).format(step=step.key,
                                     failed_inputs=failed_inputs))
                            failed_or_skipped_steps.add(step.key)
                            yield DagsterEvent.step_skipped_event(step_context)
                            continue

                        uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs(
                            step_context, step)
                        if uncovered_inputs:
                            # In partial pipeline execution, we may end up here without having validated the
                            # missing dependent outputs were optional
                            _assert_missing_inputs_optional(
                                uncovered_inputs, execution_plan, step.key)

                            step_context.log.info((
                                'Not all inputs covered for {step}. Not executing. Output missing for '
                                'inputs: {uncovered_inputs}').format(
                                    uncovered_inputs=uncovered_inputs,
                                    step=step.key))
                            failed_or_skipped_steps.add(step.key)
                            yield DagsterEvent.step_skipped_event(step_context)
                            continue

                        for step_event in check.generator(
                                dagster_event_sequence_for_step(step_context)):
                            check.inst(step_event, DagsterEvent)
                            if step_event.is_step_failure:
                                failed_or_skipped_steps.add(step.key)

                            yield step_event

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Finished steps in process (pid: {pid}) in {duration_ms}'.format(
                pid=os.getpid(),
                duration_ms=format_duration(timer_result.millis)),
            event_specific_data=EngineEventData.in_process(
                os.getpid(), step_key_set),
        )
예제 #6
0
def inner_plan_execution_iterator(pipeline_context, execution_plan):
    check.inst_param(pipeline_context, "pipeline_context",
                     SystemExecutionContext)
    check.inst_param(execution_plan, "execution_plan", ExecutionPlan)

    retries = pipeline_context.retries

    for event in copy_required_intermediates_for_execution(
            pipeline_context, execution_plan):
        yield event

    with execution_plan.start(retries=retries) as active_execution:

        # It would be good to implement a reference tracking algorithm here to
        # garbage collect results that are no longer needed by any steps
        # https://github.com/dagster-io/dagster/issues/811
        while not active_execution.is_complete:
            step = active_execution.get_next_step()
            step_context = pipeline_context.for_step(step)
            step_event_list = []

            missing_resources = [
                resource_key
                for resource_key in step_context.required_resource_keys
                if not hasattr(step_context.resources, resource_key)
            ]
            check.invariant(
                len(missing_resources) == 0,
                ("Expected step context for solid {solid_name} to have all required resources, but "
                 "missing {missing_resources}.").format(
                     solid_name=step_context.solid.name,
                     missing_resources=missing_resources),
            )

            with pipeline_context.instance.compute_log_manager.watch(
                    step_context.pipeline_run, step_context.step.key):
                # capture all of the logs for this step
                uncovered_inputs = pipeline_context.intermediate_storage.uncovered_inputs(
                    step_context, step)
                if uncovered_inputs:
                    # In partial pipeline execution, we may end up here without having validated the
                    # missing dependent outputs were optional
                    _assert_missing_inputs_optional(uncovered_inputs,
                                                    execution_plan, step.key)

                    step_context.log.info((
                        "Not all inputs covered for {step}. Not executing. Output missing for "
                        "inputs: {uncovered_inputs}").format(
                            uncovered_inputs=uncovered_inputs, step=step.key))
                    step_event = DagsterEvent.step_skipped_event(step_context)
                    step_event_list.append(step_event)
                    yield step_event
                    active_execution.mark_skipped(step.key)
                else:
                    for step_event in check.generator(
                            _dagster_event_sequence_for_step(
                                step_context, retries)):
                        check.inst(step_event, DagsterEvent)
                        step_event_list.append(step_event)
                        yield step_event
                        active_execution.handle_event(step_event)

                active_execution.verify_complete(pipeline_context, step.key)

            # process skips from failures or uncovered inputs
            for event in active_execution.skipped_step_events_iterator(
                    pipeline_context):
                step_event_list.append(event)
                yield event

            # pass a list of step events to hooks
            for hook_event in _trigger_hook(step_context, step_event_list):
                yield hook_event
예제 #7
0
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        intermediates_manager = pipeline_context.intermediates_manager

        limit = pipeline_context.executor_config.max_concurrent

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Executing steps using multiprocess engine: parent process (pid: {pid})'
            .format(pid=os.getpid()),
            event_specific_data=EngineEventData.multiprocess(
                os.getpid(),
                step_keys_to_execute=execution_plan.step_keys_to_execute),
        )

        # It would be good to implement a reference tracking algorithm here so we could
        # garbage collection results that are no longer needed by any steps
        # https://github.com/dagster-io/dagster/issues/811
        with time_execution_scope() as timer_result:

            for event in copy_required_intermediates_for_execution(
                    pipeline_context, execution_plan):
                yield event

            active_execution = execution_plan.start()
            active_iters = {}
            errors = {}
            term_events = {}
            stopping = False

            while (not stopping
                   and not active_execution.is_complete) or active_iters:
                try:
                    # start iterators
                    while len(active_iters) < limit and not stopping:
                        steps = active_execution.get_steps_to_execute(
                            limit=(limit - len(active_iters)))

                        if not steps:
                            break

                        for step in steps:
                            step_context = pipeline_context.for_step(step)
                            term_events[
                                step.key] = get_multiprocessing_context(
                                ).Event()
                            active_iters[
                                step.key] = execute_step_out_of_process(
                                    step_context, step, errors, term_events)

                    # process active iterators
                    empty_iters = []
                    for key, step_iter in active_iters.items():
                        try:
                            event_or_none = next(step_iter)
                            if event_or_none is None:
                                continue
                            else:
                                yield event_or_none
                                active_execution.handle_event(event_or_none)

                        except StopIteration:
                            empty_iters.append(key)

                    # clear and mark complete finished iterators
                    for key in empty_iters:
                        del active_iters[key]
                        if term_events[key].is_set():
                            stopping = True
                        del term_events[key]
                        active_execution.verify_complete(pipeline_context, key)

                    # process skips from failures or uncovered inputs
                    for event in active_execution.skipped_step_events_iterator(
                            pipeline_context):
                        yield event

                # In the very small chance that we get interrupted in this coordination section and not
                # polling the subprocesses for events - try to clean up greacefully
                except KeyboardInterrupt:
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes',
                        EngineEventData.interrupted(list(term_events.keys())),
                    )
                    for event in term_events.values():
                        event.set()

            errs = {pid: err for pid, err in errors.items() if err}
            if errs:
                raise DagsterSubprocessError(
                    'During multiprocess execution errors occured in child processes:\n{error_list}'
                    .format(error_list='\n'.join([
                        'In process {pid}: {err}'.format(pid=pid,
                                                         err=err.to_string())
                        for pid, err in errs.items()
                    ])),
                    subprocess_error_infos=list(errs.values()),
                )

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Multiprocess engine: parent process exiting after {duration} (pid: {pid})'
            .format(duration=format_duration(timer_result.millis),
                    pid=os.getpid()),
            event_specific_data=EngineEventData.multiprocess(os.getpid()),
        )