Пример #1
0
def validate_reexecution_memoization(plan_context: IPlanContext,
                                     execution_plan: ExecutionPlan) -> None:

    parent_run_id = plan_context.pipeline_run.parent_run_id
    check.opt_str_param(parent_run_id, "parent_run_id")

    if parent_run_id is None:
        return

    if not plan_context.instance.has_run(parent_run_id):
        raise DagsterRunNotFoundError(
            "Run id {} set as parent run id was not found in instance".format(
                parent_run_id),
            invalid_run_id=parent_run_id,
        )

    # exclude full pipeline re-execution
    if len(execution_plan.step_keys_to_execute) == len(execution_plan.steps):
        return

    if execution_plan.artifacts_persisted:
        return

    raise DagsterInvariantViolationError(
        "Cannot perform reexecution with in-memory io managers.\n"
        "To enable reexecution, you can set a persistent io manager, such as the "
        'fs_io_manager, in the resource_defs argument on your ModeDefinition: resource_defs={"io_manager": fs_io_manager}'
    )
Пример #2
0
def _check_reexecution_config(pipeline_context, execution_plan, run_config):
    check.invariant(pipeline_context.run_storage)

    if not pipeline_context.run_storage.is_persistent:
        raise DagsterInvariantViolationError(
            'Cannot perform reexecution with non persistent run storage.')

    previous_run_id = run_config.reexecution_config.previous_run_id

    if not pipeline_context.run_storage.has_run(previous_run_id):
        raise DagsterRunNotFoundError(
            'Run id {} set as previous run id was not found in run storage'.
            format(previous_run_id),
            invalid_run_id=previous_run_id,
        )

    for step_output_handle in run_config.reexecution_config.step_output_handles:
        if not execution_plan.has_step(step_output_handle.step_key):
            raise DagsterExecutionStepNotFoundError(
                ('Step {step_key} was specified as a step from a previous run. '
                 'It does not exist.').format(
                     step_key=step_output_handle.step_key),
                step_key=step_output_handle.step_key,
            )

        step = execution_plan.get_step_by_key(step_output_handle.step_key)
        if not step.has_step_output(step_output_handle.output_name):
            raise DagsterStepOutputNotFoundError(
                ('You specified a step_output_handle in the ReexecutionConfig that does '
                 'not exist: Step {step_key} does not have output {output_name}.'
                 ).format(step_key=step_output_handle.step_key,
                          output_name=step_output_handle.output_name),
                step_key=step_output_handle.step_key,
                output_name=step_output_handle.output_name,
            )
Пример #3
0
def validate_reexecution_memoization(plan_context, execution_plan):
    check.inst_param(plan_context, "plan_context", IPlanContext)
    check.inst_param(execution_plan, "execution_plan", ExecutionPlan)

    parent_run_id = plan_context.pipeline_run.parent_run_id
    check.opt_str_param(parent_run_id, "parent_run_id")

    if parent_run_id is None:
        return

    if not plan_context.instance.has_run(parent_run_id):
        raise DagsterRunNotFoundError(
            "Run id {} set as parent run id was not found in instance".format(
                parent_run_id),
            invalid_run_id=parent_run_id,
        )

    # exclude full pipeline re-execution
    if len(execution_plan.step_keys_to_execute) == len(execution_plan.steps):
        return

    if execution_plan.artifacts_persisted:
        return

    raise DagsterInvariantViolationError(
        "Cannot perform reexecution with in-memory io managers.\n"
        "You may have configured non persistent intermediate storage `{}` for reexecution. "
        "Intermediate Storage is deprecated in 0.10.0 and will be removed in a future release."
        .format(plan_context.intermediate_storage.__class__.__name__))
Пример #4
0
def validate_reexecution_memoization(pipeline_context, execution_plan):
    check.inst_param(pipeline_context, 'pipeline_context',
                     SystemPipelineExecutionContext)
    check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

    parent_run_id = pipeline_context.pipeline_run.parent_run_id
    check.opt_str_param(parent_run_id, 'parent_run_id')

    if parent_run_id is None:
        return

    if not pipeline_context.instance.has_run(parent_run_id):
        raise DagsterRunNotFoundError(
            'Run id {} set as parent run id was not found in instance'.format(
                parent_run_id),
            invalid_run_id=parent_run_id,
        )

    # exclude full pipeline re-execution
    if len(execution_plan.step_keys_to_execute) == len(execution_plan.steps):
        return

    if not pipeline_context.intermediates_manager.is_persistent:
        raise DagsterInvariantViolationError(
            'Cannot perform reexecution with non persistent intermediates manager `{}`.'
            .format(pipeline_context.intermediates_manager.__class__.__name__))
Пример #5
0
def validate_reexecution_memoization(pipeline_context, execution_plan):
    check.inst_param(pipeline_context, "pipeline_context",
                     SystemExecutionContext)
    check.inst_param(execution_plan, "execution_plan", ExecutionPlan)

    parent_run_id = pipeline_context.pipeline_run.parent_run_id
    check.opt_str_param(parent_run_id, "parent_run_id")

    if parent_run_id is None:
        return

    if not pipeline_context.instance.has_run(parent_run_id):
        raise DagsterRunNotFoundError(
            "Run id {} set as parent run id was not found in instance".format(
                parent_run_id),
            invalid_run_id=parent_run_id,
        )

    # exclude full pipeline re-execution
    if len(execution_plan.step_keys_to_execute) == len(execution_plan.steps):
        return

    # remove this once intermediate storage is fully deprecated
    # https://github.com/dagster-io/dagster/issues/3043
    if pipeline_context.intermediate_storage.is_persistent:
        return

    # exclude the case where non-in-memory asset stores are configured on the required steps
    if check_all_asset_stores_non_mem_for_reexecution(pipeline_context,
                                                      execution_plan) is False:
        raise DagsterInvariantViolationError(
            "Cannot perform reexecution with in-memory asset stores.\n"
            "You may have configured non persistent intermediate storage `{}` for reexecution. "
            "Intermediate Storage is deprecated in 0.10.0 and will be removed in 0.11.0."
            .format(pipeline_context.intermediate_storage.__class__.__name__))
Пример #6
0
def validate_retry_memoization(pipeline_context, execution_plan):
    check.inst_param(pipeline_context, 'pipeline_context',
                     SystemPipelineExecutionContext)
    check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

    parent_run_id = pipeline_context.pipeline_run.parent_run_id
    check.opt_str_param(parent_run_id, 'parent_run_id')

    if parent_run_id is None:
        return

    if not pipeline_context.instance.has_run(parent_run_id):
        raise DagsterRunNotFoundError(
            'Run id {} set as parent run id was not found in instance'.format(
                parent_run_id),
            invalid_run_id=parent_run_id,
        )

    if len(execution_plan.step_keys_to_execute) == len(execution_plan.steps):
        # this is a short-term proxy to distinguish between re-execution and retries.
        # Resume/retry will always have a subset of the execution plan, and re-execution will
        # always be the full execution plan (until we change dagit to enable re-execution subsets)
        return

    if not pipeline_context.intermediates_manager.is_persistent:
        raise DagsterInvariantViolationError(
            'Cannot perform reexecution with non persistent intermediates manager `{}`.'
            .format(pipeline_context.intermediates_manager.__class__.__name__))
Пример #7
0
    def get_run_group(self, run_id: str) -> Optional[Tuple[str, Iterable[PipelineRun]]]:
        check.str_param(run_id, "run_id")
        pipeline_run = self.get_run_by_id(run_id)
        if not pipeline_run:
            raise DagsterRunNotFoundError(
                f"Run {run_id} was not found in instance.", invalid_run_id=run_id
            )

        # find root_run
        root_run_id = pipeline_run.root_run_id if pipeline_run.root_run_id else pipeline_run.run_id
        root_run = self.get_run_by_id(root_run_id)
        if not root_run:
            raise DagsterRunNotFoundError(
                f"Run id {root_run} set as root run id for run {run_id} was not found in instance.",
                invalid_run_id=root_run,
            )

        # root_run_id to run_id 1:1 mapping
        # https://github.com/dagster-io/dagster/issues/2495
        # Note: we currently use tags to persist the run group info
        root_to_run = (
            db.select(
                [RunTagsTable.c.value.label("root_run_id"), RunTagsTable.c.run_id.label("run_id")]
            )
            .where(
                db.and_(RunTagsTable.c.key == ROOT_RUN_ID_TAG, RunTagsTable.c.value == root_run_id)
            )
            .alias("root_to_run")
        )
        # get run group
        run_group_query = (
            db.select([RunsTable.c.run_body])
            .select_from(
                root_to_run.join(
                    RunsTable,
                    root_to_run.c.run_id == RunsTable.c.run_id,
                    isouter=True,
                )
            )
            .alias("run_group")
        )

        with self.connect() as conn:
            res = conn.execute(run_group_query)
            run_group = self._rows_to_runs(res)

        return (root_run_id, [root_run] + run_group)
Пример #8
0
    def add_run_tags(self, run_id: str, new_tags: Dict[str, str]):
        check.str_param(run_id, "run_id")
        check.dict_param(new_tags, "new_tags", key_type=str, value_type=str)

        run = self.get_run_by_id(run_id)
        if not run:
            raise DagsterRunNotFoundError(
                f"Run {run_id} was not found in instance.",
                invalid_run_id=run_id)
        current_tags = run.tags if run.tags else {}

        all_tags = merge_dicts(current_tags, new_tags)
        partition = all_tags.get(PARTITION_NAME_TAG)
        partition_set = all_tags.get(PARTITION_SET_TAG)

        with self.connect() as conn:
            conn.execute(RunsTable.update()  # pylint: disable=no-value-for-parameter
                         .where(RunsTable.c.run_id == run_id).values(
                             run_body=serialize_dagster_namedtuple(
                                 run.with_tags(
                                     merge_dicts(current_tags, new_tags))),
                             partition=partition,
                             partition_set=partition_set,
                             update_timestamp=pendulum.now("UTC"),
                         ))

            current_tags_set = set(current_tags.keys())
            new_tags_set = set(new_tags.keys())

            existing_tags = current_tags_set & new_tags_set
            added_tags = new_tags_set.difference(existing_tags)

            for tag in existing_tags:
                conn.execute(RunTagsTable.update()  # pylint: disable=no-value-for-parameter
                             .where(
                                 db.and_(RunTagsTable.c.run_id == run_id,
                                         RunTagsTable.c.key == tag)).values(
                                             value=new_tags[tag]))

            if added_tags:
                conn.execute(
                    RunTagsTable.insert(),  # pylint: disable=no-value-for-parameter
                    [
                        dict(run_id=run_id, key=tag, value=new_tags[tag])
                        for tag in added_tags
                    ],
                )
Пример #9
0
 def get_run_group(self,
                   run_id: str) -> Optional[Tuple[str, List[PipelineRun]]]:
     check.str_param(run_id, "run_id")
     pipeline_run = self._runs.get(run_id)
     if not pipeline_run:
         raise DagsterRunNotFoundError(
             f"Run {run_id} was not found in instance.",
             invalid_run_id=run_id)
     # if the run doesn't have root_run_id, itself is the root
     root_run = (self.get_run_by_id(pipeline_run.root_run_id)
                 if pipeline_run.root_run_id else pipeline_run)
     if not root_run:
         return None
     run_group = [root_run]
     for curr_run in self._runs.values():
         if curr_run.root_run_id == root_run.run_id:
             run_group.append(curr_run)
     return (root_run.root_run_id, run_group)
Пример #10
0
def validate_retry_memoization(pipeline_context, execution_plan):
    check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext)
    check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

    if not execution_plan.previous_run_id:
        return

    if not pipeline_context.intermediates_manager.is_persistent:
        raise DagsterInvariantViolationError(
            'Cannot perform reexecution with non persistent intermediates manager `{}`.'.format(
                pipeline_context.intermediates_manager.__class__.__name__
            )
        )

    previous_run_id = execution_plan.previous_run_id

    if not pipeline_context.instance.has_run(previous_run_id):
        raise DagsterRunNotFoundError(
            'Run id {} set as previous run id was not found in instance'.format(previous_run_id),
            invalid_run_id=previous_run_id,
        )
Пример #11
0
def _run_in_subprocess(
    serialized_execute_run_args,
    recon_pipeline,
    termination_event,
    subprocess_status_handler,
    run_event_handler,
):

    start_termination_thread(termination_event)
    try:
        execute_run_args = deserialize_json_to_dagster_namedtuple(
            serialized_execute_run_args)
        check.inst_param(execute_run_args, "execute_run_args",
                         ExecuteExternalPipelineArgs)

        instance = DagsterInstance.from_ref(execute_run_args.instance_ref)
        pipeline_run = instance.get_run_by_id(execute_run_args.pipeline_run_id)

        if not pipeline_run:
            raise DagsterRunNotFoundError(
                "gRPC server could not load run {run_id} in order to execute it. Make sure that the gRPC server has access to your run storage."
                .format(run_id=execute_run_args.pipeline_run_id),
                invalid_run_id=execute_run_args.pipeline_run_id,
            )

        pid = os.getpid()

    except:  # pylint: disable=bare-except
        serializable_error_info = serializable_error_info_from_exc_info(
            sys.exc_info())
        event = IPCErrorMessage(
            serializable_error_info=serializable_error_info,
            message="Error during RPC setup for executing run: {message}".
            format(message=serializable_error_info.message),
        )
        subprocess_status_handler(event)
        subprocess_status_handler(RunInSubprocessComplete())
        if instance:
            instance.dispose()
        return

    subprocess_status_handler(StartRunInSubprocessSuccessful())

    run_event_handler(
        instance.report_engine_event(
            "Started process for pipeline (pid: {pid}).".format(pid=pid),
            pipeline_run,
            EngineEventData.in_process(pid,
                                       marker_end="cli_api_subprocess_init"),
        ))

    # This is so nasty but seemingly unavoidable
    # https://amir.rachum.com/blog/2017/03/03/generator-cleanup/
    closed = False
    try:
        for event in _core_execute_run(recon_pipeline, pipeline_run, instance):
            run_event_handler(event)
    except GeneratorExit:
        closed = True
        raise
    finally:
        if not closed:
            run_event_handler(
                instance.report_engine_event(
                    "Process for pipeline exited (pid: {pid}).".format(
                        pid=pid),
                    pipeline_run,
                ))
        subprocess_status_handler(RunInSubprocessComplete())
        instance.dispose()