示例#1
0
def test_execute_execute_plan_mutation_raw():
    pipeline_name = "sleepy_pipeline"
    pipeline = sleepy_recon_pipeline()
    instance = DagsterInstance.local_temp()

    workspace = create_in_process_ephemeral_workspace(
        pointer=pipeline.repository.pointer)

    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline.get_definition())
    variables = {
        "executionParams": {
            "runConfigData": {},
            "mode": "default",
            "selector": {
                "repositoryLocationName": IN_PROCESS_NAME,
                "repositoryName": get_ephemeral_repository_name(pipeline_name),
                "pipelineName": pipeline_name,
            },
            "executionMetadata": {
                "runId": pipeline_run.run_id
            },
        }
    }
    result = execute_execute_plan_mutation_raw(workspace,
                                               variables,
                                               instance_ref=instance.get_ref())
    seen_events = set()
    for event in result:
        seen_events.add((event.dagster_event.event_type_value, event.step_key))

    assert seen_events == EXPECTED_EVENTS
示例#2
0
def invoke_steps_within_python_operator(invocation_args, ts, dag_run,
                                        **kwargs):  # pylint: disable=unused-argument
    mode = invocation_args.mode
    pipeline_name = invocation_args.pipeline_name
    step_keys = invocation_args.step_keys
    instance_ref = invocation_args.instance_ref
    run_config = invocation_args.run_config
    recon_repo = invocation_args.recon_repo
    pipeline_snapshot = invocation_args.pipeline_snapshot
    execution_plan_snapshot = invocation_args.execution_plan_snapshot
    parent_pipeline_snapshot = invocation_args.parent_pipeline_snapshot

    run_id = dag_run.run_id

    variables = construct_execute_plan_variables(recon_repo, mode, run_config,
                                                 pipeline_name, run_id,
                                                 step_keys)
    variables = add_airflow_tags(variables, ts)

    logging.info('Executing GraphQL query: {query}\n'.format(
        query=EXECUTE_PLAN_MUTATION) + 'with variables:\n' +
                 seven.json.dumps(variables, indent=2))
    instance = DagsterInstance.from_ref(instance_ref) if instance_ref else None
    if instance:
        instance.register_managed_run(
            pipeline_name=pipeline_name,
            run_id=run_id,
            run_config=run_config,
            mode=mode,
            solids_to_execute=None,
            step_keys_to_execute=None,
            tags=None,
            root_run_id=None,
            parent_run_id=None,
            pipeline_snapshot=pipeline_snapshot,
            execution_plan_snapshot=execution_plan_snapshot,
            parent_pipeline_snapshot=parent_pipeline_snapshot,
        )

    workspace = create_in_process_ephemeral_workspace(
        pointer=recon_repo.pointer)
    events = execute_execute_plan_mutation(
        workspace,
        variables,
        instance_ref=instance_ref,
    )
    check_events_for_failures(events)
    check_events_for_skips(events)
    return events
示例#3
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context",
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            "pipeline_context",
            "Expected executor to be DaskExecutor got {}".format(
                pipeline_context.executor),
        )

        check.invariant(
            pipeline_context.instance.is_persistent,
            "Dask execution requires a persistent DagsterInstance",
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == "local":
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "yarn":
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "ssh":
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "pbs":
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "moab":
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "sge":
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == "lsf":
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "slurm":
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "oar":
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "kube":
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={"in_process": {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )
                    variables = {
                        "executionParams": {
                            "selector": {
                                "pipelineName": pipeline_name,
                                "repositoryName":
                                recon_repo.get_definition().name,
                                "repositoryLocationName": "<<in_process>>",
                            },
                            "runConfigData": run_config,
                            "mode": pipeline_context.mode_def.name,
                            "executionMetadata": {
                                "runId": pipeline_context.pipeline_run.run_id
                            },
                            "stepKeys": [step.key],
                        }
                    }

                    dask_task_name = "%s.%s" % (pipeline_name, step.key)

                    workspace = create_in_process_ephemeral_workspace(
                        pointer=pipeline_context.pipeline.
                        get_reconstructable_repository().pointer)

                    future = client.submit(
                        query_on_dask_worker,
                        workspace,
                        variables,
                        dependencies,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            futures = dask.distributed.as_completed(execution_futures,
                                                    with_results=True)

            # Allow interrupts while waiting for the results from Dask
            for future, result in iterate_with_context(
                    raise_interrupts_immediately, futures):
                for step_event in result:
                    check.inst(step_event, DagsterEvent)
                    yield step_event
示例#4
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            'pipeline_context',
            'Expected executor to be DaskExecutor got {}'.format(
                pipeline_context.executor),
        )

        # Checks to ensure storage is compatible with Dask configuration
        storage = pipeline_context.run_config.get('storage')
        check.invariant(storage.keys(),
                        'Must specify storage to use Dask execution')

        check.invariant(
            pipeline_context.instance.is_persistent,
            'Dask execution requires a persistent DagsterInstance',
        )

        # https://github.com/dagster-io/dagster/issues/2440
        check.invariant(
            pipeline_context.system_storage_def.is_persistent,
            'Cannot use in-memory storage with Dask, use filesystem, S3, or GCS',
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == 'local':
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'yarn':
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'ssh':
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'pbs':
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'moab':
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'sge':
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'lsf':
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'slurm':
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'oar':
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'kube':
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={'in_process': {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )
                    variables = {
                        'executionParams': {
                            'selector': {
                                'pipelineName': pipeline_name,
                                'repositoryName':
                                recon_repo.get_definition().name,
                                'repositoryLocationName': '<<in_process>>',
                            },
                            'runConfigData': run_config,
                            'mode': pipeline_context.mode_def.name,
                            'executionMetadata': {
                                'runId': pipeline_context.pipeline_run.run_id
                            },
                            'stepKeys': [step.key],
                        }
                    }

                    dask_task_name = '%s.%s' % (pipeline_name, step.key)

                    workspace = create_in_process_ephemeral_workspace(
                        pointer=pipeline_context.pipeline.
                        get_reconstructable_repository().pointer)

                    future = client.submit(
                        query_on_dask_worker,
                        workspace,
                        variables,
                        dependencies,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            for future in dask.distributed.as_completed(execution_futures):
                for step_event in future.result():
                    check.inst(step_event, DagsterEvent)

                    yield step_event
示例#5
0
def test_all_step_events():  # pylint: disable=too-many-locals
    instance = DagsterInstance.ephemeral()

    pipeline_def = define_test_events_pipeline()

    workspace = create_in_process_ephemeral_workspace(
        pointer=CodePointer.from_python_file(
            __file__,
            define_test_events_pipeline.__name__,
            working_directory=None))

    mode = pipeline_def.get_default_mode_name()
    execution_plan = create_execution_plan(pipeline_def, mode=mode)
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline_def, execution_plan=execution_plan, mode=mode)
    step_levels = execution_plan.topological_step_levels()

    unhandled_events = STEP_EVENTS.copy()

    # Exclude types that are not step events
    ignored_events = {
        "LogMessageEvent",
        "PipelineStartEvent",
        "PipelineSuccessEvent",
        "PipelineInitFailureEvent",
        "PipelineFailureEvent",
    }

    event_counts = defaultdict(int)

    for step_level in step_levels:
        for step in step_level:

            variables = {
                "executionParams": {
                    "selector": {
                        "repositoryLocationName":
                        IN_PROCESS_NAME,
                        "repositoryName":
                        get_ephemeral_repository_name(pipeline_def.name),
                        "pipelineName":
                        pipeline_def.name,
                    },
                    "runConfigData": {
                        "storage": {
                            "filesystem": {}
                        }
                    },
                    "mode": mode,
                    "executionMetadata": {
                        "runId": pipeline_run.run_id
                    },
                    "stepKeys": [step.key],
                },
            }
            res = execute_query(
                workspace,
                EXECUTE_PLAN_MUTATION,
                variables,
                instance=instance,
            )

            # go through the same dict, decrement all the event records we've seen from the GraphQL
            # response
            if not res.get("errors"):
                assert "data" in res, res
                assert "executePlan" in res["data"], res
                assert "stepEvents" in res["data"]["executePlan"], res
                step_events = res["data"]["executePlan"]["stepEvents"]

                events = [
                    dagster_event_from_dict(e, pipeline_def.name)
                    for e in step_events
                    if e["__typename"] not in ignored_events
                ]

                for event in events:
                    if event.step_key:
                        key = event.step_key + "." + event.event_type_value
                    else:
                        key = event.event_type_value
                    event_counts[key] -= 1
                unhandled_events -= {
                    DagsterEventType(e.event_type_value)
                    for e in events
                }
            else:
                raise Exception(res["errors"])

    # build up a dict, incrementing all the event records we've produced in the run storage
    logs = instance.all_logs(pipeline_run.run_id)
    for log in logs:
        if not log.dagster_event or (DagsterEventType(
                log.dagster_event.event_type_value) not in STEP_EVENTS.union(
                    set([DagsterEventType.ENGINE_EVENT]))):
            continue
        if log.dagster_event.step_key:
            key = log.dagster_event.step_key + "." + log.dagster_event.event_type_value
        else:
            key = log.dagster_event.event_type_value
        event_counts[key] += 1

    # Ensure we've processed all the events that were generated in the run storage
    assert sum(event_counts.values()) == 0

    # Ensure we've handled the universe of event types
    # Why are these retry events not handled? Because right now there is no way to configure retries
    # on executePlan -- this needs to change, and we should separate the ExecutionParams that get
    # sent to executePlan fromm those that get sent to startPipelineExecution and friends
    assert unhandled_events == {
        DagsterEventType.STEP_UP_FOR_RETRY, DagsterEventType.STEP_RESTARTED
    }