示例#1
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        environment_dict=None,
        handle_kwargs=None,
        run_config_kwargs=None,
        solid_subset=None,
        solid_handle_kwargs=None,
    ):
        '''Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        '''
        check.opt_str_param(output_log_path, 'output_log_path')
        check.opt_str_param(marshal_dir, 'marshal_dir')
        environment_dict = check.opt_dict_param(environment_dict,
                                                'environment_dict',
                                                key_type=str)
        check.dict_param(run_config_kwargs, 'run_config_kwargs')
        check.dict_param(handle_kwargs, 'handle_kwargs')
        check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
        check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs')

        try:
            handle = load_handle.handle_for_pipeline_cli_args(
                handle_kwargs, use_default_repository_yaml=False)
        except (check.CheckError, load_handle.CliUsageError) as err:
            six.raise_from(
                DagstermillError(
                    'Cannot invoke a dagstermill solid from an in-memory pipeline that was not loaded '
                    'from an ExecutionTargetHandle. Run this pipeline using dagit, the dagster CLI, '
                    'through dagster-graphql, or in-memory after loading it through an '
                    'ExecutionTargetHandle.'),
                err,
            )

        pipeline_def = check.inst_param(
            handle.build_pipeline_definition(),
            'pipeline_def (from handle {handle_dict})'.format(
                handle_dict=handle.data._asdict()),
            PipelineDefinition,
        ).build_sub_pipeline(solid_subset)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle)

        run_config = RunConfig(**run_config_kwargs)
        # since we are rehydrating the SqliteEventSink we will skip the db init
        run_config = run_config.with_event_sink(
            SqliteEventSink(output_log_path, skip_db_init=True))

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline_def = pipeline_def

        with scoped_pipeline_context(
                self.pipeline_def,
                environment_dict,
                run_config,
                scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:
            self.context = DagstermillExecutionContext(pipeline_context)

        return self.context
示例#2
0
文件: events.py 项目: zkan/dagster
 def __new__(cls, data):
     return super(JsonMetadataEntryData,
                  cls).__new__(cls,
                               check.dict_param(data, 'data', key_type=str))
示例#3
0
def _cli_get_user_process_api(kwargs):
    check.dict_param(kwargs, 'kwargs')
    return UserProcessApi.GRPC if kwargs.get('grpc') else UserProcessApi.CLI
示例#4
0
def test_dict_param():
    assert check.dict_param({}, "dict_param") == {}
    assert check.dict_param(frozendict(), "dict_param") == {}
    ddict = {"a": 2}
    assert check.dict_param(ddict, "dict_param") == ddict

    with pytest.raises(ParameterCheckError):
        check.dict_param(None, "dict_param")

    with pytest.raises(ParameterCheckError):
        check.dict_param(0, "dict_param")

    with pytest.raises(ParameterCheckError):
        check.dict_param(1, "dict_param")

    with pytest.raises(ParameterCheckError):
        check.dict_param("foo", "dict_param")

    with pytest.raises(ParameterCheckError):
        check.dict_param(["foo"], "dict_param")

    with pytest.raises(ParameterCheckError):
        check.dict_param([], "dict_param")
def pipeline_initialization_event_generator(
    execution_plan,
    environment_dict,
    pipeline_run,
    instance,
    scoped_resources_builder_cm,
    system_storage_data=None,
    raise_on_error=False,
):
    execution_plan = check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
    pipeline_def = execution_plan.pipeline.get_definition()

    environment_dict = check.dict_param(environment_dict, 'environment_dict', key_type=str)
    pipeline_run = check.inst_param(pipeline_run, 'pipeline_run', PipelineRun)
    instance = check.inst_param(instance, 'instance', DagsterInstance)

    scoped_resources_builder_cm = check.callable_param(
        scoped_resources_builder_cm, 'scoped_resources_builder_cm'
    )
    system_storage_data = check.opt_inst_param(
        system_storage_data, 'system_storage_data', SystemStorageData
    )
    raise_on_error = check.bool_param(raise_on_error, 'raise_on_error')

    pipeline_context = None
    resources_manager = None

    try:
        context_creation_data = create_context_creation_data(
            execution_plan, environment_dict, pipeline_run, instance,
        )
        executor_config = create_executor_config(context_creation_data)
        log_manager = create_log_manager(context_creation_data)
        resources_manager = scoped_resources_builder_cm(
            execution_plan,
            context_creation_data.environment_config,
            context_creation_data.pipeline_run,
            log_manager,
            context_creation_data.resource_keys_to_init,
        )
        for event in resources_manager.generate_setup_events():
            yield event
        scoped_resources_builder = check.inst(
            resources_manager.get_object(), ScopedResourcesBuilder
        )
        system_storage_data = create_system_storage_data(
            context_creation_data, system_storage_data, scoped_resources_builder
        )
        pipeline_context = construct_pipeline_execution_context(
            context_creation_data=context_creation_data,
            scoped_resources_builder=scoped_resources_builder,
            system_storage_data=system_storage_data,
            log_manager=log_manager,
            executor_config=executor_config,
            raise_on_error=raise_on_error,
        )

        _validate_plan_with_context(pipeline_context, execution_plan)

        yield pipeline_context
        for event in resources_manager.generate_teardown_events():
            yield event
    except DagsterError as dagster_error:
        if pipeline_context is None:
            user_facing_exc_info = (
                # pylint does not know original_exc_info exists is is_user_code_error is true
                # pylint: disable=no-member
                dagster_error.original_exc_info
                if dagster_error.is_user_code_error
                else sys.exc_info()
            )
            error_info = serializable_error_info_from_exc_info(user_facing_exc_info)

            yield DagsterEvent.pipeline_init_failure(
                pipeline_name=pipeline_def.name,
                failure_data=PipelineInitFailureData(error=error_info),
                log_manager=_create_context_free_log_manager(instance, pipeline_run, pipeline_def),
            )
            if resources_manager:
                for event in resources_manager.generate_teardown_events():
                    yield event
        else:
            # pipeline teardown failure
            raise dagster_error

        if raise_on_error:
            raise dagster_error
示例#6
0
def ensure_single_item(ddict):
    check.dict_param(ddict, 'ddict')
    check.param_invariant(
        len(ddict) == 1, 'ddict', 'Expected dict with single item')
    return list(ddict.items())[0]
    def event_generator(
        self,
        execution_plan,
        run_config,
        pipeline_run,
        instance,
        scoped_resources_builder_cm,
        intermediate_storage=None,
        raise_on_error=False,
    ):
        execution_plan = check.inst_param(execution_plan, "execution_plan",
                                          ExecutionPlan)
        pipeline_def = execution_plan.pipeline.get_definition()

        run_config = check.dict_param(run_config, "run_config", key_type=str)
        pipeline_run = check.inst_param(pipeline_run, "pipeline_run",
                                        PipelineRun)
        instance = check.inst_param(instance, "instance", DagsterInstance)

        scoped_resources_builder_cm = check.callable_param(
            scoped_resources_builder_cm, "scoped_resources_builder_cm")
        intermediate_storage = check.opt_inst_param(
            intermediate_storage, "intermediate_storage_data",
            IntermediateStorage)
        raise_on_error = check.bool_param(raise_on_error, "raise_on_error")

        execution_context = None
        resources_manager = None

        try:
            context_creation_data = create_context_creation_data(
                execution_plan,
                run_config,
                pipeline_run,
                instance,
            )

            log_manager = create_log_manager(context_creation_data)
            resources_manager = scoped_resources_builder_cm(
                execution_plan,
                context_creation_data.environment_config,
                context_creation_data.pipeline_run,
                log_manager,
                context_creation_data.resource_keys_to_init,
                instance,
            )
            yield from resources_manager.generate_setup_events()
            scoped_resources_builder = check.inst(
                resources_manager.get_object(), ScopedResourcesBuilder)

            intermediate_storage = create_intermediate_storage(
                context_creation_data,
                intermediate_storage,
                scoped_resources_builder,
            )

            execution_context = self.construct_context(
                context_creation_data=context_creation_data,
                scoped_resources_builder=scoped_resources_builder,
                log_manager=log_manager,
                intermediate_storage=intermediate_storage,
                raise_on_error=raise_on_error,
            )

            _validate_plan_with_context(execution_context, execution_plan)

            yield execution_context
            yield from resources_manager.generate_teardown_events()
        except DagsterError as dagster_error:
            if execution_context is None:
                user_facing_exc_info = (
                    # pylint does not know original_exc_info exists is is_user_code_error is true
                    # pylint: disable=no-member
                    dagster_error.original_exc_info
                    if dagster_error.is_user_code_error else sys.exc_info())
                error_info = serializable_error_info_from_exc_info(
                    user_facing_exc_info)

                yield DagsterEvent.pipeline_init_failure(
                    pipeline_name=pipeline_def.name,
                    failure_data=PipelineInitFailureData(error=error_info),
                    log_manager=_create_context_free_log_manager(
                        instance, pipeline_run, pipeline_def),
                )
                if resources_manager:
                    yield from resources_manager.generate_teardown_events()
            else:
                # pipeline teardown failure
                raise dagster_error

            if raise_on_error:
                raise dagster_error
示例#8
0
    def _execute_plan(self, execute_step_args_packed, executable_dict):
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)

        check.dict_param(executable_dict, "executable_dict")

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        retry_mode = execute_step_args.retry_mode

        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)
        check.invariant(
            pipeline_run,
            "Could not load run {}".format(execute_step_args.pipeline_run_id))

        step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)

        execution_plan = create_execution_plan(
            pipeline,
            pipeline_run.run_config,
            mode=pipeline_run.mode,
            step_keys_to_execute=execute_step_args.step_keys_to_execute,
            known_state=execute_step_args.known_state,
        )

        engine_event = instance.report_engine_event(
            "Executing steps {} in celery worker".format(step_keys_str),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, "step_keys"),
                    EventMetadataEntry.text(self.request.hostname,
                                            "Celery worker"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryExecutor,
            step_key=execution_plan.step_handle_for_single_step_plans().to_key(
            ),
        )

        events = [engine_event]
        for step_event in execute_plan_iterator(
                execution_plan=execution_plan,
                pipeline=pipeline,
                pipeline_run=pipeline_run,
                instance=instance,
                retry_mode=retry_mode,
                run_config=pipeline_run.run_config,
        ):
            events.append(step_event)

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
示例#9
0
def _build_execution_param_list_for_backfill(
    instance,
    partition_data_list,
    backfill_id,
    backfill_params,
    pipeline_selector,
    external_partition_set,
):
    check.inst_param(instance, "instance", DagsterInstance)
    check.list_param(partition_data_list,
                     "partition_data_list",
                     of_type=ExternalPartitionExecutionParamData)
    check.str_param(backfill_id, "backfill_id")
    check.dict_param(backfill_params, "backfill_params")
    check.inst_param(pipeline_selector, "pipeline_selector", PipelineSelector)
    check.inst_param(external_partition_set, "external_partition_set",
                     ExternalPartitionSet)

    backfill_tags = PipelineRun.tags_for_backfill_id(backfill_id)
    execution_tags = {
        t["key"]: t["value"]
        for t in backfill_params.get("tags", [])
    }
    execution_param_list = []
    for partition_data in partition_data_list:
        tags = merge_dicts(merge_dicts(partition_data.tags, backfill_tags),
                           execution_tags)
        if not backfill_params.get("fromFailure") and not backfill_params.get(
                "reexecutionSteps"):
            # full pipeline execution
            execution_param_list.append(
                ExecutionParams(
                    selector=pipeline_selector,
                    run_config=partition_data.run_config,
                    mode=external_partition_set.mode,
                    execution_metadata=ExecutionMetadata(run_id=None,
                                                         tags=tags),
                    step_keys=None,
                ))
            continue

        last_run = _fetch_last_run(instance, external_partition_set,
                                   partition_data.name)

        if backfill_params.get("fromFailure"):
            if not last_run or last_run.status != PipelineRunStatus.FAILURE:
                continue

            execution_param_list.append(
                ExecutionParams(
                    selector=pipeline_selector,
                    run_config=partition_data.run_config,
                    mode=external_partition_set.mode,
                    execution_metadata=ExecutionMetadata(
                        run_id=None,
                        tags=merge_dicts(tags, {RESUME_RETRY_TAG: "true"}),
                        root_run_id=last_run.root_run_id or last_run.run_id,
                        parent_run_id=last_run.run_id,
                    ),
                    step_keys=None,
                ))
            continue

        # partial reexecution from success
        if not last_run or last_run.status != PipelineRunStatus.SUCCESS:
            continue

        execution_param_list.append(
            ExecutionParams(
                selector=pipeline_selector,
                run_config=partition_data.run_config,
                mode=external_partition_set.mode,
                execution_metadata=ExecutionMetadata(
                    run_id=None,
                    tags=tags,
                    root_run_id=last_run.root_run_id or last_run.run_id,
                    parent_run_id=last_run.run_id,
                ),
                step_keys=backfill_params["reexecutionSteps"],
            ))
        continue

    return execution_param_list
示例#10
0
def build_resources(
    resources: Dict[str, Any],
    instance: Optional[DagsterInstance] = None,
    resource_config: Optional[Dict[str, Any]] = None,
    pipeline_run: Optional[PipelineRun] = None,
    log_manager: Optional[DagsterLogManager] = None,
) -> Generator[Resources, None, None]:
    """Context manager that yields resources using provided resource definitions and run config.

    This API allows for using resources in an independent context. Resources will be initialized
    with the provided run config, and optionally, pipeline_run. The resulting resources will be
    yielded on a dictionary keyed identically to that provided for `resource_defs`. Upon exiting the
    context, resources will also be torn down safely.

    Args:
        resources (Dict[str, Any]): Resource instances or definitions to build. All
            required resource dependencies to a given resource must be contained within this
            dictionary, or the resource build will fail.
        instance (Optional[DagsterInstance]): The dagster instance configured to instantiate
            resources on.
        resource_config (Optional[Dict[str, Any]]): A dict representing the config to be
            provided to each resource during initialization and teardown.
        pipeline_run (Optional[PipelineRun]): The pipeline run to provide during resource
            initialization and teardown. If the provided resources require either the `pipeline_run`
            or `run_id` attributes of the provided context during resource initialization and/or
            teardown, this must be provided, or initialization will fail.
        log_manager (Optional[DagsterLogManager]): Log Manager to use during resource
            initialization. Defaults to system log manager.

    Examples:

    .. code-block:: python

        from dagster import resource, build_resources

        @resource
        def the_resource():
            return "foo"

        with build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources:
            assert resources.from_def == "foo"
            assert resources.from_val == "bar"

    """

    resources = check.dict_param(resources, "resource_defs", key_type=str)
    instance = check.opt_inst_param(instance, "instance", DagsterInstance)
    resource_config = check.opt_dict_param(resource_config,
                                           "resource_config",
                                           key_type=str)
    log_manager = check.opt_inst_param(log_manager, "log_manager",
                                       DagsterLogManager)
    resource_defs = wrap_resources_for_execution(resources)
    mapped_resource_config = _get_mapped_resource_config(
        resource_defs, resource_config)

    with ephemeral_instance_if_missing(instance) as dagster_instance:
        resources_manager = resource_initialization_manager(
            resource_defs=resource_defs,
            resource_configs=mapped_resource_config,
            log_manager=log_manager
            if log_manager else initialize_console_manager(pipeline_run),
            execution_plan=None,
            pipeline_run=pipeline_run,
            resource_keys_to_init=set(resource_defs.keys()),
            instance=dagster_instance,
            emit_persistent_events=False,
            pipeline_def_for_backwards_compat=None,
        )
        try:
            list(resources_manager.generate_setup_events())
            instantiated_resources = check.inst(resources_manager.get_object(),
                                                ScopedResourcesBuilder)
            yield instantiated_resources.build(
                set(instantiated_resources.resource_instance_dict.keys()))
        finally:
            list(resources_manager.generate_teardown_events())
示例#11
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        environment_dict=None,
        handle_kwargs=None,
        pipeline_run_dict=None,
        solid_subset=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        '''Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        '''
        check.opt_str_param(output_log_path, 'output_log_path')
        check.opt_str_param(marshal_dir, 'marshal_dir')
        environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)
        check.dict_param(pipeline_run_dict, 'pipeline_run_dict')
        check.dict_param(handle_kwargs, 'handle_kwargs')
        check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
        check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs')
        check.dict_param(instance_ref_dict, 'instance_ref_dict')

        try:
            handle = load_handle.handle_for_pipeline_cli_args(
                handle_kwargs, use_default_repository_yaml=False
            )
        except (check.CheckError, load_handle.UsageError) as err:
            six.raise_from(
                DagstermillError(
                    'Cannot invoke a dagstermill solid from an in-memory pipeline that was not loaded '
                    'from an ExecutionTargetHandle. Run this pipeline using dagit, the dagster CLI, '
                    'through dagster-graphql, or in-memory after loading it through an '
                    'ExecutionTargetHandle.'
                ),
                err,
            )

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            six.raise_from(
                DagstermillError(
                    'Error when attempting to resolve DagsterInstance from serialized InstanceRef'
                ),
                err,
            )

        pipeline_def = check.inst_param(
            handle.build_pipeline_definition(),
            'pipeline_def (from handle {handle_dict})'.format(handle_dict=handle.data._asdict()),
            PipelineDefinition,
        ).build_sub_pipeline(solid_subset)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle)

        pipeline_run = unpack_value(pipeline_run_dict)

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline_def = pipeline_def

        with scoped_pipeline_context(
            self.pipeline_def,
            environment_dict,
            pipeline_run,
            instance=instance,
            scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:
            self.context = DagstermillExecutionContext(pipeline_context)

        return self.context
示例#12
0
    def from_dict(config):
        check.dict_param(config, "config", key_type=str)

        return ResourceConfig(config=config.get("config"))
示例#13
0
文件: util.py 项目: punneng/dagster
def dagster_event_from_dict(event_dict, pipeline_name):
    check.dict_param(event_dict, 'event_dict', key_type=str)
    check.str_param(pipeline_name, 'pipeline_name')

    # Get event_type
    event_type = HANDLED_EVENTS.get(event_dict['__typename'])
    if not event_type:
        raise Exception('unhandled event type %s' % event_dict['__typename'])

    # Get event_specific_data
    event_specific_data = None
    if event_type == DagsterEventType.STEP_OUTPUT:
        event_specific_data = StepOutputData(
            step_output_handle=StepOutputHandle(event_dict['step']['key'],
                                                event_dict['outputName']),
            type_check_data=TypeCheckData(
                success=event_dict['typeCheck']['success'],
                label=event_dict['typeCheck']['label'],
                description=event_dict.get('description'),
                metadata_entries=list(
                    event_metadata_entries(event_dict.get('metadataEntries'))
                    or []),
            ),
        )

    elif event_type == DagsterEventType.STEP_INPUT:
        event_specific_data = StepInputData(
            input_name=event_dict['inputName'],
            type_check_data=TypeCheckData(
                success=event_dict['typeCheck']['success'],
                label=event_dict['typeCheck']['label'],
                description=event_dict.get('description'),
                metadata_entries=list(
                    event_metadata_entries(event_dict.get('metadataEntries'))
                    or []),
            ),
        )
    elif event_type == DagsterEventType.STEP_SUCCESS:
        event_specific_data = StepSuccessData(0.0)

    elif event_type == DagsterEventType.STEP_UP_FOR_RETRY:
        event_specific_data = StepRetryData(
            error=error_from_data(event_dict['retryError']),
            seconds_to_wait=event_dict['secondsToWait'],
        )

    elif event_type == DagsterEventType.STEP_MATERIALIZATION:
        materialization = event_dict['materialization']
        event_specific_data = StepMaterializationData(
            materialization=materialization_from_data(materialization))
    elif event_type == DagsterEventType.STEP_EXPECTATION_RESULT:
        expectation_result = expectation_result_from_data(
            event_dict['expectationResult'])
        event_specific_data = StepExpectationResultData(expectation_result)

    elif event_type == DagsterEventType.STEP_FAILURE:
        event_specific_data = StepFailureData(
            error_from_data(event_dict['error']),
            UserFailureData(
                label=event_dict['failureMetadata']['label'],
                description=event_dict['failureMetadata']['description'],
                metadata_entries=list(
                    event_metadata_entries(event_dict.get('metadataEntries'))
                    or []),
            ) if event_dict.get('failureMetadata') else None,
        )

    elif event_type == DagsterEventType.ENGINE_EVENT:
        event_specific_data = EngineEventData(
            metadata_entries=list(
                event_metadata_entries(event_dict.get('metadataEntries'))),
            marker_start=event_dict.get('markerStart'),
            marker_end=event_dict.get('markerEnd'),
            error=error_from_data(event_dict['engineError'])
            if event_dict.get('engineError') else None,
        )

    # We should update the GraphQL response so that clients don't need to do this handle parsing.
    # See: https://github.com/dagster-io/dagster/issues/1559
    handle = None
    step_key = None
    step_kind_value = None
    if 'step' in event_dict and event_dict['step']:
        step_key = event_dict['step']['key']
        step_kind_value = event_dict['step']['kind']
        keys = event_dict['step']['solidHandleID'].split('.')
        while keys:
            handle = SolidHandle(keys.pop(0), parent=handle)

    return DagsterEvent(
        event_type_value=event_type.value,
        pipeline_name=pipeline_name,
        step_key=step_key,
        solid_handle=handle,
        step_kind_value=step_kind_value,
        logging_tags=None,
        event_specific_data=event_specific_data,
    )
示例#14
0
    def __init__(self, run_id, tags, loggers):

        self.run_id = check.str_param(run_id, 'run_id')
        self.tags = check.dict_param(tags, 'tags')
        self.loggers = check.list_param(loggers, 'loggers', of_type=logging.Logger)
示例#15
0
def test_dict_param():
    assert check.dict_param({}, 'dict_param') == {}
    ddict = {'a': 2}
    assert check.dict_param(ddict, 'dict_param') == ddict

    with pytest.raises(ParameterCheckError):
        check.dict_param(None, 'dict_param')

    with pytest.raises(ParameterCheckError):
        check.dict_param(0, 'dict_param')

    with pytest.raises(ParameterCheckError):
        check.dict_param(1, 'dict_param')

    with pytest.raises(ParameterCheckError):
        check.dict_param('foo', 'dict_param')

    with pytest.raises(ParameterCheckError):
        check.dict_param(['foo'], 'dict_param')

    with pytest.raises(ParameterCheckError):
        check.dict_param([], 'dict_param')
示例#16
0
 def __new__(cls, run_id, tags):
     return super(ExecutionMetadata, cls).__new__(
         cls,
         check.opt_str_param(run_id, 'run_id'),
         check.dict_param(tags, 'tags', key_type=str, value_type=str),
     )
示例#17
0
 def __init__(self, *args, **kwargs):
     super(frozentags, self).__init__(*args, **kwargs)
     check.dict_param(self, 'self', key_type=str, value_type=str)
示例#18
0
文件: executor.py 项目: keyz/dagster
    def _execute_step_docker(
        self,
        execute_step_args_packed,
        docker_config,
    ):
        """Run step execution in a Docker container."""
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)

        check.dict_param(docker_config, "docker_config")

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)
        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)
        check.inst(
            pipeline_run,
            PipelineRun,
            "Could not load run {}".format(execute_step_args.pipeline_run_id),
        )
        step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)

        input_json = serialize_dagster_namedtuple(execute_step_args)

        command = "dagster api execute_step {}".format(json.dumps(input_json))

        docker_image = (docker_config["image"]
                        if docker_config.get("image") else execute_step_args.
                        pipeline_origin.repository_origin.container_image)

        if not docker_image:
            raise Exception(
                "No docker image specified by either the job or the repository"
            )

        client = docker.client.from_env()

        if docker_config.get("registry"):
            client.login(
                registry=docker_config["registry"]["url"],
                username=docker_config["registry"]["username"],
                password=docker_config["registry"]["password"],
            )

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            "Executing steps {} in Docker container {}".format(
                step_keys_str, docker_image),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, "Step keys"),
                    EventMetadataEntry.text(docker_image, "Image"),
                    EventMetadataEntry.text(self.request.hostname,
                                            "Celery worker"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryDockerExecutor,
            step_key=execute_step_args.step_keys_to_execute[0],
        )

        serialized_events = [serialize_dagster_namedtuple(engine_event)]

        docker_env = {}
        if docker_config.get("env_vars"):
            docker_env = {
                env_name: os.getenv(env_name)
                for env_name in docker_config["env_vars"]
            }

        try:
            docker_response = client.containers.run(
                docker_image,
                command=command,
                detach=False,
                auto_remove=True,
                # pass through this worker's environment for things like AWS creds etc.
                environment=docker_env,
                network=docker_config.get("network", None),
            )

            res = docker_response.decode("utf-8")
        except docker.errors.ContainerError as err:
            instance.report_engine_event(
                "Failed to run steps {} in Docker container {}".format(
                    step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(docker_image, "Job image"),
                    EventMetadataEntry.text(err.stderr, "Docker stderr"),
                ], ),
                CeleryDockerExecutor,
                step_key=execute_step_args.step_keys_to_execute[0],
            )
            raise
        else:
            if res is None:
                raise Exception(
                    "No response from execute_step in CeleryDockerExecutor")

            serialized_events += [event for event in res.split("\n") if event]

        return serialized_events
示例#19
0
def test_dict_param(kwargs, should_succeed):
    if should_succeed:
        assert check.dict_param(**kwargs, param_name="name") == kwargs["obj"]
    else:
        with pytest.raises(CheckError):
            check.dict_param(**kwargs, param_name="name")
示例#20
0
 def _input_schema(_context, value):
     check.dict_param(value, 'value')
     check.param_invariant(set(value.keys()) == field_names, 'value')
     return value
示例#21
0
def _expand_fields_dict(original_root, fields, stack):
    check.dict_param(fields, 'fields')
    return {
        name: _convert_potential_field(original_root, value, stack + [name])
        for name, value in fields.items()
    }
示例#22
0
def graph(
    name: Optional[Union[Callable[..., Any], str]] = None,
    description: Optional[str] = None,
    input_defs: Optional[List[InputDefinition]] = None,
    output_defs: Optional[List[OutputDefinition]] = None,
    ins: Optional[Dict[str, GraphIn]] = None,
    out: Optional[Union[GraphOut, Dict[str, GraphOut]]] = None,
    tags: Optional[Dict[str, Any]] = None,
    config: Optional[Union[ConfigMapping, Dict[str, Any]]] = None,
) -> Union[GraphDefinition, _Graph]:
    """Create a graph with the specified parameters from the decorated composition function.

    Using this decorator allows you to build up a dependency graph by writing a
    function that invokes ops (or other graphs) and passes the output to subsequent invocations.

    Args:
        name (Optional[str]):
            The name of the graph. Must be unique within any :py:class:`RepositoryDefinition` containing the graph.
        description (Optional[str]):
            A human-readable description of the graph.
        input_defs (Optional[List[InputDefinition]]):
            Information about the inputs that this graph maps. Information provided here
            will be combined with what can be inferred from the function signature, with these
            explicit InputDefinitions taking precedence.

            Uses of inputs in the body of the decorated composition function will determine
            the :py:class:`InputMappings <InputMapping>` passed to the underlying
            :py:class:`GraphDefinition`.
        output_defs (Optional[List[OutputDefinition]]):
            Output definitions for the graph. If not provided explicitly, these will be inferred from typehints.

            Uses of these outputs in the body of the decorated composition function, as well as the
            return value of the decorated function, will be used to infer the appropriate set of
            :py:class:`OutputMappings <OutputMapping>` for the underlying
            :py:class:`GraphDefinition`.

            To map multiple outputs, return a dictionary from the composition function.
        ins (Optional[Dict[str, GraphIn]]):
            Information about the inputs that this graph maps. Information provided here
            will be combined with what can be inferred from the function signature, with these
            explicit GraphIn taking precedence.
        out (Optional[Union[GraphOut, Dict[str, GraphOut]]]):
            Information about the outputs that this graph maps. Information provided here will be
            combined with what can be inferred from the return type signature if the function does
            not use yield.

            To map multiple outputs, return a dictionary from the composition function.
       tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the graph.
            Values that are not strings will be json encoded and must meet the criteria that
            `json.loads(json.dumps(value)) == value`.  These tag values may be overwritten by tag
            values provided at invocation time.
    """
    if callable(name):
        check.invariant(description is None)
        return _Graph()(name)

    config_mapping = None
    # Case 1: a dictionary of config is provided, convert to config mapping.
    if config is not None and not isinstance(config, ConfigMapping):
        config = check.dict_param(config, "config", key_type=str)
        config_mapping = ConfigMapping(config_fn=lambda _: config, config_schema=None)
    # Case 2: actual config mapping is provided.
    else:
        config_mapping = config

    return _Graph(
        name=name,
        description=description,
        input_defs=input_defs,
        output_defs=output_defs,
        ins=ins,
        out=out,
        tags=tags,
        config_mapping=config_mapping,
    )
示例#23
0
def test_dict_param_with_type():
    str_to_int = {"str": 1}
    assert check.dict_param(str_to_int, "str_to_int", key_type=str, value_type=int)
    assert check.dict_param(str_to_int, "str_to_int", value_type=int)
    assert check.dict_param(str_to_int, "str_to_int", key_type=str)
    assert check.dict_param(str_to_int, "str_to_int")

    assert check.dict_param({}, "str_to_int", key_type=str, value_type=int) == {}
    assert check.dict_param({}, "str_to_int", value_type=int) == {}
    assert check.dict_param({}, "str_to_int", key_type=str) == {}
    assert check.dict_param({}, "str_to_int") == {}

    class Wrong(object):
        pass

    with pytest.raises(CheckError):
        assert check.dict_param(str_to_int, "str_to_int", key_type=Wrong, value_type=Wrong)

    with pytest.raises(CheckError):
        assert check.dict_param(str_to_int, "str_to_int", key_type=Wrong, value_type=int)

    with pytest.raises(CheckError):
        assert check.dict_param(str_to_int, "str_to_int", key_type=str, value_type=Wrong)

    with pytest.raises(CheckError):
        assert check.dict_param(str_to_int, "str_to_int", key_type=Wrong)

    with pytest.raises(CheckError):
        assert check.dict_param(str_to_int, "str_to_int", value_type=Wrong)
示例#24
0
def _validate_resource_dependencies(mode_definitions, node_defs,
                                    dagster_type_dict, solid_dict,
                                    pipeline_hook_defs):
    """This validation ensures that each pipeline context provides the resources that are required
    by each solid.
    """
    check.list_param(mode_definitions,
                     "mode_definitions",
                     of_type=ModeDefinition)
    check.list_param(node_defs, "node_defs", of_type=NodeDefinition)
    check.dict_param(dagster_type_dict, "dagster_type_dict")
    check.dict_param(solid_dict, "solid_dict")
    check.set_param(pipeline_hook_defs,
                    "pipeline_hook_defs",
                    of_type=HookDefinition)

    for mode_def in mode_definitions:
        mode_resources = set(mode_def.resource_defs.keys())
        for node_def in node_defs:
            for required_resource in node_def.required_resource_keys:
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError((
                        'Resource "{resource}" is required by solid def {node_def_name}, but is not '
                        'provided by mode "{mode_name}".').format(
                            resource=required_resource,
                            node_def_name=node_def.name,
                            mode_name=mode_def.name,
                        ))

        _validate_type_resource_deps_for_mode(mode_def, mode_resources,
                                              dagster_type_dict)

        for system_storage_def in mode_def.system_storage_defs:
            for required_resource in system_storage_def.required_resource_keys:
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError((
                        "Resource '{resource}' is required by system storage "
                        "'{storage_name}', but is not provided by mode '{mode_name}'."
                    ).format(
                        resource=required_resource,
                        storage_name=system_storage_def.name,
                        mode_name=mode_def.name,
                    ))
        for intermediate_storage in mode_def.intermediate_storage_defs or []:
            for required_resource in intermediate_storage.required_resource_keys:
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError((
                        "Resource '{resource}' is required by intermediate storage "
                        "'{storage_name}', but is not provided by mode '{mode_name}'."
                    ).format(
                        resource=required_resource,
                        storage_name=intermediate_storage.name,
                        mode_name=mode_def.name,
                    ))
        for solid in solid_dict.values():
            for hook_def in solid.hook_defs:
                for required_resource in hook_def.required_resource_keys:
                    if required_resource not in mode_resources:
                        raise DagsterInvalidDefinitionError((
                            'Resource "{resource}" is required by hook "{hook_name}", but is not '
                            'provided by mode "{mode_name}".').format(
                                resource=required_resource,
                                hook_name=hook_def.name,
                                mode_name=mode_def.name,
                            ))

        for hook_def in pipeline_hook_defs:
            for required_resource in hook_def.required_resource_keys:
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError((
                        'Resource "{resource}" is required by hook "{hook_name}", but is not '
                        'provided by mode "{mode_name}".').format(
                            resource=required_resource,
                            hook_name=hook_def.name,
                            mode_name=mode_def.name,
                        ))
示例#25
0
文件: manager.py 项目: sd2k/dagster
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        run_config=None,
        executable_dict=None,
        pipeline_run_dict=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        """Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        """
        check.opt_str_param(output_log_path, "output_log_path")
        check.opt_str_param(marshal_dir, "marshal_dir")
        run_config = check.opt_dict_param(run_config,
                                          "run_config",
                                          key_type=str)
        check.dict_param(pipeline_run_dict, "pipeline_run_dict")
        check.dict_param(executable_dict, "executable_dict")
        check.dict_param(solid_handle_kwargs, "solid_handle_kwargs")
        check.dict_param(instance_ref_dict, "instance_ref_dict")

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        pipeline_def = pipeline.get_definition()

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            six.raise_from(
                DagstermillError(
                    "Error when attempting to resolve DagsterInstance from serialized InstanceRef"
                ),
                err,
            )

        pipeline_run = unpack_value(pipeline_run_dict)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle).definition

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline = pipeline

        execution_plan = create_execution_plan(
            self.pipeline,
            run_config,
            mode=pipeline_run.mode,
            step_keys_to_execute=pipeline_run.step_keys_to_execute,
        )

        with scoped_pipeline_context(
                execution_plan,
                run_config,
                pipeline_run,
                instance,
                scoped_resources_builder_cm=self._setup_resources,
                # Set this flag even though we're not in test for clearer error reporting
                raise_on_error=True,
        ) as pipeline_context:
            self.context = DagstermillRuntimeExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=run_config.get("solids",
                                            {}).get(solid_def.name,
                                                    {}).get("config"),
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_context.system_storage_def,
                    pipeline_context.intermediate_storage_def,
                ),
                solid_name=solid_def.name,
            )

        return self.context
示例#26
0
    def _execute_step_k8s_job(
        self,
        execute_step_args_packed,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        job_wait_timeout,
        user_defined_k8s_config_dict=None,
        kubeconfig_file=None,
    ):
        """Run step execution in a K8s job pod."""
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            )
        )
        check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs)
        check.invariant(
            len(execute_step_args.step_keys_to_execute) == 1,
            "Celery K8s task executor can only execute 1 step at a time",
        )

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, "job_config", DagsterK8sJobConfig)
        check.str_param(job_namespace, "job_namespace")

        check.bool_param(load_incluster_config, "load_incluster_config")

        user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(
            user_defined_k8s_config_dict
        )
        check.opt_inst_param(
            user_defined_k8s_config,
            "user_defined_k8s_config",
            UserDefinedDagsterK8sConfig,
        )
        check.opt_str_param(kubeconfig_file, "kubeconfig_file")

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)
        pipeline_run = instance.get_run_by_id(execute_step_args.pipeline_run_id)

        check.inst(
            pipeline_run,
            PipelineRun,
            "Could not load run {}".format(execute_step_args.pipeline_run_id),
        )
        step_key = execute_step_args.step_keys_to_execute[0]

        celery_worker_name = self.request.hostname
        celery_pod_name = os.environ.get("HOSTNAME")
        instance.report_engine_event(
            "Task for step {step_key} picked up by Celery".format(step_key=step_key),
            pipeline_run,
            EngineEventData(
                [
                    MetadataEntry("Celery worker name", value=celery_worker_name),
                    MetadataEntry("Celery worker Kubernetes Pod name", value=celery_pod_name),
                ]
            ),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )

        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                "Not scheduling step because dagster run status is not STARTED",
                pipeline_run,
                EngineEventData(
                    [
                        MetadataEntry("Step key", value=step_key),
                    ]
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        # Ensure we stay below k8s name length limits
        k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id, step_key)

        retry_state = execute_step_args.known_state.get_retry_state()

        if retry_state.get_attempt_count(step_key):
            attempt_number = retry_state.get_attempt_count(step_key)
            job_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)
            pod_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)
        else:
            job_name = "dagster-step-%s" % (k8s_name_key)
            pod_name = "dagster-step-%s" % (k8s_name_key)

        args = execute_step_args.get_command_args()

        job = construct_dagster_k8s_job(
            job_config,
            args,
            job_name,
            user_defined_k8s_config,
            pod_name,
            component="step_worker",
            labels={
                "dagster/job": execute_step_args.pipeline_origin.pipeline_name,
                "dagster/op": step_key,
                "dagster/run-id": execute_step_args.pipeline_run_id,
            },
        )

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            "Executing step {} in Kubernetes job {}".format(step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    MetadataEntry("Step key", value=step_key),
                    MetadataEntry("Kubernetes Job name", value=job_name),
                    MetadataEntry("Job image", value=job_config.job_image),
                    MetadataEntry("Image pull policy", value=job_config.image_pull_policy),
                    MetadataEntry("Image pull secrets", value=str(job_config.image_pull_secrets)),
                    MetadataEntry(
                        "Service account name", value=str(job_config.service_account_name)
                    ),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)
        try:
            kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            if e.reason == "Conflict":
                # There is an existing job with the same name so proceed and see if the existing job succeeded
                instance.report_engine_event(
                    "Did not create Kubernetes job {} for step {} since job name already "
                    "exists, proceeding with existing job.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            MetadataEntry("Step key", value=step_key),
                            MetadataEntry("Kubernetes Job name", value=job_name),
                        ],
                        marker_end=DELEGATE_MARKER,
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            else:
                instance.report_engine_event(
                    "Encountered unexpected error while creating Kubernetes job {} for step {}, "
                    "exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            MetadataEntry("Step key", value=step_key),
                        ],
                        error=serializable_error_info_from_exc_info(sys.exc_info()),
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
                return []

        try:
            wait_for_job_success(
                job_name=job_name,
                namespace=job_namespace,
                instance=instance,
                run_id=execute_step_args.pipeline_run_id,
                wait_timeout=job_wait_timeout,
            )
        except (DagsterK8sError, DagsterK8sTimeoutError) as err:
            step_failure_event = construct_step_failure_event_and_handle(
                pipeline_run, step_key, err, instance=instance
            )
            events.append(step_failure_event)
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                "Terminating Kubernetes Job because dagster run status is not STARTED",
                pipeline_run,
                EngineEventData(
                    [
                        MetadataEntry("Step key", value=step_key),
                        MetadataEntry("Kubernetes Job name", value=job_name),
                        MetadataEntry("Kubernetes Job namespace", value=job_namespace),
                    ]
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return []
        except (
            DagsterK8sUnrecoverableAPIError,
            DagsterK8sAPIRetryLimitExceeded,
            # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in
            # a retry boundary. We still catch it here just in case we missed one so that we can
            # report it to the event log
            kubernetes.client.rest.ApiException,
        ) as err:
            instance.report_engine_event(
                "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "
                "exiting.".format(job_name, step_key),
                pipeline_run,
                EngineEventData(
                    [
                        MetadataEntry("Step key", value=step_key),
                    ],
                    error=serializable_error_info_from_exc_info(sys.exc_info()),
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        try:
            pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            instance.report_engine_event(
                "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "
                "exiting.".format(job_name, step_key),
                pipeline_run,
                EngineEventData(
                    [
                        MetadataEntry("Step key", value=step_key),
                    ],
                    error=serializable_error_info_from_exc_info(sys.exc_info()),
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            "Retrieving logs from Kubernetes Job pods",
            pipeline_run,
            EngineEventData([MetadataEntry("Pod names", value="\n".join(pod_names))]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            try:
                raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
                logs += raw_logs.split("\n")
            except kubernetes.client.rest.ApiException as e:
                instance.report_engine_event(
                    "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "
                    "Pod name {} for step {}. Will attempt to continue with other pods.".format(
                        job_name, pod_name, step_key
                    ),
                    pipeline_run,
                    EngineEventData(
                        [
                            MetadataEntry("Step key", value=step_key),
                        ],
                        error=serializable_error_info_from_exc_info(sys.exc_info()),
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )

        events += filter_dagster_events_from_pod_logs(logs)
        serialized_events = [serialize_dagster_namedtuple(event) for event in events]
        return serialized_events
示例#27
0
 def __init__(self, step_dict, deps):
     self.step_dict = check.dict_param(
         step_dict, 'step_dict', key_type=str, value_type=ExecutionStep
     )
     self.deps = check.dict_param(deps, 'deps', key_type=str, value_type=set)
     self.steps = list(step_dict.values())
示例#28
0
    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        retries_dict,
        resources=None,
        kubeconfig_file=None,
    ):
        '''Run step execution in a K8s job pod.
        '''

        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.invariant(
            len(step_keys) == 1,
            'Celery K8s task executor can only execute 1 step at a time')
        check.dict_param(run_config, 'run_config')
        check.str_param(mode, 'mode')
        check.str_param(repo_name, 'repo_name')
        check.str_param(repo_location_name, 'repo_location_name')
        check.str_param(run_id, 'run_id')

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
        check.str_param(job_namespace, 'job_namespace')

        check.bool_param(load_incluster_config, 'load_incluster_config')
        check.dict_param(retries_dict, 'retries_dict')

        check.opt_dict_param(resources,
                             'resources',
                             key_type=str,
                             value_type=dict)
        check.opt_str_param(kubeconfig_file, 'kubeconfig_file')

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_key = step_keys[0]
        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                'Not scheduling step because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return

        # Ensure we stay below k8s name length limits
        k8s_name_key = _get_k8s_name_key(run_id, step_keys)

        retries = Retries.from_config(retries_dict)

        if retries.get_attempt_count(step_key):
            attempt_number = retries.get_attempt_count(step_key)
            job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
            pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
        else:
            job_name = 'dagster-job-%s' % (k8s_name_key)
            pod_name = 'dagster-job-%s' % (k8s_name_key)

        variables = {
            'executionParams': {
                'runConfigData': run_config,
                'mode': mode,
                'selector': {
                    'repositoryLocationName':
                    repo_location_name,
                    'repositoryName':
                    repo_name,
                    'pipelineName':
                    pipeline_run.pipeline_name,
                    'solidSelection':
                    list(pipeline_run.solids_to_execute)
                    if pipeline_run.solids_to_execute else None,
                },
                'executionMetadata': {
                    'runId': run_id
                },
                'stepKeys': step_keys,
            },
            'retries': retries.to_graphql_input(),
        }
        args = [
            '-p', 'executePlan', '-v',
            seven.json.dumps(variables), '--remap-sigterm'
        ]

        job = construct_dagster_graphql_k8s_job(job_config, args, job_name,
                                                resources, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            'Executing step {} in Kubernetes job {}'.format(
                step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                    EventMetadataEntry.text(job_config.job_image, 'Job image'),
                    EventMetadataEntry.text(job_config.image_pull_policy,
                                            'Image pull policy'),
                    EventMetadataEntry.text(str(job_config.image_pull_secrets),
                                            'Image pull secrets'),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name),
                        'Service account name'),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)

        kubernetes.client.BatchV1Api().create_namespaced_job(
            body=job, namespace=job_namespace)

        try:
            wait_for_job_success(
                job_name=job_name,
                namespace=job_namespace,
                instance=instance,
                run_id=run_id,
            )
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                'Terminating Kubernetes Job because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(job_namespace,
                                            'Kubernetes Job namespace'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return

        pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            'Retrieving logs from Kubernetes Job pods',
            pipeline_run,
            EngineEventData(
                [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split('\n')

        res = parse_raw_log_lines(logs)
        handle_execution_errors(res, 'executePlan')
        step_events = handle_execute_plan_result(res)

        events += step_events

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
示例#29
0
def create_execution_structure(solid_defs, dependencies_dict,
                               container_definition):
    '''This builder takes the dependencies dictionary specified during creation of the
    PipelineDefinition object and builds (1) the execution structure and (2) a solid dependency
    dictionary.

    For example, for the following dependencies:

    dep_dict = {
            SolidInvocation('giver'): {},
            SolidInvocation('sleeper', alias='sleeper_1'): {
                'units': DependencyDefinition('giver', 'out_1')
            },
            SolidInvocation('sleeper', alias='sleeper_2'): {
                'units': DependencyDefinition('giver', 'out_2')
            },
            SolidInvocation('sleeper', alias='sleeper_3'): {
                'units': DependencyDefinition('giver', 'out_3')
            },
            SolidInvocation('sleeper', alias='sleeper_4'): {
                'units': DependencyDefinition('giver', 'out_4')
            },
            SolidInvocation('total'): {
                'in_1': DependencyDefinition('sleeper_1', 'total'),
                'in_2': DependencyDefinition('sleeper_2', 'total'),
                'in_3': DependencyDefinition('sleeper_3', 'total'),
                'in_4': DependencyDefinition('sleeper_4', 'total'),
            },
        },

    This will create:

    pipeline_solid_dict = {
        'giver': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_1': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_2': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_3': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_4': <dagster.core.definitions.dependency.Solid object>,
        'total': <dagster.core.definitions.dependency.Solid object>
    }

    as well as a dagster.core.definitions.dependency.DependencyStructure object.
    '''
    from .solid import ISolidDefinition, CompositeSolidDefinition

    check.list_param(solid_defs, 'solid_defs', of_type=ISolidDefinition)
    check.dict_param(
        dependencies_dict,
        'dependencies_dict',
        key_type=six.string_types + (SolidInvocation, ),
        value_type=dict,
    )
    # container_definition is none in the context of a pipeline
    check.opt_inst_param(container_definition, 'container_definition',
                         CompositeSolidDefinition)

    # Same as dep_dict but with SolidInvocation replaced by alias string
    aliased_dependencies_dict = {}

    # Keep track of solid name -> all aliases used and alias -> name
    name_to_aliases = defaultdict(set)
    alias_to_solid_instance = {}
    alias_to_name = {}

    for solid_key, input_dep_dict in dependencies_dict.items():
        # We allow deps of the form dependencies={'foo': DependencyDefition('bar')}
        # Here, we replace 'foo' with SolidInvocation('foo')
        if not isinstance(solid_key, SolidInvocation):
            solid_key = SolidInvocation(solid_key)

        alias = solid_key.alias or solid_key.name

        name_to_aliases[solid_key.name].add(alias)
        alias_to_solid_instance[alias] = solid_key
        alias_to_name[alias] = solid_key.name
        aliased_dependencies_dict[alias] = input_dep_dict

        for dependency in input_dep_dict.values():
            for dep in dependency.get_definitions():
                name_to_aliases[dep.solid].add(dep.solid)

    pipeline_solid_dict = _build_pipeline_solid_dict(solid_defs,
                                                     name_to_aliases,
                                                     alias_to_solid_instance,
                                                     container_definition)

    _validate_dependencies(aliased_dependencies_dict, pipeline_solid_dict,
                           alias_to_name)

    dependency_structure = DependencyStructure.from_definitions(
        pipeline_solid_dict, aliased_dependencies_dict)

    return dependency_structure, pipeline_solid_dict
示例#30
0
 def __init__(self, ddict):
     check.dict_param(ddict, 'ddict', key_type=str, value_type=Field)
     super(FieldDefinitionDictionary, self).__init__(ddict)