示例#1
0
    def create_schedule_definition(
        self,
        schedule_name,
        cron_schedule,
        partition_selector,
        should_execute=None,
        environment_vars=None,
        execution_timezone=None,
    ):
        """Create a ScheduleDefinition from a PartitionSetDefinition.

        Arguments:
            schedule_name (str): The name of the schedule.
            cron_schedule (str): A valid cron string for the schedule
            partition_selector (Callable[ScheduleExecutionContext, PartitionSetDefinition],
            Partition): Function that determines the partition to use at a given execution time.
            For time-based partition sets, will likely be either `identity_partition_selector` or a
            selector returned by `create_offset_partition_selector`.
            should_execute (Optional[function]): Function that runs at schedule execution time that
            determines whether a schedule should execute. Defaults to a function that always returns
            ``True``.
            environment_vars (Optional[dict]): The environment variables to set for the schedule.
            execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
                with DagsterDaemonScheduler, and must be set when using that scheduler.

        Returns:
            ScheduleDefinition: The generated ScheduleDefinition for the partition selector
        """

        check.str_param(schedule_name, "schedule_name")
        check.str_param(cron_schedule, "cron_schedule")
        check.opt_callable_param(should_execute, "should_execute")
        check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)
        check.callable_param(partition_selector, "partition_selector")
        check.opt_str_param(execution_timezone, "execution_timezone")

        def _should_execute_wrapper(context):
            check.inst_param(context, "context", ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)

            if not selected_partition or not selected_partition.name in self.get_partition_names():
                return False
            elif not should_execute:
                return True
            else:
                return should_execute(context)

        def _run_config_fn_wrapper(context):
            check.inst_param(context, "context", ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition or not selected_partition.name in self.get_partition_names():
                raise DagsterInvariantViolationError(
                    "The partition selection function `{selector}` did not return "
                    "a partition from PartitionSet {partition_set}".format(
                        selector=getattr(partition_selector, "__name__", repr(partition_selector)),
                        partition_set=self.name,
                    )
                )

            return self.run_config_for_partition(selected_partition)

        def _tags_fn_wrapper(context):
            check.inst_param(context, "context", ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition:
                raise DagsterInvariantViolationError(
                    "The partition selection function `{selector}` did not return "
                    "a partition from PartitionSet {partition_set}".format(
                        selector=getattr(partition_selector, "__name__", repr(partition_selector)),
                        partition_set=self.name,
                    )
                )

            return self.tags_for_partition(selected_partition)

        return PartitionScheduleDefinition(
            name=schedule_name,
            cron_schedule=cron_schedule,
            pipeline_name=self.pipeline_name,
            run_config_fn=_run_config_fn_wrapper,
            tags_fn=_tags_fn_wrapper,
            solid_selection=self.solid_selection,
            mode=self.mode,
            should_execute=_should_execute_wrapper,
            environment_vars=environment_vars,
            partition_set=self,
            execution_timezone=execution_timezone,
        )
示例#2
0
def _create_solid_compute_wrapper(fn, input_defs, output_defs):
    check.callable_param(fn, 'fn')
    check.list_param(input_defs, 'input_defs', of_type=InputDefinition)
    check.list_param(output_defs, 'output_defs', of_type=OutputDefinition)

    input_names = [
        input_def.name for input_def in input_defs if not input_def.runtime_type.is_nothing
    ]

    @wraps(fn)
    def compute(context, input_defs):
        kwargs = {}
        for input_name in input_names:
            kwargs[input_name] = input_defs[input_name]

        result = fn(context, **kwargs)

        if inspect.isgenerator(result):
            for item in result:
                yield item
        else:
            if isinstance(result, (Materialization, ExpectationResult)):
                raise DagsterInvariantViolationError(
                    (
                        'Error in solid {solid_name}: If you are returning a Materialization '
                        'or an ExpectationResult from solid you must yield them to avoid '
                        'ambiguity with an implied result from returning a value.'.format(
                            solid_name=context.solid.name
                        )
                    )
                )

            if isinstance(result, Output):
                yield result
            elif len(output_defs) == 1:
                yield Output(value=result, output_name=output_defs[0].name)
            elif result is not None:
                if not output_defs:
                    raise DagsterInvariantViolationError(
                        (
                            'Error in solid {solid_name}: Unexpectedly returned output {result} '
                            'of type {type_}. Solid is explicitly defined to return no '
                            'results.'
                        ).format(solid_name=context.solid.name, result=result, type_=type(result))
                    )

                raise DagsterInvariantViolationError(
                    (
                        'Error in solid {solid_name}: Solid unexpectedly returned '
                        'output {result} of type {type_}. Should '
                        'be a generator, containing or yielding '
                        '{n_results} results: {{{expected_results}}}.'
                    ).format(
                        solid_name=context.solid.name,
                        result=result,
                        type_=type(result),
                        n_results=len(output_defs),
                        expected_results=', '.join(
                            [
                                '\'{result_name}\': {runtime_type}'.format(
                                    result_name=output_def.name,
                                    runtime_type=output_def.runtime_type,
                                )
                                for output_def in output_defs
                            ]
                        ),
                    )
                )

    return compute
示例#3
0
    def __call__(self, fn):
        check.callable_param(fn, 'fn')

        if not self.name:
            self.name = fn.__name__

        input_defs = (
            self.input_defs
            if self.input_defs is not None
            else infer_input_definitions_for_composite_solid(self.name, fn)
        )

        explicit_outputs = False
        if self.output_defs is not None:
            explicit_outputs = True
            output_defs = self.output_defs
        else:
            explicit_outputs = has_explicit_return_type(fn)
            output_defs = infer_output_definitions('@composite_solid', self.name, fn)

        positional_inputs = validate_solid_fn(
            '@composite_solid', self.name, fn, input_defs, exclude_nothing=False
        )

        kwargs = {input_def.name: InputMappingNode(input_def) for input_def in input_defs}

        output = None
        mapping = None
        enter_composition(self.name, '@composite_solid')
        try:
            output = fn(**kwargs)
            mapping = composite_mapping_from_output(output, output_defs, self.name)
        finally:
            context = exit_composition(mapping)

        check.invariant(
            context.name == self.name,
            'Composition context stack desync: received context for '
            '"{context.name}" expected "{self.name}"'.format(context=context, self=self),
        )

        # line up mappings in definition order
        input_mappings = []
        for defn in input_defs:
            mappings = [
                mapping
                for mapping in context.input_mappings
                if mapping.definition.name == defn.name
            ]

            if len(mappings) == 0:
                raise DagsterInvalidDefinitionError(
                    "@composite_solid '{solid_name}' has unmapped input '{input_name}'. "
                    "Remove it or pass it to the appropriate solid invocation.".format(
                        solid_name=self.name, input_name=defn.name
                    )
                )

            input_mappings += mappings

        output_mappings = []
        for defn in output_defs:
            mapping = context.output_mapping_dict.get(defn.name)
            if mapping is None:
                # if we inferred output_defs we will be flexible and either take a mapping or not
                if not explicit_outputs:
                    continue

                raise DagsterInvalidDefinitionError(
                    "@composite_solid '{solid_name}' has unmapped output '{output_name}'. "
                    "Remove it or return a value from the appropriate solid invocation.".format(
                        solid_name=self.name, output_name=defn.name
                    )
                )
            output_mappings.append(mapping)

        config_mapping = _get_validated_config_mapping(self.name, self.config, self.config_fn)

        return CompositeSolidDefinition(
            name=self.name,
            input_mappings=input_mappings,
            output_mappings=output_mappings,
            dependencies=context.dependencies,
            solid_defs=context.solid_defs,
            description=self.description,
            config_mapping=config_mapping,
            positional_inputs=positional_inputs,
        )
示例#4
0
 def __init__(self, resource_fn, config_field=None, description=None):
     self._resource_fn = check.callable_param(resource_fn, 'resource_fn')
     self._config_field = check_user_facing_opt_field_param(
         config_field, 'config_field', 'of a ResourceDefinition or @resource'
     )
     self._description = check.opt_str_param(description, 'description')
示例#5
0
 def __init__(self, manager_fn, marks):
     self.manager_fn = check.callable_param(manager_fn, "manager_fn")
     self.marks = check.list_param(marks, "marks")
示例#6
0
def create_offset_partition_selector(execution_time_to_partition_fn):
    """ Utility function for supplying a partition selector when creating a schedule from a
    partition set made of `datetime`s that assumes a fixed time offset between the partition
    time and the time at which the schedule executes.

    It's important to keep the cron string that's supplied to
    `PartitionSetDefinition.create_schedule_definition` in sync with the offset that's
    supplied to this function. For example, a schedule created from a partition set with
    partitions for each day at midnight that fills in the partition for day N at day N+1 at
    10:00AM would create the partition selector as follows:

    .. code-block:: python

        partition_set = PartitionSetDefinition(
            name='hello_world_partition_set',
            pipeline_name='hello_world_pipeline',
            partition_fn= date_partition_range(
                start=datetime.datetime(2021, 1, 1),
                delta_range="days",
                timezone="US/Central",
            )
            run_config_fn_for_partition=my_run_config_fn,
        )

        schedule_definition = partition_set.create_schedule_definition(
            "daily_10am_schedule",
            "0 10 * * *",
            partition_selector=create_offset_partition_selector(lambda d: d.subtract(hours=10, days=1))
            execution_timezone="US/Central",
        )

    Args:
        execution_time_to_partition_fn (Callable[[datetime.datetime], datetime.datetime]): A
        function that maps the execution time of the schedule to the partition time.
    """

    check.callable_param(execution_time_to_partition_fn,
                         "execution_time_to_partition_fn")

    def offset_partition_selector(context, partition_set_def):
        check.inst_param(context, "context", ScheduleExecutionContext)
        check.inst_param(partition_set_def, "partition_set_def",
                         PartitionSetDefinition)

        if not context.scheduled_execution_time:
            partitions = partition_set_def.get_partitions()
            if not partitions:
                return None
            return partitions[-1]

        partition_time = execution_time_to_partition_fn(
            context.scheduled_execution_time)

        for partition in reversed(
                partition_set_def.get_partitions(
                    context.scheduled_execution_time)):
            if partition.value.isoformat() == partition_time.isoformat():
                return partition

            if partition.value < partition_time:
                break

        return None

    return offset_partition_selector
示例#7
0
文件: partition.py 项目: zkan/dagster
    def create_schedule_definition(
        self,
        schedule_name,
        cron_schedule,
        should_execute=None,
        partition_selector=last_partition,
        environment_vars=None,
    ):
        '''Create a ScheduleDefinition from a PartitionSetDefinition.

        Arguments:
            schedule_name (str): The name of the schedule.
            cron_schedule (str): A valid cron string for the schedule
            should_execute (Optional[function]): Function that runs at schedule execution time that
            determines whether a schedule should execute. Defaults to a function that always returns
            ``True``.
            partition_selector (Callable[PartitionSet], Partition): A partition selector for the
                schedule
            environment_vars (Optional[dict]): The environment variables to set for the schedule

        Returns:
            ScheduleDefinition -- The generated ScheduleDefinition for the partition selector
        '''

        check.str_param(schedule_name, 'schedule_name')
        check.str_param(cron_schedule, 'cron_schedule')
        check.opt_callable_param(should_execute, 'should_execute')
        check.opt_dict_param(environment_vars, 'environment_vars', key_type=str, value_type=str)
        check.callable_param(partition_selector, 'partition_selector')

        def _should_execute_wrapper(context):
            check.inst_param(context, 'context', ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition:
                return False
            elif not should_execute:
                return True
            else:
                return should_execute(context)

        def _environment_dict_fn_wrapper(context):
            check.inst_param(context, 'context', ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition:
                raise DagsterInvariantViolationError(
                    "The partition selection function `{selector}` did not return "
                    "a partition from PartitionSet {partition_set}".format(
                        selector=getattr(partition_selector, '__name__', repr(partition_selector)),
                        partition_set=self.name,
                    )
                )

            return self.environment_dict_for_partition(selected_partition)

        def _tags_fn_wrapper(context):
            check.inst_param(context, 'context', ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition:
                raise DagsterInvariantViolationError(
                    "The partition selection function `{selector}` did not return "
                    "a partition from PartitionSet {partition_set}".format(
                        selector=getattr(partition_selector, '__name__', repr(partition_selector)),
                        partition_set=self.name,
                    )
                )

            return self.tags_for_partition(selected_partition)

        return PartitionScheduleDefinition(
            name=schedule_name,
            cron_schedule=cron_schedule,
            pipeline_name=self.pipeline_name,
            environment_dict_fn=_environment_dict_fn_wrapper,
            tags_fn=_tags_fn_wrapper,
            solid_subset=self.solid_subset,
            mode=self.mode,
            should_execute=_should_execute_wrapper,
            environment_vars=environment_vars,
            partition_set=self,
        )
示例#8
0
    def __call__(self, fn: Callable[[], Any]) -> RepositoryDefinition:
        from dagster.core.asset_defs import AssetGroup

        check.callable_param(fn, "fn")

        if not self.name:
            self.name = fn.__name__

        repository_definitions = fn()

        if not (isinstance(repository_definitions, list)
                or isinstance(repository_definitions, dict)
                or isinstance(repository_definitions, RepositoryData)):
            raise DagsterInvalidDefinitionError(
                "Bad return value of type {type_} from repository construction function: must "
                "return list, dict, or RepositoryData. See the @repository decorator docstring for "
                "details and examples".format(
                    type_=type(repository_definitions)), )

        if isinstance(repository_definitions, list):
            bad_definitions = []
            for i, definition in enumerate(repository_definitions):
                if not (isinstance(definition, PipelineDefinition)
                        or isinstance(definition, PartitionSetDefinition)
                        or isinstance(definition, ScheduleDefinition)
                        or isinstance(definition, SensorDefinition)
                        or isinstance(definition, GraphDefinition)
                        or isinstance(definition, AssetGroup)):
                    bad_definitions.append((i, type(definition)))
            if bad_definitions:
                bad_definitions_str = ", ".join([
                    "value of type {type_} at index {i}".format(type_=type_,
                                                                i=i)
                    for i, type_ in bad_definitions
                ])
                raise DagsterInvalidDefinitionError(
                    "Bad return value from repository construction function: all elements of list "
                    "must be of type JobDefinition, GraphDefinition, PipelineDefinition, "
                    "PartitionSetDefinition, ScheduleDefinition, or SensorDefinition. "
                    f"Got {bad_definitions_str}.")
            repository_data = CachingRepositoryData.from_list(
                repository_definitions)

        elif isinstance(repository_definitions, dict):
            if not set(repository_definitions.keys()).issubset(
                    VALID_REPOSITORY_DATA_DICT_KEYS):
                raise DagsterInvalidDefinitionError(
                    "Bad return value from repository construction function: dict must not contain "
                    "keys other than {{'pipelines', 'partition_sets', 'schedules', 'jobs'}}: found "
                    "{bad_keys}".format(bad_keys=", ".join([
                        "'{key}'".format(key=key)
                        for key in repository_definitions.keys()
                        if key not in VALID_REPOSITORY_DATA_DICT_KEYS
                    ])))
            repository_data = CachingRepositoryData.from_dict(
                repository_definitions)
        elif isinstance(repository_definitions, RepositoryData):
            repository_data = repository_definitions

        repository_def = RepositoryDefinition(name=self.name,
                                              description=self.description,
                                              repository_data=repository_data)

        update_wrapper(repository_def, fn)
        return repository_def
示例#9
0
def split_function_parameters(fn, expected_positionals):
    check.callable_param(fn, 'fn')
    check.list_param(expected_positionals, 'expected_positionals', str)
    fn_params = list(funcsigs.signature(fn).parameters.values())
    return fn_params[0:len(expected_positionals
                           )], fn_params[len(expected_positionals):]
示例#10
0
    def __init__(
        self,
        name: str,
        pipeline_run_status: PipelineRunStatus,
        run_status_sensor_fn: Callable[[RunStatusSensorContext],
                                       Union[SkipReason, PipelineRunReaction]],
        pipeline_selection: Optional[List[str]] = None,
        minimum_interval_seconds: Optional[int] = None,
        description: Optional[str] = None,
        job_selection: Optional[List[Union[PipelineDefinition,
                                           GraphDefinition]]] = None,
    ):

        from dagster.core.storage.event_log.base import RunShardedEventsCursor, EventRecordsFilter

        check.str_param(name, "name")
        check.inst_param(pipeline_run_status, "pipeline_run_status",
                         PipelineRunStatus)
        check.callable_param(run_status_sensor_fn, "run_status_sensor_fn")
        check.opt_list_param(pipeline_selection, "pipeline_selection", str)
        check.opt_int_param(minimum_interval_seconds,
                            "minimum_interval_seconds")
        check.opt_str_param(description, "description")
        check.opt_list_param(job_selection, "job_selection",
                             (PipelineDefinition, GraphDefinition))

        def _wrapped_fn(context: SensorEvaluationContext):
            # initiate the cursor to (most recent event id, current timestamp) when:
            # * it's the first time starting the sensor
            # * or, the cursor isn't in valid format (backcompt)
            if context.cursor is None or not RunStatusSensorCursor.is_valid(
                    context.cursor):
                most_recent_event_records = list(
                    context.instance.get_event_records(ascending=False,
                                                       limit=1))
                most_recent_event_id = (most_recent_event_records[0].storage_id
                                        if len(most_recent_event_records) == 1
                                        else -1)

                new_cursor = RunStatusSensorCursor(
                    update_timestamp=pendulum.now("UTC").isoformat(),
                    record_id=most_recent_event_id,
                )
                context.update_cursor(new_cursor.to_json())
                yield SkipReason(
                    f"Initiating {name}. Set cursor to {new_cursor}")
                return

            record_id, update_timestamp = RunStatusSensorCursor.from_json(
                context.cursor)

            # Fetch events after the cursor id
            # * we move the cursor forward to the latest visited event's id to avoid revisits
            # * when the daemon is down, bc we persist the cursor info, we can go back to where we
            #   left and backfill alerts for the qualified events (up to 5 at a time) during the downtime
            # Note: this is a cross-run query which requires extra handling in sqlite, see details in SqliteEventLogStorage.
            event_records = context.instance.get_event_records(
                EventRecordsFilter(
                    after_cursor=RunShardedEventsCursor(
                        id=record_id,
                        run_updated_after=cast(
                            datetime, pendulum.parse(update_timestamp)),
                    ),
                    event_type=PIPELINE_RUN_STATUS_TO_EVENT_TYPE[
                        pipeline_run_status],
                ),
                ascending=True,
                limit=5,
            )

            for event_record in event_records:
                event_log_entry = event_record.event_log_entry
                storage_id = event_record.storage_id

                # get run info
                run_records = context.instance.get_run_records(
                    filters=PipelineRunsFilter(
                        run_ids=[event_log_entry.run_id]))

                # skip if we couldn't find the right run
                if len(run_records) != 1:
                    # bc we couldn't find the run, we use the event timestamp as the approximate
                    # run update timestamp
                    approximate_update_timestamp = utc_datetime_from_timestamp(
                        event_log_entry.timestamp)
                    context.update_cursor(
                        RunStatusSensorCursor(
                            record_id=storage_id,
                            update_timestamp=approximate_update_timestamp.
                            isoformat(),
                        ).to_json())
                    continue

                pipeline_run = run_records[0].pipeline_run
                update_timestamp = run_records[0].update_timestamp

                # skip if any of of the followings happens:
                if (
                        # the pipeline does not have a repository (manually executed)
                        not pipeline_run.external_pipeline_origin or
                        # the pipeline does not belong to the current repository
                        pipeline_run.external_pipeline_origin.
                        external_repository_origin.repository_name !=
                        context.repository_name or
                        # if pipeline is not selected
                    (pipeline_selection
                     and pipeline_run.pipeline_name not in pipeline_selection)
                        or
                        # if job not selected
                    (job_selection and pipeline_run.pipeline_name not in map(
                        lambda x: x.name, job_selection))):
                    context.update_cursor(
                        RunStatusSensorCursor(
                            record_id=storage_id,
                            update_timestamp=update_timestamp.isoformat()).
                        to_json())
                    continue

                serializable_error = None

                try:
                    with user_code_error_boundary(
                            RunStatusSensorExecutionError,
                            lambda:
                            f'Error occurred during the execution sensor "{name}".',
                    ):
                        # one user code invocation maps to one failure event
                        run_status_sensor_fn(
                            RunStatusSensorContext(
                                sensor_name=name,
                                dagster_run=pipeline_run,
                                dagster_event=event_log_entry.dagster_event,
                                instance=context.instance,
                            ))
                except RunStatusSensorExecutionError as run_status_sensor_execution_error:
                    # When the user code errors, we report error to the sensor tick not the original run.
                    serializable_error = serializable_error_info_from_exc_info(
                        run_status_sensor_execution_error.original_exc_info)

                context.update_cursor(
                    RunStatusSensorCursor(record_id=storage_id,
                                          update_timestamp=update_timestamp.
                                          isoformat()).to_json())

                # Yield PipelineRunReaction to indicate the execution success/failure.
                # The sensor machinery would
                # * report back to the original run if success
                # * update cursor and job state
                yield PipelineRunReaction(
                    pipeline_run=pipeline_run,
                    error=serializable_error,
                )

        super(RunStatusSensorDefinition, self).__init__(
            name=name,
            evaluation_fn=_wrapped_fn,
            minimum_interval_seconds=minimum_interval_seconds,
            description=description,
        )
示例#11
0
    def create_schedule_definition(
        self,
        schedule_name,
        cron_schedule,
        partition_selector,
        should_execute=None,
        environment_vars=None,
        execution_timezone=None,
        description=None,
    ):
        """Create a ScheduleDefinition from a PartitionSetDefinition.

        Arguments:
            schedule_name (str): The name of the schedule.
            cron_schedule (str): A valid cron string for the schedule
            partition_selector (Callable[ScheduleExecutionContext, PartitionSetDefinition],
            Partition): Function that determines the partition to use at a given execution time.
            For time-based partition sets, will likely be either `identity_partition_selector` or a
            selector returned by `create_offset_partition_selector`.
            should_execute (Optional[function]): Function that runs at schedule execution time that
            determines whether a schedule should execute. Defaults to a function that always returns
            ``True``.
            environment_vars (Optional[dict]): The environment variables to set for the schedule.
            execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
                with DagsterDaemonScheduler, and must be set when using that scheduler.
            description (Optional[str]): A human-readable description of the schedule.

        Returns:
            ScheduleDefinition: The generated ScheduleDefinition for the partition selector
        """

        check.str_param(schedule_name, "schedule_name")
        check.str_param(cron_schedule, "cron_schedule")
        check.opt_callable_param(should_execute, "should_execute")
        check.opt_dict_param(environment_vars,
                             "environment_vars",
                             key_type=str,
                             value_type=str)
        check.callable_param(partition_selector, "partition_selector")
        check.opt_str_param(execution_timezone, "execution_timezone")
        check.opt_str_param(description, "description")

        def _execution_fn(context):
            check.inst_param(context, "context", ScheduleExecutionContext)
            with user_code_error_boundary(
                    ScheduleExecutionError,
                    lambda:
                    f"Error occurred during the execution of partition_selector for schedule {schedule_name}",
            ):
                selected_partition = partition_selector(context, self)

            if not selected_partition:
                yield SkipReason(
                    "Partition selector did not return a partition. Make sure that the timezone "
                    "on your partition set matches your execution timezone.")
                return

            if selected_partition.name not in self.get_partition_names(
                    context.scheduled_execution_time):
                yield SkipReason(
                    f"Partition selector returned a partition {selected_partition.name} not in the partition set."
                )
                return

            with user_code_error_boundary(
                    ScheduleExecutionError,
                    lambda:
                    f"Error occurred during the execution of should_execute for schedule {schedule_name}",
            ):
                if should_execute and not should_execute(context):
                    yield SkipReason(
                        "should_execute function for {schedule_name} returned false."
                        .format(schedule_name=schedule_name))
                    return

            with user_code_error_boundary(
                    ScheduleExecutionError,
                    lambda:
                    f"Error occurred during the execution of run_config_fn for schedule {schedule_name}",
            ):
                run_config = self.run_config_for_partition(selected_partition)

            with user_code_error_boundary(
                    ScheduleExecutionError,
                    lambda:
                    f"Error occurred during the execution of tags_fn for schedule {schedule_name}",
            ):
                tags = self.tags_for_partition(selected_partition)
            yield RunRequest(
                run_key=None,
                run_config=run_config,
                tags=tags,
            )

        return PartitionScheduleDefinition(
            name=schedule_name,
            cron_schedule=cron_schedule,
            pipeline_name=self.pipeline_name,
            tags_fn=None,
            solid_selection=self.solid_selection,
            mode=self.mode,
            should_execute=None,
            environment_vars=environment_vars,
            partition_set=self,
            execution_timezone=execution_timezone,
            execution_fn=_execution_fn,
            description=description,
        )
示例#12
0
def validate_solid_fn(
    decorator_name: str,
    fn_name: str,
    compute_fn: Callable[..., Any],
    input_defs: List[InputDefinition],
    expected_positionals: Optional[List[str]] = None,
    exclude_nothing: Optional[bool] = True,
) -> List[str]:
    check.str_param(decorator_name, "decorator_name")
    check.str_param(fn_name, "fn_name")
    check.callable_param(compute_fn, "compute_fn")
    check.list_param(input_defs, "input_defs", of_type=InputDefinition)
    expected_positionals = check.opt_list_param(expected_positionals,
                                                "expected_positionals",
                                                of_type=str)
    if exclude_nothing:
        names = set(inp.name for inp in input_defs
                    if not inp.dagster_type.kind == DagsterTypeKind.NOTHING)
        nothing_names = set(
            inp.name for inp in input_defs
            if inp.dagster_type.kind == DagsterTypeKind.NOTHING)
    else:
        names = set(inp.name for inp in input_defs)
        nothing_names = set()

    # Currently being super strict about naming. Might be a good idea to relax. Starting strict.
    fn_positionals, input_args = split_function_parameters(
        compute_fn, expected_positionals)

    # Validate Positional Parameters
    missing_positional = validate_decorated_fn_positionals(
        fn_positionals, expected_positionals)
    if missing_positional:
        raise DagsterInvalidDefinitionError(
            "{decorator_name} '{solid_name}' decorated function does not have required positional "
            "parameter '{missing_param}'. Solid functions should only have keyword arguments "
            "that match input names and a first positional parameter named 'context'."
            .format(decorator_name=decorator_name,
                    solid_name=fn_name,
                    missing_param=missing_positional))

    # Validate non positional parameters
    invalid_function_info = validate_decorated_fn_input_args(names, input_args)
    if invalid_function_info:
        if invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[
                "vararg"]:
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function has positional vararg parameter "
                "'{param}'. Solid functions should only have keyword arguments that match "
                "input names and a first positional parameter named 'context'."
                .format(
                    decorator_name=decorator_name,
                    solid_name=fn_name,
                    param=invalid_function_info.param,
                ))
        elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[
                "missing_name"]:
            if invalid_function_info.param in nothing_names:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is "
                    "one of the solid input_defs of type 'Nothing' which should not be included since "
                    "no data will be passed for it. ".format(
                        decorator_name=decorator_name,
                        solid_name=fn_name,
                        param=invalid_function_info.param,
                    ))
            else:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is not "
                    "one of the solid input_defs. Solid functions should only have keyword arguments "
                    "that match input names and a first positional parameter named 'context'."
                    .format(
                        decorator_name=decorator_name,
                        solid_name=fn_name,
                        param=invalid_function_info.param,
                    ))
        elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[
                "extra"]:
            undeclared_inputs_printed = ", '".join(
                invalid_function_info.missing_names)
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function does not have parameter(s) "
                "'{undeclared_inputs_printed}', which are in solid's input_defs. Solid functions "
                "should only have keyword arguments that match input names and a first positional "
                "parameter named 'context'.".format(
                    decorator_name=decorator_name,
                    solid_name=fn_name,
                    undeclared_inputs_printed=undeclared_inputs_printed,
                ))

    return positional_arg_name_list(input_args)
示例#13
0
def solid_execution_error_boundary(error_cls, msg_fn, step_context, **kwargs):
    """
    A specialization of user_code_error_boundary for the steps involved in executing a solid.
    This variant supports the control flow exceptions RetryRequested and Failure as well
    as respecting the RetryPolicy if present.
    """
    from dagster.core.execution.context.system import StepExecutionContext

    check.callable_param(msg_fn, "msg_fn")
    check.class_param(error_cls,
                      "error_cls",
                      superclass=DagsterUserCodeExecutionError)
    check.inst_param(step_context, "step_context", StepExecutionContext)

    with raise_execution_interrupts():

        step_context.log.begin_python_log_capture()
        retry_policy = step_context.solid_retry_policy

        try:
            yield
        except DagsterError as de:
            # The system has thrown an error that is part of the user-framework contract
            raise de

        except Exception as e:  # pylint: disable=W0703
            # An exception has been thrown by user code and computation should cease
            # with the error reported further up the stack

            # Directly thrown RetryRequested escalate before evaluating the retry policy.
            if isinstance(e, RetryRequested):
                raise e

            if retry_policy:
                raise RetryRequested(
                    max_retries=retry_policy.max_retries,
                    seconds_to_wait=retry_policy.calculate_delay(
                        step_context.previous_attempt_count + 1),
                ) from e

            # Failure exceptions get re-throw without wrapping
            if isinstance(e, Failure):
                raise e

            # Otherwise wrap the user exception with context
            raise error_cls(
                msg_fn(),
                user_exception=e,
                original_exc_info=sys.exc_info(),
                **kwargs,
            ) from e

        except (DagsterExecutionInterruptedError, KeyboardInterrupt) as ie:
            # respect retry policy when interrupts occur
            if retry_policy:
                raise RetryRequested(
                    max_retries=retry_policy.max_retries,
                    seconds_to_wait=retry_policy.calculate_delay(
                        step_context.previous_attempt_count + 1),
                ) from ie
            else:
                raise ie

        finally:
            step_context.log.end_python_log_capture()
示例#14
0
 def __init__(self, config_type, func, required_resource_keys):
     self._config_type = check.inst_param(config_type, 'config_type',
                                          ConfigType)
     self._func = check.callable_param(func, 'func')
     self._required_resource_keys = check.opt_set_param(
         required_resource_keys, 'required_resource_keys', of_type=str)
示例#15
0
    def __init__(
        self,
        type_check_fn,
        key=None,
        name=None,
        is_builtin=False,
        description=None,
        loader=None,
        materializer=None,
        serialization_strategy=None,
        auto_plugins=None,
        required_resource_keys=None,
        kind=DagsterTypeKind.REGULAR,
    ):
        check.opt_str_param(key, "key")
        check.opt_str_param(name, "name")

        check.invariant(not (name is None and key is None), "Must set key or name")

        if name is None:
            check.param_invariant(
                bool(key), "key", "If name is not provided, must provide key.",
            )
            self.key, self._name = key, None
        elif key is None:
            check.param_invariant(
                bool(name), "name", "If key is not provided, must provide name.",
            )
            self.key, self._name = name, name
        else:
            check.invariant(key and name)
            self.key, self._name = key, name

        self.description = check.opt_str_param(description, "description")
        self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)
        self.materializer = check.opt_inst_param(
            materializer, "materializer", DagsterTypeMaterializer
        )

        self.serialization_strategy = check.opt_inst_param(
            serialization_strategy,
            "serialization_strategy",
            SerializationStrategy,
            PickleSerializationStrategy(),
        )
        self.required_resource_keys = check.opt_set_param(
            required_resource_keys, "required_resource_keys",
        )

        self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn")
        _validate_type_check_fn(self._type_check_fn, self._name)

        auto_plugins = check.opt_list_param(auto_plugins, "auto_plugins", of_type=type)

        check.param_invariant(
            all(
                issubclass(auto_plugin_type, TypeStoragePlugin) for auto_plugin_type in auto_plugins
            ),
            "auto_plugins",
        )

        self.auto_plugins = auto_plugins

        self.is_builtin = check.bool_param(is_builtin, "is_builtin")
        check.invariant(
            self.display_name is not None,
            "All types must have a valid display name, got None for key {}".format(key),
        )

        self.kind = check.inst_param(kind, "kind", DagsterTypeKind)
示例#16
0
    def inner(
        fn: Callable[
            ...,
            Union[RunRequest, SkipReason, RunConfig, RunRequestGenerator],
        ]
    ) -> ScheduleDefinition:
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        # perform upfront validation of schedule tags
        _tags_fn: Optional[Callable[["ScheduleEvaluationContext"], Dict[str, str]]] = None
        if tags_fn and tags:
            raise DagsterInvalidDefinitionError(
                "Attempted to provide both tags_fn and tags as arguments"
                " to ScheduleDefinition. Must provide only one of the two."
            )
        elif tags:
            check_tags(tags, "tags")
            _tags_fn = cast(Callable[["ScheduleEvaluationContext"], Dict[str, str]], lambda _: tags)
        elif tags_fn:
            _tags_fn = cast(
                Callable[["ScheduleEvaluationContext"], Dict[str, str]],
                lambda context: tags_fn(context) or {},
            )

        def _wrapped_fn(context: "ScheduleEvaluationContext"):
            if should_execute:
                with user_code_error_boundary(
                    ScheduleExecutionError,
                    lambda: f"Error occurred during the execution of should_execute for schedule {schedule_name}",
                ):
                    if not should_execute(context):
                        yield SkipReason(
                            f"should_execute function for {schedule_name} returned false."
                        )
                        return

            with user_code_error_boundary(
                ScheduleExecutionError,
                lambda: f"Error occurred during the evaluation of schedule {schedule_name}",
            ):
                result = fn(context) if has_context_arg else fn()
                if isinstance(result, dict):
                    # this is the run-config based decorated function, wrap the evaluated run config
                    # and tags in a RunRequest
                    evaluated_run_config = copy.deepcopy(result)
                    evaluated_tags = _tags_fn(context) if _tags_fn else None
                    yield RunRequest(
                        run_key=None,
                        run_config=evaluated_run_config,
                        tags=evaluated_tags,
                    )
                else:
                    # this is a run-request based decorated function
                    yield from ensure_gen(result)

        has_context_arg = is_context_provided(get_function_params(fn))
        evaluation_fn = DecoratedScheduleFunction(
            decorated_fn=fn,
            wrapped_fn=_wrapped_fn,
            has_context_arg=has_context_arg,
        )

        schedule_def = ScheduleDefinition(
            name=schedule_name,
            cron_schedule=cron_schedule,
            pipeline_name=pipeline_name,
            solid_selection=solid_selection,
            mode=mode,
            environment_vars=environment_vars,
            execution_timezone=execution_timezone,
            description=description,
            execution_fn=evaluation_fn,
            job=job,
            default_status=default_status,
        )

        update_wrapper(schedule_def, wrapped=fn)

        return schedule_def
示例#17
0
def schedule_partition_range(
    start,
    end,
    cron_schedule,
    fmt,
    timezone,
    execution_time_to_partition_fn,
):
    check.inst_param(start, "start", datetime.datetime)
    check.opt_inst_param(end, "end", datetime.datetime)
    check.str_param(cron_schedule, "cron_schedule")
    check.str_param(fmt, "fmt")
    check.opt_str_param(timezone, "timezone")
    check.callable_param(execution_time_to_partition_fn,
                         "execution_time_to_partition_fn")

    if end and start > end:
        raise DagsterInvariantViolationError(
            'Selected date range start "{start}" is after date range end "{end}'
            .format(
                start=start.strftime(fmt),
                end=end.strftime(fmt),
            ))

    def get_schedule_range_partitions(current_time=None):
        check.opt_inst_param(current_time, "current_time", datetime.datetime)
        tz = timezone if timezone else pendulum.now().timezone.name
        _start = (start.in_tz(tz) if isinstance(start, pendulum.Pendulum) else
                  pendulum.instance(start, tz=tz))

        if end:
            _end = end
        elif current_time:
            _end = current_time
        else:
            _end = pendulum.now(tz)

        # coerce to the definition timezone
        if isinstance(_end, pendulum.Pendulum):
            _end = _end.in_tz(tz)
        else:
            _end = pendulum.instance(_end, tz=tz)

        end_timestamp = _end.timestamp()

        partitions = []
        for next_time in schedule_execution_time_iterator(
                _start.timestamp(), cron_schedule, tz):

            partition_time = execution_time_to_partition_fn(next_time)

            if partition_time.timestamp() > end_timestamp:
                break

            if partition_time.timestamp() < _start.timestamp():
                continue

            partitions.append(
                Partition(value=partition_time,
                          name=partition_time.strftime(fmt)))

        return partitions[:-1]

    return get_schedule_range_partitions
    def event_generator(
        self,
        execution_plan,
        run_config,
        pipeline_run,
        instance,
        scoped_resources_builder_cm,
        intermediate_storage=None,
        raise_on_error=False,
        resource_instances_to_override=None,
    ):
        execution_plan = check.inst_param(execution_plan, "execution_plan",
                                          ExecutionPlan)
        pipeline_def = execution_plan.pipeline.get_definition()

        run_config = check.dict_param(run_config, "run_config", key_type=str)
        pipeline_run = check.inst_param(pipeline_run, "pipeline_run",
                                        PipelineRun)
        instance = check.inst_param(instance, "instance", DagsterInstance)

        scoped_resources_builder_cm = check.callable_param(
            scoped_resources_builder_cm, "scoped_resources_builder_cm")
        intermediate_storage = check.opt_inst_param(
            intermediate_storage, "intermediate_storage_data",
            IntermediateStorage)
        raise_on_error = check.bool_param(raise_on_error, "raise_on_error")
        resource_instances_to_override = check.opt_dict_param(
            resource_instances_to_override, "resource_instances_to_override")

        execution_context = None
        resources_manager = None

        try:
            context_creation_data = create_context_creation_data(
                execution_plan,
                run_config,
                pipeline_run,
                instance,
            )

            log_manager = create_log_manager(context_creation_data)
            resources_manager = scoped_resources_builder_cm(
                execution_plan,
                context_creation_data.environment_config,
                context_creation_data.pipeline_run,
                log_manager,
                context_creation_data.resource_keys_to_init,
                instance,
                resource_instances_to_override,
            )
            yield from resources_manager.generate_setup_events()
            scoped_resources_builder = check.inst(
                resources_manager.get_object(), ScopedResourcesBuilder)

            intermediate_storage = create_intermediate_storage(
                context_creation_data,
                intermediate_storage,
                scoped_resources_builder,
            )

            execution_context = self.construct_context(
                context_creation_data=context_creation_data,
                scoped_resources_builder=scoped_resources_builder,
                log_manager=log_manager,
                intermediate_storage=intermediate_storage,
                raise_on_error=raise_on_error,
            )

            _validate_plan_with_context(execution_context, execution_plan)

            yield execution_context
            yield from resources_manager.generate_teardown_events()
        except DagsterError as dagster_error:
            if execution_context is None:
                user_facing_exc_info = (
                    # pylint does not know original_exc_info exists is is_user_code_error is true
                    # pylint: disable=no-member
                    dagster_error.original_exc_info
                    if dagster_error.is_user_code_error else sys.exc_info())
                error_info = serializable_error_info_from_exc_info(
                    user_facing_exc_info)

                yield DagsterEvent.pipeline_init_failure(
                    pipeline_name=pipeline_def.name,
                    failure_data=PipelineInitFailureData(error=error_info),
                    log_manager=_create_context_free_log_manager(
                        instance, pipeline_run, pipeline_def),
                )
                if resources_manager:
                    yield from resources_manager.generate_teardown_events()
            else:
                # pipeline teardown failure
                raise dagster_error

            if raise_on_error:
                raise dagster_error
示例#19
0
def _core_celery_execution_loop(pipeline_context, execution_plan,
                                step_execution_fn):
    from .tasks import make_app

    check.inst_param(pipeline_context, 'pipeline_context',
                     SystemPipelineExecutionContext)
    check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
    check.callable_param(step_execution_fn, 'step_execution_fn')

    check.param_invariant(
        isinstance(pipeline_context.executor_config,
                   (CeleryConfig, CeleryK8sJobConfig)),
        'pipeline_context',
        'Expected executor_config to be Celery config got {}'.format(
            pipeline_context.executor_config),
    )

    celery_config = pipeline_context.executor_config

    storage = pipeline_context.environment_dict.get('storage')

    # https://github.com/dagster-io/dagster/issues/2440
    check.invariant(
        pipeline_context.system_storage_def.is_persistent,
        'Cannot use in-memory storage with Celery, use filesystem (on top of NFS or '
        'similar system that allows files to be available to all nodes), S3, or GCS',
    )

    app = make_app(celery_config)

    priority_for_step = lambda step: (-1 * int(
        step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority)
    ) + -1 * _get_run_priority(pipeline_context))
    priority_for_key = lambda step_key: (priority_for_step(
        execution_plan.get_step_by_key(step_key)))
    _warn_on_priority_misuse(pipeline_context, execution_plan)

    step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
    step_errors = {}
    completed_steps = set({})  # Set[step_key]
    active_execution = execution_plan.start(
        retries=pipeline_context.executor_config.retries,
        sort_key_fn=priority_for_step)
    stopping = False

    while (not active_execution.is_complete and not stopping) or step_results:

        results_to_pop = []
        for step_key, result in sorted(step_results.items(),
                                       key=lambda x: priority_for_key(x[0])):
            if result.ready():
                try:
                    step_events = result.get()
                except Exception as e:  # pylint: disable=broad-except
                    # We will want to do more to handle the exception here.. maybe subclass Task
                    # Certainly yield an engine or pipeline event
                    step_events = []
                    step_errors[
                        step_key] = serializable_error_info_from_exc_info(
                            sys.exc_info())
                    stopping = True
                for step_event in step_events:
                    event = deserialize_json_to_dagster_namedtuple(step_event)
                    yield event
                    active_execution.handle_event(event)

                results_to_pop.append(step_key)
                completed_steps.add(step_key)

        for step_key in results_to_pop:
            if step_key in step_results:
                del step_results[step_key]
                active_execution.verify_complete(pipeline_context, step_key)

        # process skips from failures or uncovered inputs
        for event in active_execution.skipped_step_events_iterator(
                pipeline_context):
            yield event

        # don't add any new steps if we are stopping
        if stopping:
            continue

        # This is a slight refinement. If we have n workers idle and schedule m > n steps for
        # execution, the first n steps will be picked up by the idle workers in the order in
        # which they are scheduled (and the following m-n steps will be executed in priority
        # order, provided that it takes longer to execute a step than to schedule it). The test
        # case has m >> n to exhibit this behavior in the absence of this sort step.
        for step in active_execution.get_steps_to_execute():
            try:
                queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG,
                                      task_default_queue)
                yield DagsterEvent.engine_event(
                    pipeline_context,
                    'Submitting celery task for step "{step_key}" to queue "{queue}".'
                    .format(step_key=step.key, queue=queue),
                    EngineEventData(marker_start=DELEGATE_MARKER),
                    step_key=step.key,
                )

                # Get the Celery priority for this step
                priority = _get_step_priority(pipeline_context, step)

                # Submit the Celery tasks
                step_results[step.key] = step_execution_fn(
                    app, pipeline_context, step, queue, priority)

            except Exception:
                yield DagsterEvent.engine_event(
                    pipeline_context,
                    'Encountered error during celery task submission.'.format(
                    ),
                    event_specific_data=EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(
                            sys.exc_info()), ),
                )
                raise

        time.sleep(TICK_SECONDS)

    if step_errors:
        raise DagsterSubprocessError(
            'During celery execution errors occurred in workers:\n{error_list}'
            .format(error_list='\n'.join([
                '[{step}]: {err}'.format(step=key, err=err.to_string())
                for key, err in step_errors.items()
            ])),
            subprocess_error_infos=list(step_errors.values()),
        )
示例#20
0
def _create_solid_compute_wrapper(fn, input_defs, output_defs):
    check.callable_param(fn, "fn")
    check.list_param(input_defs, "input_defs", of_type=InputDefinition)
    check.list_param(output_defs, "output_defs", of_type=OutputDefinition)

    input_names = [
        input_def.name for input_def in input_defs
        if not input_def.dagster_type.kind == DagsterTypeKind.NOTHING
    ]

    @wraps(fn)
    def compute(context, input_defs):
        kwargs = {}
        for input_name in input_names:
            kwargs[input_name] = input_defs[input_name]

        result = fn(context, **kwargs)

        if inspect.isgenerator(result):
            for item in result:
                yield item
        else:
            if isinstance(
                    result,
                (AssetMaterialization, Materialization, ExpectationResult)):
                raise DagsterInvariantViolationError((
                    "Error in solid {solid_name}: If you are returning an AssetMaterialization "
                    "or an ExpectationResult from solid you must yield them to avoid "
                    "ambiguity with an implied result from returning a value.".
                    format(solid_name=context.solid.name)))

            if isinstance(result, Output):
                yield result
            elif len(output_defs) == 1:
                yield Output(value=result, output_name=output_defs[0].name)
            elif result is not None:
                if not output_defs:
                    raise DagsterInvariantViolationError((
                        "Error in solid {solid_name}: Unexpectedly returned output {result} "
                        "of type {type_}. Solid is explicitly defined to return no "
                        "results.").format(solid_name=context.solid.name,
                                           result=result,
                                           type_=type(result)))

                raise DagsterInvariantViolationError((
                    "Error in solid {solid_name}: Solid unexpectedly returned "
                    "output {result} of type {type_}. Should "
                    "be a generator, containing or yielding "
                    "{n_results} results: {{{expected_results}}}.").format(
                        solid_name=context.solid.name,
                        result=result,
                        type_=type(result),
                        n_results=len(output_defs),
                        expected_results=", ".join([
                            "'{result_name}': {dagster_type}".format(
                                result_name=output_def.name,
                                dagster_type=output_def.dagster_type,
                            ) for output_def in output_defs
                        ]),
                    ))

    return compute
示例#21
0
def pipeline_initialization_event_generator(
    execution_plan,
    run_config,
    pipeline_run,
    instance,
    scoped_resources_builder_cm,
    system_storage_data=None,
    intermediate_storage=None,
    raise_on_error=False,
):
    execution_plan = check.inst_param(execution_plan, 'execution_plan',
                                      ExecutionPlan)
    pipeline_def = execution_plan.pipeline.get_definition()

    run_config = check.dict_param(run_config, 'run_config', key_type=str)
    pipeline_run = check.inst_param(pipeline_run, 'pipeline_run', PipelineRun)
    instance = check.inst_param(instance, 'instance', DagsterInstance)

    scoped_resources_builder_cm = check.callable_param(
        scoped_resources_builder_cm, 'scoped_resources_builder_cm')
    system_storage_data = check.opt_inst_param(system_storage_data,
                                               'system_storage_data',
                                               SystemStorageData)
    intermediate_storage = check.opt_inst_param(intermediate_storage,
                                                'intermediate_storage_data',
                                                IntermediateStorage)
    raise_on_error = check.bool_param(raise_on_error, 'raise_on_error')

    pipeline_context = None
    resources_manager = None

    try:
        context_creation_data = create_context_creation_data(
            execution_plan,
            run_config,
            pipeline_run,
            instance,
        )

        executor = check.inst(create_executor(context_creation_data), Executor,
                              'Must return an Executor')

        log_manager = create_log_manager(context_creation_data)
        resources_manager = scoped_resources_builder_cm(
            execution_plan,
            context_creation_data.environment_config,
            context_creation_data.pipeline_run,
            log_manager,
            context_creation_data.resource_keys_to_init,
        )
        for event in resources_manager.generate_setup_events():
            yield event
        scoped_resources_builder = check.inst(resources_manager.get_object(),
                                              ScopedResourcesBuilder)
        system_storage_data = create_system_storage_data(
            context_creation_data, system_storage_data,
            scoped_resources_builder)
        if intermediate_storage or context_creation_data.intermediate_storage_def:
            intermediate_storage = create_intermediate_storage(
                context_creation_data,
                intermediate_storage,
                scoped_resources_builder,
            )
        else:
            # remove this as part of https://github.com/dagster-io/dagster/issues/2705
            intermediate_storage = system_storage_data.intermediates_manager
        pipeline_context = construct_pipeline_execution_context(
            context_creation_data=context_creation_data,
            scoped_resources_builder=scoped_resources_builder,
            system_storage_data=system_storage_data,
            intermediate_storage=intermediate_storage,
            log_manager=log_manager,
            executor=executor,
            raise_on_error=raise_on_error,
        )

        _validate_plan_with_context(pipeline_context, execution_plan)

        yield pipeline_context
        for event in resources_manager.generate_teardown_events():
            yield event
    except DagsterError as dagster_error:
        if pipeline_context is None:
            user_facing_exc_info = (
                # pylint does not know original_exc_info exists is is_user_code_error is true
                # pylint: disable=no-member
                dagster_error.original_exc_info
                if dagster_error.is_user_code_error else sys.exc_info())
            error_info = serializable_error_info_from_exc_info(
                user_facing_exc_info)

            yield DagsterEvent.pipeline_init_failure(
                pipeline_name=pipeline_def.name,
                failure_data=PipelineInitFailureData(error=error_info),
                log_manager=_create_context_free_log_manager(
                    instance, pipeline_run, pipeline_def),
            )
            if resources_manager:
                for event in resources_manager.generate_teardown_events():
                    yield event
        else:
            # pipeline teardown failure
            raise dagster_error

        if raise_on_error:
            raise dagster_error
示例#22
0
 def __new__(cls, config_fn, config_schema=None):
     return super(ConfigMapping, cls).__new__(
         cls,
         config_fn=check.callable_param(config_fn, 'config_fn'),
         config_schema=check_user_facing_opt_config_param(config_schema, 'config_schema'),
     )
示例#23
0
    def __init__(
        self,
        type_check_fn: TypeCheckFn,
        key: t.Optional[str] = None,
        name: t.Optional[str] = None,
        is_builtin: bool = False,
        description: t.Optional[str] = None,
        loader: t.Optional[DagsterTypeLoader] = None,
        materializer: t.Optional[DagsterTypeMaterializer] = None,
        required_resource_keys: t.Optional[t.Set[str]] = None,
        kind: DagsterTypeKind = DagsterTypeKind.REGULAR,
        typing_type: t.Any = None,
        metadata_entries: t.Optional[t.List[MetadataEntry]] = None,
        metadata: t.Optional[t.Dict[str, RawMetadataValue]] = None,
    ):
        check.opt_str_param(key, "key")
        check.opt_str_param(name, "name")

        check.invariant(not (name is None and key is None),
                        "Must set key or name")
        if name is None:
            key = check.not_none(
                key,
                "If name is not provided, must provide key.",
            )
            self.key, self._name = key, None
        elif key is None:
            name = check.not_none(
                name,
                "If key is not provided, must provide name.",
            )
            self.key, self._name = name, name
        else:
            check.invariant(key and name)
            self.key, self._name = key, name

        self.description = check.opt_str_param(description, "description")
        self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)
        self.materializer = check.opt_inst_param(materializer, "materializer",
                                                 DagsterTypeMaterializer)

        self.required_resource_keys = check.opt_set_param(
            required_resource_keys,
            "required_resource_keys",
        )

        self._type_check_fn = check.callable_param(type_check_fn,
                                                   "type_check_fn")
        _validate_type_check_fn(self._type_check_fn, self._name)

        self.is_builtin = check.bool_param(is_builtin, "is_builtin")
        check.invariant(
            self.display_name is not None,
            "All types must have a valid display name, got None for key {}".
            format(key),
        )

        self.kind = check.inst_param(kind, "kind", DagsterTypeKind)

        self.typing_type = typing_type

        metadata_entries = check.opt_list_param(metadata_entries,
                                                "metadata_entries",
                                                of_type=MetadataEntry)
        metadata = check.opt_dict_param(metadata, "metadata", key_type=str)
        self._metadata_entries = normalize_metadata(metadata, metadata_entries)
示例#24
0
    def __init__(self, indent_level=2, printer=print):
        self.current_indent = 0
        self.indent_level = check.int_param(indent_level, 'indent_level')
        self.printer = check.callable_param(printer, 'printer')

        self._line_so_far = ''
示例#25
0
def core_celery_execution_loop(pipeline_context, execution_plan,
                               step_execution_fn):

    check.inst_param(pipeline_context, "pipeline_context",
                     SystemPipelineExecutionContext)
    check.inst_param(execution_plan, "execution_plan", ExecutionPlan)
    check.callable_param(step_execution_fn, "step_execution_fn")

    executor = pipeline_context.executor

    # https://github.com/dagster-io/dagster/issues/2440
    check.invariant(
        execution_plan.artifacts_persisted,
        "Cannot use in-memory storage with Celery, use filesystem (on top of NFS or "
        "similar system that allows files to be available to all nodes), S3, or GCS",
    )

    app = make_app(executor.app_args())

    priority_for_step = lambda step: (-1 * int(
        step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority)
    ) + -1 * _get_run_priority(pipeline_context))
    priority_for_key = lambda step_key: (priority_for_step(
        execution_plan.get_step_by_key(step_key)))
    _warn_on_priority_misuse(pipeline_context, execution_plan)

    step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
    step_errors = {}

    with execution_plan.start(
            retries=pipeline_context.executor.retries,
            sort_key_fn=priority_for_step,
    ) as active_execution:

        stopping = False

        while (not active_execution.is_complete
               and not stopping) or step_results:
            if active_execution.check_for_interrupts():
                yield DagsterEvent.engine_event(
                    pipeline_context,
                    "Celery executor: received termination signal - revoking active tasks from workers",
                    EngineEventData.interrupted(list(step_results.keys())),
                )
                stopping = True
                for key, result in step_results.items():
                    result.revoke()
                    active_execution.mark_interrupted(key)
            results_to_pop = []
            for step_key, result in sorted(
                    step_results.items(),
                    key=lambda x: priority_for_key(x[0])):
                if result.ready():
                    try:
                        step_events = result.get()
                    except TaskRevokedError:
                        step_events = []
                        yield DagsterEvent.engine_event(
                            pipeline_context,
                            'celery task for running step "{step_key}" was revoked.'
                            .format(step_key=step_key, ),
                            EngineEventData(marker_end=DELEGATE_MARKER),
                            step_key=step_key,
                        )
                    except Exception:  # pylint: disable=broad-except
                        # We will want to do more to handle the exception here.. maybe subclass Task
                        # Certainly yield an engine or pipeline event
                        step_events = []
                        step_errors[
                            step_key] = serializable_error_info_from_exc_info(
                                sys.exc_info())
                    for step_event in step_events:
                        event = deserialize_json_to_dagster_namedtuple(
                            step_event)
                        yield event
                        active_execution.handle_event(event)

                    results_to_pop.append(step_key)

            for step_key in results_to_pop:
                if step_key in step_results:
                    del step_results[step_key]
                    active_execution.verify_complete(pipeline_context,
                                                     step_key)

            # process skips from failures or uncovered inputs
            for event in active_execution.plan_events_iterator(
                    pipeline_context):
                yield event

            # don't add any new steps if we are stopping
            if stopping or step_errors:
                continue

            # This is a slight refinement. If we have n workers idle and schedule m > n steps for
            # execution, the first n steps will be picked up by the idle workers in the order in
            # which they are scheduled (and the following m-n steps will be executed in priority
            # order, provided that it takes longer to execute a step than to schedule it). The test
            # case has m >> n to exhibit this behavior in the absence of this sort step.
            for step in active_execution.get_steps_to_execute():
                try:
                    queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG,
                                          task_default_queue)
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Submitting celery task for step "{step_key}" to queue "{queue}".'
                        .format(step_key=step.key, queue=queue),
                        EngineEventData(marker_start=DELEGATE_MARKER),
                        step_key=step.key,
                    )

                    # Get the Celery priority for this step
                    priority = _get_step_priority(pipeline_context, step)

                    # Submit the Celery tasks
                    step_results[step.key] = step_execution_fn(
                        app, pipeline_context, step, queue, priority)

                except Exception:
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        "Encountered error during celery task submission.".
                        format(),
                        event_specific_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()), ),
                    )
                    raise

            time.sleep(TICK_SECONDS)

        if step_errors:
            raise DagsterSubprocessError(
                "During celery execution errors occurred in workers:\n{error_list}"
                .format(error_list="\n".join([
                    "[{step}]: {err}".format(step=key, err=err.to_string())
                    for key, err in step_errors.items()
                ])),
                subprocess_error_infos=list(step_errors.values()),
            )
示例#26
0
    def __init__(
        self,
        type_check_fn,
        key=None,
        name=None,
        is_builtin=False,
        description=None,
        input_hydration_config=None,
        output_materialization_config=None,
        serialization_strategy=None,
        auto_plugins=None,
        required_resource_keys=None,
        kind=DagsterTypeKind.REGULAR,
    ):
        check.opt_str_param(key, 'key')
        check.opt_str_param(name, 'name')

        check.invariant(not (name is None and key is None),
                        'Must set key or name')

        if name is None:
            check.param_invariant(
                bool(key),
                'key',
                'If name is not provided, must provide key.',
            )
            self.key, self.name = key, None
        elif key is None:
            check.param_invariant(
                bool(name),
                'name',
                'If key is not provided, must provide name.',
            )
            self.key, self.name = name, name
        else:
            check.invariant(key and name)
            self.key, self.name = key, name

        self.description = check.opt_str_param(description, 'description')
        self.input_hydration_config = check.opt_inst_param(
            input_hydration_config, 'input_hydration_config',
            InputHydrationConfig)
        self.output_materialization_config = check.opt_inst_param(
            output_materialization_config,
            'output_materialization_config',
            OutputMaterializationConfig,
        )
        self.serialization_strategy = check.opt_inst_param(
            serialization_strategy,
            'serialization_strategy',
            SerializationStrategy,
            PickleSerializationStrategy(),
        )
        self.required_resource_keys = check.opt_set_param(
            required_resource_keys,
            'required_resource_keys',
        )

        self._type_check_fn = check.callable_param(type_check_fn,
                                                   'type_check_fn')
        _validate_type_check_fn(self._type_check_fn, self.name)

        auto_plugins = check.opt_list_param(auto_plugins,
                                            'auto_plugins',
                                            of_type=type)

        check.param_invariant(
            all(
                issubclass(auto_plugin_type, TypeStoragePlugin)
                for auto_plugin_type in auto_plugins),
            'auto_plugins',
        )

        self.auto_plugins = auto_plugins

        self.is_builtin = check.bool_param(is_builtin, 'is_builtin')
        check.invariant(
            self.display_name is not None,
            'All types must have a valid display name, got None for key {}'.
            format(key),
        )

        self.kind = check.inst_param(kind, 'kind', DagsterTypeKind)
示例#27
0
def validate_solid_fn(
    decorator_name, fn_name, compute_fn, input_defs, expected_positionals=None, exclude_nothing=True
):
    check.str_param(decorator_name, 'decorator_name')
    check.str_param(fn_name, 'fn_name')
    check.callable_param(compute_fn, 'compute_fn')
    check.list_param(input_defs, 'input_defs', of_type=InputDefinition)
    expected_positionals = check.opt_list_param(
        expected_positionals, 'expected_positionals', of_type=str
    )
    if exclude_nothing:
        names = set(inp.name for inp in input_defs if not inp.runtime_type.is_nothing)
        nothing_names = set(inp.name for inp in input_defs if inp.runtime_type.is_nothing)
    else:
        names = set(inp.name for inp in input_defs)
        nothing_names = set()

    # Currently being super strict about naming. Might be a good idea to relax. Starting strict.
    fn_positionals, input_args = split_function_parameters(compute_fn, expected_positionals)

    # Validate Positional Parameters
    missing_positional = validate_decorated_fn_positionals(fn_positionals, expected_positionals)
    if missing_positional:
        raise DagsterInvalidDefinitionError(
            "{decorator_name} '{solid_name}' decorated function does not have required positional "
            "parameter '{missing_param}'. Solid functions should only have keyword arguments "
            "that match input names and a first positional parameter named 'context'.".format(
                decorator_name=decorator_name, solid_name=fn_name, missing_param=missing_positional
            )
        )

    # Validate non positional parameters
    invalid_function_info = validate_decorated_fn_input_args(names, input_args)
    if invalid_function_info:
        if invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES['vararg']:
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function has positional vararg parameter "
                "'{param}'. Solid functions should only have keyword arguments that match "
                "input names and a first positional parameter named 'context'.".format(
                    decorator_name=decorator_name,
                    solid_name=fn_name,
                    param=invalid_function_info.param,
                )
            )
        elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES['missing_name']:
            if invalid_function_info.param in nothing_names:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is "
                    "one of the solid input_defs of type 'Nothing' which should not be included since "
                    "no data will be passed for it. ".format(
                        decorator_name=decorator_name,
                        solid_name=fn_name,
                        param=invalid_function_info.param,
                    )
                )
            else:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is not "
                    "one of the solid input_defs. Solid functions should only have keyword arguments "
                    "that match input names and a first positional parameter named 'context'.".format(
                        decorator_name=decorator_name,
                        solid_name=fn_name,
                        param=invalid_function_info.param,
                    )
                )
        elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES['extra']:
            undeclared_inputs_printed = ", '".join(invalid_function_info.missing_names)
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function does not have parameter(s) "
                "'{undeclared_inputs_printed}', which are in solid's input_defs. Solid functions "
                "should only have keyword arguments that match input names and a first positional "
                "parameter named 'context'.".format(
                    decorator_name=decorator_name,
                    solid_name=fn_name,
                    undeclared_inputs_printed=undeclared_inputs_printed,
                )
            )

    return positional_arg_name_list(input_args)
示例#28
0
 def __init__(self, config_type, func, required_resource_keys):
     self._config_type = check.inst_param(config_type, "config_type",
                                          ConfigType)
     self._func = check.callable_param(func, "func")
     self._required_resource_keys = check.opt_set_param(
         required_resource_keys, "required_resource_keys", of_type=str)
示例#29
0
def _validate_solid_fn(solid_name,
                       compute_fn,
                       input_defs,
                       expected_positionals=None,
                       exclude_nothing=True):
    check.str_param(solid_name, 'solid_name')
    check.callable_param(compute_fn, 'compute_fn')
    check.list_param(input_defs, 'input_defs', of_type=InputDefinition)
    expected_positionals = check.opt_list_param(expected_positionals,
                                                'expected_positionals',
                                                of_type=(str, tuple))
    if exclude_nothing:
        names = set(inp.name for inp in input_defs
                    if not inp.runtime_type.is_nothing)
        nothing_names = set(inp.name for inp in input_defs
                            if inp.runtime_type.is_nothing)
    else:
        names = set(inp.name for inp in input_defs)
        nothing_names = set()

    # Currently being super strict about naming. Might be a good idea to relax. Starting strict.
    try:
        _validate_decorated_fn(compute_fn, names, expected_positionals)
    except FunctionValidationError as e:
        if e.error_type == FunctionValidationError.TYPES['vararg']:
            raise DagsterInvalidDefinitionError(
                "solid '{solid_name}' decorated function has positional vararg parameter "
                "'{e.param}'. Solid functions should only have keyword arguments that match "
                "input names and a first positional parameter named 'context'."
                .format(solid_name=solid_name, e=e))
        elif e.error_type == FunctionValidationError.TYPES['missing_name']:
            if e.param in nothing_names:
                raise DagsterInvalidDefinitionError(
                    "solid '{solid_name}' decorated function has parameter '{e.param}' that is "
                    "one of the solid input_defs of type 'Nothing' which should not be included since "
                    "no data will be passed for it. ".format(
                        solid_name=solid_name, e=e))
            else:
                raise DagsterInvalidDefinitionError(
                    "solid '{solid_name}' decorated function has parameter '{e.param}' that is not "
                    "one of the solid input_defs. Solid functions should only have keyword arguments "
                    "that match input names and a first positional parameter named 'context'."
                    .format(solid_name=solid_name, e=e))
        elif e.error_type == FunctionValidationError.TYPES[
                'missing_positional']:
            raise DagsterInvalidDefinitionError(
                "solid '{solid_name}' decorated function does not have required positional "
                "parameter '{e.param}'. Solid functions should only have keyword arguments "
                "that match input names and a first positional parameter named 'context'."
                .format(solid_name=solid_name, e=e))
        elif e.error_type == FunctionValidationError.TYPES['extra']:
            undeclared_inputs_printed = ", '".join(e.missing_names)
            raise DagsterInvalidDefinitionError(
                "solid '{solid_name}' decorated function does not have parameter(s) "
                "'{undeclared_inputs_printed}', which are in solid's input_defs. Solid functions "
                "should only have keyword arguments that match input names and a first positional "
                "parameter named 'context'.".format(
                    solid_name=solid_name,
                    undeclared_inputs_printed=undeclared_inputs_printed))
        else:
            raise e
示例#30
0
 def __init__(self, resource_fn, config_field=None, description=None):
     self.resource_fn = check.callable_param(resource_fn, 'resource_fn')
     self.config_field = check_opt_field_param(config_field, 'config_field')
     self.description = check.opt_str_param(description, 'description')