def evaluate_composite_config(context): check.inst_param(context, 'context', TraversalContext) check.param_invariant(context.config_type.is_composite, 'composite_type') fields = context.config_type.fields if context.config_value and not isinstance(context.config_value, dict): return EvaluateValueResult.for_error( create_composite_type_mismatch_error(context)) evaluate_value_result = _evaluate_composite_solid_config(context) if evaluate_value_result.errors or evaluate_value_result.value: return evaluate_value_result # ASK: this can crash on user error config_value = check.opt_dict_param(context.config_value, 'incoming_value', key_type=str) defined_fields = set(fields.keys()) incoming_fields = set(config_value.keys()) extra_fields = list(incoming_fields - defined_fields) # We'll build up a dict of processed config values below errors = [] output_config_value = {} # Here, we support permissive composites. In cases where we know the set of permissible keys a # priori, we validate against the config: if not context.config_type.is_permissive_composite: if extra_fields: if len(extra_fields) == 1: errors.append( create_field_not_defined_error(context, extra_fields[0])) else: errors.append( create_fields_not_defined_error(context, extra_fields)) # And for permissive fields, we just pass along to the output without further validation else: for field_name in extra_fields: output_config_value[field_name] = config_value[field_name] # ...However, for any fields the user *has* told us about, we validate against their config # specifications missing_fields = [] for key, field_def in fields.items(): if key in incoming_fields: evaluate_value_result = _evaluate_config( context.for_field(field_def, key, context.config_value.get(key, {}))) if evaluate_value_result.errors: errors += evaluate_value_result.errors else: output_config_value[key] = evaluate_value_result.value elif is_solid_dict( field_def.config_type) and context.config_value is not None: evaluate_value_result = _evaluate_config( context.for_field(field_def, key, context.config_value.get(key, {}))) if evaluate_value_result.errors: missing_fields.append(key) else: output_config_value[key] = evaluate_value_result.value elif field_def.is_optional: # Try to see if this is a composite solid speculative_composite_solid_result = _evaluate_composite_solid_config( context.for_field( field_def, key, field_def.default_value if field_def.default_provided else {})) if speculative_composite_solid_result.value is not None: output_config_value[ key] = speculative_composite_solid_result.value else: if field_def.default_provided: output_config_value[key] = field_def.default_value else: check.invariant(not field_def.default_provided) missing_fields.append(key) if missing_fields: if len(missing_fields) == 1: errors.append( create_missing_required_field_error(context, missing_fields[0])) else: errors.append( create_missing_required_fields_error(context, missing_fields)) if errors: return EvaluateValueResult.for_errors(errors) else: return EvaluateValueResult.for_value(output_config_value)
def monthly_schedule( pipeline_name, start_date, name=None, execution_day_of_month=1, execution_time=datetime.time(0, 0), tags_fn_for_date=None, solid_selection=None, mode="default", should_execute=None, environment_vars=None, end_date=None, ): '''Create a schedule that runs monthly. The decorated function will be called as the ``run_config_fn`` of the underlying :py:class:`~dagster.ScheduleDefinition` and should take a :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment dict for the scheduled execution. Args: pipeline_name (str): The name of the pipeline to execute when the schedule runs. start_date (datetime.datetime): The date from which to run the schedule. name (Optional[str]): The name of the schedule to create. execution_day_of_month (int): The day of the month on which to run the schedule (must be between 0 and 31). execution_time (datetime.time): The time at which to execute the schedule. tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A function that generates tags to attach to the schedules runs. Takes the date of the schedule run and returns a dictionary of tags (string key-value pairs). solid_selection (Optional[List[str]]): A list of solid subselection (including single solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']`` mode (Optional[str]): The pipeline mode in which to execute this schedule. (Default: 'default') should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at schedule execution tie to determine whether a schedule should execute or skip. Takes a :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the schedule should execute). Defaults to a function that always returns ``True``. environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing the schedule. end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to current time. ''' check.opt_str_param(name, 'name') check.inst_param(start_date, 'start_date', datetime.datetime) check.opt_inst_param(end_date, 'end_date', datetime.datetime) check.opt_callable_param(tags_fn_for_date, 'tags_fn_for_date') check.opt_nullable_list_param(solid_selection, 'solid_selection', of_type=str) mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME) check.opt_callable_param(should_execute, 'should_execute') check.opt_dict_param(environment_vars, 'environment_vars', key_type=str, value_type=str) check.str_param(pipeline_name, 'pipeline_name') check.int_param(execution_day_of_month, 'execution_day') check.inst_param(execution_time, 'execution_time', datetime.time) if execution_day_of_month <= 0 or execution_day_of_month > 31: raise DagsterInvalidDefinitionError( '`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be ' 'between 1 and 31'.format(execution_day_of_month)) cron_schedule = '{minute} {hour} {day} * *'.format( minute=execution_time.minute, hour=execution_time.hour, day=execution_day_of_month) partition_fn = date_partition_range(start_date, end=end_date, delta=relativedelta(months=1), fmt="%Y-%m") def inner(fn): check.callable_param(fn, 'fn') schedule_name = name or fn.__name__ tags_fn_for_partition_value = lambda partition: {} if tags_fn_for_date: tags_fn_for_partition_value = lambda partition: tags_fn_for_date( partition.value) partition_set = PartitionSetDefinition( name='{}_partitions'.format(schedule_name), pipeline_name=pipeline_name, partition_fn=partition_fn, run_config_fn_for_partition=lambda partition: fn(partition.value), solid_selection=solid_selection, tags_fn_for_partition=tags_fn_for_partition_value, mode=mode, ) return partition_set.create_schedule_definition( schedule_name, cron_schedule, should_execute=should_execute, environment_vars=environment_vars, ) return inner
def weekly_schedule( pipeline_name: str, start_date: datetime.datetime, name: Optional[str] = None, execution_day_of_week: int = 0, execution_time: datetime.time = datetime.time(0, 0), tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None, solid_selection: Optional[List[str]] = None, mode: Optional[str] = "default", should_execute: Optional[Callable[["ScheduleExecutionContext"], bool]] = None, environment_vars: Optional[Dict[str, str]] = None, end_date: Optional[datetime.datetime] = None, execution_timezone: Optional[str] = None, partition_weeks_offset: Optional[int] = 1, description: Optional[str] = None, ) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]: """Create a partitioned schedule that runs daily. The decorated function should accept a datetime object as its only argument. The datetime represents the date partition that it's meant to run on. The decorated function should return a run configuration dictionary, which will be used as configuration for the scheduled run. The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`. Args: pipeline_name (str): The name of the pipeline to execute when the schedule runs. start_date (datetime.datetime): The date from which to run the schedule. name (Optional[str]): The name of the schedule to create. execution_day_of_week (int): The day of the week on which to run the schedule. Must be between 0 (Sunday) and 6 (Saturday). execution_time (datetime.time): The time at which to execute the schedule. tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A function that generates tags to attach to the schedules runs. Takes the date of the schedule run and returns a dictionary of tags (string key-value pairs). solid_selection (Optional[List[str]]): A list of solid subselection (including single solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']`` mode (Optional[str]): The pipeline mode in which to execute this schedule. (Default: 'default') should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at schedule execution tie to determine whether a schedule should execute or skip. Takes a :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the schedule should execute). Defaults to a function that always returns ``True``. environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing the schedule. end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to current time. execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works with DagsterDaemonScheduler, and must be set when using that scheduler. partition_weeks_offset (Optional[int]): How many weeks back to go when choosing the partition for a given schedule execution. For example, when partition_weeks_offset=1, the schedule that executes during week N will fill in the partition for week N-1. (Default: 1) description (Optional[str]): A human-readable description of the schedule. """ check.opt_str_param(name, "name") check.inst_param(start_date, "start_date", datetime.datetime) check.opt_inst_param(end_date, "end_date", datetime.datetime) check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date") check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str) mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME) check.opt_callable_param(should_execute, "should_execute") check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str) check.str_param(pipeline_name, "pipeline_name") check.int_param(execution_day_of_week, "execution_day_of_week") check.inst_param(execution_time, "execution_time", datetime.time) check.opt_str_param(execution_timezone, "execution_timezone") check.opt_int_param(partition_weeks_offset, "partition_weeks_offset") check.opt_str_param(description, "description") if start_date.hour != 0 or start_date.minute != 0 or start_date.second != 0: warnings.warn( "`start_date` must be at the beginning of a day for a weekly schedule. " "Use `execution_time` to execute the schedule at a specific time of day. For example, " "to run the schedule at 3AM each Tuesday starting on 10/20/2020, your schedule " "definition would look like:" """ @weekly_schedule( start_date=datetime.datetime(2020, 10, 20), execution_day_of_week=1, execution_time=datetime.time(3, 0) ): def my_schedule_definition(_): ... """ ) if execution_day_of_week < 0 or execution_day_of_week >= 7: raise DagsterInvalidDefinitionError( "`execution_day_of_week={}` is not valid for weekly schedule. Execution day must be " "between 0 [Sunday] and 6 [Saturday]".format(execution_day_of_week) ) cron_schedule = "{minute} {hour} * * {day}".format( minute=execution_time.minute, hour=execution_time.hour, day=execution_day_of_week ) fmt = DEFAULT_DATE_FORMAT day_difference = (execution_day_of_week - (start_date.weekday() + 1)) % 7 execution_time_to_partition_fn = ( lambda d: pendulum.instance(d) .replace(hour=0, minute=0) .subtract(weeks=partition_weeks_offset, days=day_difference) ) partition_fn = schedule_partition_range( start_date, end=end_date, cron_schedule=cron_schedule, fmt=fmt, timezone=execution_timezone, execution_time_to_partition_fn=execution_time_to_partition_fn, inclusive=(partition_weeks_offset == 0), ) def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition: check.callable_param(fn, "fn") schedule_name = name or fn.__name__ tags_fn_for_partition_value: Callable[ ["Partition"], Optional[Dict[str, str]] ] = lambda partition: {} if tags_fn_for_date: tags_fn = cast( Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date ) tags_fn_for_partition_value = lambda partition: tags_fn(partition.value) partition_set = PartitionSetDefinition( name="{}_partitions".format(schedule_name), pipeline_name=pipeline_name, partition_fn=partition_fn, run_config_fn_for_partition=lambda partition: fn(partition.value), solid_selection=solid_selection, tags_fn_for_partition=tags_fn_for_partition_value, mode=mode, ) return partition_set.create_schedule_definition( schedule_name, cron_schedule, should_execute=should_execute, environment_vars=environment_vars, partition_selector=create_offset_partition_selector( execution_time_to_partition_fn=execution_time_to_partition_fn, ), execution_timezone=execution_timezone, description=description, ) return inner
def __new__( cls, pipeline_name=None, run_id=None, environment_dict=None, mode=None, solid_selection=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot_id=None, execution_plan_snapshot_id=None, ## GRAVEYARD BELOW # see https://github.com/dagster-io/dagster/issues/2372 for explanation previous_run_id=None, selector=None, solid_subset=None, ): # a frozenset which contains the names of the solids to execute check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str) # a list of solid queries provided by the user # possible to be None when only solids_to_execute is set by the user directly check.opt_list_param(solid_selection, 'solid_selection', of_type=str) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) check.opt_str_param(root_run_id, 'root_run_id') check.opt_str_param(parent_run_id, 'parent_run_id') check.invariant( (root_run_id is not None and parent_run_id is not None) or (root_run_id is None and parent_run_id is None), ( 'Must set both root_run_id and parent_run_id when creating a PipelineRun that ' 'belongs to a run group' ), ) # Compatibility # ---------------------------------------------------------------------------------------- # Historical runs may have previous_run_id set, in which case # that previous ID becomes both the root and the parent if previous_run_id: if not (parent_run_id and root_run_id): parent_run_id = previous_run_id root_run_id = previous_run_id check.opt_inst_param(selector, 'selector', ExecutionSelector) if selector: check.invariant( pipeline_name is None or selector.name == pipeline_name, ( 'Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: ' 'selector was passed with pipeline {selector_pipeline}'.format( pipeline_name=pipeline_name, selector_pipeline=selector.name ) ), ) if pipeline_name is None: pipeline_name = selector.name check.invariant( solids_to_execute is None or set(selector.solid_subset) == solids_to_execute, ( 'Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: ' 'selector was passed with subset {selector_subset}'.format( solids_to_execute=solids_to_execute, selector_subset=selector.solid_subset ) ), ) # for old runs that only have selector but no solids_to_execute if solids_to_execute is None: solids_to_execute = ( frozenset(selector.solid_subset) if selector.solid_subset else None ) # for old runs that specified list-type solid_subset check.opt_list_param(solid_subset, 'solid_subset', of_type=str) if solid_subset: solids_to_execute = frozenset(solid_subset) # ---------------------------------------------------------------------------------------- return super(PipelineRun, cls).__new__( cls, pipeline_name=check.opt_str_param(pipeline_name, 'pipeline_name'), run_id=check.opt_str_param(run_id, 'run_id', default=make_new_run_id()), environment_dict=check.opt_dict_param( environment_dict, 'environment_dict', key_type=str ), mode=check.opt_str_param(mode, 'mode'), solid_selection=solid_selection, solids_to_execute=solids_to_execute, step_keys_to_execute=step_keys_to_execute, status=check.opt_inst_param( status, 'status', PipelineRunStatus, PipelineRunStatus.NOT_STARTED ), tags=check.opt_dict_param(tags, 'tags', key_type=str), root_run_id=root_run_id, parent_run_id=parent_run_id, pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id, 'pipeline_snapshot_id'), execution_plan_snapshot_id=check.opt_str_param( execution_plan_snapshot_id, 'execution_plan_snapshot_id' ), )
def __new__(cls, data: Optional[Dict[str, Any]]): return super(JsonMetadataEntryData, cls).__new__( cls, check.opt_dict_param(data, "data", key_type=str) )
def test_opt_dict_param(): assert check.opt_dict_param(None, 'opt_dict_param') == {} assert check.opt_dict_param({}, 'opt_dict_param') == {} ddict = {'a': 2} assert check.opt_dict_param(ddict, 'opt_dict_param') == ddict with pytest.raises(ParameterCheckError): check.opt_dict_param(0, 'opt_dict_param') with pytest.raises(ParameterCheckError): check.opt_dict_param(1, 'opt_dict_param') with pytest.raises(ParameterCheckError): check.opt_dict_param('foo', 'opt_dict_param') with pytest.raises(ParameterCheckError): check.opt_dict_param(['foo'], 'opt_dict_param') with pytest.raises(ParameterCheckError): check.opt_dict_param([], 'opt_dict_param')
def test_opt_dict_param_with_type(): str_to_int = {"str": 1} assert check.opt_dict_param(str_to_int, "str_to_int", key_type=str, value_type=int) assert check.opt_dict_param(str_to_int, "str_to_int", value_type=int) assert check.opt_dict_param(str_to_int, "str_to_int", key_type=str) assert check.opt_dict_param(str_to_int, "str_to_int") assert check.opt_dict_param({}, "str_to_int", key_type=str, value_type=int) == {} assert check.opt_dict_param({}, "str_to_int", value_type=int) == {} assert check.opt_dict_param({}, "str_to_int", key_type=str) == {} assert check.opt_dict_param({}, "str_to_int") == {} assert check.opt_dict_param(None, "str_to_int", key_type=str, value_type=int) == {} assert check.opt_dict_param(None, "str_to_int", value_type=int) == {} assert check.opt_dict_param(None, "str_to_int", key_type=str) == {} assert check.opt_dict_param(None, "str_to_int") == {} assert check.opt_dict_param( {"str": 1, "str2": "str", 1: "str", 2: "str"}, "multi_type_dict", key_type=(str, int), value_type=(str, int), ) class Wrong: pass with pytest.raises(CheckError): assert check.opt_dict_param(str_to_int, "str_to_int", key_type=Wrong, value_type=Wrong) with pytest.raises(CheckError): assert check.opt_dict_param(str_to_int, "str_to_int", key_type=Wrong, value_type=int) with pytest.raises(CheckError): assert check.opt_dict_param(str_to_int, "str_to_int", key_type=str, value_type=Wrong) with pytest.raises(CheckError): assert check.opt_dict_param(str_to_int, "str_to_int", key_type=Wrong) with pytest.raises(CheckError): assert check.opt_dict_param(str_to_int, "str_to_int", value_type=Wrong) class AlsoWrong: pass with pytest.raises(CheckError): assert check.dict_param(str_to_int, "str_to_int", key_type=(Wrong, AlsoWrong)) with pytest.raises(CheckError): assert check.dict_param(str_to_int, "str_to_int", value_type=(Wrong, AlsoWrong))
def create_schedule_definition( self, schedule_name, cron_schedule, should_execute=None, partition_selector=last_partition, environment_vars=None, ): '''Create a ScheduleDefinition from a PartitionSetDefinition. Arguments: schedule_name (str): The name of the schedule. cron_schedule (str): A valid cron string for the schedule should_execute (Optional[function]): Function that runs at schedule execution time that determines whether a schedule should execute. Defaults to a function that always returns ``True``. partition_selector (Callable[PartitionSet], Partition): A partition selector for the schedule. environment_vars (Optional[dict]): The environment variables to set for the schedule. Returns: ScheduleDefinition: The generated ScheduleDefinition for the partition selector ''' check.str_param(schedule_name, 'schedule_name') check.str_param(cron_schedule, 'cron_schedule') check.opt_callable_param(should_execute, 'should_execute') check.opt_dict_param(environment_vars, 'environment_vars', key_type=str, value_type=str) check.callable_param(partition_selector, 'partition_selector') def _should_execute_wrapper(context): check.inst_param(context, 'context', ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition: return False elif not should_execute: return True else: return should_execute(context) def _run_config_fn_wrapper(context): check.inst_param(context, 'context', ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition: raise DagsterInvariantViolationError( "The partition selection function `{selector}` did not return " "a partition from PartitionSet {partition_set}".format( selector=getattr(partition_selector, '__name__', repr(partition_selector)), partition_set=self.name, )) return self.run_config_for_partition(selected_partition) def _tags_fn_wrapper(context): check.inst_param(context, 'context', ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition: raise DagsterInvariantViolationError( "The partition selection function `{selector}` did not return " "a partition from PartitionSet {partition_set}".format( selector=getattr(partition_selector, '__name__', repr(partition_selector)), partition_set=self.name, )) return self.tags_for_partition(selected_partition) return PartitionScheduleDefinition( name=schedule_name, cron_schedule=cron_schedule, pipeline_name=self.pipeline_name, run_config_fn=_run_config_fn_wrapper, tags_fn=_tags_fn_wrapper, solid_selection=self.solid_selection, mode=self.mode, should_execute=_should_execute_wrapper, environment_vars=environment_vars, partition_set=self, )
def monthly_schedule( pipeline_name, start_date, name=None, execution_day_of_month=1, execution_time=datetime.time(0, 0), tags_fn_for_date=None, solid_selection=None, mode="default", should_execute=None, environment_vars=None, end_date=None, execution_timezone=None, ): """Create a schedule that runs monthly. The decorated function will be called as the ``run_config_fn`` of the underlying :py:class:`~dagster.ScheduleDefinition` and should take a :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment dict for the scheduled execution. Args: pipeline_name (str): The name of the pipeline to execute when the schedule runs. start_date (datetime.datetime): The date from which to run the schedule. name (Optional[str]): The name of the schedule to create. execution_day_of_month (int): The day of the month on which to run the schedule (must be between 0 and 31). execution_time (datetime.time): The time at which to execute the schedule. tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A function that generates tags to attach to the schedules runs. Takes the date of the schedule run and returns a dictionary of tags (string key-value pairs). solid_selection (Optional[List[str]]): A list of solid subselection (including single solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']`` mode (Optional[str]): The pipeline mode in which to execute this schedule. (Default: 'default') should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at schedule execution tie to determine whether a schedule should execute or skip. Takes a :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the schedule should execute). Defaults to a function that always returns ``True``. environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing the schedule. end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to current time. execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works with DagsterDaemonScheduler, and must be set when using that scheduler. """ check.opt_str_param(name, "name") check.inst_param(start_date, "start_date", datetime.datetime) check.opt_inst_param(end_date, "end_date", datetime.datetime) check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date") check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str) mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME) check.opt_callable_param(should_execute, "should_execute") check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str) check.str_param(pipeline_name, "pipeline_name") check.int_param(execution_day_of_month, "execution_day") check.inst_param(execution_time, "execution_time", datetime.time) check.opt_str_param(execution_timezone, "execution_timezone") if (start_date.day != 1 or start_date.hour != 0 or start_date.minute != 0 or start_date.second != 0): warnings.warn( "`start_date` must be at the beginning of the first day of the month for a monthly " "schedule. Use `execution_day_of_month` and `execution_time` to execute the schedule " "at a specific time within the month. For example, to run the schedule at 3AM on the " "23rd of each month starting in October, your schedule definition would look like:" """ @monthly_schedule( start_date=datetime.datetime(2020, 10, 1), execution_day_of_month=23, execution_time=datetime.time(3, 0) ): def my_schedule_definition(_): ... """) if execution_day_of_month <= 0 or execution_day_of_month > 31: raise DagsterInvalidDefinitionError( "`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be " "between 1 and 31".format(execution_day_of_month)) cron_schedule = "{minute} {hour} {day} * *".format( minute=execution_time.minute, hour=execution_time.hour, day=execution_day_of_month) fmt = DEFAULT_MONTHLY_FORMAT execution_time_to_partition_fn = (lambda d: pendulum.instance(d).replace( hour=0, minute=0).subtract(months=1, days=execution_day_of_month - 1)) partition_fn = schedule_partition_range( start_date, end=end_date, cron_schedule=cron_schedule, fmt=fmt, timezone=execution_timezone, execution_time_to_partition_fn=execution_time_to_partition_fn, ) def inner(fn): check.callable_param(fn, "fn") schedule_name = name or fn.__name__ tags_fn_for_partition_value = lambda partition: {} if tags_fn_for_date: tags_fn_for_partition_value = lambda partition: tags_fn_for_date( partition.value) partition_set = PartitionSetDefinition( name="{}_partitions".format(schedule_name), pipeline_name=pipeline_name, partition_fn=partition_fn, run_config_fn_for_partition=lambda partition: fn(partition.value), solid_selection=solid_selection, tags_fn_for_partition=tags_fn_for_partition_value, mode=mode, ) return partition_set.create_schedule_definition( schedule_name, cron_schedule, should_execute=should_execute, environment_vars=environment_vars, partition_selector=create_offset_partition_selector( execution_time_to_partition_fn=execution_time_to_partition_fn), execution_timezone=execution_timezone, ) return inner
def to_job( self, name: Optional[str] = None, description: Optional[str] = None, resource_defs: Optional[Dict[str, ResourceDefinition]] = None, config: Union[ConfigMapping, Dict[str, Any], "PartitionedConfig"] = None, tags: Optional[Dict[str, Any]] = None, logger_defs: Optional[Dict[str, LoggerDefinition]] = None, executor_def: Optional["ExecutorDefinition"] = None, hooks: Optional[AbstractSet[HookDefinition]] = None, op_retry_policy: Optional[RetryPolicy] = None, version_strategy: Optional[VersionStrategy] = None, op_selection: Optional[List[str]] = None, partitions_def: Optional["PartitionsDefinition"] = None, ) -> "JobDefinition": """ Make this graph in to an executable Job by providing remaining components required for execution. Args: name (Optional[str]): The name for the Job. Defaults to the name of the this graph. resource_defs (Optional[Dict[str, ResourceDefinition]]): Resources that are required by this graph for execution. If not defined, `io_manager` will default to filesystem. config: Describes how the job is parameterized at runtime. If no value is provided, then the schema for the job's run config is a standard format based on its solids and resources. If a dictionary is provided, then it must conform to the standard config schema, and it will be used as the job's run config for the job whenever the job is executed. The values provided will be viewable and editable in the Dagit playground, so be careful with secrets. If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is determined by the config mapping, and the ConfigMapping, which should return configuration in the standard format to configure the job. If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config values that can parameterize the job, as well as a function for mapping those values to the base config. The values provided will be viewable and editable in the Dagit playground, so be careful with secrets. tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution of the Job. Values that are not strings will be json encoded and must meet the criteria that `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag values provided at invocation time. logger_defs (Optional[Dict[str, LoggerDefinition]]): A dictionary of string logger identifiers to their implementations. executor_def (Optional[ExecutorDefinition]): How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`, which can be switched between multi-process and in-process modes of execution. The default mode of execution is multi-process. op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job. Only used if retry policy is not defined on the op definition or op invocation. version_strategy (Optional[VersionStrategy]): Defines how each solid (and optionally, resource) in the job can be versioned. If provided, memoizaton will be enabled for this job. partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition keys that can parameterize the job. If this argument is supplied, the config argument can't also be supplied. Returns: JobDefinition """ from .job_definition import JobDefinition from .partition import PartitionedConfig, PartitionsDefinition from .executor_definition import ExecutorDefinition, multi_or_in_process_executor job_name = check_valid_name(name or self.name) tags = check.opt_dict_param(tags, "tags", key_type=str) executor_def = check.opt_inst_param( executor_def, "executor_def", ExecutorDefinition, default=multi_or_in_process_executor) if resource_defs and "io_manager" in resource_defs: resource_defs_with_defaults = resource_defs else: resource_defs_with_defaults = merge_dicts( {"io_manager": default_job_io_manager}, resource_defs or {}) hooks = check.opt_set_param(hooks, "hooks", of_type=HookDefinition) op_retry_policy = check.opt_inst_param(op_retry_policy, "op_retry_policy", RetryPolicy) op_selection = check.opt_list_param(op_selection, "op_selection", of_type=str) presets = [] config_mapping = None partitioned_config = None if partitions_def: check.inst_param(partitions_def, "partitions_def", PartitionsDefinition) check.invariant( config is None, "Can't supply both the 'config' and 'partitions_def' arguments" ) partitioned_config = PartitionedConfig(partitions_def, lambda _: {}) if isinstance(config, ConfigMapping): config_mapping = config elif isinstance(config, PartitionedConfig): partitioned_config = config elif isinstance(config, dict): presets = [PresetDefinition(name="default", run_config=config)] # Using config mapping here is a trick to make it so that the preset will be used even # when no config is supplied for the job. config_mapping = _config_mapping_with_default_value( self._get_config_schema(resource_defs_with_defaults, executor_def, logger_defs), config, job_name, self.name, ) elif config is not None: check.failed( f"config param must be a ConfigMapping, a PartitionedConfig, or a dictionary, but " f"is an object of type {type(config)}") return JobDefinition( name=job_name, description=description or self.description, graph_def=self, mode_def=ModeDefinition( resource_defs=resource_defs_with_defaults, logger_defs=logger_defs, executor_defs=[executor_def], _config_mapping=config_mapping, _partitioned_config=partitioned_config, ), preset_defs=presets, tags=tags, hook_defs=hooks, version_strategy=version_strategy, op_retry_policy=op_retry_policy, ).get_job_def_for_op_selection(op_selection)
def execute_in_process( self, run_config: Any = None, instance: Optional["DagsterInstance"] = None, resources: Optional[Dict[str, Any]] = None, raise_on_error: bool = True, op_selection: Optional[List[str]] = None, ) -> "ExecuteInProcessResult": """ Execute this graph in-process, collecting results in-memory. Args: run_config (Optional[Dict[str, Any]]): Run config to provide to execution. The configuration for the underlying graph should exist under the "ops" key. instance (Optional[DagsterInstance]): The instance to execute against, an ephemeral one will be used if none provided. resources (Optional[Dict[str, Any]]): The resources needed if any are required. Can provide resource instances directly, or resource definitions. raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur. Defaults to ``True``. op_selection (Optional[List[str]]): A list of op selection queries (including single op names) to execute. For example: * ``['some_op']``: selects ``some_op`` itself. * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies). * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants (downstream dependencies) within 3 levels down. * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops. Returns: :py:class:`~dagster.ExecuteInProcessResult` """ from dagster.core.execution.build_resources import wrap_resources_for_execution from dagster.core.execution.execute_in_process import core_execute_in_process from dagster.core.instance import DagsterInstance from .job_definition import JobDefinition from .executor_definition import execute_in_process_executor instance = check.opt_inst_param(instance, "instance", DagsterInstance) resources = check.opt_dict_param(resources, "resources", key_type=str) resource_defs = wrap_resources_for_execution(resources) in_proc_mode = ModeDefinition( executor_defs=[execute_in_process_executor], resource_defs=resource_defs) ephemeral_job = JobDefinition( name=self._name, graph_def=self, mode_def=in_proc_mode).get_job_def_for_op_selection(op_selection) run_config = run_config if run_config is not None else {} op_selection = check.opt_list_param(op_selection, "op_selection", str) return core_execute_in_process( node=self, ephemeral_pipeline=ephemeral_job, run_config=run_config, instance=instance, output_capturing_enabled=True, raise_on_error=raise_on_error, )
def _create_lakehouse_table_def( name, lakehouse_fn, input_tables=None, other_input_defs=None, required_resource_keys=None, metadata=None, description=None, ): metadata = check.opt_dict_param(metadata, 'metadata') input_tables = check.opt_list_param(input_tables, input_tables, of_type=LakehouseTableInputDefinition) other_input_defs = check.opt_list_param(other_input_defs, other_input_defs, of_type=InputDefinition) required_resource_keys = check.opt_set_param(required_resource_keys, 'required_resource_keys', of_type=str) table_type = define_python_dagster_type(python_type=ITableHandle, name=name, description=description) table_type_inst = table_type.inst() table_input_dict = { input_table.name: input_table for input_table in input_tables } input_defs = input_tables + other_input_defs validate_solid_fn('@solid', name, lakehouse_fn, input_defs, [('context', )]) def _compute(context, inputs): ''' Workhouse function of lakehouse. The inputs are something that inherits from ITableHandle. This compute_fn: (1) Iterates over input tables and ask the lakehouse resource to hydrate their contents or a representation of their contents (e.g a pyspark dataframe) into memory for computation (2) Pass those into the lakehouse table function. Do the actual thing. (3) Pass the output of the lakehouse function to the lakehouse materialize function. (4) Yield a materialization if the lakehouse function returned that. There's an argument that the hydrate and materialize functions should return a stream of events but that started to feel like I was implementing what should be a framework feature. ''' check.inst_param(context.resources.lakehouse, 'context.resources.lakehouse', Lakehouse) # hydrate tables hydrated_tables = {} other_inputs = {} for input_name, value in inputs.items(): context.log.info( 'About to hydrate table {input_name} for use in {name}'.format( input_name=input_name, name=name)) if input_name in table_input_dict: table_handle = value input_type = table_input_dict[input_name].runtime_type hydrated_tables[ input_name] = context.resources.lakehouse.hydrate( context, input_type, table_def_of_type(context.pipeline_def, input_type.name).metadata, table_handle, ) else: other_inputs[input_name] = value # call user-provided business logic which operates on the hydrated values # (as opposed to the handles) computed_output = lakehouse_fn(context, **hydrated_tables, **other_inputs) materialization, output_table_handle = context.resources.lakehouse.materialize( context, table_type_inst, metadata, computed_output) if materialization: yield materialization # just pass in a dummy handle for now if the materialize function # does not return one yield Output( output_table_handle if output_table_handle else TableHandle()) required_resource_keys.add('lakehouse') return LakehouseTableDefinition( lakehouse_fn=lakehouse_fn, name=name, input_tables=input_tables, input_defs=input_defs, output_defs=[OutputDefinition(table_type)], compute_fn=_compute, required_resource_keys=required_resource_keys, metadata=metadata, description=description, )
def execute_solid( solid_def, mode_def=None, input_values=None, environment_dict=None, run_config=None, raise_on_error=True, ): '''Execute a single solid in an ephemeral pipeline. Intended to support unit tests. Input values may be passed directly, and no pipeline need be specified -- an ephemeral pipeline will be constructed. Args: solid_def (SolidDefinition): The solid to execute. mode_def (Optional[ModeDefinition]): The mode within which to execute the solid. Use this if, e.g., custom resources, loggers, or executors are desired. input_values (Optional[Dict[str, Any]]): A dict of input names to input values, used to pass inputs to the solid directly. You may also use the ``environment_dict`` to configure any inputs that are configurable. environment_dict (Optional[dict]): The enviroment configuration that parameterizes this execution, as a dict. run_config (Optional[RunConfig]): Optionally specifies additional config options for pipeline execution. raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur. Defaults to ``True``, since this is the most useful behavior in test. Returns: Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of executing the solid. ''' check.inst_param(solid_def, 'solid_def', ISolidDefinition) check.opt_inst_param(mode_def, 'mode_def', ModeDefinition) input_values = check.opt_dict_param(input_values, 'input_values', key_type=str) solid_defs = [solid_def] def create_value_solid(input_name, input_value): @lambda_solid(name=input_name) def input_solid(): return input_value return input_solid dependencies = defaultdict(dict) for input_name, input_value in input_values.items(): dependencies[solid_def.name][input_name] = DependencyDefinition( input_name) solid_defs.append(create_value_solid(input_name, input_value)) result = execute_pipeline( PipelineDefinition( name='ephemeral_{}_solid_pipeline'.format(solid_def.name), solid_defs=solid_defs, dependencies=dependencies, mode_defs=[mode_def] if mode_def else None, ), environment_dict=environment_dict, run_config=run_config, raise_on_error=raise_on_error, ) return result.result_for_handle(solid_def.name)
def complete(self, output): return CompleteCompositionContext( self.name, self._invocations, check.opt_dict_param(output, 'output') )
def __new__(cls, name: str, tags: Optional[Dict[object, object]] = None): return super(ExternalPartitionTagsData, cls).__new__( cls, name=check.str_param(name, "name"), tags=check.opt_dict_param(tags, "tags"), )
def __init__( self, name, cron_schedule, pipeline_name, environment_dict=None, environment_dict_fn=None, tags=None, tags_fn=None, solid_selection=None, mode="default", should_execute=None, environment_vars=None, ): check.str_param(name, 'name') check.str_param(cron_schedule, 'cron_schedule') check.str_param(pipeline_name, 'pipeline_name') check.opt_dict_param(environment_dict, 'environment_dict') check.opt_callable_param(environment_dict_fn, 'environment_dict_fn') check.opt_dict_param(tags, 'tags', key_type=str, value_type=str) check.opt_callable_param(tags_fn, 'tags_fn') check.opt_nullable_list_param(solid_selection, 'solid_selection', of_type=str) mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME) check.opt_callable_param(should_execute, 'should_execute') check.opt_dict_param(environment_vars, 'environment_vars', key_type=str, value_type=str) if environment_dict_fn and environment_dict: raise DagsterInvalidDefinitionError( 'Attempted to provide both environment_dict_fn and environment_dict as arguments' ' to ScheduleDefinition. Must provide only one of the two.') if tags_fn and tags: raise DagsterInvalidDefinitionError( 'Attempted to provide both tags_fn and tags as arguments' ' to ScheduleDefinition. Must provide only one of the two.') if not environment_dict and not environment_dict_fn: environment_dict_fn = lambda _context: {} if not tags and not tags_fn: tags_fn = lambda _context: {} if not should_execute: should_execute = lambda _context: True self._schedule_definition_data = ScheduleDefinitionData( name=check.str_param(name, 'name'), cron_schedule=check.str_param(cron_schedule, 'cron_schedule'), environment_vars=check.opt_dict_param(environment_vars, 'environment_vars'), ) self._environment_dict = environment_dict self._environment_dict_fn = environment_dict_fn self._tags = tags self._tags_fn = tags_fn self._should_execute = should_execute self._mode = mode self._pipeline_name = pipeline_name self._solid_selection = solid_selection
def test_opt_dict_param_with_type(): str_to_int = {'str': 1} assert check.opt_dict_param(str_to_int, 'str_to_int', key_type=str, value_type=int) assert check.opt_dict_param(str_to_int, 'str_to_int', value_type=int) assert check.opt_dict_param(str_to_int, 'str_to_int', key_type=str) assert check.opt_dict_param(str_to_int, 'str_to_int') assert check.opt_dict_param({}, 'str_to_int', key_type=str, value_type=int) == {} assert check.opt_dict_param({}, 'str_to_int', value_type=int) == {} assert check.opt_dict_param({}, 'str_to_int', key_type=str) == {} assert check.opt_dict_param({}, 'str_to_int') == {} assert check.opt_dict_param(None, 'str_to_int', key_type=str, value_type=int) == {} assert check.opt_dict_param(None, 'str_to_int', value_type=int) == {} assert check.opt_dict_param(None, 'str_to_int', key_type=str) == {} assert check.opt_dict_param(None, 'str_to_int') == {} class Wrong(object): pass with pytest.raises(CheckError): assert check.opt_dict_param(str_to_int, 'str_to_int', key_type=Wrong, value_type=Wrong) with pytest.raises(CheckError): assert check.opt_dict_param(str_to_int, 'str_to_int', key_type=Wrong, value_type=int) with pytest.raises(CheckError): assert check.opt_dict_param(str_to_int, 'str_to_int', key_type=str, value_type=Wrong) with pytest.raises(CheckError): assert check.opt_dict_param(str_to_int, 'str_to_int', key_type=Wrong) with pytest.raises(CheckError): assert check.opt_dict_param(str_to_int, 'str_to_int', value_type=Wrong)
def execute_script_file(shell_script_path, output_logging, log, cwd=None, env=None): '''Execute a shell script file specified by the argument ``shell_command``. The script will be invoked via ``subprocess.Popen(['bash', shell_script_path], ...)``. In the Popen invocation, ``stdout=PIPE, stderr=STDOUT`` is used, and the combined stdout/stderr output is retrieved. Args: shell_command (str): The shell command to execute output_logging (str): The logging mode to use. Supports STREAM, BUFFER, and NONE. log (Union[logging.Logger, DagsterLogManager]): Any logger which responds to .info() cwd (str, optional): Working directory for the shell command to use. Defaults to the temporary path where we store the shell command in a script file. env (Dict[str, str], optional): Environment dictionary to pass to ``subprocess.Popen``. Unused by default. Raises: Exception: When an invalid output_logging is selected. Unreachable from solid-based invocation since the config system will check output_logging against the config enum. Returns: str: The combined stdout/stderr output of running the shell script. ''' check.str_param(shell_script_path, 'shell_script_path') check.str_param(output_logging, 'output_logging') check.opt_str_param(cwd, 'cwd', default=os.path.dirname(shell_script_path)) env = check.opt_dict_param(env, 'env') def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() with open(shell_script_path, 'rb') as f: shell_command = six.ensure_str(f.read()) log.info('Running command:\n{command}'.format(command=shell_command)) # pylint: disable=subprocess-popen-preexec-fn sub_process = Popen( ['bash', shell_script_path], stdout=PIPE, stderr=STDOUT, cwd=cwd, env=env, preexec_fn=pre_exec, ) # Will return the string result of reading stdout of the shell command output = '' if output_logging not in ['STREAM', 'BUFFER', 'NONE']: raise Exception('Unrecognized output_logging %s' % output_logging) # Stream back logs as they are emitted if output_logging == 'STREAM': for raw_line in iter(sub_process.stdout.readline, b''): line = six.ensure_str(raw_line) log.info(line.rstrip()) output += line sub_process.wait() # Collect and buffer all logs, then emit if output_logging == 'BUFFER': output = ''.join( [six.ensure_str(raw_line) for raw_line in iter(sub_process.stdout.readline, b'')] ) log.info(output) # no logging in this case elif output_logging == 'NONE': pass log.info('Command exited with return code {retcode}'.format(retcode=sub_process.returncode)) return output, sub_process.returncode
def execute_pipeline(pipeline, environment_dict=None, run_config=None, instance=None, raise_on_error=True): '''Execute a pipeline synchronously. Users will typically call this API when testing pipeline execution, or running standalone scripts. Parameters: pipeline (PipelineDefinition): The pipeline to execute. environment_dict (Optional[dict]): The enviroment configuration that parameterizes this run, as a dict. run_config (Optional[RunConfig]): Optionally specifies additional config options for pipeline execution. instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``, an ephemeral instance will be used, and no artifacts will be persisted from the run. raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur. Defaults to ``True``, since this is the most useful behavior in test. Returns: :py:class:`PipelineExecutionResult`: The result of pipeline execution. For the asynchronous version, see :py:func:`execute_pipeline_iterator`. This is the entrypoint for dagster CLI execution. For the dagster-graphql entrypoint, see ``dagster.core.execution.api.execute_plan()``. ''' check.inst_param(pipeline, 'pipeline', PipelineDefinition) environment_dict = check.opt_dict_param(environment_dict, 'environment_dict') run_config = check_run_config_param(run_config, pipeline) check.opt_inst_param(instance, 'instance', DagsterInstance) instance = instance or DagsterInstance.ephemeral() execution_plan = create_execution_plan(pipeline, environment_dict, run_config) pipeline_run = _create_run(instance, pipeline, run_config, environment_dict) initialization_manager = pipeline_initialization_manager( pipeline, environment_dict, pipeline_run, instance, execution_plan, raise_on_error=raise_on_error, ) event_list = list(initialization_manager.generate_setup_events()) pipeline_context = initialization_manager.get_object() if pipeline_context: event_list.extend( _pipeline_execution_iterator(pipeline_context, execution_plan, pipeline_run)) event_list.extend(initialization_manager.generate_teardown_events()) return PipelineExecutionResult( pipeline, run_config.run_id, event_list, lambda: scoped_pipeline_context( pipeline, environment_dict, pipeline_run, instance, execution_plan, system_storage_data=SystemStorageData( intermediates_manager=pipeline_context.intermediates_manager, file_manager=pipeline_context.file_manager, ), ), )
def __init__( self, service_account_name, instance_config_map, postgres_password_secret=None, dagster_home=None, job_image=None, image_pull_policy=None, image_pull_secrets=None, load_incluster_config=True, kubeconfig_file=None, inst_data=None, job_namespace="default", env_config_maps=None, env_secrets=None, env_vars=None, k8s_client_batch_api=None, volume_mounts=None, volumes=None, labels=None, fail_pod_on_run_failure=None, ): self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData) self.job_namespace = check.str_param(job_namespace, "job_namespace") self.load_incluster_config = load_incluster_config self.kubeconfig_file = kubeconfig_file if load_incluster_config: check.invariant( kubeconfig_file is None, "`kubeconfig_file` is set but `load_incluster_config` is True.", ) kubernetes.config.load_incluster_config() else: check.opt_str_param(kubeconfig_file, "kubeconfig_file") kubernetes.config.load_kube_config(kubeconfig_file) self._fixed_batch_api = k8s_client_batch_api self._job_config = None self._job_image = check.opt_str_param(job_image, "job_image") self.dagster_home = check.str_param(dagster_home, "dagster_home") self._image_pull_policy = check.opt_str_param( image_pull_policy, "image_pull_policy", "IfNotPresent" ) self._image_pull_secrets = check.opt_list_param( image_pull_secrets, "image_pull_secrets", of_type=dict ) self._service_account_name = check.str_param(service_account_name, "service_account_name") self.instance_config_map = check.str_param(instance_config_map, "instance_config_map") self.postgres_password_secret = check.opt_str_param( postgres_password_secret, "postgres_password_secret" ) self._env_config_maps = check.opt_list_param( env_config_maps, "env_config_maps", of_type=str ) self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str) self._env_vars = check.opt_list_param(env_vars, "env_vars", of_type=str) self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts") self._volumes = check.opt_list_param(volumes, "volumes") self._labels = check.opt_dict_param(labels, "labels", key_type=str, value_type=str) self._fail_pod_on_run_failure = check.opt_bool_param( fail_pod_on_run_failure, "fail_pod_on_run_failure" ) super().__init__()
def test_opt_dict_param(): assert check.opt_dict_param(None, "opt_dict_param") == {} assert check.opt_dict_param({}, "opt_dict_param") == {} assert check.opt_dict_param(frozendict(), "opt_dict_param") == {} ddict = {"a": 2} assert check.opt_dict_param(ddict, "opt_dict_param") == ddict with pytest.raises(ParameterCheckError): check.opt_dict_param(0, "opt_dict_param") with pytest.raises(ParameterCheckError): check.opt_dict_param(1, "opt_dict_param") with pytest.raises(ParameterCheckError): check.opt_dict_param("foo", "opt_dict_param") with pytest.raises(ParameterCheckError): check.opt_dict_param(["foo"], "opt_dict_param") with pytest.raises(ParameterCheckError): check.opt_dict_param([], "opt_dict_param")
def __init__(self, cluster_type, cluster_configuration): self.cluster_type = check.opt_str_param(cluster_type, 'cluster_type', default='local') self.cluster_configuration = check.opt_dict_param( cluster_configuration, 'cluster_configuration')
def minute_schedule( pipeline_name, start_date, cron_schedule="* * * * *", name=None, tags_fn_for_date=None, solid_selection=None, should_execute=None, environment_vars=None, end_date=None, execution_timezone=None, ): """Create a schedule that runs every minute. The decorated function will be called as the ``run_config_fn`` of the underlying `ScheduleDefinition` and should take a `ScheduleExecutionContext` as its only argument, returning the environment dict for the scheduled execution. Args: pipeline_name (str): The name of the pipeline to execute when the schedule runs. start_date (datetime.datetime): The date from which to run the schedule. name (Optional[str]): The name of the schedule to create. By default, this will be the name of the decorated function. tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A function that generates tags to attach to the schedules runs. Takes the date of the schedule run and returns a dictionary of tags (string key-value pairs). solid_selection (Optional[List[str]]): A list of solid subselection (including single solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']`` mode (Optional[str]): The pipeline mode in which to execute this schedule. (Default: 'default') should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at schedule execution tie to determine whether a schedule should execute or skip. Takes a :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the schedule should execute). Defaults to a function that always returns ``True``. environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing the schedule. end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to current time. execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works with DagsterDaemonScheduler, and must be set when using that scheduler. """ check.opt_str_param(name, "name") check.inst_param(start_date, "start_date", datetime.datetime) check.opt_inst_param(end_date, "end_date", datetime.datetime) check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date") check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str) mode = "default" check.opt_callable_param(should_execute, "should_execute") check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str) check.str_param(pipeline_name, "pipeline_name") check.opt_str_param(execution_timezone, "execution_timezone") if start_date.second != 0: warnings.warn( "`start_date` must be at the beginning of the minute for a per minute schedule. " ) fmt = (DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE if execution_timezone else DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE) execution_time_to_partition_fn = lambda d: pendulum.instance(d).subtract( minutes=1) partition_fn = schedule_partition_range( start_date, end=end_date, cron_schedule=cron_schedule, fmt=fmt, timezone=execution_timezone, execution_time_to_partition_fn=execution_time_to_partition_fn, ) def inner(fn): check.callable_param(fn, "fn") schedule_name = name or fn.__name__ tags_fn_for_partition_value = lambda partition: {} if tags_fn_for_date: tags_fn_for_partition_value = lambda partition: tags_fn_for_date( partition.value) partition_set = PartitionSetDefinition( name="{}_partitions".format(schedule_name), pipeline_name=pipeline_name, partition_fn=partition_fn, run_config_fn_for_partition=lambda partition: fn(partition.value), solid_selection=solid_selection, tags_fn_for_partition=tags_fn_for_partition_value, mode=mode, ) return partition_set.create_schedule_definition( schedule_name, cron_schedule, should_execute=should_execute, environment_vars=environment_vars, partition_selector=create_default_partition_selector_fn( delta_fn=execution_time_to_partition_fn, fmt=fmt, ), execution_timezone=execution_timezone, ) return inner
def create_schedule_definition( self, schedule_name, cron_schedule, should_execute=None, partition_selector=last_partition, environment_vars=None, execution_timezone=None, ): """Create a ScheduleDefinition from a PartitionSetDefinition. Arguments: schedule_name (str): The name of the schedule. cron_schedule (str): A valid cron string for the schedule should_execute (Optional[function]): Function that runs at schedule execution time that determines whether a schedule should execute. Defaults to a function that always returns ``True``. partition_selector (Callable[ScheduleExecutionContext, PartitionSetDefinition], Partition): A partition selector for the schedule. environment_vars (Optional[dict]): The environment variables to set for the schedule. execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works with DagsterDaemonScheduler, and must be set when using that scheduler. Returns: ScheduleDefinition: The generated ScheduleDefinition for the partition selector """ check.str_param(schedule_name, "schedule_name") check.str_param(cron_schedule, "cron_schedule") check.opt_callable_param(should_execute, "should_execute") check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str) check.callable_param(partition_selector, "partition_selector") check.opt_str_param(execution_timezone, "execution_timezone") def _should_execute_wrapper(context): check.inst_param(context, "context", ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition or not selected_partition.name in self.get_partition_names( ): return False elif not should_execute: return True else: return should_execute(context) def _run_config_fn_wrapper(context): check.inst_param(context, "context", ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition or not selected_partition.name in self.get_partition_names( ): raise DagsterInvariantViolationError( "The partition selection function `{selector}` did not return " "a partition from PartitionSet {partition_set}".format( selector=getattr(partition_selector, "__name__", repr(partition_selector)), partition_set=self.name, )) return self.run_config_for_partition(selected_partition) def _tags_fn_wrapper(context): check.inst_param(context, "context", ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition: raise DagsterInvariantViolationError( "The partition selection function `{selector}` did not return " "a partition from PartitionSet {partition_set}".format( selector=getattr(partition_selector, "__name__", repr(partition_selector)), partition_set=self.name, )) return self.tags_for_partition(selected_partition) return PartitionScheduleDefinition( name=schedule_name, cron_schedule=cron_schedule, pipeline_name=self.pipeline_name, run_config_fn=_run_config_fn_wrapper, tags_fn=_tags_fn_wrapper, solid_selection=self.solid_selection, mode=self.mode, should_execute=_should_execute_wrapper, environment_vars=environment_vars, partition_set=self, execution_timezone=execution_timezone, )
def hourly_schedule( pipeline_name, start_date, name=None, execution_time=datetime.time(0, 0), tags_fn_for_date=None, solid_selection=None, mode="default", should_execute=None, environment_vars=None, end_date=None, ): '''Create a schedule that runs hourly. The decorated function will be called as the ``run_config_fn`` of the underlying :py:class:`~dagster.ScheduleDefinition` and should take a :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment dict for the scheduled execution. Args: pipeline_name (str): The name of the pipeline to execute when the schedule runs. start_date (datetime.datetime): The date from which to run the schedule. name (Optional[str]): The name of the schedule to create. By default, this will be the name of the decorated function. execution_time (datetime.time): The time at which to execute the schedule. Only the minutes component will be respected -- the hour should be 0, and will be ignored if it is not 0. tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A function that generates tags to attach to the schedules runs. Takes the date of the schedule run and returns a dictionary of tags (string key-value pairs). solid_selection (Optional[List[str]]): A list of solid subselection (including single solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']`` mode (Optional[str]): The pipeline mode in which to execute this schedule. (Default: 'default') should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at schedule execution tie to determine whether a schedule should execute or skip. Takes a :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the schedule should execute). Defaults to a function that always returns ``True``. environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing the schedule. end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to current time. ''' check.opt_str_param(name, 'name') check.inst_param(start_date, 'start_date', datetime.datetime) check.opt_inst_param(end_date, 'end_date', datetime.datetime) check.opt_callable_param(tags_fn_for_date, 'tags_fn_for_date') check.opt_nullable_list_param(solid_selection, 'solid_selection', of_type=str) mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME) check.opt_callable_param(should_execute, 'should_execute') check.opt_dict_param(environment_vars, 'environment_vars', key_type=str, value_type=str) check.str_param(pipeline_name, 'pipeline_name') check.inst_param(execution_time, 'execution_time', datetime.time) if execution_time.hour != 0: warnings.warn( "Hourly schedule {schedule_name} created with:\n" "\tschedule_time=datetime.time(hour={hour}, minute={minute}, ...)." "Since this is a hourly schedule, the hour parameter will be ignored and the schedule " "will run on the {minute} mark for the previous hour interval. Replace " "datetime.time(hour={hour}, minute={minute}, ...) with " "datetime.time(minute={minute}, ...) to fix this warning.") cron_schedule = '{minute} * * * *'.format(minute=execution_time.minute) partition_fn = date_partition_range(start_date, end=end_date, delta=datetime.timedelta(hours=1), fmt="%Y-%m-%d-%H:%M") def inner(fn): check.callable_param(fn, 'fn') schedule_name = name or fn.__name__ tags_fn_for_partition_value = lambda partition: {} if tags_fn_for_date: tags_fn_for_partition_value = lambda partition: tags_fn_for_date( partition.value) partition_set = PartitionSetDefinition( name='{}_partitions'.format(schedule_name), pipeline_name=pipeline_name, partition_fn=partition_fn, run_config_fn_for_partition=lambda partition: fn(partition.value), solid_selection=solid_selection, tags_fn_for_partition=tags_fn_for_partition_value, mode=mode, ) return partition_set.create_schedule_definition( schedule_name, cron_schedule, should_execute=should_execute, environment_vars=environment_vars, ) return inner
def create_execution_plan(pipeline, environment_dict=None): check.inst_param(pipeline, 'pipeline', PipelineDefinition) environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) environment_config = create_environment_config(pipeline, environment_dict) return create_execution_plan_core(pipeline, environment_config)
def make_dagster_pipeline_from_airflow_dag(dag, tags=None, use_airflow_template_context=False, unique_id=None): """Construct a Dagster pipeline corresponding to a given Airflow DAG. Tasks in the resulting pipeline will execute the ``execute()`` method on the corresponding Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module containing your DAG definition must be available in the Python environment within which your Dagster solids execute. To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods, either: 1. (Best for ad hoc runs) Run Pipeline with 'default' preset, which sets execution_date to the time (in UTC) of pipeline invocation: .. code-block:: python execute_pipeline( pipeline=make_dagster_pipeline_from_airflow_dag(dag=dag), preset='default') 2. Add ``{'airflow_execution_date': utc_date_string}`` to the PipelineDefinition tags. This will override behavior from (1). .. code-block:: python execute_pipeline( make_dagster_pipeline_from_airflow_dag( dag=dag, tags={'airflow_execution_date': utc_execution_date_str} ) ) 3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the PipelineRun tags, such as in the Dagit UI. This will override behavior from (1) and (2) We apply normalized_name() to the dag id and task ids when generating pipeline name and solid names to ensure that names conform to Dagster's naming conventions. Args: dag (DAG): The Airflow DAG to compile into a Dagster pipeline tags (Dict[str, Field]): Pipeline tags. Optionally include `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within execution of Airflow Operators. use_airflow_template_context (bool): If True, will call get_template_context() on the Airflow TaskInstance model which requires and modifies the DagRun table. (default: False) unique_id (int): If not None, this id will be postpended to generated solid names. Used by framework authors to enforce unique solid names within a repo. Returns: pipeline_def (PipelineDefinition): The generated Dagster pipeline """ check.inst_param(dag, "dag", DAG) tags = check.opt_dict_param(tags, "tags") check.bool_param(use_airflow_template_context, "use_airflow_template_context") unique_id = check.opt_int_param(unique_id, "unique_id") if IS_AIRFLOW_INGEST_PIPELINE_STR not in tags: tags[IS_AIRFLOW_INGEST_PIPELINE_STR] = "true" tags = validate_tags(tags) pipeline_dependencies, solid_defs = _get_pipeline_definition_args( dag, use_airflow_template_context, unique_id) pipeline_def = PipelineDefinition( name=normalized_name(dag.dag_id, None), solid_defs=solid_defs, dependencies=pipeline_dependencies, tags=tags, ) return pipeline_def
def __init__( self, task_id, environment_dict=None, pipeline_name=None, mode=None, step_keys=None, dag=None, instance_ref=None, *args, **kwargs ): check.str_param(pipeline_name, 'pipeline_name') step_keys = check.opt_list_param(step_keys, 'step_keys', of_type=str) environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) check.opt_inst_param(instance_ref, 'instance_ref', InstanceRef) kwargs['name'] = 'dagster.{pipeline_name}.{task_id}'.format( pipeline_name=pipeline_name, task_id=task_id ).replace( '_', '-' # underscores are not permissible DNS names ) if 'storage' not in environment_dict: raise AirflowException( 'No storage config found -- must configure either filesystem or s3 storage for ' 'the DagsterKubernetesPodOperator. Ex.: \n' 'storage:\n' ' filesystem:\n' ' base_dir: \'/some/shared/volume/mount/special_place\'' '\n\n --or--\n\n' 'storage:\n' ' s3:\n' ' s3_bucket: \'my-s3-bucket\'\n' ) check.invariant( 'in_memory' not in environment_dict.get('storage', {}), 'Cannot use in-memory storage with Airflow, must use S3', ) self.environment_dict = environment_dict self.pipeline_name = pipeline_name self.mode = mode self.step_keys = step_keys self._run_id = None # self.instance might be None in, for instance, a unit test setting where the operator # was being directly instantiated without passing through make_airflow_dag self.instance = DagsterInstance.from_ref(instance_ref) if instance_ref else None # Store Airflow DAG run timestamp so that we can pass along via execution metadata self.airflow_ts = kwargs.get('ts') # Add AWS creds self.env_vars = kwargs.get('env_vars', {}) for k, v in get_aws_environment().items(): self.env_vars.setdefault(k, v) kwargs.setdefault('labels', {}) kwargs['labels'].setdefault('dagster_pipeline', self.pipeline_name) kwargs['labels'].setdefault('app.kubernetes.io/name', 'dagster') kwargs['labels'].setdefault('app.kubernetes.io/instance', self.pipeline_name) kwargs['labels'].setdefault('app.kubernetes.io/version', dagster_version) kwargs['labels'].setdefault('app.kubernetes.io/component', 'pipeline-execution') kwargs['labels'].setdefault('app.kubernetes.io/part-of', 'dagster-airflow') kwargs['labels'].setdefault('app.kubernetes.io/managed-by', 'dagster-airflow') # The xcom mechanism for the pod operator is very unlike that of the Docker operator, so # we disable it if 'xcom_push' in kwargs: self.log.warning( 'xcom_push cannot be enabled with the DagsterKubernetesPodOperator, disabling' ) kwargs['xcom_push'] = False super(DagsterKubernetesPodOperator, self).__init__( task_id=task_id, dag=dag, *args, **kwargs )
def hourly_schedule( pipeline_name: str, start_date: datetime.datetime, name: Optional[str] = None, execution_time: datetime.time = datetime.time(0, 0), tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None, solid_selection: Optional[List[str]] = None, mode: Optional[str] = "default", should_execute: Optional[Callable[["ScheduleExecutionContext"], bool]] = None, environment_vars: Optional[Dict[str, str]] = None, end_date: Optional[str] = None, execution_timezone: Optional[str] = None, partition_hours_offset: Optional[int] = 1, description: Optional[str] = None, ) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]: """Create a partitioned schedule that runs hourly. The decorated function should accept a datetime object as its only argument. The datetime represents the date partition that it's meant to run on. The decorated function should return a run configuration dictionary, which will be used as configuration for the scheduled run. The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`. Args: pipeline_name (str): The name of the pipeline to execute when the schedule runs. start_date (datetime.datetime): The date from which to run the schedule. name (Optional[str]): The name of the schedule to create. By default, this will be the name of the decorated function. execution_time (datetime.time): The time at which to execute the schedule. Only the minutes component will be respected -- the hour should be 0, and will be ignored if it is not 0. tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A function that generates tags to attach to the schedules runs. Takes the date of the schedule run and returns a dictionary of tags (string key-value pairs). solid_selection (Optional[List[str]]): A list of solid subselection (including single solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']`` mode (Optional[str]): The pipeline mode in which to execute this schedule. (Default: 'default') should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at schedule execution tie to determine whether a schedule should execute or skip. Takes a :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the schedule should execute). Defaults to a function that always returns ``True``. environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing the schedule. end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to current time. execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works with DagsterDaemonScheduler, and must be set when using that scheduler. partition_hours_offset (Optional[int]): How many hours back to go when choosing the partition for a given schedule execution. For example, when partition_hours_offset=1, the schedule that executes during hour N will fill in the partition for hour N-1. (Default: 1) description (Optional[str]): A human-readable description of the schedule. """ check.opt_str_param(name, "name") check.inst_param(start_date, "start_date", datetime.datetime) check.opt_inst_param(end_date, "end_date", datetime.datetime) check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date") check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str) mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME) check.opt_callable_param(should_execute, "should_execute") check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str) check.str_param(pipeline_name, "pipeline_name") check.inst_param(execution_time, "execution_time", datetime.time) check.opt_str_param(execution_timezone, "execution_timezone") check.opt_int_param(partition_hours_offset, "partition_hours_offset") check.opt_str_param(description, "description") if start_date.minute != 0 or start_date.second != 0: warnings.warn( "`start_date` must be at the beginning of the hour for an hourly schedule. " "Use `execution_time` to execute the schedule at a specific time within the hour. For " "example, to run the schedule each hour at 15 minutes past the hour starting at 3AM " "on 10/20/2020, your schedule definition would look like:" """ @hourly_schedule( start_date=datetime.datetime(2020, 10, 20, 3), execution_time=datetime.time(0, 15) ): def my_schedule_definition(_): ... """ ) if execution_time.hour != 0: warnings.warn( "Hourly schedule {schedule_name} created with:\n" "\tschedule_time=datetime.time(hour={hour}, minute={minute}, ...)." "Since this is an hourly schedule, the hour parameter will be ignored and the schedule " "will run on the {minute} mark for the previous hour interval. Replace " "datetime.time(hour={hour}, minute={minute}, ...) with " "datetime.time(minute={minute}, ...) to fix this warning." ) cron_schedule = "{minute} * * * *".format(minute=execution_time.minute) fmt = ( DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE if execution_timezone else DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE ) execution_time_to_partition_fn = lambda d: pendulum.instance(d).subtract( hours=partition_hours_offset, minutes=(execution_time.minute - start_date.minute) % 60 ) partition_fn = schedule_partition_range( start_date, end=end_date, cron_schedule=cron_schedule, fmt=fmt, timezone=execution_timezone, execution_time_to_partition_fn=execution_time_to_partition_fn, inclusive=(partition_hours_offset == 0), ) def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition: check.callable_param(fn, "fn") schedule_name = name or fn.__name__ tags_fn_for_partition_value: Callable[ ["Partition"], Optional[Dict[str, str]] ] = lambda partition: {} if tags_fn_for_date: tags_fn = cast( Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date ) tags_fn_for_partition_value = lambda partition: tags_fn(partition.value) partition_set = PartitionSetDefinition( name="{}_partitions".format(schedule_name), pipeline_name=pipeline_name, partition_fn=partition_fn, run_config_fn_for_partition=lambda partition: fn(partition.value), solid_selection=solid_selection, tags_fn_for_partition=tags_fn_for_partition_value, mode=mode, ) return partition_set.create_schedule_definition( schedule_name, cron_schedule, should_execute=should_execute, environment_vars=environment_vars, partition_selector=create_offset_partition_selector( execution_time_to_partition_fn=execution_time_to_partition_fn, ), execution_timezone=execution_timezone, description=description, ) return inner
def daily_schedule( pipeline_name, start_date, name=None, execution_time=datetime.time(0, 0), tags_fn_for_date=None, solid_selection=None, mode="default", should_execute=None, environment_vars=None, end_date=None, ): """Create a schedule that runs daily. The decorated function will be called as the ``run_config_fn`` of the underlying :py:class:`~dagster.ScheduleDefinition` and should take a :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment dict for the scheduled execution. Args: pipeline_name (str): The name of the pipeline to execute when the schedule runs. start_date (datetime.datetime): The date from which to run the schedule. name (Optional[str]): The name of the schedule to create. execution_time (datetime.time): The time at which to execute the schedule. tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A function that generates tags to attach to the schedules runs. Takes the date of the schedule run and returns a dictionary of tags (string key-value pairs). solid_selection (Optional[List[str]]): A list of solid subselection (including single solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']`` mode (Optional[str]): The pipeline mode in which to execute this schedule. (Default: 'default') should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at schedule execution tie to determine whether a schedule should execute or skip. Takes a :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the schedule should execute). Defaults to a function that always returns ``True``. environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing the schedule. end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to current time. """ check.opt_str_param(name, "name") check.inst_param(start_date, "start_date", datetime.datetime) check.opt_inst_param(end_date, "end_date", datetime.datetime) check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date") check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str) mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME) check.opt_callable_param(should_execute, "should_execute") check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str) check.str_param(pipeline_name, "pipeline_name") check.inst_param(execution_time, "execution_time", datetime.time) cron_schedule = "{minute} {hour} * * *".format( minute=execution_time.minute, hour=execution_time.hour) partition_fn = date_partition_range(start_date, end=end_date) def inner(fn): check.callable_param(fn, "fn") schedule_name = name or fn.__name__ tags_fn_for_partition_value = lambda partition: {} if tags_fn_for_date: tags_fn_for_partition_value = lambda partition: tags_fn_for_date( partition.value) partition_set = PartitionSetDefinition( name="{}_partitions".format(schedule_name), pipeline_name=pipeline_name, partition_fn=partition_fn, run_config_fn_for_partition=lambda partition: fn(partition.value), solid_selection=solid_selection, tags_fn_for_partition=tags_fn_for_partition_value, mode=mode, ) return partition_set.create_schedule_definition( schedule_name, cron_schedule, should_execute=should_execute, environment_vars=environment_vars, ) return inner