def test_multirun_partition_schedule_definition(): partition_set = PartitionSetDefinition( name="test_time", pipeline_name="test_pipeline", partition_fn=date_partition_range( start=datetime.datetime(2020, 1, 5), end=datetime.datetime(2020, 12, 31), delta_range="days", inclusive=True, ), run_config_fn_for_partition=lambda _: {}, ) def _custom_partition_selector(_context, partition_set_def): return partition_set_def.get_partitions()[-5:] multi_run_schedule = partition_set.create_schedule_definition( "test_schedule", "* * * * *", _custom_partition_selector) with instance_for_test() as instance: with ScheduleEvaluationContext( instance.get_ref(), pendulum.now("UTC")) as schedule_context: execution_data = multi_run_schedule.evaluate_tick(schedule_context) assert isinstance(execution_data, ScheduleExecutionData) assert execution_data.run_requests assert len(execution_data.run_requests) == 5 assert [ request.run_key for request in execution_data.run_requests ] == [ "2020-12-27", "2020-12-28", "2020-12-29", "2020-12-30", "2020-12-31", ] def _invalid_partition_selector(_cotnext, _partition_set_def): return [ Partition( value=create_pendulum_time(year=2019, month=1, day=27, hour=1, minute=25), name="made_up", ) ] invalid_schedule = partition_set.create_schedule_definition( "test_schedule", "* * * * *", _invalid_partition_selector) with instance_for_test() as instance: with ScheduleEvaluationContext( instance.get_ref(), pendulum.now("UTC")) as schedule_context: execution_data = invalid_schedule.evaluate_tick(schedule_context) assert isinstance(execution_data, ScheduleExecutionData) assert execution_data.skip_message assert ( "Partition selector returned partition not in the partition set: made_up." in execution_data.skip_message)
def backfill_test_schedule(): schedule_name = "backfill_unreliable_weekly" # create weekly partition set partition_set = PartitionSetDefinition( name="unreliable_weekly", pipeline_name="unreliable_pipeline", partition_fn=date_partition_range( # first sunday of the year start=datetime.datetime(2020, 1, 5), delta_range="weeks", ), run_config_fn_for_partition=lambda _: {"intermediate_storage": { "filesystem": {} }}, ) def _should_execute(context): return backfill_should_execute(context, partition_set) return partition_set.create_schedule_definition( schedule_name=schedule_name, cron_schedule="* * * * *", # tick every minute partition_selector=backfilling_partition_selector, should_execute=_should_execute, execution_timezone=_toys_tz_info(), )
def backfill_test_schedule(): schedule_name = 'backfill_unreliable_weekly' # create weekly partition set partition_set = PartitionSetDefinition( name='unreliable_weekly', pipeline_name='unreliable_pipeline', partition_fn=date_partition_range( # first sunday of the year start=datetime.datetime(2020, 1, 5), delta=datetime.timedelta(weeks=1), ), environment_dict_fn_for_partition=lambda _: {'storage': { 'filesystem': {} }}, ) def _should_execute(context): return backfill_should_execute(context, partition_set, schedule_name) return partition_set.create_schedule_definition( schedule_name=schedule_name, cron_schedule="* * * * *", # tick every minute partition_selector=backfilling_partition_selector, should_execute=_should_execute, )
def longitudinal_schedule(): from .longitudinal import longitudinal_config schedule_name = "longitudinal_demo" partition_set = PartitionSetDefinition( name="ingest_and_train", pipeline_name="longitudinal_pipeline", partition_fn=date_partition_range(start=datetime.datetime(2020, 1, 1)), run_config_fn_for_partition=longitudinal_config, ) def _should_execute(context): return backfill_should_execute(context, partition_set, retry_failed=True) def _partition_selector(context, partition_set): return backfilling_partition_selector(context, partition_set, retry_failed=True) return partition_set.create_schedule_definition( schedule_name=schedule_name, cron_schedule="*/5 * * * *", # tick every 5 minutes partition_selector=_partition_selector, should_execute=_should_execute, execution_timezone=_toys_tz_info(), )
def define_bar_schedules(): partition_set = PartitionSetDefinition( name="scheduled_partitions", pipeline_name="partitioned_scheduled_pipeline", partition_fn=lambda: string.digits, ) return { "foo_schedule": ScheduleDefinition( "foo_schedule", cron_schedule="* * * * *", pipeline_name="test_pipeline", run_config={}, ), "partitioned_schedule": partition_set.create_schedule_definition( schedule_name="partitioned_schedule", cron_schedule="* * * * *" ), }
def materialization_schedule(): # create weekly partition set schedule_name = 'many_events_partitioned' partition_set = PartitionSetDefinition( name='many_events_minutely', pipeline_name='many_events', partition_fn=date_partition_range(start=datetime.datetime(2020, 1, 1)), run_config_fn_for_partition=lambda _: {'storage': {'filesystem': {}}}, ) def _should_execute(context): return backfill_should_execute(context, partition_set) return partition_set.create_schedule_definition( schedule_name=schedule_name, cron_schedule="* * * * *", # tick every minute partition_selector=backfilling_partition_selector, should_execute=_should_execute, )
def materialization_schedule(): # create weekly partition set schedule_name = "many_events_partitioned" partition_set = PartitionSetDefinition( name="many_events_minutely", pipeline_name="many_events", partition_fn=date_partition_range(start=datetime.datetime(2020, 1, 1)), ) def _should_execute(context): return backfill_should_execute(context, partition_set) return partition_set.create_schedule_definition( schedule_name=schedule_name, cron_schedule="* * * * *", # tick every minute partition_selector=backfilling_partition_selector, should_execute=_should_execute, execution_timezone=_toys_tz_info(), )
def longitudinal_schedule(): from .toys.longitudinal import longitudinal_config schedule_name = 'longitudinal_demo' partition_set = PartitionSetDefinition( name='ingest_and_train', pipeline_name='longitudinal_pipeline', partition_fn=date_partition_range(start=datetime.datetime(2020, 1, 1)), environment_dict_fn_for_partition=longitudinal_config, ) def _should_execute(context): return backfill_should_execute(context, partition_set, schedule_name) return partition_set.create_schedule_definition( schedule_name=schedule_name, cron_schedule="* * * * *", # tick every minute partition_selector=backfilling_partition_selector, should_execute=_should_execute, )
def define_schedules(): integer_partition_set = PartitionSetDefinition( name="scheduled_integer_partitions", pipeline_name="no_config_pipeline", partition_fn=lambda: [Partition(x) for x in range(1, 10)], tags_fn_for_partition=lambda _partition: {"test": "1234"}, ) no_config_pipeline_hourly_schedule = ScheduleDefinition( name="no_config_pipeline_hourly_schedule", cron_schedule="0 0 * * *", pipeline_name="no_config_pipeline", ) no_config_pipeline_hourly_schedule_with_config_fn = ScheduleDefinition( name="no_config_pipeline_hourly_schedule_with_config_fn", cron_schedule="0 0 * * *", pipeline_name="no_config_pipeline", ) no_config_should_execute = ScheduleDefinition( name="no_config_should_execute", cron_schedule="0 0 * * *", pipeline_name="no_config_pipeline", should_execute=lambda _context: False, ) dynamic_config = ScheduleDefinition( name="dynamic_config", cron_schedule="0 0 * * *", pipeline_name="no_config_pipeline", ) partition_based = integer_partition_set.create_schedule_definition( schedule_name="partition_based", cron_schedule="0 0 * * *", partition_selector=last_empty_partition, ) @daily_schedule( pipeline_name="no_config_pipeline", start_date=today_at_midnight().subtract(days=1), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=2)).time(), ) def partition_based_decorator(_date): return {} @daily_schedule( pipeline_name="no_config_pipeline", start_date=today_at_midnight().subtract(days=1), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=2)).time(), default_status=DefaultScheduleStatus.RUNNING, ) def running_in_code_schedule(_date): return {} @daily_schedule( pipeline_name="multi_mode_with_loggers", start_date=today_at_midnight().subtract(days=1), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=2)).time(), mode="foo_mode", ) def partition_based_multi_mode_decorator(_date): return {} @hourly_schedule( pipeline_name="no_config_chain_pipeline", start_date=today_at_midnight().subtract(days=1), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=2)).time(), solid_selection=["return_foo"], ) def solid_selection_hourly_decorator(_date): return {} @daily_schedule( pipeline_name="no_config_chain_pipeline", start_date=today_at_midnight().subtract(days=2), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=3)).time(), solid_selection=["return_foo"], ) def solid_selection_daily_decorator(_date): return {} @monthly_schedule( pipeline_name="no_config_chain_pipeline", start_date=(today_at_midnight().subtract(days=100)).replace(day=1), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=4)).time(), solid_selection=["return_foo"], ) def solid_selection_monthly_decorator(_date): return {} @weekly_schedule( pipeline_name="no_config_chain_pipeline", start_date=today_at_midnight().subtract(days=50), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=5)).time(), solid_selection=["return_foo"], ) def solid_selection_weekly_decorator(_date): return {} # Schedules for testing the user error boundary @daily_schedule( pipeline_name="no_config_pipeline", start_date=today_at_midnight().subtract(days=1), should_execute=lambda _: asdf, # pylint: disable=undefined-variable ) def should_execute_error_schedule(_date): return {} @daily_schedule( pipeline_name="no_config_pipeline", start_date=today_at_midnight().subtract(days=1), tags_fn_for_date=lambda _: asdf, # pylint: disable=undefined-variable ) def tags_error_schedule(_date): return {} @daily_schedule( pipeline_name="no_config_pipeline", start_date=today_at_midnight().subtract(days=1), ) def run_config_error_schedule(_date): return asdf # pylint: disable=undefined-variable @daily_schedule( pipeline_name="no_config_pipeline", start_date=today_at_midnight("US/Central") - datetime.timedelta(days=1), execution_timezone="US/Central", ) def timezone_schedule(_date): return {} tagged_pipeline_schedule = ScheduleDefinition( name="tagged_pipeline_schedule", cron_schedule="0 0 * * *", pipeline_name="tagged_pipeline", ) tagged_pipeline_override_schedule = ScheduleDefinition( name="tagged_pipeline_override_schedule", cron_schedule="0 0 * * *", pipeline_name="tagged_pipeline", tags={"foo": "notbar"}, ) invalid_config_schedule = ScheduleDefinition( name="invalid_config_schedule", cron_schedule="0 0 * * *", pipeline_name="pipeline_with_enum_config", run_config={"solids": { "takes_an_enum": { "config": "invalid" } }}, ) return [ run_config_error_schedule, no_config_pipeline_hourly_schedule, no_config_pipeline_hourly_schedule_with_config_fn, no_config_should_execute, dynamic_config, partition_based, partition_based_decorator, partition_based_multi_mode_decorator, solid_selection_hourly_decorator, solid_selection_daily_decorator, solid_selection_monthly_decorator, solid_selection_weekly_decorator, should_execute_error_schedule, tagged_pipeline_schedule, tagged_pipeline_override_schedule, tags_error_schedule, timezone_schedule, invalid_config_schedule, running_in_code_schedule, ]
def define_schedules(): integer_partition_set = PartitionSetDefinition( name='scheduled_integer_partitions', pipeline_name='no_config_pipeline', partition_fn=lambda: [Partition(x) for x in range(1, 10)], environment_dict_fn_for_partition=lambda _partition: {"storage": { "filesystem": {} }}, tags_fn_for_partition=lambda _partition: {"test": "1234"}, ) no_config_pipeline_hourly_schedule = ScheduleDefinition( name="no_config_pipeline_hourly_schedule", cron_schedule="0 0 * * *", pipeline_name="no_config_pipeline", environment_dict={"storage": { "filesystem": {} }}, ) no_config_pipeline_hourly_schedule_with_config_fn = ScheduleDefinition( name="no_config_pipeline_hourly_schedule_with_config_fn", cron_schedule="0 0 * * *", pipeline_name="no_config_pipeline", environment_dict_fn=lambda _context: {"storage": { "filesystem": {} }}, ) no_config_should_execute = ScheduleDefinition( name="no_config_should_execute", cron_schedule="0 0 * * *", pipeline_name="no_config_pipeline", environment_dict={"storage": { "filesystem": {} }}, should_execute=lambda _context: False, ) dynamic_config = ScheduleDefinition( name="dynamic_config", cron_schedule="0 0 * * *", pipeline_name="no_config_pipeline", environment_dict_fn=lambda _context: {"storage": { "filesystem": {} }}, ) partition_based = integer_partition_set.create_schedule_definition( schedule_name="partition_based", cron_schedule="0 0 * * *", ) partition_based_custom_selector = integer_partition_set.create_schedule_definition( schedule_name="partition_based_custom_selector", cron_schedule="0 0 * * *", partition_selector=last_empty_partition, ) @daily_schedule( pipeline_name='no_config_pipeline', start_date=datetime.datetime.now() - datetime.timedelta(days=1), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=2)).time(), ) def partition_based_decorator(_date): return {"storage": {"filesystem": {}}} @daily_schedule( pipeline_name='multi_mode_with_loggers', start_date=datetime.datetime.now() - datetime.timedelta(days=1), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=2)).time(), mode='foo_mode', ) def partition_based_multi_mode_decorator(_date): return {"storage": {"filesystem": {}}} @hourly_schedule( pipeline_name='no_config_chain_pipeline', start_date=datetime.datetime.now() - datetime.timedelta(days=1), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=2)).time(), solid_selection=['return_foo'], ) def solid_selection_hourly_decorator(_date): return {"storage": {"filesystem": {}}} @daily_schedule( pipeline_name='no_config_chain_pipeline', start_date=datetime.datetime.now() - datetime.timedelta(days=2), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=3)).time(), solid_selection=['return_foo'], ) def solid_selection_daily_decorator(_date): return {"storage": {"filesystem": {}}} @monthly_schedule( pipeline_name='no_config_chain_pipeline', start_date=datetime.datetime.now() - datetime.timedelta(days=100), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=4)).time(), solid_selection=['return_foo'], ) def solid_selection_monthly_decorator(_date): return {"storage": {"filesystem": {}}} @weekly_schedule( pipeline_name='no_config_chain_pipeline', start_date=datetime.datetime.now() - datetime.timedelta(days=50), execution_time=(datetime.datetime.now() + datetime.timedelta(hours=5)).time(), solid_selection=['return_foo'], ) def solid_selection_weekly_decorator(_date): return {"storage": {"filesystem": {}}} # Schedules for testing the user error boundary @daily_schedule( pipeline_name='no_config_pipeline', start_date=datetime.datetime.now() - datetime.timedelta(days=1), should_execute=lambda _: asdf, # pylint: disable=undefined-variable ) def should_execute_error_schedule(_date): return {"storage": {"filesystem": {}}} @daily_schedule( pipeline_name='no_config_pipeline', start_date=datetime.datetime.now() - datetime.timedelta(days=1), tags_fn_for_date=lambda _: asdf, # pylint: disable=undefined-variable ) def tags_error_schedule(_date): return {"storage": {"filesystem": {}}} @daily_schedule( pipeline_name='no_config_pipeline', start_date=datetime.datetime.now() - datetime.timedelta(days=1), ) def environment_dict_error_schedule(_date): return asdf # pylint: disable=undefined-variable tagged_pipeline_schedule = ScheduleDefinition( name="tagged_pipeline_schedule", cron_schedule="0 0 * * *", pipeline_name="tagged_pipeline", environment_dict={"storage": { "filesystem": {} }}, ) tagged_pipeline_override_schedule = ScheduleDefinition( name="tagged_pipeline_override_schedule", cron_schedule="0 0 * * *", pipeline_name="tagged_pipeline", environment_dict={"storage": { "filesystem": {} }}, tags={'foo': 'notbar'}, ) invalid_config_schedule = ScheduleDefinition( name="invalid_config_schedule", cron_schedule="0 0 * * *", pipeline_name="pipeline_with_enum_config", environment_dict={"solids": { "takes_an_enum": { 'config': "invalid" } }}, ) return [ environment_dict_error_schedule, no_config_pipeline_hourly_schedule, no_config_pipeline_hourly_schedule_with_config_fn, no_config_should_execute, dynamic_config, partition_based, partition_based_custom_selector, partition_based_decorator, partition_based_multi_mode_decorator, solid_selection_hourly_decorator, solid_selection_daily_decorator, solid_selection_monthly_decorator, solid_selection_weekly_decorator, should_execute_error_schedule, tagged_pipeline_schedule, tagged_pipeline_override_schedule, tags_error_schedule, invalid_config_schedule, ]
def weekday_partition_selector( ctx: ScheduleExecutionContext, partition_set: PartitionSetDefinition ) -> Union[Partition, List[Partition]]: """Maps a schedule execution time to the corresponding partition or list of partitions that should be executed at that time""" partitions = partition_set.get_partitions(ctx.scheduled_execution_time) weekday = ctx.scheduled_execution_time.weekday( ) if ctx.scheduled_execution_time else 0 return partitions[weekday] my_schedule = weekday_partition_set.create_schedule_definition( "my_schedule", "5 0 * * *", partition_selector=weekday_partition_selector, execution_timezone="US/Eastern", ) @repository def my_repository_with_partitioned_schedule(): return [ my_data_pipeline, weekday_partition_set, my_schedule, ] # end_manual_partition_schedule
def run_config_for_date_partition(partition): date = partition.value config = yaml.load(open(Path(__file__).parent / "realized_trips.yaml", "r")) config["solids"]["download_brt_raw_realized_trips"]["config"]["date"] = date return config daily_partition_set = PartitionSetDefinition( name="daily_partitions", pipeline_name="br_rj_riodejaneiro_gtfs_realized_trips", partition_fn=get_date_partitions, run_config_fn_for_partition=run_config_for_date_partition, mode="dev", ) def daily_partition_selector(context, partition_set): partitions = partition_set.get_partitions(context.scheduled_execution_time) return partitions[-2] # one day before run day (today) daily_schedule = daily_partition_set.create_schedule_definition( "br_rj_riodejaneiro_gtfs_realized_trips", "0 1 * * *", partition_selector=daily_partition_selector, execution_timezone="America/Sao_Paulo", )