예제 #1
0
def test_multirun_partition_schedule_definition():
    partition_set = PartitionSetDefinition(
        name="test_time",
        pipeline_name="test_pipeline",
        partition_fn=date_partition_range(
            start=datetime.datetime(2020, 1, 5),
            end=datetime.datetime(2020, 12, 31),
            delta_range="days",
            inclusive=True,
        ),
        run_config_fn_for_partition=lambda _: {},
    )

    def _custom_partition_selector(_context, partition_set_def):
        return partition_set_def.get_partitions()[-5:]

    multi_run_schedule = partition_set.create_schedule_definition(
        "test_schedule", "* * * * *", _custom_partition_selector)

    with instance_for_test() as instance:
        with ScheduleEvaluationContext(
                instance.get_ref(), pendulum.now("UTC")) as schedule_context:
            execution_data = multi_run_schedule.evaluate_tick(schedule_context)
            assert isinstance(execution_data, ScheduleExecutionData)
            assert execution_data.run_requests
            assert len(execution_data.run_requests) == 5
            assert [
                request.run_key for request in execution_data.run_requests
            ] == [
                "2020-12-27",
                "2020-12-28",
                "2020-12-29",
                "2020-12-30",
                "2020-12-31",
            ]

    def _invalid_partition_selector(_cotnext, _partition_set_def):
        return [
            Partition(
                value=create_pendulum_time(year=2019,
                                           month=1,
                                           day=27,
                                           hour=1,
                                           minute=25),
                name="made_up",
            )
        ]

    invalid_schedule = partition_set.create_schedule_definition(
        "test_schedule", "* * * * *", _invalid_partition_selector)

    with instance_for_test() as instance:
        with ScheduleEvaluationContext(
                instance.get_ref(), pendulum.now("UTC")) as schedule_context:
            execution_data = invalid_schedule.evaluate_tick(schedule_context)
            assert isinstance(execution_data, ScheduleExecutionData)
            assert execution_data.skip_message
            assert (
                "Partition selector returned partition not in the partition set: made_up."
                in execution_data.skip_message)
예제 #2
0
def backfill_test_schedule():
    schedule_name = "backfill_unreliable_weekly"
    # create weekly partition set
    partition_set = PartitionSetDefinition(
        name="unreliable_weekly",
        pipeline_name="unreliable_pipeline",
        partition_fn=date_partition_range(
            # first sunday of the year
            start=datetime.datetime(2020, 1, 5),
            delta_range="weeks",
        ),
        run_config_fn_for_partition=lambda _:
        {"intermediate_storage": {
            "filesystem": {}
        }},
    )

    def _should_execute(context):
        return backfill_should_execute(context, partition_set)

    return partition_set.create_schedule_definition(
        schedule_name=schedule_name,
        cron_schedule="* * * * *",  # tick every minute
        partition_selector=backfilling_partition_selector,
        should_execute=_should_execute,
        execution_timezone=_toys_tz_info(),
    )
예제 #3
0
def backfill_test_schedule():
    schedule_name = 'backfill_unreliable_weekly'
    # create weekly partition set
    partition_set = PartitionSetDefinition(
        name='unreliable_weekly',
        pipeline_name='unreliable_pipeline',
        partition_fn=date_partition_range(
            # first sunday of the year
            start=datetime.datetime(2020, 1, 5),
            delta=datetime.timedelta(weeks=1),
        ),
        environment_dict_fn_for_partition=lambda _:
        {'storage': {
            'filesystem': {}
        }},
    )

    def _should_execute(context):
        return backfill_should_execute(context, partition_set, schedule_name)

    return partition_set.create_schedule_definition(
        schedule_name=schedule_name,
        cron_schedule="* * * * *",  # tick every minute
        partition_selector=backfilling_partition_selector,
        should_execute=_should_execute,
    )
예제 #4
0
def longitudinal_schedule():
    from .longitudinal import longitudinal_config

    schedule_name = "longitudinal_demo"

    partition_set = PartitionSetDefinition(
        name="ingest_and_train",
        pipeline_name="longitudinal_pipeline",
        partition_fn=date_partition_range(start=datetime.datetime(2020, 1, 1)),
        run_config_fn_for_partition=longitudinal_config,
    )

    def _should_execute(context):
        return backfill_should_execute(context,
                                       partition_set,
                                       retry_failed=True)

    def _partition_selector(context, partition_set):
        return backfilling_partition_selector(context,
                                              partition_set,
                                              retry_failed=True)

    return partition_set.create_schedule_definition(
        schedule_name=schedule_name,
        cron_schedule="*/5 * * * *",  # tick every 5 minutes
        partition_selector=_partition_selector,
        should_execute=_should_execute,
        execution_timezone=_toys_tz_info(),
    )
예제 #5
0
def define_bar_schedules():
    partition_set = PartitionSetDefinition(
        name="scheduled_partitions",
        pipeline_name="partitioned_scheduled_pipeline",
        partition_fn=lambda: string.digits,
    )
    return {
        "foo_schedule": ScheduleDefinition(
            "foo_schedule", cron_schedule="* * * * *", pipeline_name="test_pipeline", run_config={},
        ),
        "partitioned_schedule": partition_set.create_schedule_definition(
            schedule_name="partitioned_schedule", cron_schedule="* * * * *"
        ),
    }
예제 #6
0
def materialization_schedule():
    # create weekly partition set
    schedule_name = 'many_events_partitioned'
    partition_set = PartitionSetDefinition(
        name='many_events_minutely',
        pipeline_name='many_events',
        partition_fn=date_partition_range(start=datetime.datetime(2020, 1, 1)),
        run_config_fn_for_partition=lambda _: {'storage': {'filesystem': {}}},
    )

    def _should_execute(context):
        return backfill_should_execute(context, partition_set)

    return partition_set.create_schedule_definition(
        schedule_name=schedule_name,
        cron_schedule="* * * * *",  # tick every minute
        partition_selector=backfilling_partition_selector,
        should_execute=_should_execute,
    )
예제 #7
0
def materialization_schedule():
    # create weekly partition set
    schedule_name = "many_events_partitioned"
    partition_set = PartitionSetDefinition(
        name="many_events_minutely",
        pipeline_name="many_events",
        partition_fn=date_partition_range(start=datetime.datetime(2020, 1, 1)),
    )

    def _should_execute(context):
        return backfill_should_execute(context, partition_set)

    return partition_set.create_schedule_definition(
        schedule_name=schedule_name,
        cron_schedule="* * * * *",  # tick every minute
        partition_selector=backfilling_partition_selector,
        should_execute=_should_execute,
        execution_timezone=_toys_tz_info(),
    )
예제 #8
0
파일: schedules.py 프로젝트: cy56/dagster
def longitudinal_schedule():
    from .toys.longitudinal import longitudinal_config

    schedule_name = 'longitudinal_demo'
    partition_set = PartitionSetDefinition(
        name='ingest_and_train',
        pipeline_name='longitudinal_pipeline',
        partition_fn=date_partition_range(start=datetime.datetime(2020, 1, 1)),
        environment_dict_fn_for_partition=longitudinal_config,
    )

    def _should_execute(context):
        return backfill_should_execute(context, partition_set, schedule_name)

    return partition_set.create_schedule_definition(
        schedule_name=schedule_name,
        cron_schedule="* * * * *",  # tick every minute
        partition_selector=backfilling_partition_selector,
        should_execute=_should_execute,
    )
예제 #9
0
파일: setup.py 프로젝트: helloworld/dagster
def define_schedules():
    integer_partition_set = PartitionSetDefinition(
        name="scheduled_integer_partitions",
        pipeline_name="no_config_pipeline",
        partition_fn=lambda: [Partition(x) for x in range(1, 10)],
        tags_fn_for_partition=lambda _partition: {"test": "1234"},
    )

    no_config_pipeline_hourly_schedule = ScheduleDefinition(
        name="no_config_pipeline_hourly_schedule",
        cron_schedule="0 0 * * *",
        pipeline_name="no_config_pipeline",
    )

    no_config_pipeline_hourly_schedule_with_config_fn = ScheduleDefinition(
        name="no_config_pipeline_hourly_schedule_with_config_fn",
        cron_schedule="0 0 * * *",
        pipeline_name="no_config_pipeline",
    )

    no_config_should_execute = ScheduleDefinition(
        name="no_config_should_execute",
        cron_schedule="0 0 * * *",
        pipeline_name="no_config_pipeline",
        should_execute=lambda _context: False,
    )

    dynamic_config = ScheduleDefinition(
        name="dynamic_config",
        cron_schedule="0 0 * * *",
        pipeline_name="no_config_pipeline",
    )

    partition_based = integer_partition_set.create_schedule_definition(
        schedule_name="partition_based",
        cron_schedule="0 0 * * *",
        partition_selector=last_empty_partition,
    )

    @daily_schedule(
        pipeline_name="no_config_pipeline",
        start_date=today_at_midnight().subtract(days=1),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=2)).time(),
    )
    def partition_based_decorator(_date):
        return {}

    @daily_schedule(
        pipeline_name="no_config_pipeline",
        start_date=today_at_midnight().subtract(days=1),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=2)).time(),
        default_status=DefaultScheduleStatus.RUNNING,
    )
    def running_in_code_schedule(_date):
        return {}

    @daily_schedule(
        pipeline_name="multi_mode_with_loggers",
        start_date=today_at_midnight().subtract(days=1),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=2)).time(),
        mode="foo_mode",
    )
    def partition_based_multi_mode_decorator(_date):
        return {}

    @hourly_schedule(
        pipeline_name="no_config_chain_pipeline",
        start_date=today_at_midnight().subtract(days=1),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=2)).time(),
        solid_selection=["return_foo"],
    )
    def solid_selection_hourly_decorator(_date):
        return {}

    @daily_schedule(
        pipeline_name="no_config_chain_pipeline",
        start_date=today_at_midnight().subtract(days=2),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=3)).time(),
        solid_selection=["return_foo"],
    )
    def solid_selection_daily_decorator(_date):
        return {}

    @monthly_schedule(
        pipeline_name="no_config_chain_pipeline",
        start_date=(today_at_midnight().subtract(days=100)).replace(day=1),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=4)).time(),
        solid_selection=["return_foo"],
    )
    def solid_selection_monthly_decorator(_date):
        return {}

    @weekly_schedule(
        pipeline_name="no_config_chain_pipeline",
        start_date=today_at_midnight().subtract(days=50),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=5)).time(),
        solid_selection=["return_foo"],
    )
    def solid_selection_weekly_decorator(_date):
        return {}

    # Schedules for testing the user error boundary
    @daily_schedule(
        pipeline_name="no_config_pipeline",
        start_date=today_at_midnight().subtract(days=1),
        should_execute=lambda _: asdf,  # pylint: disable=undefined-variable
    )
    def should_execute_error_schedule(_date):
        return {}

    @daily_schedule(
        pipeline_name="no_config_pipeline",
        start_date=today_at_midnight().subtract(days=1),
        tags_fn_for_date=lambda _: asdf,  # pylint: disable=undefined-variable
    )
    def tags_error_schedule(_date):
        return {}

    @daily_schedule(
        pipeline_name="no_config_pipeline",
        start_date=today_at_midnight().subtract(days=1),
    )
    def run_config_error_schedule(_date):
        return asdf  # pylint: disable=undefined-variable

    @daily_schedule(
        pipeline_name="no_config_pipeline",
        start_date=today_at_midnight("US/Central") -
        datetime.timedelta(days=1),
        execution_timezone="US/Central",
    )
    def timezone_schedule(_date):
        return {}

    tagged_pipeline_schedule = ScheduleDefinition(
        name="tagged_pipeline_schedule",
        cron_schedule="0 0 * * *",
        pipeline_name="tagged_pipeline",
    )

    tagged_pipeline_override_schedule = ScheduleDefinition(
        name="tagged_pipeline_override_schedule",
        cron_schedule="0 0 * * *",
        pipeline_name="tagged_pipeline",
        tags={"foo": "notbar"},
    )

    invalid_config_schedule = ScheduleDefinition(
        name="invalid_config_schedule",
        cron_schedule="0 0 * * *",
        pipeline_name="pipeline_with_enum_config",
        run_config={"solids": {
            "takes_an_enum": {
                "config": "invalid"
            }
        }},
    )

    return [
        run_config_error_schedule,
        no_config_pipeline_hourly_schedule,
        no_config_pipeline_hourly_schedule_with_config_fn,
        no_config_should_execute,
        dynamic_config,
        partition_based,
        partition_based_decorator,
        partition_based_multi_mode_decorator,
        solid_selection_hourly_decorator,
        solid_selection_daily_decorator,
        solid_selection_monthly_decorator,
        solid_selection_weekly_decorator,
        should_execute_error_schedule,
        tagged_pipeline_schedule,
        tagged_pipeline_override_schedule,
        tags_error_schedule,
        timezone_schedule,
        invalid_config_schedule,
        running_in_code_schedule,
    ]
예제 #10
0
def define_schedules():
    integer_partition_set = PartitionSetDefinition(
        name='scheduled_integer_partitions',
        pipeline_name='no_config_pipeline',
        partition_fn=lambda: [Partition(x) for x in range(1, 10)],
        environment_dict_fn_for_partition=lambda _partition:
        {"storage": {
            "filesystem": {}
        }},
        tags_fn_for_partition=lambda _partition: {"test": "1234"},
    )

    no_config_pipeline_hourly_schedule = ScheduleDefinition(
        name="no_config_pipeline_hourly_schedule",
        cron_schedule="0 0 * * *",
        pipeline_name="no_config_pipeline",
        environment_dict={"storage": {
            "filesystem": {}
        }},
    )

    no_config_pipeline_hourly_schedule_with_config_fn = ScheduleDefinition(
        name="no_config_pipeline_hourly_schedule_with_config_fn",
        cron_schedule="0 0 * * *",
        pipeline_name="no_config_pipeline",
        environment_dict_fn=lambda _context: {"storage": {
            "filesystem": {}
        }},
    )

    no_config_should_execute = ScheduleDefinition(
        name="no_config_should_execute",
        cron_schedule="0 0 * * *",
        pipeline_name="no_config_pipeline",
        environment_dict={"storage": {
            "filesystem": {}
        }},
        should_execute=lambda _context: False,
    )

    dynamic_config = ScheduleDefinition(
        name="dynamic_config",
        cron_schedule="0 0 * * *",
        pipeline_name="no_config_pipeline",
        environment_dict_fn=lambda _context: {"storage": {
            "filesystem": {}
        }},
    )

    partition_based = integer_partition_set.create_schedule_definition(
        schedule_name="partition_based",
        cron_schedule="0 0 * * *",
    )

    partition_based_custom_selector = integer_partition_set.create_schedule_definition(
        schedule_name="partition_based_custom_selector",
        cron_schedule="0 0 * * *",
        partition_selector=last_empty_partition,
    )

    @daily_schedule(
        pipeline_name='no_config_pipeline',
        start_date=datetime.datetime.now() - datetime.timedelta(days=1),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=2)).time(),
    )
    def partition_based_decorator(_date):
        return {"storage": {"filesystem": {}}}

    @daily_schedule(
        pipeline_name='multi_mode_with_loggers',
        start_date=datetime.datetime.now() - datetime.timedelta(days=1),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=2)).time(),
        mode='foo_mode',
    )
    def partition_based_multi_mode_decorator(_date):
        return {"storage": {"filesystem": {}}}

    @hourly_schedule(
        pipeline_name='no_config_chain_pipeline',
        start_date=datetime.datetime.now() - datetime.timedelta(days=1),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=2)).time(),
        solid_selection=['return_foo'],
    )
    def solid_selection_hourly_decorator(_date):
        return {"storage": {"filesystem": {}}}

    @daily_schedule(
        pipeline_name='no_config_chain_pipeline',
        start_date=datetime.datetime.now() - datetime.timedelta(days=2),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=3)).time(),
        solid_selection=['return_foo'],
    )
    def solid_selection_daily_decorator(_date):
        return {"storage": {"filesystem": {}}}

    @monthly_schedule(
        pipeline_name='no_config_chain_pipeline',
        start_date=datetime.datetime.now() - datetime.timedelta(days=100),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=4)).time(),
        solid_selection=['return_foo'],
    )
    def solid_selection_monthly_decorator(_date):
        return {"storage": {"filesystem": {}}}

    @weekly_schedule(
        pipeline_name='no_config_chain_pipeline',
        start_date=datetime.datetime.now() - datetime.timedelta(days=50),
        execution_time=(datetime.datetime.now() +
                        datetime.timedelta(hours=5)).time(),
        solid_selection=['return_foo'],
    )
    def solid_selection_weekly_decorator(_date):
        return {"storage": {"filesystem": {}}}

    # Schedules for testing the user error boundary
    @daily_schedule(
        pipeline_name='no_config_pipeline',
        start_date=datetime.datetime.now() - datetime.timedelta(days=1),
        should_execute=lambda _: asdf,  # pylint: disable=undefined-variable
    )
    def should_execute_error_schedule(_date):
        return {"storage": {"filesystem": {}}}

    @daily_schedule(
        pipeline_name='no_config_pipeline',
        start_date=datetime.datetime.now() - datetime.timedelta(days=1),
        tags_fn_for_date=lambda _: asdf,  # pylint: disable=undefined-variable
    )
    def tags_error_schedule(_date):
        return {"storage": {"filesystem": {}}}

    @daily_schedule(
        pipeline_name='no_config_pipeline',
        start_date=datetime.datetime.now() - datetime.timedelta(days=1),
    )
    def environment_dict_error_schedule(_date):
        return asdf  # pylint: disable=undefined-variable

    tagged_pipeline_schedule = ScheduleDefinition(
        name="tagged_pipeline_schedule",
        cron_schedule="0 0 * * *",
        pipeline_name="tagged_pipeline",
        environment_dict={"storage": {
            "filesystem": {}
        }},
    )

    tagged_pipeline_override_schedule = ScheduleDefinition(
        name="tagged_pipeline_override_schedule",
        cron_schedule="0 0 * * *",
        pipeline_name="tagged_pipeline",
        environment_dict={"storage": {
            "filesystem": {}
        }},
        tags={'foo': 'notbar'},
    )

    invalid_config_schedule = ScheduleDefinition(
        name="invalid_config_schedule",
        cron_schedule="0 0 * * *",
        pipeline_name="pipeline_with_enum_config",
        environment_dict={"solids": {
            "takes_an_enum": {
                'config': "invalid"
            }
        }},
    )

    return [
        environment_dict_error_schedule,
        no_config_pipeline_hourly_schedule,
        no_config_pipeline_hourly_schedule_with_config_fn,
        no_config_should_execute,
        dynamic_config,
        partition_based,
        partition_based_custom_selector,
        partition_based_decorator,
        partition_based_multi_mode_decorator,
        solid_selection_hourly_decorator,
        solid_selection_daily_decorator,
        solid_selection_monthly_decorator,
        solid_selection_weekly_decorator,
        should_execute_error_schedule,
        tagged_pipeline_schedule,
        tagged_pipeline_override_schedule,
        tags_error_schedule,
        invalid_config_schedule,
    ]
예제 #11
0

def weekday_partition_selector(
    ctx: ScheduleExecutionContext, partition_set: PartitionSetDefinition
) -> Union[Partition, List[Partition]]:
    """Maps a schedule execution time to the corresponding partition or list of partitions that
    should be executed at that time"""
    partitions = partition_set.get_partitions(ctx.scheduled_execution_time)
    weekday = ctx.scheduled_execution_time.weekday(
    ) if ctx.scheduled_execution_time else 0
    return partitions[weekday]


my_schedule = weekday_partition_set.create_schedule_definition(
    "my_schedule",
    "5 0 * * *",
    partition_selector=weekday_partition_selector,
    execution_timezone="US/Eastern",
)


@repository
def my_repository_with_partitioned_schedule():
    return [
        my_data_pipeline,
        weekday_partition_set,
        my_schedule,
    ]


# end_manual_partition_schedule
예제 #12
0

def run_config_for_date_partition(partition):
    date = partition.value
    config = yaml.load(open(Path(__file__).parent / "realized_trips.yaml", "r"))
    config["solids"]["download_brt_raw_realized_trips"]["config"]["date"] = date
    return config


daily_partition_set = PartitionSetDefinition(
    name="daily_partitions",
    pipeline_name="br_rj_riodejaneiro_gtfs_realized_trips",
    partition_fn=get_date_partitions,
    run_config_fn_for_partition=run_config_for_date_partition,
    mode="dev",
)


def daily_partition_selector(context, partition_set):

    partitions = partition_set.get_partitions(context.scheduled_execution_time)
    return partitions[-2]  # one day before run day (today)


daily_schedule = daily_partition_set.create_schedule_definition(
    "br_rj_riodejaneiro_gtfs_realized_trips",
    "0 1 * * *",
    partition_selector=daily_partition_selector,
    execution_timezone="America/Sao_Paulo",
)