Пример #1
0
def pipeline_backfill_command(**kwargs):
    with DagsterInstance.get() as instance:
        execute_backfill_command(kwargs, click.echo, instance)
Пример #2
0
def debug_heartbeat_dump_command():
    with DagsterInstance.get() as instance:
        for daemon_type in DaemonType:
            click.echo(get_daemon_status(instance, daemon_type))
Пример #3
0
def _make_airflow_dag(
    handle,
    pipeline_name,
    environment_dict=None,
    mode=None,
    instance=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    operator=DagsterPythonOperator,
):
    check.inst_param(handle, 'handle', ExecutionTargetHandle)
    check.str_param(pipeline_name, 'pipeline_name')
    environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)
    mode = check.opt_str_param(mode, 'mode')
    # Default to use the (persistent) system temp directory rather than a seven.TemporaryDirectory,
    # which would not be consistent between Airflow task invocations.
    instance = (
        check.inst_param(instance, 'instance', DagsterInstance)
        if instance
        else DagsterInstance.get(fallback_storage=seven.get_system_temp_directory())
    )

    # Only used for Airflow; internally we continue to use pipeline.name
    dag_id = check.opt_str_param(dag_id, 'dag_id', _rename_for_airflow(pipeline_name))

    dag_description = check.opt_str_param(
        dag_description, 'dag_description', _make_dag_description(pipeline_name)
    )
    check.subclass_param(operator, 'operator', BaseOperator)

    # black 18.9b0 doesn't support py27-compatible formatting of the below invocation (omitting
    # the trailing comma after **check.opt_dict_param...) -- black 19.3b0 supports multiple python
    # versions, but currently doesn't know what to do with from __future__ import print_function --
    # see https://github.com/ambv/black/issues/768
    # fmt: off
    dag_kwargs = dict(
        {'default_args': DEFAULT_ARGS},
        **check.opt_dict_param(dag_kwargs, 'dag_kwargs', key_type=str)
    )
    # fmt: on

    op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str)

    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)

    pipeline = handle.build_pipeline_definition()

    if mode is None:
        mode = pipeline.get_default_mode_name()

    execution_plan = create_execution_plan(
        pipeline, environment_dict, run_config=RunConfig(mode=mode)
    )

    tasks = {}

    coalesced_plan = coalesce_execution_steps(execution_plan)

    for solid_handle, solid_steps in coalesced_plan.items():

        step_keys = [step.key for step in solid_steps]

        # We separately construct the Airflow operators here with the appropriate args, because if
        # Airflow gets extraneous args/kwargs it emits a warning every time it parses the DAG (and
        # future Airflow versions will mark this a failure).
        # see https://github.com/ambv/black/issues/768
        # fmt: off
        if operator == DagsterPythonOperator:
            task = operator(
                handle=handle,
                pipeline_name=pipeline_name,
                environment_dict=environment_dict,
                mode=mode,
                task_id=solid_handle,
                step_keys=step_keys,
                dag=dag,
                instance_ref=instance.get_ref(),
                **op_kwargs
            )
        else:
            task = operator(
                pipeline_name=pipeline_name,
                environment_dict=environment_dict,
                mode=mode,
                task_id=solid_handle,
                step_keys=step_keys,
                dag=dag,
                instance_ref=instance.get_ref(),
                **op_kwargs
            )
        # fmt: on

        tasks[solid_handle] = task

        for solid_step in solid_steps:
            for step_input in solid_step.step_inputs:
                for key in step_input.dependency_keys:
                    prev_solid_handle = execution_plan.get_step_by_key(key).solid_handle.to_string()
                    if solid_handle != prev_solid_handle:
                        tasks[prev_solid_handle].set_downstream(task)

    return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
Пример #4
0
def pipeline_backfill_command(mode, *args, **kwargs):
    pipeline_name = kwargs.pop('pipeline_name')
    repo_args = {k: v for k, v in kwargs.items() if k in REPO_ARG_NAMES}
    if pipeline_name and not isinstance(pipeline_name, six.string_types):
        if len(pipeline_name) == 1:
            pipeline_name = pipeline_name[0]

    instance = DagsterInstance.get()
    handle = handle_for_repo_cli_args(repo_args)
    repository = handle.build_repository_definition()

    if not pipeline_name:
        pipeline_name = click.prompt(
            'Select a pipeline to backfill: {}'.format(', '.join(repository.pipeline_names))
        )

    pipeline = handle.with_pipeline_name(pipeline_name).build_pipeline_definition()

    partition_handle = handle.build_partitions_handle()

    if not pipeline:
        raise click.UsageError('no pipeline')

    if not partition_handle:
        raise click.UsageError('no parititon')

    if kwargs.get('partition_set'):
        partition_set = partition_handle.get_partition_set(kwargs['partition_set'])
    else:
        all_partition_sets = [
            x for x in partition_handle.get_all_partition_sets() if x.pipeline_name == pipeline.name
        ]
        if not all_partition_sets:
            raise click.UsageError(
                'Pipeline `{}` does not have partition sets defined'.format(pipeline.name)
            )
        if len(all_partition_sets) == 1:
            partition_set = all_partition_sets[0]
        else:
            partition_set_name = click.prompt(
                'Select a partition set to use for backfill: {}'.format(
                    ', '.join(x.name for x in all_partition_sets)
                )
            )
            partition_set = next(
                (x for x in all_partition_sets if x.name == partition_set_name), None
            )

    if not partition_set:
        raise click.UsageError('error, partition set not found')

    partitions = gen_partition_slice_from_args(partition_set, kwargs)

    click.echo('\n     Pipeline: {}'.format(pipeline.name))
    click.echo('Partition set: {}'.format(partition_set.name))
    click.echo('   Partitions: {}\n'.format(', '.join([x.name for x in partitions])))

    if click.confirm(
        'Do you want to proceed with the backfill ({} partitions)?'.format(len(partitions))
    ):
        click.echo('Launching')
        from dagster_graphql.launcher import RemoteDagitRunLauncher

        run_launcher = RemoteDagitRunLauncher('http://localhost:3333')
        for partition in partitions:
            run = PipelineRun(
                pipeline_name=pipeline.name,
                run_id=make_new_run_id(),
                selector=ExecutionSelector(pipeline.name),
                environment_dict=partition_set.environment_dict_for_partition(partition),
                mode=mode or 'default',
                tags=partition_set.tags_for_partition(partition),
                status=PipelineRunStatus.NOT_STARTED,
            )
            run = run_launcher.launch_run(run)
    else:
        click.echo(' Aborted!')
Пример #5
0
def wipe_command():
    with DagsterInstance.get() as instance:
        instance.wipe_daemon_heartbeats()
        click.echo("Daemon heartbeats wiped")
Пример #6
0
def pipeline_list_command(**kwargs):
    return execute_list_command(kwargs, click.echo, DagsterInstance.get())
Пример #7
0
def execute_backfill_command(cli_args, print_fn, instance=None):
    pipeline_name = cli_args.pop('pipeline_name')
    repo_args = {k: v for k, v in cli_args.items() if k in REPO_ARG_NAMES}
    if pipeline_name and not isinstance(pipeline_name, six.string_types):
        if len(pipeline_name) == 1:
            pipeline_name = pipeline_name[0]

    instance = instance or DagsterInstance.get()
    handle = handle_for_repo_cli_args(repo_args)
    repository = handle.build_repository_definition()
    noprompt = cli_args.get('noprompt')

    # check run launcher
    if not instance.run_launcher:
        raise click.UsageError(
            'A run launcher must be configured before running a backfill. You can configure a run '
            'launcher (e.g. dagster_graphql.launcher.RemoteDagitRunLauncher) in your instance '
            '`dagster.yaml` settings. See '
            'https://dagster.readthedocs.io/en/latest/sections/deploying/instance.html for more'
            'information.')

    # Resolve pipeline
    if not pipeline_name and noprompt:
        raise click.UsageError('No pipeline specified')
    if not pipeline_name:
        pipeline_name = click.prompt(
            'Select a pipeline to backfill: {}'.format(', '.join(
                repository.pipeline_names)))
    repository = handle.build_repository_definition()
    if not repository.has_pipeline(pipeline_name):
        raise click.UsageError(
            'No pipeline found named `{}`'.format(pipeline_name))

    pipeline = repository.get_pipeline(pipeline_name)

    # Resolve partition set
    all_partition_sets = get_partition_sets_for_handle(handle)
    pipeline_partition_sets = [
        x for x in all_partition_sets if x.pipeline_name == pipeline.name
    ]
    if not pipeline_partition_sets:
        raise click.UsageError(
            'No partition sets found for pipeline `{}`'.format(pipeline.name))
    partition_set_name = cli_args.get('partition_set')
    if not partition_set_name:
        if len(pipeline_partition_sets) == 1:
            partition_set_name = pipeline_partition_sets[0].name
        elif noprompt:
            raise click.UsageError(
                'No partition set specified (see option `--partition-set`)')
        else:
            partition_set_name = click.prompt(
                'Select a partition set to use for backfill: {}'.format(
                    ', '.join(x.name for x in pipeline_partition_sets)))
    partition_set = next(
        (x for x in pipeline_partition_sets if x.name == partition_set_name),
        None)
    if not partition_set:
        raise click.UsageError(
            'No partition set found named `{}`'.format(partition_set_name))

    # Resolve partitions to backfill
    partitions = gen_partitions_from_args(partition_set, cli_args)

    # Print backfill info
    print_fn('\n     Pipeline: {}'.format(pipeline.name))
    print_fn('Partition set: {}'.format(partition_set.name))
    print_fn('   Partitions: {}\n'.format(
        print_partition_format(partitions, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
            'Do you want to proceed with the backfill ({} partitions)?'.format(
                len(partitions))):

        backfill_tag = ''.join(
            random.choice(string.ascii_lowercase)
            for x in range(BACKFILL_TAG_LENGTH))
        print_fn('Launching runs... ')

        for partition in partitions:
            run = PipelineRun(
                pipeline_name=pipeline.name,
                run_id=make_new_run_id(),
                selector=ExecutionSelector(pipeline.name),
                environment_dict=partition_set.environment_dict_for_partition(
                    partition),
                mode=cli_args.get('mode') or 'default',
                tags=merge_dicts({'dagster/backfill': backfill_tag},
                                 partition_set.tags_for_partition(partition)),
                status=PipelineRunStatus.NOT_STARTED,
            )
            instance.launch_run(run)
            # Remove once we can handle synchronous execution... currently limited by sqlite
            time.sleep(0.1)

        print_fn('Launched backfill job `{}`'.format(backfill_tag))
    else:
        print_fn(' Aborted!')
Пример #8
0
from dagster_examples.experimental.repo import daily_rollup_schedule

from dagster import execute_partition_set
from dagster.core.instance import DagsterInstance


def select_weekday_partitions(candidate_partitions):
    SATURDAY = 5
    SUNDAY = 6
    return [
        partition for partition in candidate_partitions
        if partition.value.weekday() != SATURDAY
        and partition.value.weekday() != SUNDAY
    ]


partition_set = daily_rollup_schedule.get_partition_set()
execute_partition_set(partition_set,
                      select_weekday_partitions,
                      instance=DagsterInstance.get())
Пример #9
0
def _make_airflow_dag(
    handle,
    pipeline_name,
    environment_dict=None,
    mode=None,
    instance=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    operator=DagsterPythonOperator,
):
    check.inst_param(handle, 'handle', ExecutionTargetHandle)
    check.str_param(pipeline_name, 'pipeline_name')
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict',
                                            key_type=str)
    mode = check.opt_str_param(mode, 'mode')
    # Default to use the (persistent) system temp directory rather than a seven.TemporaryDirectory,
    # which would not be consistent between Airflow task invocations.
    instance = (check.inst_param(instance, 'instance', DagsterInstance)
                if instance else DagsterInstance.get(
                    fallback_storage=seven.get_system_temp_directory()))

    # Only used for Airflow; internally we continue to use pipeline.name
    dag_id = check.opt_str_param(dag_id, 'dag_id',
                                 _rename_for_airflow(pipeline_name))

    dag_description = check.opt_str_param(dag_description, 'dag_description',
                                          _make_dag_description(pipeline_name))
    check.subclass_param(operator, 'operator', BaseOperator)

    dag_kwargs = dict({'default_args': DEFAULT_ARGS},
                      **check.opt_dict_param(dag_kwargs,
                                             'dag_kwargs',
                                             key_type=str))

    op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str)

    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)
    pipeline = handle.build_pipeline_definition()

    if mode is None:
        mode = pipeline.get_default_mode_name()

    execution_plan = create_execution_plan(pipeline,
                                           environment_dict,
                                           mode=mode)

    tasks = {}

    coalesced_plan = coalesce_execution_steps(execution_plan)

    for solid_handle, solid_steps in coalesced_plan.items():
        step_keys = [step.key for step in solid_steps]

        operator_parameters = DagsterOperatorParameters(
            handle=handle,
            pipeline_name=pipeline_name,
            environment_dict=environment_dict,
            mode=mode,
            task_id=solid_handle,
            step_keys=step_keys,
            dag=dag,
            instance_ref=instance.get_ref(),
            op_kwargs=op_kwargs,
            pipeline_snapshot=pipeline.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                execution_plan,
                pipeline_snapshot_id=pipeline.get_pipeline_snapshot_id()),
        )
        task = operator(operator_parameters)

        tasks[solid_handle] = task

        for solid_step in solid_steps:
            for step_input in solid_step.step_inputs:
                for key in step_input.dependency_keys:
                    prev_solid_handle = execution_plan.get_step_by_key(
                        key).solid_handle.to_string()
                    if solid_handle != prev_solid_handle:
                        tasks[prev_solid_handle].set_downstream(task)

    return (dag,
            [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
Пример #10
0
def run_list_command(limit):
    with DagsterInstance.get() as instance:
        for run in instance.get_runs(limit=limit):
            click.echo("Run: {}".format(run.run_id))
            click.echo("     Pipeline: {}".format(run.pipeline_name))
Пример #11
0
def execute_preview_command(cli_args, print_fn):
    with DagsterInstance.get() as instance:
        with get_external_repository_from_kwargs(cli_args) as external_repo:
            check_repo_and_scheduler(external_repo, instance)

            print_changes(external_repo, instance, print_fn, preview=True)
Пример #12
0
def execute_backfill_command(cli_args, print_fn, instance=None):
    instance = instance or DagsterInstance.get()
    external_pipeline = get_external_pipeline_from_kwargs(cli_args)
    external_repository = get_external_repository_from_kwargs(cli_args)

    # We should move this to use external repository
    # https://github.com/dagster-io/dagster/issues/2556
    recon_repo = recon_repo_from_external_repo(external_repository)
    repo_def = recon_repo.get_definition()

    noprompt = cli_args.get('noprompt')

    pipeline_def = repo_def.get_pipeline(external_pipeline.name)

    # Resolve partition set
    all_partition_sets = repo_def.partition_set_defs + [
        schedule_def.get_partition_set()
        for schedule_def in repo_def.schedule_defs
        if isinstance(schedule_def, PartitionScheduleDefinition)
    ]

    pipeline_partition_sets = [
        x for x in all_partition_sets if x.pipeline_name == pipeline_def.name
    ]
    if not pipeline_partition_sets:
        raise click.UsageError(
            'No partition sets found for pipeline `{}`'.format(pipeline_def.name)
        )
    partition_set_name = cli_args.get('partition_set')
    if not partition_set_name:
        if len(pipeline_partition_sets) == 1:
            partition_set_name = pipeline_partition_sets[0].name
        elif noprompt:
            raise click.UsageError('No partition set specified (see option `--partition-set`)')
        else:
            partition_set_name = click.prompt(
                'Select a partition set to use for backfill: {}'.format(
                    ', '.join(x.name for x in pipeline_partition_sets)
                )
            )
    partition_set = next((x for x in pipeline_partition_sets if x.name == partition_set_name), None)
    if not partition_set:
        raise click.UsageError('No partition set found named `{}`'.format(partition_set_name))

    # Resolve partitions to backfill
    partitions = gen_partitions_from_args(partition_set, cli_args)

    # Print backfill info
    print_fn('\n     Pipeline: {}'.format(pipeline_def.name))
    print_fn('Partition set: {}'.format(partition_set.name))
    print_fn('   Partitions: {}\n'.format(print_partition_format(partitions, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
        'Do you want to proceed with the backfill ({} partitions)?'.format(len(partitions))
    ):

        print_fn('Launching runs... ')
        backfill_id = make_new_backfill_id()

        run_tags = merge_dicts(
            PipelineRun.tags_for_backfill_id(backfill_id), get_tags_from_args(cli_args),
        )

        for partition in partitions:
            run = instance.create_run_for_pipeline(
                pipeline_def=pipeline_def,
                mode=partition_set.mode,
                solids_to_execute=frozenset(partition_set.solid_selection)
                if partition_set and partition_set.solid_selection
                else None,
                run_config=partition_set.run_config_for_partition(partition),
                tags=merge_dicts(partition_set.tags_for_partition(partition), run_tags),
            )

            instance.launch_run(run.run_id, external_pipeline)
            # Remove once we can handle synchronous execution... currently limited by sqlite
            time.sleep(0.1)

        print_fn('Launched backfill job `{}`'.format(backfill_id))
    else:
        print_fn(' Aborted!')
Пример #13
0
def _logged_pipeline_launch_command(config, preset_name, mode, instance, kwargs):
    check.inst_param(instance, 'instance', DagsterInstance)
    env = (
        canonicalize_backcompat_args(
            (config if config else None),
            '--config',
            (kwargs.get('env') if kwargs.get('env') else None),
            '--env',
            '0.9.0',
            stacklevel=2,  # this stacklevel can point the warning to this line
        )
        or tuple()  # back to default empty tuple
    )

    env = list(check.opt_tuple_param(env, 'env', default=(), of_type=str))

    external_pipeline = get_external_pipeline_from_kwargs(kwargs)
    # We should move this to use external pipeline
    # https://github.com/dagster-io/dagster/issues/2556
    pipeline = recon_pipeline_from_origin(external_pipeline.get_origin())

    instance = DagsterInstance.get()
    log_repo_stats(instance=instance, pipeline=pipeline, source='pipeline_launch_command')

    if preset_name:
        if env:
            raise click.UsageError('Can not use --preset with --config.')

        if mode:
            raise click.UsageError('Can not use --preset with --mode.')

        preset = pipeline.get_definition().get_preset(preset_name)
    else:
        preset = None

    run_tags = get_tags_from_args(kwargs)

    solid_selection = get_solid_selection_from_args(kwargs)

    if preset and preset.solid_selection is not None:
        check.invariant(
            solid_selection is None or solid_selection == preset.solid_selection,
            'The solid_selection set in preset \'{preset}\', {preset_subset}, does not agree with '
            'the `solid_selection` argument: {solid_selection}'.format(
                preset=preset,
                preset_subset=preset.solid_selection,
                solid_selection=solid_selection,
            ),
        )
        solid_selection = preset.solid_selection

    # generate pipeline subset from the given solid_selection
    if solid_selection:
        pipeline = pipeline.subset_for_execution(solid_selection)

    # FIXME need to check the env against run_config
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline.get_definition(),
        solid_selection=solid_selection,
        solids_to_execute=pipeline.solids_to_execute,
        run_config=preset.run_config if preset else load_yaml_from_glob_list(env),
        mode=(preset.mode if preset else mode) or 'default',
        tags=run_tags,
    )

    recon_repo = pipeline.get_reconstructable_repository()

    repo_location = InProcessRepositoryLocation(recon_repo)
    external_pipeline = (
        repo_location.get_repository(recon_repo.get_definition().name).get_full_external_pipeline(
            pipeline.get_definition().name
        ),
    )

    return instance.launch_run(pipeline_run.run_id, external_pipeline)
Пример #14
0
def run_wipe_command():
    instance = DagsterInstance.get()
    instance.wipe()
    click.echo('Deleted all run history')
Пример #15
0
def launch_scheduled_execution(output_file, schedule_name,
                               override_system_timezone, **kwargs):
    with (mock_system_timezone(override_system_timezone)
          if override_system_timezone else nullcontext()):
        with ipc_write_stream(output_file) as stream:
            with DagsterInstance.get() as instance:
                repository_origin = get_repository_origin_from_kwargs(kwargs)
                job_origin = repository_origin.get_job_origin(schedule_name)

                # open the tick scope before we load any external artifacts so that
                # load errors are stored in DB
                with _schedule_tick_context(
                        instance,
                        stream,
                        JobTickData(
                            job_origin_id=job_origin.get_id(),
                            job_name=schedule_name,
                            job_type=JobType.SCHEDULE,
                            status=JobTickStatus.STARTED,
                            timestamp=time.time(),
                        ),
                ) as tick_context:
                    with get_repository_location_from_kwargs(
                            kwargs) as repo_location:
                        repo_dict = repo_location.get_repositories()
                        check.invariant(
                            repo_dict and len(repo_dict) == 1,
                            "Passed in arguments should reference exactly one repository, instead there are {num_repos}"
                            .format(num_repos=len(repo_dict)),
                        )
                        external_repo = next(iter(repo_dict.values()))
                        if not schedule_name in [
                                schedule.name for schedule in
                                external_repo.get_external_schedules()
                        ]:
                            raise DagsterInvariantViolationError(
                                "Could not find schedule named {schedule_name}"
                                .format(schedule_name=schedule_name), )

                        external_schedule = external_repo.get_external_schedule(
                            schedule_name)

                        # Validate that either the schedule has no timezone or it matches
                        # the system timezone
                        schedule_timezone = external_schedule.execution_timezone
                        if schedule_timezone:
                            system_timezone = pendulum.now().timezone.name

                            if system_timezone != external_schedule.execution_timezone:
                                raise DagsterInvariantViolationError(
                                    "Schedule {schedule_name} is set to execute in {schedule_timezone}, "
                                    "but this scheduler can only run in the system timezone, "
                                    "{system_timezone}. Use DagsterDaemonScheduler if you want to be able "
                                    "to execute schedules in arbitrary timezones."
                                    .format(
                                        schedule_name=external_schedule.name,
                                        schedule_timezone=schedule_timezone,
                                        system_timezone=system_timezone,
                                    ), )

                        _launch_scheduled_executions(instance, repo_location,
                                                     external_repo,
                                                     external_schedule,
                                                     tick_context)
Пример #16
0
def run_list_command():
    instance = DagsterInstance.get()
    for run in instance.get_runs():
        click.echo("Run: {}".format(run.run_id))
        click.echo("     Pipeline: {}".format(run.pipeline_name))
Пример #17
0
def pipeline_launch_command(config, preset, mode, **kwargs):
    return _logged_pipeline_launch_command(config, preset, mode,
                                           DagsterInstance.get(), kwargs)
Пример #18
0
def execute_preview_command(sensor_name,
                            since,
                            last_run_key,
                            cursor,
                            cli_args,
                            print_fn,
                            instance=None):
    with DagsterInstance.get() as instance:
        with get_repository_location_from_kwargs(
                instance,
                version=dagster_version,
                kwargs=cli_args,
        ) as repo_location:
            try:
                external_repo = get_external_repository_from_repo_location(
                    repo_location, cli_args.get("repository"))
                check_repo_and_scheduler(external_repo, instance)
                external_sensor = external_repo.get_external_sensor(
                    sensor_name)
                try:
                    sensor_runtime_data = repo_location.get_external_sensor_execution_data(
                        instance,
                        external_repo.handle,
                        external_sensor.name,
                        since,
                        last_run_key,
                        cursor,
                    )
                except Exception:
                    error_info = serializable_error_info_from_exc_info(
                        sys.exc_info())
                    print_fn(
                        "Failed to resolve sensor for {sensor_name} : {error_info}"
                        .format(
                            sensor_name=external_sensor.name,
                            error_info=error_info.to_string(),
                        ))
                    return

                if not sensor_runtime_data.run_requests:
                    if sensor_runtime_data.skip_message:
                        print_fn(
                            "Sensor returned false for {sensor_name}, skipping: {skip_message}"
                            .format(
                                sensor_name=external_sensor.name,
                                skip_message=sensor_runtime_data.skip_message,
                            ))
                    else:
                        print_fn(
                            "Sensor returned false for {sensor_name}, skipping"
                            .format(sensor_name=external_sensor.name))
                else:
                    print_fn(
                        "Sensor returning run requests for {num} run(s):\n\n{run_requests}"
                        .format(
                            num=len(sensor_runtime_data.run_requests),
                            run_requests="\n".join(
                                yaml.safe_dump(run_request.run_config,
                                               default_flow_style=False)
                                for run_request in
                                sensor_runtime_data.run_requests),
                        ))

            except DagsterInvariantViolationError as ex:
                raise click.UsageError(ex)
Пример #19
0
def execute_backfill_command(cli_args, print_fn, instance=None):
    instance = instance or DagsterInstance.get()
    repo_location = get_repository_location_from_kwargs(cli_args, instance)
    external_repo = get_external_repository_from_repo_location(
        repo_location, cli_args.get('repository'))

    external_pipeline = get_external_pipeline_from_external_repo(
        external_repo,
        cli_args.get('pipeline'),
    )

    noprompt = cli_args.get('noprompt')

    pipeline_partition_set_names = {
        external_partition_set.name: external_partition_set
        for external_partition_set in
        external_repo.get_external_partition_sets()
        if external_partition_set.pipeline_name == external_pipeline.name
    }

    if not pipeline_partition_set_names:
        raise click.UsageError(
            'No partition sets found for pipeline `{}`'.format(
                external_pipeline.name))
    partition_set_name = cli_args.get('partition_set')
    if not partition_set_name:
        if len(pipeline_partition_set_names) == 1:
            partition_set_name = next(iter(
                pipeline_partition_set_names.keys()))
        elif noprompt:
            raise click.UsageError(
                'No partition set specified (see option `--partition-set`)')
        else:
            partition_set_name = click.prompt(
                'Select a partition set to use for backfill: {}'.format(
                    ', '.join(x for x in pipeline_partition_set_names.keys())))

    partition_set = pipeline_partition_set_names.get(partition_set_name)

    if not partition_set:
        raise click.UsageError(
            'No partition set found named `{}`'.format(partition_set_name))

    mode = partition_set.mode
    solid_selection = partition_set.solid_selection

    repo_handle = RepositoryHandle(
        repository_name=external_repo.name,
        repository_location_handle=repo_location.location_handle,
    )

    # Resolve partitions to backfill
    partition_names_or_error = repo_location.get_external_partition_names(
        repo_handle,
        partition_set_name,
    )

    if isinstance(partition_names_or_error,
                  ExternalPartitionExecutionErrorData):
        raise DagsterBackfillFailedError(
            'Failure fetching partition names for {partition_set_name}: {error_message}'
            .format(
                partition_set_name=partition_set_name,
                error_message=partition_names_or_error.error.message,
            ),
            serialized_error_info=partition_names_or_error.error,
        )

    partition_names = gen_partition_names_from_args(
        partition_names_or_error.partition_names, cli_args)

    # Print backfill info
    print_fn('\n     Pipeline: {}'.format(external_pipeline.name))
    print_fn('Partition set: {}'.format(partition_set_name))
    print_fn('   Partitions: {}\n'.format(
        print_partition_format(partition_names, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
            'Do you want to proceed with the backfill ({} partitions)?'.format(
                len(partition_names))):

        print_fn('Launching runs... ')
        backfill_id = make_new_backfill_id()

        run_tags = merge_dicts(
            PipelineRun.tags_for_backfill_id(backfill_id),
            get_tags_from_args(cli_args),
        )

        for partition_name in partition_names:
            run_config_or_error = repo_location.get_external_partition_config(
                repo_handle, partition_set_name, partition_name)
            if isinstance(run_config_or_error,
                          ExternalPartitionExecutionErrorData):
                raise DagsterBackfillFailedError(
                    'Failure fetching run config for partition {partition_name} in {partition_set_name}: {error_message}'
                    .format(
                        partition_name=partition_name,
                        partition_set_name=partition_set_name,
                        error_message=run_config_or_error.error.message,
                    ),
                    serialized_error_info=run_config_or_error.error,
                )

            tags_or_error = repo_location.get_external_partition_tags(
                repo_handle, partition_set_name, partition_name)
            if isinstance(tags_or_error, ExternalPartitionExecutionErrorData):
                raise DagsterBackfillFailedError(
                    'Failure fetching tags for partition {partition_name} in {partition_set_name}: {error_message}'
                    .format(
                        partition_name=partition_name,
                        partition_set_name=partition_set_name,
                        error_message=tags_or_error.error.message,
                    ),
                    serialized_error_info=tags_or_error.error,
                )
            run = _create_external_pipeline_run(
                instance=instance,
                repo_location=repo_location,
                external_repo=external_repo,
                external_pipeline=external_pipeline,
                run_config=run_config_or_error.run_config,
                mode=mode,
                preset=None,
                tags=merge_dicts(tags_or_error.tags, run_tags),
                solid_selection=frozenset(solid_selection)
                if solid_selection else None,
            )

            instance.launch_run(run.run_id, external_pipeline)
            # Remove once we can handle synchronous execution... currently limited by sqlite
            time.sleep(0.1)

        print_fn('Launched backfill job `{}`'.format(backfill_id))
    else:
        print_fn('Aborted!')
Пример #20
0
def pipeline_list_versions_command(**kwargs):
    with DagsterInstance.get() as instance:
        execute_list_versions_command(instance, kwargs)
Пример #21
0
def _make_airflow_dag(
    recon_repo,
    job_name,
    run_config=None,
    mode=None,
    instance=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    operator=DagsterPythonOperator,
):
    check.inst_param(recon_repo, "recon_repo", ReconstructableRepository)
    check.str_param(job_name, "job_name")
    run_config = check.opt_dict_param(run_config, "run_config", key_type=str)
    mode = check.opt_str_param(mode, "mode")
    # Default to use the (persistent) system temp directory rather than a TemporaryDirectory,
    # which would not be consistent between Airflow task invocations.

    if instance is None:
        if is_dagster_home_set():
            instance = DagsterInstance.get()
        else:
            instance = DagsterInstance.local_temp(
                tempdir=seven.get_system_temp_directory())

    check.inst_param(instance, "instance", DagsterInstance)

    # Only used for Airflow; internally we continue to use pipeline.name
    dag_id = check.opt_str_param(dag_id, "dag_id",
                                 _rename_for_airflow(job_name))

    dag_description = check.opt_str_param(dag_description, "dag_description",
                                          _make_dag_description(job_name))
    check.subclass_param(operator, "operator", BaseOperator)

    dag_kwargs = dict(
        {"default_args": DEFAULT_ARGS},
        **check.opt_dict_param(dag_kwargs, "dag_kwargs", key_type=str),
    )

    op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)

    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)
    pipeline = recon_repo.get_definition().get_pipeline(job_name)

    if mode is None:
        mode = pipeline.get_default_mode_name()

    execution_plan = create_execution_plan(pipeline, run_config, mode=mode)

    tasks = {}

    coalesced_plan = coalesce_execution_steps(execution_plan)

    for solid_handle, solid_steps in coalesced_plan.items():
        step_keys = [step.key for step in solid_steps]

        operator_parameters = DagsterOperatorParameters(
            recon_repo=recon_repo,
            pipeline_name=job_name,
            run_config=run_config,
            mode=mode,
            task_id=solid_handle,
            step_keys=step_keys,
            dag=dag,
            instance_ref=instance.get_ref(),
            op_kwargs=op_kwargs,
            pipeline_snapshot=pipeline.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                execution_plan,
                pipeline_snapshot_id=pipeline.get_pipeline_snapshot_id()),
        )
        task = operator(operator_parameters)

        tasks[solid_handle] = task

        for solid_step in solid_steps:
            for step_input in solid_step.step_inputs:
                for key in step_input.dependency_keys:
                    prev_solid_handle = execution_plan.get_step_by_key(
                        key).solid_handle.to_string()
                    if solid_handle != prev_solid_handle:
                        tasks[prev_solid_handle].set_downstream(task)

    return (dag,
            [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
Пример #22
0
def pipeline_execute_command(**kwargs):
    with capture_interrupts():
        with DagsterInstance.get() as instance:
            execute_execute_command(instance, kwargs)
Пример #23
0
def execute_backfill_command(cli_args, print_fn, instance=None):
    pipeline_name = cli_args.pop('pipeline_name')
    repo_args = {k: v for k, v in cli_args.items() if k in REPO_ARG_NAMES}
    if pipeline_name and not isinstance(pipeline_name, six.string_types):
        if len(pipeline_name) == 1:
            pipeline_name = pipeline_name[0]

    instance = instance or DagsterInstance.get()
    handle = recon_repo_for_cli_args(repo_args)
    repository = handle.get_definition()
    noprompt = cli_args.get('noprompt')

    # check run launcher
    if not instance.run_launcher:
        raise click.UsageError(
            'A run launcher must be configured before running a backfill. You can configure a run '
            'launcher (e.g. dagster_graphql.launcher.RemoteDagitRunLauncher) in your instance '
            '`dagster.yaml` settings. See '
            'https://docs.dagster.io/docs/deploying/instance/ for more'
            'information.')

    # Resolve pipeline
    if not pipeline_name and noprompt:
        raise click.UsageError('No pipeline specified')
    if not pipeline_name:
        pipeline_name = click.prompt(
            'Select a pipeline to backfill: {}'.format(', '.join(
                repository.pipeline_names)))
    repository = handle.get_definition()
    if not repository.has_pipeline(pipeline_name):
        raise click.UsageError(
            'No pipeline found named `{}`'.format(pipeline_name))

    pipeline = repository.get_pipeline(pipeline_name)

    # Resolve partition set
    all_partition_sets = get_partition_sets_for_handle(handle)
    pipeline_partition_sets = [
        x for x in all_partition_sets if x.pipeline_name == pipeline.name
    ]
    if not pipeline_partition_sets:
        raise click.UsageError(
            'No partition sets found for pipeline `{}`'.format(pipeline.name))
    partition_set_name = cli_args.get('partition_set')
    if not partition_set_name:
        if len(pipeline_partition_sets) == 1:
            partition_set_name = pipeline_partition_sets[0].name
        elif noprompt:
            raise click.UsageError(
                'No partition set specified (see option `--partition-set`)')
        else:
            partition_set_name = click.prompt(
                'Select a partition set to use for backfill: {}'.format(
                    ', '.join(x.name for x in pipeline_partition_sets)))
    partition_set = next(
        (x for x in pipeline_partition_sets if x.name == partition_set_name),
        None)
    if not partition_set:
        raise click.UsageError(
            'No partition set found named `{}`'.format(partition_set_name))

    # Resolve partitions to backfill
    partitions = gen_partitions_from_args(partition_set, cli_args)

    # Print backfill info
    print_fn('\n     Pipeline: {}'.format(pipeline.name))
    print_fn('Partition set: {}'.format(partition_set.name))
    print_fn('   Partitions: {}\n'.format(
        print_partition_format(partitions, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
            'Do you want to proceed with the backfill ({} partitions)?'.format(
                len(partitions))):

        print_fn('Launching runs... ')
        backfill_id = make_new_backfill_id()

        run_tags = merge_dicts(
            PipelineRun.tags_for_backfill_id(backfill_id),
            get_tags_from_args(cli_args),
        )

        for partition in partitions:
            run = instance.create_run_for_pipeline(
                pipeline_def=pipeline,
                mode=partition_set.mode,
                solid_subset=partition_set.solid_subset,
                environment_dict=partition_set.environment_dict_for_partition(
                    partition),
                tags=merge_dicts(partition_set.tags_for_partition(partition),
                                 run_tags),
            )
            instance.launch_run(run.run_id)
            # Remove once we can handle synchronous execution... currently limited by sqlite
            time.sleep(0.1)

        print_fn('Launched backfill job `{}`'.format(backfill_id))
    else:
        print_fn(' Aborted!')
Пример #24
0
def pipeline_launch_command(**kwargs):
    with DagsterInstance.get() as instance:
        return execute_launch_command(instance, kwargs)
Пример #25
0
def debug_heartbeat_command():
    with DagsterInstance.get() as instance:
        debug_daemon_heartbeats(instance)
Пример #26
0
def pipeline_list_command(**kwargs):
    with DagsterInstance.get() as instance:
        return execute_list_command(kwargs, click.echo, instance)
Пример #27
0
def pipeline_print_command(verbose, **cli_args):
    with DagsterInstance.get() as instance:
        return execute_print_command(verbose, cli_args, click.echo, instance)
Пример #28
0
def run_command(**kwargs):
    with capture_interrupts():
        with DagsterInstance.get() as instance:
            _daemon_run_command(instance, kwargs)