예제 #1
0
    def __init__(
        self,
        server_termination_event,
        loadable_target_origin=None,
        heartbeat=False,
        heartbeat_timeout=30,
        lazy_load_user_code=False,
        fixed_server_id=None,
    ):
        super(DagsterApiServer, self).__init__()

        check.bool_param(heartbeat, "heartbeat")
        check.int_param(heartbeat_timeout, "heartbeat_timeout")
        check.invariant(heartbeat_timeout > 0,
                        "heartbeat_timeout must be greater than 0")

        self._server_termination_event = check.inst_param(
            server_termination_event, "server_termination_event",
            seven.ThreadingEventType)
        self._loadable_target_origin = check.opt_inst_param(
            loadable_target_origin, "loadable_target_origin",
            LoadableTargetOrigin)

        # Each server is initialized with a unique UUID. This UUID is used by clients to track when
        # servers are replaced and is used for cache invalidation and reloading.
        self._server_id = check.opt_str_param(fixed_server_id,
                                              "fixed_server_id",
                                              str(uuid.uuid4()))

        # Client tells the server to shutdown by calling ShutdownServer (or by failing to send a
        # hearbeat, at which point this event is set. The cleanup thread will then set the server
        # termination event once all current executions have finished, which will stop the server)
        self._shutdown_once_executions_finish_event = threading.Event()

        # Dict[str, (multiprocessing.Process, DagsterInstance)]
        self._executions = {}
        # Dict[str, multiprocessing.Event]
        self._termination_events = {}
        self._termination_times = {}
        self._execution_lock = threading.Lock()

        self._repository_symbols_and_code_pointers = LazyRepositorySymbolsAndCodePointers(
            loadable_target_origin)
        if not lazy_load_user_code:
            self._repository_symbols_and_code_pointers.load()

        self.__last_heartbeat_time = time.time()
        if heartbeat:
            self.__heartbeat_thread = threading.Thread(
                target=self._heartbeat_thread,
                args=(heartbeat_timeout, ),
            )
            self.__heartbeat_thread.daemon = True
            self.__heartbeat_thread.start()
        else:
            self.__heartbeat_thread = None

        self.__cleanup_thread = threading.Thread(
            target=self._cleanup_thread,
            args=(),
        )
        self.__cleanup_thread.daemon = True

        self.__cleanup_thread.start()
예제 #2
0
 def __init__(self, instance, scheduled_execution_time):
     super(ScheduleExecutionContext, self).__init__(
         check.inst_param(instance, "instance", DagsterInstance))
     self._scheduled_execution_time = check.opt_inst_param(
         scheduled_execution_time, "scheduled_execution_time", datetime)
예제 #3
0
    def __init__(
        self,
        type_check_fn,
        key=None,
        name=None,
        is_builtin=False,
        description=None,
        loader=None,
        materializer=None,
        serialization_strategy=None,
        auto_plugins=None,
        required_resource_keys=None,
        kind=DagsterTypeKind.REGULAR,
        # Graveyard is below
        input_hydration_config=None,
        output_materialization_config=None,
    ):
        check.opt_str_param(key, "key")
        check.opt_str_param(name, "name")

        check.invariant(not (name is None and key is None), "Must set key or name")

        if name is None:
            check.param_invariant(
                bool(key), "key", "If name is not provided, must provide key.",
            )
            self.key, self.name = key, None
        elif key is None:
            check.param_invariant(
                bool(name), "name", "If key is not provided, must provide name.",
            )
            self.key, self.name = name, name
        else:
            check.invariant(key and name)
            self.key, self.name = key, name

        self.description = check.opt_str_param(description, "description")
        self.loader = canonicalize_backcompat_args(
            check.opt_inst_param(loader, "loader", DagsterTypeLoader),
            "loader",
            check.opt_inst_param(
                input_hydration_config, "input_hydration_config", DagsterTypeLoader
            ),
            "input_hydration_config",
            "0.10.0",
        )
        self.materializer = canonicalize_backcompat_args(
            check.opt_inst_param(materializer, "materializer", DagsterTypeMaterializer),
            "materializer",
            check.opt_inst_param(
                output_materialization_config,
                "output_materialization_config",
                DagsterTypeMaterializer,
            ),
            "output_materialization_config",
            "0.10.0",
        )
        self.serialization_strategy = check.opt_inst_param(
            serialization_strategy,
            "serialization_strategy",
            SerializationStrategy,
            PickleSerializationStrategy(),
        )
        self.required_resource_keys = check.opt_set_param(
            required_resource_keys, "required_resource_keys",
        )

        self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn")
        _validate_type_check_fn(self._type_check_fn, self.name)

        auto_plugins = check.opt_list_param(auto_plugins, "auto_plugins", of_type=type)

        check.param_invariant(
            all(
                issubclass(auto_plugin_type, TypeStoragePlugin) for auto_plugin_type in auto_plugins
            ),
            "auto_plugins",
        )

        self.auto_plugins = auto_plugins

        self.is_builtin = check.bool_param(is_builtin, "is_builtin")
        check.invariant(
            self.display_name is not None,
            "All types must have a valid display name, got None for key {}".format(key),
        )

        self.kind = check.inst_param(kind, "kind", DagsterTypeKind)
예제 #4
0
def from_dagster_event_record(graphene_info, event_record, dauphin_pipeline,
                              execution_plan):
    # Lots of event types. Pylint thinks there are too many branches
    # pylint: disable=too-many-branches
    check.inst_param(event_record, 'event_record', EventRecord)
    check.param_invariant(event_record.is_dagster_event, 'event_record')
    check.opt_inst_param(dauphin_pipeline, 'dauphin_pipeline',
                         graphene_info.schema.type_named('Pipeline'))
    check.opt_inst_param(execution_plan, 'execution_plan', ExecutionPlan)

    dagster_event = event_record.dagster_event
    basic_params = construct_basic_params(graphene_info, event_record,
                                          execution_plan)
    if dagster_event.event_type == DagsterEventType.STEP_START:
        return graphene_info.schema.type_named('ExecutionStepStartEvent')(
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_SKIPPED:
        return graphene_info.schema.type_named('ExecutionStepSkippedEvent')(
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_SUCCESS:
        return graphene_info.schema.type_named('ExecutionStepSuccessEvent')(
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_INPUT:
        input_data = dagster_event.event_specific_data
        return graphene_info.schema.type_named('ExecutionStepInputEvent')(
            input_name=input_data.input_name,
            type_check=input_data.type_check_data,
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_OUTPUT:
        output_data = dagster_event.step_output_data
        return graphene_info.schema.type_named('ExecutionStepOutputEvent')(
            output_name=output_data.output_name,
            type_check=output_data.type_check_data,
            # parens make black not put trailing commas, which in turn break py27
            # fmt: off
            **(basic_params)
            # fmt: on
        )
    elif dagster_event.event_type == DagsterEventType.STEP_MATERIALIZATION:
        materialization = dagster_event.step_materialization_data.materialization
        return graphene_info.schema.type_named('StepMaterializationEvent')(
            materialization=materialization, **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_EXPECTATION_RESULT:
        expectation_result = dagster_event.event_specific_data.expectation_result
        return graphene_info.schema.type_named('StepExpectationResultEvent')(
            expectation_result=expectation_result, **(basic_params))
    elif dagster_event.event_type == DagsterEventType.STEP_FAILURE:
        check.inst(dagster_event.step_failure_data, StepFailureData)
        return graphene_info.schema.type_named('ExecutionStepFailureEvent')(
            error=graphene_info.schema.type_named('PythonError')(
                dagster_event.step_failure_data.error),
            # parens make black not put trailing commas, which in turn break py27
            # fmt: off
            **(basic_params)
            # fmt: on
        )
    elif dagster_event.event_type == DagsterEventType.PIPELINE_START:
        return graphene_info.schema.type_named('PipelineStartEvent')(
            pipeline=dauphin_pipeline, **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_SUCCESS:
        return graphene_info.schema.type_named('PipelineSuccessEvent')(
            pipeline=dauphin_pipeline, **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_FAILURE:
        return graphene_info.schema.type_named('PipelineFailureEvent')(
            pipeline=dauphin_pipeline, **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_PROCESS_START:
        process_data = dagster_event.pipeline_process_start_data
        return graphene_info.schema.type_named('PipelineProcessStartEvent')(
            pipeline=dauphin_pipeline,
            pipeline_name=process_data.pipeline_name,
            run_id=process_data.run_id,
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_PROCESS_STARTED:
        process_data = dagster_event.pipeline_process_started_data
        return graphene_info.schema.type_named('PipelineProcessStartedEvent')(
            pipeline=dauphin_pipeline,
            process_id=process_data.process_id,
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_PROCESS_EXITED:
        process_data = dagster_event.pipeline_process_exited_data
        return graphene_info.schema.type_named('PipelineProcessExitedEvent')(
            pipeline=dauphin_pipeline,
            process_id=process_data.process_id,
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_INIT_FAILURE:
        return graphene_info.schema.type_named('PipelineInitFailureEvent')(
            pipeline=dauphin_pipeline,
            error=graphene_info.schema.type_named('PythonError')(
                dagster_event.pipeline_init_failure_data.error),
            # parens make black not put trailing commas, which in turn break py27
            # fmt: off
            **(basic_params)
            # fmt: on
        )
    elif dagster_event.event_type == DagsterEventType.OBJECT_STORE_OPERATION:
        operation_result = dagster_event.event_specific_data
        return graphene_info.schema.type_named('ObjectStoreOperationEvent')(
            operation_result=operation_result, **basic_params)
    elif dagster_event.event_type == DagsterEventType.ENGINE_EVENT:
        return graphene_info.schema.type_named('EngineEvent')(
            metadataEntries=_to_dauphin_metadata_entries(
                dagster_event.event_specific_data.metadata_entries),
            **basic_params)
    else:
        raise Exception(
            'Unknown DAGSTER_EVENT type {inner_type} found in logs'.format(
                inner_type=dagster_event.event_type))
예제 #5
0
def create_execution_structure(solid_defs, dependencies_dict,
                               container_definition):
    '''This builder takes the dependencies dictionary specified during creation of the
    PipelineDefinition object and builds (1) the execution structure and (2) a solid dependency
    dictionary.

    For example, for the following dependencies:

    dep_dict = {
            SolidInvocation('giver'): {},
            SolidInvocation('sleeper', alias='sleeper_1'): {
                'units': DependencyDefinition('giver', 'out_1')
            },
            SolidInvocation('sleeper', alias='sleeper_2'): {
                'units': DependencyDefinition('giver', 'out_2')
            },
            SolidInvocation('sleeper', alias='sleeper_3'): {
                'units': DependencyDefinition('giver', 'out_3')
            },
            SolidInvocation('sleeper', alias='sleeper_4'): {
                'units': DependencyDefinition('giver', 'out_4')
            },
            SolidInvocation('total'): {
                'in_1': DependencyDefinition('sleeper_1', 'total'),
                'in_2': DependencyDefinition('sleeper_2', 'total'),
                'in_3': DependencyDefinition('sleeper_3', 'total'),
                'in_4': DependencyDefinition('sleeper_4', 'total'),
            },
        },

    This will create:

    pipeline_solid_dict = {
        'giver': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_1': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_2': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_3': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_4': <dagster.core.definitions.dependency.Solid object>,
        'total': <dagster.core.definitions.dependency.Solid object>
    }

    as well as a dagster.core.definitions.dependency.DependencyStructure object.
    '''
    from .solid import ISolidDefinition, CompositeSolidDefinition

    check.list_param(solid_defs, 'solid_defs', of_type=ISolidDefinition)
    check.dict_param(
        dependencies_dict,
        'dependencies_dict',
        key_type=six.string_types + (SolidInvocation, ),
        value_type=dict,
    )
    # container_definition is none in the context of a pipeline
    check.opt_inst_param(container_definition, 'container_definition',
                         CompositeSolidDefinition)

    # Same as dep_dict but with SolidInvocation replaced by alias string
    aliased_dependencies_dict = {}

    # Keep track of solid name -> all aliases used and alias -> name
    name_to_aliases = defaultdict(set)
    alias_to_solid_instance = {}
    alias_to_name = {}

    for solid_key, input_dep_dict in dependencies_dict.items():
        # We allow deps of the form dependencies={'foo': DependencyDefition('bar')}
        # Here, we replace 'foo' with SolidInvocation('foo')
        if not isinstance(solid_key, SolidInvocation):
            solid_key = SolidInvocation(solid_key)

        alias = solid_key.alias or solid_key.name

        name_to_aliases[solid_key.name].add(alias)
        alias_to_solid_instance[alias] = solid_key
        alias_to_name[alias] = solid_key.name
        aliased_dependencies_dict[alias] = input_dep_dict

        for dependency in input_dep_dict.values():
            for dep in dependency.get_definitions():
                name_to_aliases[dep.solid].add(dep.solid)

    pipeline_solid_dict = _build_pipeline_solid_dict(solid_defs,
                                                     name_to_aliases,
                                                     alias_to_solid_instance,
                                                     container_definition)

    _validate_dependencies(aliased_dependencies_dict, pipeline_solid_dict,
                           alias_to_name)

    dependency_structure = DependencyStructure.from_definitions(
        pipeline_solid_dict, aliased_dependencies_dict)

    return dependency_structure, pipeline_solid_dict
예제 #6
0
def construct_dagster_k8s_job(
    job_config,
    args,
    job_name,
    user_defined_k8s_config=None,
    pod_name=None,
    component=None,
    env_vars=None,
):
    """Constructs a Kubernetes Job object for a dagster-graphql invocation.

    Args:
        job_config (DagsterK8sJobConfig): Job configuration to use for constructing the Kubernetes
            Job object.
        args (List[str]): CLI arguments to use with dagster-graphql in this Job.
        job_name (str): The name of the Job. Note that this name must be <= 63 characters in length.
        resources (Dict[str, Dict[str, str]]): The resource requirements for the container
        pod_name (str, optional): The name of the Pod. Note that this name must be <= 63 characters
            in length. Defaults to "<job_name>-pod".
        component (str, optional): The name of the component, used to provide the Job label
            app.kubernetes.io/component. Defaults to None.
        env_vars(Dict[str, str]): Additional environment variables to add to the K8s Container.

    Returns:
        kubernetes.client.V1Job: A Kubernetes Job object.
    """
    check.inst_param(job_config, "job_config", DagsterK8sJobConfig)
    check.list_param(args, "args", of_type=str)
    check.str_param(job_name, "job_name")
    user_defined_k8s_config = check.opt_inst_param(
        user_defined_k8s_config,
        "user_defined_k8s_config",
        UserDefinedDagsterK8sConfig,
        UserDefinedDagsterK8sConfig(),
    )

    pod_name = check.opt_str_param(pod_name,
                                   "pod_name",
                                   default=job_name + "-pod")
    check.opt_str_param(component, "component")
    check.opt_dict_param(env_vars, "env_vars", key_type=str, value_type=str)

    check.invariant(
        len(job_name) <= MAX_K8S_NAME_LEN,
        "job_name is %d in length; Kubernetes Jobs cannot be longer than %d characters."
        % (len(job_name), MAX_K8S_NAME_LEN),
    )

    check.invariant(
        len(pod_name) <= MAX_K8S_NAME_LEN,
        "job_name is %d in length; Kubernetes Pods cannot be longer than %d characters."
        % (len(pod_name), MAX_K8S_NAME_LEN),
    )

    # See: https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/
    dagster_labels = {
        "app.kubernetes.io/name": "dagster",
        "app.kubernetes.io/instance": "dagster",
        "app.kubernetes.io/version": dagster_version,
        "app.kubernetes.io/part-of": "dagster",
    }

    if component:
        dagster_labels["app.kubernetes.io/component"] = component

    env = [
        kubernetes.client.V1EnvVar(name="DAGSTER_HOME",
                                   value=job_config.dagster_home)
    ]
    if job_config.postgres_password_secret:
        env.append(
            kubernetes.client.V1EnvVar(
                name=DAGSTER_PG_PASSWORD_ENV_VAR,
                value_from=kubernetes.client.V1EnvVarSource(
                    secret_key_ref=kubernetes.client.V1SecretKeySelector(
                        name=job_config.postgres_password_secret,
                        key=DAGSTER_PG_PASSWORD_SECRET_KEY)),
            ))

    additional_k8s_env_vars = []
    if env_vars:
        for key, value in env_vars.items():
            additional_k8s_env_vars.append(
                kubernetes.client.V1EnvVar(name=key, value=value))

    job_container = kubernetes.client.V1Container(
        name=job_name,
        image=job_config.job_image,
        args=args,
        image_pull_policy=job_config.image_pull_policy,
        env=env + additional_k8s_env_vars,
        env_from=job_config.env_from_sources,
        volume_mounts=[
            kubernetes.client.V1VolumeMount(
                name="dagster-instance",
                mount_path="{dagster_home}/dagster.yaml".format(
                    dagster_home=job_config.dagster_home),
                sub_path="dagster.yaml",
            )
        ] + [
            kubernetes.client.V1VolumeMount(
                name=mount["name"],
                mount_path=mount["path"],
                sub_path=mount["sub_path"],
            ) for mount in job_config.volume_mounts
        ],
        **user_defined_k8s_config.container_config,
    )

    volumes = [
        kubernetes.client.V1Volume(
            name="dagster-instance",
            config_map=kubernetes.client.V1ConfigMapVolumeSource(
                name=job_config.instance_config_map),
        )
    ] + [
        kubernetes.client.V1Volume(
            name=mount["name"],
            config_map=kubernetes.client.V1ConfigMapVolumeSource(
                name=mount["configmap"]),
        ) if mount.get("configmap") else kubernetes.client.V1Volume(
            name=mount["name"],
            secret=kubernetes.client.V1SecretVolumeSource(
                secret_name=mount["secret"]),
        ) for mount in job_config.volume_mounts
    ]

    # If the user has defined custom labels, remove them from the pod_template_spec_metadata
    # key and merge them with the dagster labels
    user_defined_pod_template_labels = user_defined_k8s_config.pod_template_spec_metadata.pop(
        "labels", {})

    template = kubernetes.client.V1PodTemplateSpec(
        metadata=kubernetes.client.V1ObjectMeta(
            name=pod_name,
            labels=merge_dicts(dagster_labels,
                               user_defined_pod_template_labels),
            **user_defined_k8s_config.pod_template_spec_metadata,
        ),
        spec=kubernetes.client.V1PodSpec(
            image_pull_secrets=[
                kubernetes.client.V1LocalObjectReference(name=x["name"])
                for x in job_config.image_pull_secrets
            ],
            service_account_name=job_config.service_account_name,
            restart_policy="Never",
            containers=[job_container],
            volumes=volumes,
            **user_defined_k8s_config.pod_spec_config,
        ),
    )

    job = kubernetes.client.V1Job(
        api_version="batch/v1",
        kind="Job",
        metadata=kubernetes.client.V1ObjectMeta(
            name=job_name,
            labels=dagster_labels,
            **user_defined_k8s_config.job_metadata),
        spec=kubernetes.client.V1JobSpec(
            template=template,
            backoff_limit=K8S_JOB_BACKOFF_LIMIT,
            ttl_seconds_after_finished=K8S_JOB_TTL_SECONDS_AFTER_FINISHED,
            **user_defined_k8s_config.job_spec_config,
        ),
        **user_defined_k8s_config.job_config,
    )
    return job
예제 #7
0
 def add_pending_invocation(self, solid):
     solid = check.opt_inst_param(solid, "solid", CallableSolidNode)
     solid_name = solid.given_alias if solid.given_alias else solid.solid_def.name
     self._pending_invocations[solid_name] = solid
예제 #8
0
def get_logging_tags(run_config, pipeline):
    check.opt_inst_param(run_config, 'run_config', RunConfig)
    check.inst_param(pipeline, 'pipeline', PipelineDefinition)

    return merge_dicts({'pipeline': pipeline.name},
                       run_config.tags if run_config else {})
예제 #9
0
def hourly_schedule(
    pipeline_name: str,
    start_date: datetime.datetime,
    name: Optional[str] = None,
    execution_time: datetime.time = datetime.time(0, 0),
    tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,
    solid_selection: Optional[List[str]] = None,
    mode: Optional[str] = "default",
    should_execute: Optional[Callable[["ScheduleExecutionContext"], bool]] = None,
    environment_vars: Optional[Dict[str, str]] = None,
    end_date: Optional[str] = None,
    execution_timezone: Optional[str] = None,
    partition_hours_offset: Optional[int] = 1,
    description: Optional[str] = None,
) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:
    """Create a partitioned schedule that runs hourly.

    The decorated function should accept a datetime object as its only argument. The datetime
    represents the date partition that it's meant to run on.

    The decorated function should return a run configuration dictionary, which will be used as
    configuration for the scheduled run.

    The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create. By default, this will be the name
            of the decorated function.
        execution_time (datetime.time): The time at which to execute the schedule. Only the minutes
            component will be respected -- the hour should be 0, and will be ignored if it is not 0.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
        execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
            with DagsterDaemonScheduler, and must be set when using that scheduler.
        partition_hours_offset (Optional[int]): How many hours back to go when choosing the partition
            for a given schedule execution. For example, when partition_hours_offset=1, the schedule
            that executes during hour N will fill in the partition for hour N-1.
            (Default: 1)
        description (Optional[str]): A human-readable description of the schedule.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.inst_param(execution_time, "execution_time", datetime.time)
    check.opt_str_param(execution_timezone, "execution_timezone")
    check.opt_int_param(partition_hours_offset, "partition_hours_offset")
    check.opt_str_param(description, "description")

    if start_date.minute != 0 or start_date.second != 0:
        warnings.warn(
            "`start_date` must be at the beginning of the hour for an hourly schedule. "
            "Use `execution_time` to execute the schedule at a specific time within the hour. For "
            "example, to run the schedule each hour at 15 minutes past the hour starting at 3AM "
            "on 10/20/2020, your schedule definition would look like:"
            """
@hourly_schedule(
    start_date=datetime.datetime(2020, 10, 20, 3),
    execution_time=datetime.time(0, 15)
):
def my_schedule_definition(_):
    ...
"""
        )

    if execution_time.hour != 0:
        warnings.warn(
            "Hourly schedule {schedule_name} created with:\n"
            "\tschedule_time=datetime.time(hour={hour}, minute={minute}, ...)."
            "Since this is an hourly schedule, the hour parameter will be ignored and the schedule "
            "will run on the {minute} mark for the previous hour interval. Replace "
            "datetime.time(hour={hour}, minute={minute}, ...) with "
            "datetime.time(minute={minute}, ...) to fix this warning."
        )

    cron_schedule = "{minute} * * * *".format(minute=execution_time.minute)

    fmt = (
        DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE
        if execution_timezone
        else DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE
    )

    execution_time_to_partition_fn = lambda d: pendulum.instance(d).subtract(
        hours=partition_hours_offset, minutes=(execution_time.minute - start_date.minute) % 60
    )

    partition_fn = schedule_partition_range(
        start_date,
        end=end_date,
        cron_schedule=cron_schedule,
        fmt=fmt,
        timezone=execution_timezone,
        execution_time_to_partition_fn=execution_time_to_partition_fn,
        inclusive=(partition_hours_offset == 0),
    )

    def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value: Callable[
            ["Partition"], Optional[Dict[str, str]]
        ] = lambda partition: {}
        if tags_fn_for_date:
            tags_fn = cast(
                Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date
            )
            tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
            partition_selector=create_offset_partition_selector(
                execution_time_to_partition_fn=execution_time_to_partition_fn,
            ),
            execution_timezone=execution_timezone,
            description=description,
        )

    return inner
예제 #10
0
def monthly_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_day_of_month=1,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_selection=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
    end_date=None,
):
    '''Create a schedule that runs monthly.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment
    dict for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create.
        execution_day_of_month (int): The day of the month on which to run the schedule (must be
            between 0 and 31).
        execution_time (datetime.time): The time at which to execute the schedule.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
    '''
    check.opt_str_param(name, 'name')
    check.inst_param(start_date, 'start_date', datetime.datetime)
    check.opt_inst_param(end_date, 'end_date', datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, 'tags_fn_for_date')
    check.opt_nullable_list_param(solid_selection,
                                  'solid_selection',
                                  of_type=str)
    mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, 'should_execute')
    check.opt_dict_param(environment_vars,
                         'environment_vars',
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, 'pipeline_name')
    check.int_param(execution_day_of_month, 'execution_day')
    check.inst_param(execution_time, 'execution_time', datetime.time)

    if execution_day_of_month <= 0 or execution_day_of_month > 31:
        raise DagsterInvalidDefinitionError(
            '`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be '
            'between 1 and 31'.format(execution_day_of_month))

    cron_schedule = '{minute} {hour} {day} * *'.format(
        minute=execution_time.minute,
        hour=execution_time.hour,
        day=execution_day_of_month)

    partition_fn = date_partition_range(start_date,
                                        end=end_date,
                                        delta=relativedelta(months=1),
                                        fmt="%Y-%m")

    def inner(fn):
        check.callable_param(fn, 'fn')

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name='{}_partitions'.format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
        )

    return inner
예제 #11
0
def scoped_pipeline_context(
    pipeline_def,
    environment_dict,
    run_config,
    instance,
    system_storage_data=None,
    scoped_resources_builder_cm=create_resource_builder,
):
    check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)
    check.dict_param(environment_dict, 'environment_dict', key_type=str)
    check.inst_param(run_config, 'run_config', RunConfig)
    check.inst_param(instance, 'instance', DagsterInstance)
    check.opt_inst_param(system_storage_data, 'system_storage_data',
                         SystemStorageData)

    context_creation_data = create_context_creation_data(
        pipeline_def, environment_dict, run_config, instance)

    executor_config = create_executor_config(context_creation_data)

    # After this try block, a Dagster exception thrown will result in a pipeline init failure event.
    try:
        executor_config.check_requirements(
            instance, context_creation_data.system_storage_def)

        log_manager = create_log_manager(context_creation_data)

        with scoped_resources_builder_cm(
                context_creation_data.pipeline_def,
                context_creation_data.environment_config,
                context_creation_data.run_config,
                log_manager,
        ) as scoped_resources_builder:

            system_storage_data = create_system_storage_data(
                context_creation_data, system_storage_data,
                scoped_resources_builder)

            yield construct_pipeline_execution_context(
                context_creation_data=context_creation_data,
                scoped_resources_builder=scoped_resources_builder,
                system_storage_data=system_storage_data,
                log_manager=log_manager,
                executor_config=executor_config,
            )

    except DagsterError as dagster_error:
        user_facing_exc_info = (
            # pylint does not know original_exc_info exists is is_user_code_error is true
            # pylint: disable=no-member
            dagster_error.original_exc_info
            if dagster_error.is_user_code_error else sys.exc_info())

        if executor_config.raise_on_error:
            raise dagster_error

        error_info = serializable_error_info_from_exc_info(
            user_facing_exc_info)
        yield DagsterEvent.pipeline_init_failure(
            pipeline_name=pipeline_def.name,
            failure_data=PipelineInitFailureData(error=error_info),
            log_manager=_create_context_free_log_manager(
                instance, run_config, pipeline_def),
        )
예제 #12
0
def hourly_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_selection=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
    end_date=None,
):
    '''Create a schedule that runs hourly.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment
    dict for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create. By default, this will be the name
            of the decorated function.
        execution_time (datetime.time): The time at which to execute the schedule. Only the minutes
            component will be respected -- the hour should be 0, and will be ignored if it is not 0.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
    '''
    check.opt_str_param(name, 'name')
    check.inst_param(start_date, 'start_date', datetime.datetime)
    check.opt_inst_param(end_date, 'end_date', datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, 'tags_fn_for_date')
    check.opt_nullable_list_param(solid_selection,
                                  'solid_selection',
                                  of_type=str)
    mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, 'should_execute')
    check.opt_dict_param(environment_vars,
                         'environment_vars',
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, 'pipeline_name')
    check.inst_param(execution_time, 'execution_time', datetime.time)

    if execution_time.hour != 0:
        warnings.warn(
            "Hourly schedule {schedule_name} created with:\n"
            "\tschedule_time=datetime.time(hour={hour}, minute={minute}, ...)."
            "Since this is a hourly schedule, the hour parameter will be ignored and the schedule "
            "will run on the {minute} mark for the previous hour interval. Replace "
            "datetime.time(hour={hour}, minute={minute}, ...) with "
            "datetime.time(minute={minute}, ...) to fix this warning.")

    cron_schedule = '{minute} * * * *'.format(minute=execution_time.minute)

    partition_fn = date_partition_range(start_date,
                                        end=end_date,
                                        delta=datetime.timedelta(hours=1),
                                        fmt="%Y-%m-%d-%H:%M")

    def inner(fn):
        check.callable_param(fn, 'fn')

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name='{}_partitions'.format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
        )

    return inner
예제 #13
0
 def __init__(self, base_dir, inst_data=None):
     self._base_dir = base_dir
     self._subscription_manager = LocalComputeLogSubscriptionManager(self)
     self._inst_data = check.opt_inst_param(inst_data, 'inst_data',
                                            ConfigurableClassData)
예제 #14
0
    def __init__(
        self,
        host="localhost",
        port=None,
        socket=None,
        max_workers=1,
        loadable_target_origin=None,
        heartbeat=False,
        heartbeat_timeout=30,
        lazy_load_user_code=False,
        ipc_output_file=None,
        fixed_server_id=None,
    ):
        check.opt_str_param(host, "host")
        check.opt_int_param(port, "port")
        check.opt_str_param(socket, "socket")
        check.int_param(max_workers, "max_workers")
        check.opt_inst_param(loadable_target_origin, "loadable_target_origin",
                             LoadableTargetOrigin)
        check.invariant(
            port is not None if seven.IS_WINDOWS else True,
            "You must pass a valid `port` on Windows: `socket` not supported.",
        )
        check.invariant(
            (port or socket) and not (port and socket),
            "You must pass one and only one of `port` or `socket`.",
        )
        check.invariant(
            host is not None if port else True,
            "Must provide a host when serving on a port",
        )
        check.bool_param(heartbeat, "heartbeat")
        check.int_param(heartbeat_timeout, "heartbeat_timeout")
        self._ipc_output_file = check.opt_str_param(ipc_output_file,
                                                    "ipc_output_file")
        check.opt_str_param(fixed_server_id, "fixed_server_id")

        check.invariant(heartbeat_timeout > 0,
                        "heartbeat_timeout must be greater than 0")
        check.invariant(
            max_workers > 1 if heartbeat else True,
            "max_workers must be greater than 1 if heartbeat is True",
        )

        self.server = grpc.server(ThreadPoolExecutor(max_workers=max_workers))
        self._server_termination_event = threading.Event()

        self._api_servicer = DagsterApiServer(
            server_termination_event=self._server_termination_event,
            loadable_target_origin=loadable_target_origin,
            heartbeat=heartbeat,
            heartbeat_timeout=heartbeat_timeout,
            lazy_load_user_code=lazy_load_user_code,
            fixed_server_id=fixed_server_id,
        )

        # Create a health check servicer
        self._health_servicer = health.HealthServicer()
        health_pb2_grpc.add_HealthServicer_to_server(self._health_servicer,
                                                     self.server)

        add_DagsterApiServicer_to_server(self._api_servicer, self.server)

        if port:
            server_address = host + ":" + str(port)
        else:
            server_address = "unix:" + os.path.abspath(socket)

        # grpc.Server.add_insecure_port returns:
        # - 0 on failure
        # - port number when a port is successfully bound
        # - 1 when a UDS is successfully bound
        res = self.server.add_insecure_port(server_address)
        if socket and res != 1:
            if self._ipc_output_file:
                with ipc_write_stream(self._ipc_output_file) as ipc_stream:
                    ipc_stream.send(GrpcServerFailedToBindEvent())
            raise CouldNotBindGrpcServerToAddress(socket)
        if port and res != port:
            if self._ipc_output_file:
                with ipc_write_stream(self._ipc_output_file) as ipc_stream:
                    ipc_stream.send(GrpcServerFailedToBindEvent())
            raise CouldNotBindGrpcServerToAddress(port)
예제 #15
0
 def __init__(self, handle, solid, parent):
     self.handleID = check.inst_param(handle, 'handle', SolidHandle)
     self.solid = check.inst_param(solid, 'solid', DauphinSolid)
     self.parent = check.opt_inst_param(parent, 'parent',
                                        DauphinSolidHandle)
예제 #16
0
def monthly_schedule(
    pipeline_name: str,
    start_date: datetime.datetime,
    name: Optional[str] = None,
    execution_day_of_month: int = 1,
    execution_time: datetime.time = datetime.time(0, 0),
    tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,
    solid_selection: Optional[List[str]] = None,
    mode: Optional[str] = "default",
    should_execute: Optional[Callable[["ScheduleExecutionContext"], bool]] = None,
    environment_vars: Optional[Dict[str, str]] = None,
    end_date: Optional[datetime.datetime] = None,
    execution_timezone: Optional[str] = None,
    partition_months_offset: Optional[int] = 1,
    description: Optional[str] = None,
) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:
    """Create a partitioned schedule that runs monthly.

    The decorated function should accept a datetime object as its only argument. The datetime
    represents the date partition that it's meant to run on.

    The decorated function should return a run configuration dictionary, which will be used as
    configuration for the scheduled run.

    The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create.
        execution_day_of_month (int): The day of the month on which to run the schedule (must be
            between 0 and 31).
        execution_time (datetime.time): The time at which to execute the schedule.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
        execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
            with DagsterDaemonScheduler, and must be set when using that scheduler.
        partition_months_offset (Optional[int]): How many months back to go when choosing the partition
            for a given schedule execution. For example, when partition_months_offset=1, the schedule
            that executes during month N will fill in the partition for month N-1.
            (Default: 1)
        description (Optional[str]): A human-readable description of the schedule.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.int_param(execution_day_of_month, "execution_day")
    check.inst_param(execution_time, "execution_time", datetime.time)
    check.opt_str_param(execution_timezone, "execution_timezone")
    check.opt_int_param(partition_months_offset, "partition_months_offset")
    check.opt_str_param(description, "description")

    if (
        start_date.day != 1
        or start_date.hour != 0
        or start_date.minute != 0
        or start_date.second != 0
    ):
        warnings.warn(
            "`start_date` must be at the beginning of the first day of the month for a monthly "
            "schedule. Use `execution_day_of_month` and `execution_time` to execute the schedule "
            "at a specific time within the month. For example, to run the schedule at 3AM on the "
            "23rd of each month starting in October, your schedule definition would look like:"
            """
@monthly_schedule(
    start_date=datetime.datetime(2020, 10, 1),
    execution_day_of_month=23,
    execution_time=datetime.time(3, 0)
):
def my_schedule_definition(_):
    ...
"""
        )

    if execution_day_of_month <= 0 or execution_day_of_month > 31:
        raise DagsterInvalidDefinitionError(
            "`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be "
            "between 1 and 31".format(execution_day_of_month)
        )

    cron_schedule = "{minute} {hour} {day} * *".format(
        minute=execution_time.minute, hour=execution_time.hour, day=execution_day_of_month
    )

    fmt = DEFAULT_MONTHLY_FORMAT

    execution_time_to_partition_fn = (
        lambda d: pendulum.instance(d)
        .replace(hour=0, minute=0)
        .subtract(months=partition_months_offset, days=execution_day_of_month - 1)
    )

    partition_fn = schedule_partition_range(
        start_date,
        end=end_date,
        cron_schedule=cron_schedule,
        fmt=fmt,
        timezone=execution_timezone,
        execution_time_to_partition_fn=execution_time_to_partition_fn,
        inclusive=(partition_months_offset == 0),
    )

    def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value: Callable[
            ["Partition"], Optional[Dict[str, str]]
        ] = lambda partition: {}
        if tags_fn_for_date:
            tags_fn = cast(
                Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date
            )
            tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
            partition_selector=create_offset_partition_selector(
                execution_time_to_partition_fn=execution_time_to_partition_fn
            ),
            execution_timezone=execution_timezone,
            description=description,
        )

    return inner
예제 #17
0
    def __init__(
        self,
        origin: RepositoryLocationOrigin,
        host: Optional[str] = None,
        port: Optional[int] = None,
        socket: Optional[str] = None,
        server_id: Optional[str] = None,
        heartbeat: Optional[bool] = False,
        watch_server: Optional[bool] = True,
        grpc_server_registry: Optional[GrpcServerRegistry] = None,
    ):
        from dagster.grpc.client import DagsterGrpcClient, client_heartbeat_thread

        self._origin = check.inst_param(origin, "origin",
                                        RepositoryLocationOrigin)

        self.grpc_server_registry = check.opt_inst_param(
            grpc_server_registry, "grpc_server_registry", GrpcServerRegistry)

        if isinstance(self.origin, GrpcServerRepositoryLocationOrigin):
            self._port = self.origin.port
            self._socket = self.origin.socket
            self._host = self.origin.host
            self._use_ssl = bool(self.origin.use_ssl)
        else:
            self._port = check.opt_int_param(port, "port")
            self._socket = check.opt_str_param(socket, "socket")
            self._host = check.str_param(host, "host")
            self._use_ssl = False

        self._watch_thread_shutdown_event = None
        self._watch_thread = None

        self._heartbeat_shutdown_event = None
        self._heartbeat_thread = None

        self._heartbeat = check.bool_param(heartbeat, "heartbeat")
        self._watch_server = check.bool_param(watch_server, "watch_server")

        self.server_id = None
        self._external_repositories_data = None

        self._executable_path = None
        self._container_image = None
        self._container_context = None
        self._repository_code_pointer_dict = None
        self._entry_point = None

        try:
            self.client = DagsterGrpcClient(
                port=self._port,
                socket=self._socket,
                host=self._host,
                use_ssl=self._use_ssl,
            )
            list_repositories_response = sync_list_repositories_grpc(
                self.client)

            self.server_id = server_id if server_id else sync_get_server_id(
                self.client)
            self.repository_names = set(
                symbol.repository_name
                for symbol in list_repositories_response.repository_symbols)

            if self._heartbeat:
                self._heartbeat_shutdown_event = threading.Event()

                self._heartbeat_thread = threading.Thread(
                    target=client_heartbeat_thread,
                    args=(
                        self.client,
                        self._heartbeat_shutdown_event,
                    ),
                    name="grpc-client-heartbeat",
                )
                self._heartbeat_thread.daemon = True
                self._heartbeat_thread.start()

            self._executable_path = list_repositories_response.executable_path
            self._repository_code_pointer_dict = (
                list_repositories_response.repository_code_pointer_dict)
            self._entry_point = list_repositories_response.entry_point

            self._container_image = (
                list_repositories_response.container_image
                or self._reload_current_image(
                )  # Back-compat for older gRPC servers that did not include container_image in ListRepositoriesResponse
            )

            self._container_context = list_repositories_response.container_context

            self._external_repositories_data = sync_get_streaming_external_repositories_data_grpc(
                self.client,
                self,
            )

            self.external_repositories = {
                repo_name: ExternalRepository(
                    repo_data,
                    RepositoryHandle(
                        repository_name=repo_name,
                        repository_location=self,
                    ),
                )
                for repo_name, repo_data in
                self._external_repositories_data.items()
            }
        except:
            self.cleanup()
            raise
예제 #18
0
def check_opt_field_param(obj, param_name):
    return check.opt_inst_param(obj, param_name, FieldImpl)
예제 #19
0
 def __init__(self, solid_def, given_alias=None, tags=None):
     self.solid_def = solid_def
     self.given_alias = check.opt_str_param(given_alias, 'given_alias')
     self.tags = check.opt_inst_param(tags, 'tags', frozentags)
예제 #20
0
    def create_run_for_pipeline(
        self,
        pipeline_def,
        execution_plan=None,
        run_id=None,
        run_config=None,
        mode=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        solid_selection=None,
    ):
        from dagster.core.execution.api import create_execution_plan
        from dagster.core.execution.plan.plan import ExecutionPlan
        from dagster.core.snap import snapshot_from_execution_plan

        check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)
        check.opt_inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        # note that solids_to_execute is required to execute the solid subset, which is the
        # frozenset version of the previous solid_subset.
        # solid_selection is not required and will not be converted to solids_to_execute here.
        # i.e. this function doesn't handle solid queries.
        # solid_selection is only used to pass the user queries further down.
        check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str)
        check.opt_list_param(solid_selection, 'solid_selection', of_type=str)

        if solids_to_execute:
            if isinstance(pipeline_def, PipelineSubsetDefinition):
                # for the case when pipeline_def is created by ExecutablePipeline or ExternalPipeline
                check.invariant(
                    solids_to_execute == pipeline_def.solids_to_execute,
                    'Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} '
                    'that conflicts with solids_to_execute arg {solids_to_execute}'.format(
                        pipeline_solids_to_execute=str_format_list(pipeline_def.solids_to_execute),
                        solids_to_execute=str_format_list(solids_to_execute),
                    ),
                )
            else:
                # for cases when `create_run_for_pipeline` is directly called
                pipeline_def = pipeline_def.get_pipeline_subset_def(
                    solids_to_execute=solids_to_execute
                )

        if execution_plan is None:
            execution_plan = create_execution_plan(
                pipeline_def,
                run_config=run_config,
                mode=mode,
                step_keys_to_execute=step_keys_to_execute,
            )

        return self.create_run(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=check.opt_str_param(mode, 'mode', default=pipeline_def.get_default_mode_name()),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                execution_plan, pipeline_def.get_pipeline_snapshot_id()
            ),
            parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),
        )
예제 #21
0
파일: inputs.py 프로젝트: keyz/dagster
 def __new__(cls, input_name: str, type_check_data: TypeCheckData):
     return super(StepInputData, cls).__new__(
         cls,
         input_name=check.str_param(input_name, "input_name"),
         type_check_data=check.opt_inst_param(type_check_data, "type_check_data", TypeCheckData),
     )
예제 #22
0
def scoped_pipeline_context(
    pipeline_def,
    environment_dict,
    pipeline_run,
    instance,
    execution_plan,
    system_storage_data=None,
    scoped_resources_builder_cm=create_resource_builder,
    raise_on_error=False,
):
    check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)
    check.dict_param(environment_dict, 'environment_dict', key_type=str)
    check.inst_param(pipeline_run, 'pipeline_run', PipelineRun)
    check.inst_param(instance, 'instance', DagsterInstance)
    check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
    check.opt_inst_param(system_storage_data, 'system_storage_data',
                         SystemStorageData)

    context_creation_data = create_context_creation_data(
        pipeline_def,
        environment_dict,
        pipeline_run,
        instance,
        execution_plan,
    )

    # After this try block, a Dagster exception thrown will result in a pipeline init failure event.
    pipeline_context = None
    try:
        executor_config = create_executor_config(context_creation_data)

        log_manager = create_log_manager(context_creation_data)

        with scoped_resources_builder_cm(
                context_creation_data.pipeline_def,
                context_creation_data.environment_config,
                context_creation_data.pipeline_run,
                log_manager,
                context_creation_data.resource_keys_to_init,
        ) as scoped_resources_builder:

            system_storage_data = create_system_storage_data(
                context_creation_data, system_storage_data,
                scoped_resources_builder)

            pipeline_context = construct_pipeline_execution_context(
                context_creation_data=context_creation_data,
                scoped_resources_builder=scoped_resources_builder,
                system_storage_data=system_storage_data,
                log_manager=log_manager,
                executor_config=executor_config,
                raise_on_error=raise_on_error,
            )
            yield pipeline_context

    except DagsterError as dagster_error:
        # only yield an init failure event if we haven't already yielded context
        if pipeline_context is None:
            user_facing_exc_info = (
                # pylint does not know original_exc_info exists is is_user_code_error is true
                # pylint: disable=no-member
                dagster_error.original_exc_info
                if dagster_error.is_user_code_error else sys.exc_info())

            error_info = serializable_error_info_from_exc_info(
                user_facing_exc_info)
            yield DagsterEvent.pipeline_init_failure(
                pipeline_name=pipeline_def.name,
                failure_data=PipelineInitFailureData(error=error_info),
                log_manager=_create_context_free_log_manager(
                    instance, pipeline_run, pipeline_def),
            )

            if raise_on_error:
                raise dagster_error

        # if we've caught an error after context init we're in a problematic state and should just raise
        else:
            raise dagster_error
예제 #23
0
def execute_solid(
    solid_def,
    mode_def=None,
    input_values=None,
    tags=None,
    run_config=None,
    raise_on_error=True,
    environment_dict=None,
):
    '''Execute a single solid in an ephemeral pipeline.

    Intended to support unit tests. Input values may be passed directly, and no pipeline need be
    specified -- an ephemeral pipeline will be constructed.

    Args:
        solid_def (SolidDefinition): The solid to execute.
        mode_def (Optional[ModeDefinition]): The mode within which to execute the solid. Use this
            if, e.g., custom resources, loggers, or executors are desired.
        input_values (Optional[Dict[str, Any]]): A dict of input names to input values, used to
            pass inputs to the solid directly. You may also use the ``run_config`` to
            configure any inputs that are configurable.
        tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline
            logs.
        run_config (Optional[dict]): The environment configuration that parameterized this
            execution, as a dict.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``True``, since this is the most useful behavior in test.

    Returns:
        Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of executing the
        solid.
    '''
    check.inst_param(solid_def, 'solid_def', ISolidDefinition)
    check.opt_inst_param(mode_def, 'mode_def', ModeDefinition)
    input_values = check.opt_dict_param(input_values,
                                        'input_values',
                                        key_type=str)
    # backcompact
    run_config = canonicalize_run_config(run_config, environment_dict)

    solid_defs = [solid_def]

    def create_value_solid(input_name, input_value):
        @lambda_solid(name=input_name)
        def input_solid():
            return input_value

        return input_solid

    dependencies = defaultdict(dict)

    for input_name, input_value in input_values.items():
        dependencies[solid_def.name][input_name] = DependencyDefinition(
            input_name)
        solid_defs.append(create_value_solid(input_name, input_value))

    result = execute_pipeline(
        PipelineDefinition(
            name='ephemeral_{}_solid_pipeline'.format(solid_def.name),
            solid_defs=solid_defs,
            dependencies=dependencies,
            mode_defs=[mode_def] if mode_def else None,
        ),
        run_config=run_config,
        mode=mode_def.name if mode_def else None,
        tags=tags,
        raise_on_error=raise_on_error,
    )
    return result.result_for_handle(solid_def.name)
예제 #24
0
    def create_run_for_pipeline(
        self,
        pipeline_def,
        execution_plan=None,
        run_id=None,
        run_config=None,
        mode=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        solid_selection=None,
    ):
        from dagster.core.execution.api import create_execution_plan
        from dagster.core.execution.plan.plan import ExecutionPlan
        from dagster.core.snap import snapshot_from_execution_plan

        check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)
        check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)

        # note that solids_to_execute is required to execute the solid subset, which is the
        # frozenset version of the previous solid_subset.
        # solid_selection is not required and will not be converted to solids_to_execute here.
        # i.e. this function doesn't handle solid queries.
        # solid_selection is only used to pass the user queries further down.
        check.opt_set_param(solids_to_execute,
                            "solids_to_execute",
                            of_type=str)
        check.opt_list_param(solid_selection, "solid_selection", of_type=str)

        if solids_to_execute:
            if isinstance(pipeline_def, PipelineSubsetDefinition):
                # for the case when pipeline_def is created by IPipeline or ExternalPipeline
                check.invariant(
                    solids_to_execute == pipeline_def.solids_to_execute,
                    "Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} "
                    "that conflicts with solids_to_execute arg {solids_to_execute}"
                    .format(
                        pipeline_solids_to_execute=str_format_list(
                            pipeline_def.solids_to_execute),
                        solids_to_execute=str_format_list(solids_to_execute),
                    ),
                )
            else:
                # for cases when `create_run_for_pipeline` is directly called
                pipeline_def = pipeline_def.get_pipeline_subset_def(
                    solids_to_execute=solids_to_execute)

        full_execution_plan = execution_plan or create_execution_plan(
            pipeline_def,
            run_config=run_config,
            mode=mode,
        )
        check.invariant(
            len(full_execution_plan.step_keys_to_execute) == len(
                full_execution_plan.steps))

        if _is_memoized_run(tags):
            if step_keys_to_execute:
                raise DagsterInvariantViolationError(
                    "step_keys_to_execute parameter cannot be used in conjunction with memoized "
                    "pipeline runs.")

            step_keys_to_execute = self.resolve_unmemoized_steps(
                full_execution_plan,
                run_config=run_config,
                mode=mode,
            )  # TODO: tighter integration with existing step_keys_to_execute functionality

        subsetted_execution_plan = (
            full_execution_plan.build_subset_plan(step_keys_to_execute)
            if step_keys_to_execute else full_execution_plan)

        return self.create_run(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=check.opt_str_param(
                mode, "mode", default=pipeline_def.get_default_mode_name()),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                subsetted_execution_plan,
                pipeline_def.get_pipeline_snapshot_id()),
            parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(
            ),
        )
예제 #25
0
def launch_scheduled_runs_for_schedule(
    instance,
    logger,
    external_schedule: ExternalSchedule,
    schedule_state: InstigatorState,
    workspace,
    end_datetime_utc: datetime.datetime,
    max_catchup_runs,
    max_tick_retries,
    debug_crash_flags=None,
    log_verbose_checks=True,
):
    instance = check.inst_param(instance, "instance", DagsterInstance)
    schedule_state = check.opt_inst_param(schedule_state, "schedule_state", InstigatorState)
    end_datetime_utc = check.inst_param(end_datetime_utc, "end_datetime_utc", datetime.datetime)

    instigator_origin_id = external_schedule.get_external_origin_id()
    ticks = instance.get_ticks(instigator_origin_id, limit=1)
    latest_tick = ticks[0] if ticks else None

    instigator_data = cast(ScheduleInstigatorData, schedule_state.instigator_data)
    start_timestamp_utc = instigator_data.start_timestamp if schedule_state else None

    if latest_tick:
        if latest_tick.status == TickStatus.STARTED or (
            latest_tick.status == TickStatus.FAILURE
            and latest_tick.failure_count <= max_tick_retries
        ):
            # Scheduler was interrupted while performing this tick, re-do it
            start_timestamp_utc = (
                max(start_timestamp_utc, latest_tick.timestamp)
                if start_timestamp_utc
                else latest_tick.timestamp
            )
        else:
            start_timestamp_utc = (
                max(start_timestamp_utc, latest_tick.timestamp + 1)
                if start_timestamp_utc
                else latest_tick.timestamp + 1
            )
    else:
        start_timestamp_utc = instigator_data.start_timestamp

    schedule_name = external_schedule.name

    timezone_str = external_schedule.execution_timezone
    if not timezone_str:
        timezone_str = "UTC"
        if log_verbose_checks:
            logger.warn(
                f"Using UTC as the timezone for {external_schedule.name} as it did not specify "
                "an execution_timezone in its definition."
            )

    tick_times = []
    for next_time in external_schedule.execution_time_iterator(start_timestamp_utc):
        if next_time.timestamp() > end_datetime_utc.timestamp():
            break

        tick_times.append(next_time)

    if not tick_times:
        if log_verbose_checks:
            logger.info(f"No new runs for {schedule_name}")
        return

    if not external_schedule.partition_set_name and len(tick_times) > 1:
        logger.warning(f"{schedule_name} has no partition set, so not trying to catch up")
        tick_times = tick_times[-1:]
    elif len(tick_times) > max_catchup_runs:
        logger.warning(f"{schedule_name} has fallen behind, only launching {max_catchup_runs} runs")
        tick_times = tick_times[-max_catchup_runs:]

    if len(tick_times) == 1:
        tick_time = tick_times[0].strftime(default_date_format_string())
        logger.info(f"Evaluating schedule `{schedule_name}` at {tick_time}")
    else:
        times = ", ".join([time.strftime(default_date_format_string()) for time in tick_times])
        logger.info(f"Evaluating schedule `{schedule_name}` at the following times: {times}")

    for schedule_time in tick_times:
        schedule_timestamp = schedule_time.timestamp()
        schedule_time_str = schedule_time.strftime(default_date_format_string())
        if latest_tick and latest_tick.timestamp == schedule_timestamp:
            tick = latest_tick
            if latest_tick.status == TickStatus.FAILURE:
                logger.info(f"Retrying previously failed schedule execution at {schedule_time_str}")
            else:
                logger.info(
                    f"Resuming previously interrupted schedule execution at {schedule_time_str}"
                )
        else:
            tick = instance.create_tick(
                TickData(
                    instigator_origin_id=instigator_origin_id,
                    instigator_name=schedule_name,
                    instigator_type=InstigatorType.SCHEDULE,
                    status=TickStatus.STARTED,
                    timestamp=schedule_timestamp,
                )
            )

            _check_for_debug_crash(debug_crash_flags, "TICK_CREATED")

        with _ScheduleLaunchContext(tick, instance, logger) as tick_context:
            try:
                _check_for_debug_crash(debug_crash_flags, "TICK_HELD")

                yield from _schedule_runs_at_time(
                    instance,
                    logger,
                    workspace,
                    external_schedule,
                    schedule_time,
                    tick_context,
                    debug_crash_flags,
                )
            except Exception as e:
                if isinstance(e, DagsterUserCodeUnreachableError):
                    try:
                        raise DagsterSchedulerError(
                            f"Unable to reach the user code server for schedule {schedule_name}. Schedule will resume execution once the server is available."
                        ) from e
                    except:
                        error_data = serializable_error_info_from_exc_info(sys.exc_info())

                        tick_context.update_state(
                            TickStatus.FAILURE,
                            error=error_data,
                            # don't increment the failure count - retry forever until the server comes back up
                            # or the schedule is turned off
                            failure_count=tick_context.failure_count,
                        )
                        raise  # Raise the wrapped DagsterSchedulerError exception

                else:
                    error_data = serializable_error_info_from_exc_info(sys.exc_info())
                    tick_context.update_state(
                        TickStatus.FAILURE,
                        error=error_data,
                        failure_count=tick_context.failure_count + 1,
                    )
                    raise
예제 #26
0
 def __init__(self, conn_string, inst_data=None):
     check.str_param(conn_string, 'conn_string')
     self.engine = create_engine(conn_string)
     self._inst_data = check.opt_inst_param(inst_data, 'inst_data',
                                            ConfigurableClassData)
예제 #27
0
def _check_execute_pipeline_args(pipeline,
                                 run_config,
                                 mode,
                                 preset,
                                 tags,
                                 instance,
                                 solid_selection=None):
    pipeline = _check_pipeline(pipeline)
    pipeline_def = pipeline.get_definition()
    check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)

    run_config = check.opt_dict_param(run_config, "run_config")
    check.opt_str_param(mode, "mode")
    check.opt_str_param(preset, "preset")
    check.invariant(
        not (mode is not None and preset is not None),
        "You may set only one of `mode` (got {mode}) or `preset` (got {preset})."
        .format(mode=mode, preset=preset),
    )

    tags = check.opt_dict_param(tags, "tags", key_type=str)
    check.opt_list_param(solid_selection, "solid_selection", of_type=str)

    if preset is not None:
        pipeline_preset = pipeline_def.get_preset(preset)

        if pipeline_preset.run_config is not None:
            check.invariant(
                (not run_config) or (pipeline_preset.run_config == run_config),
                "The environment set in preset '{preset}' does not agree with the environment "
                "passed in the `run_config` argument.".format(preset=preset),
            )

            run_config = pipeline_preset.run_config

        # load solid_selection from preset
        if pipeline_preset.solid_selection is not None:
            check.invariant(
                solid_selection is None
                or solid_selection == pipeline_preset.solid_selection,
                "The solid_selection set in preset '{preset}', {preset_subset}, does not agree with "
                "the `solid_selection` argument: {solid_selection}".format(
                    preset=preset,
                    preset_subset=pipeline_preset.solid_selection,
                    solid_selection=solid_selection,
                ),
            )
            solid_selection = pipeline_preset.solid_selection

        check.invariant(
            mode is None or mode == pipeline_preset.mode,
            "Mode {mode} does not agree with the mode set in preset '{preset}': "
            "('{preset_mode}')".format(preset=preset,
                                       preset_mode=pipeline_preset.mode,
                                       mode=mode),
        )

        mode = pipeline_preset.mode

        tags = merge_dicts(pipeline_preset.tags, tags)

    if mode is not None:
        if not pipeline_def.has_mode_definition(mode):
            raise DagsterInvariantViolationError((
                "You have attempted to execute pipeline {name} with mode {mode}. "
                "Available modes: {modes}").format(
                    name=pipeline_def.name,
                    mode=mode,
                    modes=pipeline_def.available_modes,
                ))
    else:
        if pipeline_def.is_multi_mode:
            raise DagsterInvariantViolationError((
                "Pipeline {name} has multiple modes (Available modes: {modes}) and you have "
                "attempted to execute it without specifying a mode. Set "
                "mode property on the PipelineRun object.").format(
                    name=pipeline_def.name,
                    modes=pipeline_def.available_modes))
        mode = pipeline_def.get_default_mode_name()

    tags = merge_dicts(pipeline_def.tags, tags)

    check.opt_inst_param(instance, "instance", DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    # generate pipeline subset from the given solid_selection
    if solid_selection:
        pipeline = pipeline.subset_for_execution(solid_selection)

    return (
        pipeline,
        run_config,
        instance,
        mode,
        tags,
        pipeline.solids_to_execute,
        solid_selection,
    )
예제 #28
0
파일: pipeline.py 프로젝트: xaniasd/dagster
    def __init__(
            self,
            solid_defs,
            name=None,
            description=None,
            dependencies=None,
            mode_defs=None,
            preset_defs=None,
            tags=None,
            hook_defs=None,
            input_mappings=None,
            output_mappings=None,
            config_mapping=None,
            positional_inputs=None,
            _parent_pipeline_def=None,  # https://github.com/dagster-io/dagster/issues/2115
    ):
        if not name:
            warnings.warn(
                "Pipeline must have a name. Names will be required starting in 0.10.0 or later."
            )
            name = _anonymous_pipeline_name()

        # For these warnings they check truthiness because they get changed to [] higher
        # in the stack for the decorator case

        if input_mappings:
            experimental_arg_warning("input_mappings", "PipelineDefinition")

        if output_mappings:
            experimental_arg_warning("output_mappings", "PipelineDefinition")

        if config_mapping is not None:
            experimental_arg_warning("config_mapping", "PipelineDefinition")

        if positional_inputs:
            experimental_arg_warning("positional_inputs", "PipelineDefinition")

        super(PipelineDefinition, self).__init__(
            name=name,
            description=description,
            dependencies=dependencies,
            node_defs=solid_defs,
            tags=check.opt_dict_param(tags, "tags", key_type=str),
            positional_inputs=positional_inputs,
            input_mappings=input_mappings,
            output_mappings=output_mappings,
            config_mapping=config_mapping,
        )

        self._current_level_node_defs = solid_defs
        self._tags = validate_tags(tags)

        mode_definitions = check.opt_list_param(mode_defs,
                                                "mode_defs",
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    "Modes must have unique names.").format(
                        mode_name=mode_def.name, pipeline_name=self._name))
            seen_modes.add(mode_def.name)

        self._dagster_type_dict = construct_dagster_type_dictionary(
            self._current_level_node_defs)

        self._hook_defs = check.opt_set_param(hook_defs,
                                              "hook_defs",
                                              of_type=HookDefinition)

        self._preset_defs = check.opt_list_param(preset_defs, "preset_defs",
                                                 PresetDefinition)
        self._preset_dict = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    "PresetDefinitions must have unique names.").format(
                        name=preset.name, pipeline_name=self._name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self._name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        # Validate solid resource dependencies
        _validate_resource_dependencies(
            self._mode_definitions,
            self._current_level_node_defs,
            self._dagster_type_dict,
            self._solid_dict,
            self._hook_defs,
        )

        # Validate unsatisfied inputs can be materialized from config
        _validate_inputs(self._dependency_structure, self._solid_dict)

        # Recursively explore all nodes in the this pipeline
        self._all_node_defs = _build_all_node_defs(
            self._current_level_node_defs)
        self._parent_pipeline_def = check.opt_inst_param(
            _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition)
        self._cached_run_config_schemas = {}
        self._cached_external_pipeline = None
예제 #29
0
    def get_context(self, solid_config=None, mode_def=None, environment_dict=None):
        '''Get a dagstermill execution context for interactive exploration and development.

        Args:
            solid_config (Optional[Any]): If specified, this value will be made available on the
                context as its ``solid_config`` property.
            mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to
                use to construct the context. Specify this if you would like a context constructed
                with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode
                with a console logger will be constructed.
            environment_dict(Optional[dict]): The environment config dict with which to construct
                the context.

        Returns:
            :py:class:`~dagstermill.DagstermillExecutionContext`
        '''
        check.opt_inst_param(mode_def, 'mode_def', ModeDefinition)
        environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)

        # If we are running non-interactively, and there is already a context reconstituted, return
        # that context rather than overwriting it.
        if self.context is not None and isinstance(
            self.context, DagstermillRuntimeExecutionContext
        ):
            return self.context

        if not mode_def:
            mode_def = ModeDefinition(logger_defs={'dagstermill': colored_console_logger})
            environment_dict['loggers'] = {'dagstermill': {}}

        solid_def = SolidDefinition(
            name='this_solid',
            input_defs=[],
            compute_fn=lambda *args, **kwargs: None,
            output_defs=[],
            description='Ephemeral solid constructed by dagstermill.get_context()',
            required_resource_keys=mode_def.resource_key_set,
        )

        pipeline_def = PipelineDefinition(
            [solid_def], mode_defs=[mode_def], name='ephemeral_dagstermill_pipeline'
        )

        run_id = make_new_run_id()

        # construct stubbed PipelineRun for notebook exploration...
        # The actual pipeline run during pipeline execution will be serialized and reconstituted
        # in the `reconstitute_pipeline_context` call
        pipeline_run = PipelineRun(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            environment_dict=environment_dict,
            mode=mode_def.name,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
            tags=None,
        )

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline = pipeline_def

        execution_plan = create_execution_plan(self.pipeline, environment_dict, mode=mode_def.name)
        with scoped_pipeline_context(
            execution_plan,
            environment_dict,
            pipeline_run,
            DagsterInstance.ephemeral(),
            scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:

            self.context = DagstermillExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=solid_config,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan, pipeline_context.system_storage_def
                ),
            )

        return self.context
예제 #30
0
    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        retries_dict,
        pipeline_origin_packed,
        user_defined_k8s_config_dict=None,
        kubeconfig_file=None,
    ):
        """Run step execution in a K8s job pod.
        """

        check.dict_param(instance_ref_dict, "instance_ref_dict")
        check.list_param(step_keys, "step_keys", of_type=str)
        check.invariant(
            len(step_keys) == 1, "Celery K8s task executor can only execute 1 step at a time"
        )
        check.dict_param(run_config, "run_config")
        check.str_param(mode, "mode")
        check.str_param(repo_name, "repo_name")
        check.str_param(repo_location_name, "repo_location_name")
        check.str_param(run_id, "run_id")

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, "job_config", DagsterK8sJobConfig)
        check.str_param(job_namespace, "job_namespace")

        check.bool_param(load_incluster_config, "load_incluster_config")
        check.dict_param(retries_dict, "retries_dict")

        pipeline_origin = unpack_value(
            check.dict_param(
                pipeline_origin_packed, "pipeline_origin_packed"
            )  # TODO: make part of args
        )
        check.inst(pipeline_origin, PipelineOrigin)

        user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(
            user_defined_k8s_config_dict
        )
        check.opt_inst_param(
            user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig,
        )
        check.opt_str_param(kubeconfig_file, "kubeconfig_file")

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)

        check.invariant(pipeline_run, "Could not load run {}".format(run_id))

        step_key = step_keys[0]
        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                "Not scheduling step because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData([EventMetadataEntry.text(step_key, "Step keys"),]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return

        # Ensure we stay below k8s name length limits
        k8s_name_key = get_k8s_job_name(run_id, step_key)

        retries = Retries.from_config(retries_dict)

        if retries.get_attempt_count(step_key):
            attempt_number = retries.get_attempt_count(step_key)
            job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
            pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
        else:
            job_name = "dagster-job-%s" % (k8s_name_key)
            pod_name = "dagster-job-%s" % (k8s_name_key)

        input_json = serialize_dagster_namedtuple(
            ExecuteStepArgs(
                pipeline_origin=pipeline_origin,
                pipeline_run_id=run_id,
                instance_ref=None,
                mode=mode,
                step_keys_to_execute=step_keys,
                run_config=run_config,
                retries_dict=retries_dict,
            )
        )
        command = ["dagster"]
        args = ["api", "execute_step_with_structured_logs", input_json]

        job = construct_dagster_k8s_job(
            job_config, command, args, job_name, user_defined_k8s_config, pod_name
        )

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            "Executing step {} in Kubernetes job {}".format(step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, "Step keys"),
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(pod_name, "Kubernetes Pod name"),
                    EventMetadataEntry.text(job_config.job_image, "Job image"),
                    EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"),
                    EventMetadataEntry.text(
                        str(job_config.image_pull_secrets), "Image pull secrets"
                    ),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name), "Service account name"
                    ),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)

        try:
            kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            if e.reason == "Conflict":
                # There is an existing job with the same name so do not procede.
                instance.report_engine_event(
                    "Did not create Kubernetes job {} for step {} since job name already "
                    "exists, exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step keys"),
                            EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                            EventMetadataEntry.text(pod_name, "Kubernetes Pod name"),
                        ],
                        marker_end=DELEGATE_MARKER,
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            else:
                instance.report_engine_event(
                    "Encountered unexpected error while creating Kubernetes job {} for step {}, "
                    "exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step keys"),
                            EventMetadataEntry.text(e, "Error"),
                        ]
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            return

        try:
            wait_for_job_success(
                job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id,
            )
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                "Terminating Kubernetes Job because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData(
                    [
                        EventMetadataEntry.text(step_key, "Step keys"),
                        EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                        EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"),
                    ]
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return

        pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            "Retrieving logs from Kubernetes Job pods",
            pipeline_run,
            EngineEventData([EventMetadataEntry.text("\n".join(pod_names), "Pod names")]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split("\n")

        events += filter_dagster_events_from_pod_logs(logs)
        serialized_events = [serialize_dagster_namedtuple(event) for event in events]
        return serialized_events