예제 #1
0
    def __init__(
        self,
        retries: RetryMode,
        max_concurrent: int,
        start_method: Optional[str] = None,
        explicit_forkserver_preload: Optional[List[str]] = None,
    ):
        self._retries = check.inst_param(retries, "retries", RetryMode)
        max_concurrent = max_concurrent if max_concurrent else multiprocessing.cpu_count(
        )
        self._max_concurrent = check.int_param(max_concurrent,
                                               "max_concurrent")
        start_method = check.opt_str_param(start_method, "start_method")
        valid_starts = multiprocessing.get_all_start_methods()

        if start_method is None:
            start_method = "spawn"

        if start_method not in valid_starts:
            raise DagsterUnmetExecutorRequirementsError(
                f"The selected start_method '{start_method}' is not available. "
                f"Only {valid_starts} are valid options on {sys.platform} python {sys.version}.",
            )
        self._start_method = start_method
        self._explicit_forkserver_preload = explicit_forkserver_preload
예제 #2
0
    def materialize(
        self,
        selection: Optional[Union[str, List[str]]] = None
    ) -> ExecuteInProcessResult:
        """
        Executes an in-process run that materializes all assets in the group.

        The execution proceeds serially, in a single thread. Only supported by AssetGroups that have
        no executor_def or that that use the in-process executor.

        Args:
            selection (Union[str, List[str]]): A single selection query or list of selection queries
                to for assets in the group. For example:

                    - ``['some_asset_key']`` select ``some_asset_key`` itself.
                    - ``['*some_asset_key']`` select ``some_asset_key`` and all its ancestors (upstream dependencies).
                    - ``['*some_asset_key+++']`` select ``some_asset_key``, all its ancestors, and its descendants (downstream dependencies) within 3 levels down.
                    - ``['*some_asset_key', 'other_asset_key_a', 'other_asset_key_b+']`` select ``some_asset_key`` and all its ancestors, ``other_asset_key_a`` itself, and ``other_asset_key_b`` and its direct child asset keys. When subselecting into a multi-asset, all of the asset keys in that multi-asset must be selected.

        Returns:
            ExecuteInProcessResult: The result of the execution.
        """
        if self.executor_def and self.executor_def is not in_process_executor:
            raise DagsterUnmetExecutorRequirementsError(
                "'materialize' can only be invoked on AssetGroups which have no executor or have "
                "the in_process_executor, but the AssetGroup had executor "
                f"'{self.executor_def.name}'")

        return self.build_job(name="in_process_materialization_job",
                              selection=selection).execute_in_process()
예제 #3
0
def check_non_ephemeral_instance(instance):
    if instance.is_ephemeral:
        raise DagsterUnmetExecutorRequirementsError(
            'You have attempted to use a multi process executor with an ephemeral DagsterInstance. '
            'A non-ephermal instance is needed to coordinate execution between multiple processes. '
            'You can configure your default instance via $DAGSTER_HOME or ensure a valid one is '
            'passed when invoking the python APIs.')
예제 #4
0
def _check_non_ephemeral_instance(instance):
    if instance.is_ephemeral:
        raise DagsterUnmetExecutorRequirementsError(
            "You have attempted to use an executor that uses multiple processes with an "
            "ephemeral DagsterInstance. A non-ephemeral instance is needed to coordinate "
            "execution between multiple processes. You can configure your default instance "
            "via $DAGSTER_HOME or ensure a valid one is passed when invoking the python APIs."
        )
예제 #5
0
def check_persistent_storage_requirement(system_storage_def):
    if not system_storage_def.is_persistent:
        raise DagsterUnmetExecutorRequirementsError((
            'You have attempted use a multi process executor while using system '
            'storage {storage_name} which does not persist intermediates. '
            'This means there would be no way to move data between different '
            'processes. Please configure your pipeline in the storage config '
            'section to use persistent system storage such as the filesystem.'
        ).format(storage_name=system_storage_def.name))
예제 #6
0
def _check_intra_process_pipeline(pipeline):
    if not isinstance(pipeline, ReconstructablePipeline):
        raise DagsterUnmetExecutorRequirementsError(
            'You have attempted to use an executor that uses multiple processes with the pipeline "{name}" '
            'that is not reconstructable. Pipelines must be loaded in a way that allows dagster to reconstruct '
            'them in a new process. This means: \n'
            '  * using the file, module, or repository.yaml arguments of dagit/dagster-graphql/dagster\n'
            '  * loading the pipeline through the reconstructable() function\n'
            .format(name=pipeline.get_definition().name))
예제 #7
0
def _check_non_ephemeral_instance(instance):
    if instance.is_ephemeral:
        raise DagsterUnmetExecutorRequirementsError(
            "You have attempted to use an executor that uses multiple processes with an "
            "ephemeral DagsterInstance. A non-ephemeral instance is needed to coordinate "
            "execution between multiple processes. You can configure your default instance "
            "via $DAGSTER_HOME or ensure a valid one is passed when invoking the python APIs. "
            "You can learn more about setting up a persistent DagsterInstance from the "
            "DagsterInstance docs here: https://docs.dagster.io/deployment/dagster-instance#default-local-behavior"
        )
예제 #8
0
def _check_pipeline_has_target_handle(pipeline_def):
    from dagster.core.definitions.handle import ExecutionTargetHandle

    handle, _ = ExecutionTargetHandle.get_handle(pipeline_def)
    if not handle:
        raise DagsterUnmetExecutorRequirementsError(
            'You have attempted to use an executor that uses multiple processes with the pipeline "{name}" '
            'that can not be re-hydrated. Pipelines must be loaded in a way that allows dagster to reconstruct '
            'them in a new process. This means: \n'
            '  * using the file, module, or repository.yaml arguments of dagit/dagster-graphql/dagster\n'
            '  * constructing an ExecutionTargetHandle directly\n'.format(
                name=pipeline_def.name))
예제 #9
0
def _check_persistent_storage_requirement(intermediate_storage_def, system_storage_def):
    if intermediate_storage_def:
        if not intermediate_storage_def.is_persistent:
            raise DagsterUnmetExecutorRequirementsError(
                (
                    "You have attempted to use an executor that uses multiple processes while using "
                    "intermediate storage {storage_name} which does not persist intermediates. "
                    "This means there would be no way to move data between different "
                    "processes. Please configure your pipeline in the storage config "
                    "section to use persistent system storage such as the filesystem."
                ).format(storage_name=intermediate_storage_def.name)
            )
    elif not system_storage_def.is_persistent:
        raise DagsterUnmetExecutorRequirementsError(
            (
                "You have attempted to use an executor that uses multiple processes while using system "
                "storage {storage_name} which does not persist intermediates. "
                "This means there would be no way to move data between different "
                "processes. Please configure your pipeline in the storage config "
                "section to use persistent system storage such as the filesystem."
            ).format(storage_name=system_storage_def.name)
        )
예제 #10
0
def _check_intra_process_pipeline(pipeline):
    from dagster.core.definitions import JobDefinition

    if not isinstance(pipeline, ReconstructablePipeline):
        target = "job" if isinstance(pipeline.get_definition(),
                                     JobDefinition) else "pipeline"
        raise DagsterUnmetExecutorRequirementsError(
            'You have attempted to use an executor that uses multiple processes with the {target} "{name}" '
            "that is not reconstructable. {target_cap} must be loaded in a way that allows dagster to reconstruct "
            "them in a new process. This means: \n"
            "  * using the file, module, or repository.yaml arguments of dagit/dagster-graphql/dagster\n"
            "  * loading the {target} through the reconstructable() function\n"
            .format(target=target,
                    name=pipeline.get_definition().name,
                    target_cap=target.capitalize()))
예제 #11
0
def _check_persistent_storage_requirement(pipeline_def, mode_def,
                                          intermediate_storage_def):
    """We prefer to store outputs with IO managers, but will fall back to intermediate storage
    if an IO manager isn't set.
    """
    if not (_all_outputs_non_mem_io_managers(pipeline_def, mode_def) or
            (intermediate_storage_def
             and intermediate_storage_def.is_persistent)):
        raise DagsterUnmetExecutorRequirementsError(
            "You have attempted to use an executor that uses multiple processes, but your pipeline "
            "includes solid outputs that will not be stored somewhere where other processes can"
            "retrieve them. "
            "Please make sure that your pipeline definition includes a ModeDefinition whose "
            'resource_keys assign the "io_manager" key to an IOManager resource '
            "that stores outputs outside of the process, such as the fs_io_manager."
        )
예제 #12
0
def _check_persistent_storage_requirement(
    pipeline: IPipeline,
    mode_def: ModeDefinition,
    environment_config: EnvironmentConfig,
) -> None:
    from dagster.core.execution.context_creation_pipeline import executor_def_from_config

    pipeline_def = pipeline.get_definition()
    executor_def = executor_def_from_config(mode_def, environment_config)
    if ExecutorRequirement.PERSISTENT_OUTPUTS not in executor_def.requirements:
        return

    intermediate_storage_def = environment_config.intermediate_storage_def_for_mode(
        mode_def)

    if not (can_isolate_steps(pipeline_def, mode_def) or
            (intermediate_storage_def
             and intermediate_storage_def.is_persistent)):
        raise DagsterUnmetExecutorRequirementsError(
            "You have attempted to use an executor that uses multiple processes, but your pipeline "
            "includes solid outputs that will not be stored somewhere where other processes can "
            "retrieve them. Please use a persistent IO manager for these outputs. E.g. with\n"
            '    @pipeline(mode_defs=[ModeDefinition(resource_defs={"io_manager": fs_io_manager})])'
        )
예제 #13
0
def celery_k8s_job_executor(init_context):
    """Celery-based executor which launches tasks as Kubernetes Jobs.

    The Celery executor exposes config settings for the underlying Celery app under
    the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced
    in Celery version 4.0 and the object constructed from config will be passed to the
    :py:class:`celery.Celery` constructor as its ``config_source`` argument.
    (See https://docs.celeryproject.org/en/latest/userguide/configuration.html for details.)

    The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the
    :py:class:`celery.Celery` constructor.

    In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use
    Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently
    modified, but that when solid executions are especially fast or slow, or when there are
    different requirements around idempotence or retry, it may make sense to execute pipelines
    with variations on these settings.

    If you'd like to configure a Celery Kubernetes Job executor in addition to the
    :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a
    :py:class:`~dagster.ModeDefinition` as follows:

    .. literalinclude:: ../../../../../../python_modules/libraries/dagster-celery-k8s/dagster_celery_k8s_tests/example_celery_mode_def.py
       :language: python

    Then you can configure the executor as follows:

    .. code-block:: YAML

        execution:
          celery-k8s:
            config:
              job_image: 'my_repo.com/image_name:latest'
              job_namespace: 'some-namespace'
              broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker
              backend: 'rpc://' # Optional[str]: The URL of the Celery results backend
              include: ['my_module'] # Optional[List[str]]: Modules every worker should import
              config_source: # Dict[str, Any]: Any additional parameters to pass to the
                  #...       # Celery workers. This dict will be passed as the `config_source`
                  #...       # argument of celery.Celery().

    Note that the YAML you provide here must align with the configuration with which the Celery
    workers on which you hope to run were started. If, for example, you point the executor at a
    different broker than the one your workers are listening to, the workers will never be able to
    pick up tasks for execution.

    In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery
    commands must be invoked with the `-A dagster_celery_k8s.app` argument.
    """

    run_launcher = init_context.instance.run_launcher
    exc_cfg = init_context.executor_config

    if not isinstance(run_launcher, CeleryK8sRunLauncher):
        raise DagsterUnmetExecutorRequirementsError(
            "This engine is only compatible with a CeleryK8sRunLauncher; configure the "
            "CeleryK8sRunLauncher on your instance to use it.", )

    job_config = DagsterK8sJobConfig(
        dagster_home=run_launcher.dagster_home,
        instance_config_map=run_launcher.instance_config_map,
        postgres_password_secret=run_launcher.postgres_password_secret,
        job_image=exc_cfg.get("job_image")
        or os.getenv("DAGSTER_CURRENT_IMAGE"),
        image_pull_policy=exc_cfg.get("image_pull_policy"),
        image_pull_secrets=exc_cfg.get("image_pull_secrets"),
        service_account_name=exc_cfg.get("service_account_name"),
        env_config_maps=exc_cfg.get("env_config_maps"),
        env_secrets=exc_cfg.get("env_secrets"),
    )

    # Set on the instance but overrideable here
    broker = run_launcher.broker or exc_cfg.get("broker")
    backend = run_launcher.backend or exc_cfg.get("backend")
    config_source = run_launcher.config_source or exc_cfg.get("config_source")
    include = run_launcher.include or exc_cfg.get("include")
    retries = run_launcher.retries or RetryMode.from_config(
        exc_cfg.get("retries"))

    return CeleryK8sJobExecutor(
        broker=broker,
        backend=backend,
        config_source=config_source,
        include=include,
        retries=retries,
        job_config=job_config,
        job_namespace=exc_cfg.get("job_namespace"),
        load_incluster_config=exc_cfg.get("load_incluster_config"),
        kubeconfig_file=exc_cfg.get("kubeconfig_file"),
        repo_location_name=exc_cfg.get("repo_location_name"),
    )
예제 #14
0
def k8s_job_executor(init_context: InitExecutorContext) -> Executor:
    """
    Executor which launches steps as Kubernetes Jobs.

    To use the `k8s_job_executor`, set it as the `executor_def` when defining a job:

    .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py
       :start-after: start_marker
       :end-before: end_marker
       :language: python

    Then you can configure the executor with run config as follows:

    .. code-block:: YAML

        execution:
          config:
            job_namespace: 'some-namespace'
            image_pull_policy: ...
            image_pull_secrets: ...
            service_account_name: ...
            env_config_maps: ...
            env_secrets: ...
            job_image: ... # leave out if using userDeployments
    """

    run_launcher = init_context.instance.run_launcher
    if not isinstance(run_launcher, K8sRunLauncher):
        raise DagsterUnmetExecutorRequirementsError(
            "This engine is only compatible with a K8sRunLauncher; configure the "
            "K8sRunLauncher on your instance to use it.", )

    exc_cfg = init_context.executor_config
    job_config = DagsterK8sJobConfig(
        dagster_home=run_launcher.dagster_home,
        instance_config_map=run_launcher.instance_config_map,
        postgres_password_secret=run_launcher.postgres_password_secret,
        job_image=exc_cfg.get("job_image"),
        image_pull_policy=(exc_cfg.get("image_pull_policy")
                           if exc_cfg.get("image_pull_policy") != None else
                           run_launcher.image_pull_policy),
        image_pull_secrets=run_launcher.image_pull_secrets +
        (exc_cfg.get("image_pull_secrets") or []),
        service_account_name=(exc_cfg.get("service_account_name")
                              if exc_cfg.get("service_account_name") != None
                              else run_launcher.service_account_name),
        env_config_maps=run_launcher.env_config_maps +
        (exc_cfg.get("env_config_maps") or []),
        env_secrets=run_launcher.env_secrets +
        (exc_cfg.get("env_secrets") or []),
        volume_mounts=run_launcher.volume_mounts +
        (exc_cfg.get("volume_mounts") or []),
        volumes=run_launcher.volumes + (exc_cfg.get("volumes") or []),
    )

    return StepDelegatingExecutor(
        K8sStepHandler(
            job_config=job_config,
            job_namespace=(exc_cfg.get("job_namespace")
                           if exc_cfg.get("job_namespace") != None else
                           run_launcher.job_namespace),
            load_incluster_config=run_launcher.load_incluster_config,
            kubeconfig_file=run_launcher.kubeconfig_file,
        ),
        retries=RetryMode.from_config(init_context.executor_config["retries"]),
        should_verify_step=True,
    )
예제 #15
0
def k8s_job_executor(init_context: InitExecutorContext) -> Executor:
    """
    Executor which launches steps as Kubernetes Jobs.

    To use the `k8s_job_executor`, set it as the `executor_def` when defining a job:

    .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py
       :start-after: start_marker
       :end-before: end_marker
       :language: python

    Then you can configure the executor with run config as follows:

    .. code-block:: YAML

        execution:
          config:
            job_namespace: 'some-namespace'
            image_pull_policy: ...
            image_pull_secrets: ...
            service_account_name: ...
            env_config_maps: ...
            env_secrets: ...
            env_vars: ...
            job_image: ... # leave out if using userDeployments

    Configuration set on the Kubernetes Jobs and Pods created by the `K8sRunLauncher` will also be
    set on Kubernetes Jobs and Pods created by the `k8s_job_executor`.
    """

    run_launcher = init_context.instance.run_launcher
    if not isinstance(run_launcher, K8sRunLauncher):
        raise DagsterUnmetExecutorRequirementsError(
            "This engine is only compatible with a K8sRunLauncher; configure the "
            "K8sRunLauncher on your instance to use it.",
        )

    exc_cfg = init_context.executor_config

    k8s_container_context = K8sContainerContext(
        image_pull_policy=exc_cfg.get("image_pull_policy"),
        image_pull_secrets=exc_cfg.get("image_pull_secrets"),
        service_account_name=exc_cfg.get("service_account_name"),
        env_config_maps=exc_cfg.get("env_config_maps"),
        env_secrets=exc_cfg.get("env_secrets"),
        env_vars=exc_cfg.get("env_vars"),
        volume_mounts=exc_cfg.get("volume_mounts"),
        volumes=exc_cfg.get("volumes"),
        labels=exc_cfg.get("labels"),
        namespace=exc_cfg.get("job_namespace"),
    )

    return StepDelegatingExecutor(
        K8sStepHandler(
            image=exc_cfg.get("job_image"),
            container_context=k8s_container_context,
            load_incluster_config=run_launcher.load_incluster_config,
            kubeconfig_file=run_launcher.kubeconfig_file,
        ),
        retries=RetryMode.from_config(init_context.executor_config["retries"]),
        should_verify_step=True,
    )
예제 #16
0
def k8s_job_executor(init_context: InitExecutorContext) -> Executor:
    """
    Executor which launches steps as Kubernetes Jobs. This executor is experimental.

    To add the Kubernetes Job executor in addition to the
    :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a
    :py:class:`~dagster.ModeDefinition` as follows:

    .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py
       :start-after: start_marker
       :end-before: end_marker
       :language: python

    Then you can configure the executor with run config (either via a :py:class:`~dagster.PresetDefinition` or the Dagit playground) as follows:

    .. code-block:: YAML

        execution:
          k8s:
            config:
              job_namespace: 'some-namespace'
              image_pull_policy: ...
              image_pull_secrets: ...
              service_account_name: ...
              env_config_maps: ...
              env_secrets: ...
              job_image: ... # leave out if using userDeployments
    """

    run_launcher = init_context.instance.run_launcher
    if not isinstance(run_launcher, K8sRunLauncher):
        raise DagsterUnmetExecutorRequirementsError(
            "This engine is only compatible with a K8sRunLauncher; configure the "
            "K8sRunLauncher on your instance to use it.", )

    exc_cfg = init_context.executor_config
    job_config = DagsterK8sJobConfig(
        dagster_home=run_launcher.dagster_home,
        instance_config_map=run_launcher.instance_config_map,
        postgres_password_secret=run_launcher.postgres_password_secret,
        job_image=exc_cfg.get("job_image"),
        image_pull_policy=(exc_cfg.get("image_pull_policy")
                           if exc_cfg.get("image_pull_policy") != None else
                           run_launcher.image_pull_policy),
        image_pull_secrets=(exc_cfg.get("image_pull_secrets")
                            if exc_cfg.get("image_pull_secrets") != None else
                            run_launcher.image_pull_secrets),
        service_account_name=(exc_cfg.get("service_account_name")
                              if exc_cfg.get("service_account_name") != None
                              else run_launcher.service_account_name),
        env_config_maps=(exc_cfg.get("env_config_maps")
                         if exc_cfg.get("env_config_maps") != None else
                         run_launcher.env_config_maps),
        env_secrets=(exc_cfg.get("env_secrets")
                     if exc_cfg.get("env_secrets") != None else
                     run_launcher.env_secrets),
    )

    return StepDelegatingExecutor(
        K8sStepHandler(
            job_config=job_config,
            job_namespace=(exc_cfg.get("job_namespace")
                           if exc_cfg.get("job_namespace") != None else
                           run_launcher.job_namespace),
            load_incluster_config=run_launcher.load_incluster_config,
            kubeconfig_file=run_launcher.kubeconfig_file,
        ))