def __init__( self, retries: RetryMode, max_concurrent: int, start_method: Optional[str] = None, explicit_forkserver_preload: Optional[List[str]] = None, ): self._retries = check.inst_param(retries, "retries", RetryMode) max_concurrent = max_concurrent if max_concurrent else multiprocessing.cpu_count( ) self._max_concurrent = check.int_param(max_concurrent, "max_concurrent") start_method = check.opt_str_param(start_method, "start_method") valid_starts = multiprocessing.get_all_start_methods() if start_method is None: start_method = "spawn" if start_method not in valid_starts: raise DagsterUnmetExecutorRequirementsError( f"The selected start_method '{start_method}' is not available. " f"Only {valid_starts} are valid options on {sys.platform} python {sys.version}.", ) self._start_method = start_method self._explicit_forkserver_preload = explicit_forkserver_preload
def materialize( self, selection: Optional[Union[str, List[str]]] = None ) -> ExecuteInProcessResult: """ Executes an in-process run that materializes all assets in the group. The execution proceeds serially, in a single thread. Only supported by AssetGroups that have no executor_def or that that use the in-process executor. Args: selection (Union[str, List[str]]): A single selection query or list of selection queries to for assets in the group. For example: - ``['some_asset_key']`` select ``some_asset_key`` itself. - ``['*some_asset_key']`` select ``some_asset_key`` and all its ancestors (upstream dependencies). - ``['*some_asset_key+++']`` select ``some_asset_key``, all its ancestors, and its descendants (downstream dependencies) within 3 levels down. - ``['*some_asset_key', 'other_asset_key_a', 'other_asset_key_b+']`` select ``some_asset_key`` and all its ancestors, ``other_asset_key_a`` itself, and ``other_asset_key_b`` and its direct child asset keys. When subselecting into a multi-asset, all of the asset keys in that multi-asset must be selected. Returns: ExecuteInProcessResult: The result of the execution. """ if self.executor_def and self.executor_def is not in_process_executor: raise DagsterUnmetExecutorRequirementsError( "'materialize' can only be invoked on AssetGroups which have no executor or have " "the in_process_executor, but the AssetGroup had executor " f"'{self.executor_def.name}'") return self.build_job(name="in_process_materialization_job", selection=selection).execute_in_process()
def check_non_ephemeral_instance(instance): if instance.is_ephemeral: raise DagsterUnmetExecutorRequirementsError( 'You have attempted to use a multi process executor with an ephemeral DagsterInstance. ' 'A non-ephermal instance is needed to coordinate execution between multiple processes. ' 'You can configure your default instance via $DAGSTER_HOME or ensure a valid one is ' 'passed when invoking the python APIs.')
def _check_non_ephemeral_instance(instance): if instance.is_ephemeral: raise DagsterUnmetExecutorRequirementsError( "You have attempted to use an executor that uses multiple processes with an " "ephemeral DagsterInstance. A non-ephemeral instance is needed to coordinate " "execution between multiple processes. You can configure your default instance " "via $DAGSTER_HOME or ensure a valid one is passed when invoking the python APIs." )
def check_persistent_storage_requirement(system_storage_def): if not system_storage_def.is_persistent: raise DagsterUnmetExecutorRequirementsError(( 'You have attempted use a multi process executor while using system ' 'storage {storage_name} which does not persist intermediates. ' 'This means there would be no way to move data between different ' 'processes. Please configure your pipeline in the storage config ' 'section to use persistent system storage such as the filesystem.' ).format(storage_name=system_storage_def.name))
def _check_intra_process_pipeline(pipeline): if not isinstance(pipeline, ReconstructablePipeline): raise DagsterUnmetExecutorRequirementsError( 'You have attempted to use an executor that uses multiple processes with the pipeline "{name}" ' 'that is not reconstructable. Pipelines must be loaded in a way that allows dagster to reconstruct ' 'them in a new process. This means: \n' ' * using the file, module, or repository.yaml arguments of dagit/dagster-graphql/dagster\n' ' * loading the pipeline through the reconstructable() function\n' .format(name=pipeline.get_definition().name))
def _check_non_ephemeral_instance(instance): if instance.is_ephemeral: raise DagsterUnmetExecutorRequirementsError( "You have attempted to use an executor that uses multiple processes with an " "ephemeral DagsterInstance. A non-ephemeral instance is needed to coordinate " "execution between multiple processes. You can configure your default instance " "via $DAGSTER_HOME or ensure a valid one is passed when invoking the python APIs. " "You can learn more about setting up a persistent DagsterInstance from the " "DagsterInstance docs here: https://docs.dagster.io/deployment/dagster-instance#default-local-behavior" )
def _check_pipeline_has_target_handle(pipeline_def): from dagster.core.definitions.handle import ExecutionTargetHandle handle, _ = ExecutionTargetHandle.get_handle(pipeline_def) if not handle: raise DagsterUnmetExecutorRequirementsError( 'You have attempted to use an executor that uses multiple processes with the pipeline "{name}" ' 'that can not be re-hydrated. Pipelines must be loaded in a way that allows dagster to reconstruct ' 'them in a new process. This means: \n' ' * using the file, module, or repository.yaml arguments of dagit/dagster-graphql/dagster\n' ' * constructing an ExecutionTargetHandle directly\n'.format( name=pipeline_def.name))
def _check_persistent_storage_requirement(intermediate_storage_def, system_storage_def): if intermediate_storage_def: if not intermediate_storage_def.is_persistent: raise DagsterUnmetExecutorRequirementsError( ( "You have attempted to use an executor that uses multiple processes while using " "intermediate storage {storage_name} which does not persist intermediates. " "This means there would be no way to move data between different " "processes. Please configure your pipeline in the storage config " "section to use persistent system storage such as the filesystem." ).format(storage_name=intermediate_storage_def.name) ) elif not system_storage_def.is_persistent: raise DagsterUnmetExecutorRequirementsError( ( "You have attempted to use an executor that uses multiple processes while using system " "storage {storage_name} which does not persist intermediates. " "This means there would be no way to move data between different " "processes. Please configure your pipeline in the storage config " "section to use persistent system storage such as the filesystem." ).format(storage_name=system_storage_def.name) )
def _check_intra_process_pipeline(pipeline): from dagster.core.definitions import JobDefinition if not isinstance(pipeline, ReconstructablePipeline): target = "job" if isinstance(pipeline.get_definition(), JobDefinition) else "pipeline" raise DagsterUnmetExecutorRequirementsError( 'You have attempted to use an executor that uses multiple processes with the {target} "{name}" ' "that is not reconstructable. {target_cap} must be loaded in a way that allows dagster to reconstruct " "them in a new process. This means: \n" " * using the file, module, or repository.yaml arguments of dagit/dagster-graphql/dagster\n" " * loading the {target} through the reconstructable() function\n" .format(target=target, name=pipeline.get_definition().name, target_cap=target.capitalize()))
def _check_persistent_storage_requirement(pipeline_def, mode_def, intermediate_storage_def): """We prefer to store outputs with IO managers, but will fall back to intermediate storage if an IO manager isn't set. """ if not (_all_outputs_non_mem_io_managers(pipeline_def, mode_def) or (intermediate_storage_def and intermediate_storage_def.is_persistent)): raise DagsterUnmetExecutorRequirementsError( "You have attempted to use an executor that uses multiple processes, but your pipeline " "includes solid outputs that will not be stored somewhere where other processes can" "retrieve them. " "Please make sure that your pipeline definition includes a ModeDefinition whose " 'resource_keys assign the "io_manager" key to an IOManager resource ' "that stores outputs outside of the process, such as the fs_io_manager." )
def _check_persistent_storage_requirement( pipeline: IPipeline, mode_def: ModeDefinition, environment_config: EnvironmentConfig, ) -> None: from dagster.core.execution.context_creation_pipeline import executor_def_from_config pipeline_def = pipeline.get_definition() executor_def = executor_def_from_config(mode_def, environment_config) if ExecutorRequirement.PERSISTENT_OUTPUTS not in executor_def.requirements: return intermediate_storage_def = environment_config.intermediate_storage_def_for_mode( mode_def) if not (can_isolate_steps(pipeline_def, mode_def) or (intermediate_storage_def and intermediate_storage_def.is_persistent)): raise DagsterUnmetExecutorRequirementsError( "You have attempted to use an executor that uses multiple processes, but your pipeline " "includes solid outputs that will not be stored somewhere where other processes can " "retrieve them. Please use a persistent IO manager for these outputs. E.g. with\n" ' @pipeline(mode_defs=[ModeDefinition(resource_defs={"io_manager": fs_io_manager})])' )
def celery_k8s_job_executor(init_context): """Celery-based executor which launches tasks as Kubernetes Jobs. The Celery executor exposes config settings for the underlying Celery app under the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced in Celery version 4.0 and the object constructed from config will be passed to the :py:class:`celery.Celery` constructor as its ``config_source`` argument. (See https://docs.celeryproject.org/en/latest/userguide/configuration.html for details.) The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the :py:class:`celery.Celery` constructor. In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently modified, but that when solid executions are especially fast or slow, or when there are different requirements around idempotence or retry, it may make sense to execute pipelines with variations on these settings. If you'd like to configure a Celery Kubernetes Job executor in addition to the :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a :py:class:`~dagster.ModeDefinition` as follows: .. literalinclude:: ../../../../../../python_modules/libraries/dagster-celery-k8s/dagster_celery_k8s_tests/example_celery_mode_def.py :language: python Then you can configure the executor as follows: .. code-block:: YAML execution: celery-k8s: config: job_image: 'my_repo.com/image_name:latest' job_namespace: 'some-namespace' broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker backend: 'rpc://' # Optional[str]: The URL of the Celery results backend include: ['my_module'] # Optional[List[str]]: Modules every worker should import config_source: # Dict[str, Any]: Any additional parameters to pass to the #... # Celery workers. This dict will be passed as the `config_source` #... # argument of celery.Celery(). Note that the YAML you provide here must align with the configuration with which the Celery workers on which you hope to run were started. If, for example, you point the executor at a different broker than the one your workers are listening to, the workers will never be able to pick up tasks for execution. In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery commands must be invoked with the `-A dagster_celery_k8s.app` argument. """ run_launcher = init_context.instance.run_launcher exc_cfg = init_context.executor_config if not isinstance(run_launcher, CeleryK8sRunLauncher): raise DagsterUnmetExecutorRequirementsError( "This engine is only compatible with a CeleryK8sRunLauncher; configure the " "CeleryK8sRunLauncher on your instance to use it.", ) job_config = DagsterK8sJobConfig( dagster_home=run_launcher.dagster_home, instance_config_map=run_launcher.instance_config_map, postgres_password_secret=run_launcher.postgres_password_secret, job_image=exc_cfg.get("job_image") or os.getenv("DAGSTER_CURRENT_IMAGE"), image_pull_policy=exc_cfg.get("image_pull_policy"), image_pull_secrets=exc_cfg.get("image_pull_secrets"), service_account_name=exc_cfg.get("service_account_name"), env_config_maps=exc_cfg.get("env_config_maps"), env_secrets=exc_cfg.get("env_secrets"), ) # Set on the instance but overrideable here broker = run_launcher.broker or exc_cfg.get("broker") backend = run_launcher.backend or exc_cfg.get("backend") config_source = run_launcher.config_source or exc_cfg.get("config_source") include = run_launcher.include or exc_cfg.get("include") retries = run_launcher.retries or RetryMode.from_config( exc_cfg.get("retries")) return CeleryK8sJobExecutor( broker=broker, backend=backend, config_source=config_source, include=include, retries=retries, job_config=job_config, job_namespace=exc_cfg.get("job_namespace"), load_incluster_config=exc_cfg.get("load_incluster_config"), kubeconfig_file=exc_cfg.get("kubeconfig_file"), repo_location_name=exc_cfg.get("repo_location_name"), )
def k8s_job_executor(init_context: InitExecutorContext) -> Executor: """ Executor which launches steps as Kubernetes Jobs. To use the `k8s_job_executor`, set it as the `executor_def` when defining a job: .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py :start-after: start_marker :end-before: end_marker :language: python Then you can configure the executor with run config as follows: .. code-block:: YAML execution: config: job_namespace: 'some-namespace' image_pull_policy: ... image_pull_secrets: ... service_account_name: ... env_config_maps: ... env_secrets: ... job_image: ... # leave out if using userDeployments """ run_launcher = init_context.instance.run_launcher if not isinstance(run_launcher, K8sRunLauncher): raise DagsterUnmetExecutorRequirementsError( "This engine is only compatible with a K8sRunLauncher; configure the " "K8sRunLauncher on your instance to use it.", ) exc_cfg = init_context.executor_config job_config = DagsterK8sJobConfig( dagster_home=run_launcher.dagster_home, instance_config_map=run_launcher.instance_config_map, postgres_password_secret=run_launcher.postgres_password_secret, job_image=exc_cfg.get("job_image"), image_pull_policy=(exc_cfg.get("image_pull_policy") if exc_cfg.get("image_pull_policy") != None else run_launcher.image_pull_policy), image_pull_secrets=run_launcher.image_pull_secrets + (exc_cfg.get("image_pull_secrets") or []), service_account_name=(exc_cfg.get("service_account_name") if exc_cfg.get("service_account_name") != None else run_launcher.service_account_name), env_config_maps=run_launcher.env_config_maps + (exc_cfg.get("env_config_maps") or []), env_secrets=run_launcher.env_secrets + (exc_cfg.get("env_secrets") or []), volume_mounts=run_launcher.volume_mounts + (exc_cfg.get("volume_mounts") or []), volumes=run_launcher.volumes + (exc_cfg.get("volumes") or []), ) return StepDelegatingExecutor( K8sStepHandler( job_config=job_config, job_namespace=(exc_cfg.get("job_namespace") if exc_cfg.get("job_namespace") != None else run_launcher.job_namespace), load_incluster_config=run_launcher.load_incluster_config, kubeconfig_file=run_launcher.kubeconfig_file, ), retries=RetryMode.from_config(init_context.executor_config["retries"]), should_verify_step=True, )
def k8s_job_executor(init_context: InitExecutorContext) -> Executor: """ Executor which launches steps as Kubernetes Jobs. To use the `k8s_job_executor`, set it as the `executor_def` when defining a job: .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py :start-after: start_marker :end-before: end_marker :language: python Then you can configure the executor with run config as follows: .. code-block:: YAML execution: config: job_namespace: 'some-namespace' image_pull_policy: ... image_pull_secrets: ... service_account_name: ... env_config_maps: ... env_secrets: ... env_vars: ... job_image: ... # leave out if using userDeployments Configuration set on the Kubernetes Jobs and Pods created by the `K8sRunLauncher` will also be set on Kubernetes Jobs and Pods created by the `k8s_job_executor`. """ run_launcher = init_context.instance.run_launcher if not isinstance(run_launcher, K8sRunLauncher): raise DagsterUnmetExecutorRequirementsError( "This engine is only compatible with a K8sRunLauncher; configure the " "K8sRunLauncher on your instance to use it.", ) exc_cfg = init_context.executor_config k8s_container_context = K8sContainerContext( image_pull_policy=exc_cfg.get("image_pull_policy"), image_pull_secrets=exc_cfg.get("image_pull_secrets"), service_account_name=exc_cfg.get("service_account_name"), env_config_maps=exc_cfg.get("env_config_maps"), env_secrets=exc_cfg.get("env_secrets"), env_vars=exc_cfg.get("env_vars"), volume_mounts=exc_cfg.get("volume_mounts"), volumes=exc_cfg.get("volumes"), labels=exc_cfg.get("labels"), namespace=exc_cfg.get("job_namespace"), ) return StepDelegatingExecutor( K8sStepHandler( image=exc_cfg.get("job_image"), container_context=k8s_container_context, load_incluster_config=run_launcher.load_incluster_config, kubeconfig_file=run_launcher.kubeconfig_file, ), retries=RetryMode.from_config(init_context.executor_config["retries"]), should_verify_step=True, )
def k8s_job_executor(init_context: InitExecutorContext) -> Executor: """ Executor which launches steps as Kubernetes Jobs. This executor is experimental. To add the Kubernetes Job executor in addition to the :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a :py:class:`~dagster.ModeDefinition` as follows: .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py :start-after: start_marker :end-before: end_marker :language: python Then you can configure the executor with run config (either via a :py:class:`~dagster.PresetDefinition` or the Dagit playground) as follows: .. code-block:: YAML execution: k8s: config: job_namespace: 'some-namespace' image_pull_policy: ... image_pull_secrets: ... service_account_name: ... env_config_maps: ... env_secrets: ... job_image: ... # leave out if using userDeployments """ run_launcher = init_context.instance.run_launcher if not isinstance(run_launcher, K8sRunLauncher): raise DagsterUnmetExecutorRequirementsError( "This engine is only compatible with a K8sRunLauncher; configure the " "K8sRunLauncher on your instance to use it.", ) exc_cfg = init_context.executor_config job_config = DagsterK8sJobConfig( dagster_home=run_launcher.dagster_home, instance_config_map=run_launcher.instance_config_map, postgres_password_secret=run_launcher.postgres_password_secret, job_image=exc_cfg.get("job_image"), image_pull_policy=(exc_cfg.get("image_pull_policy") if exc_cfg.get("image_pull_policy") != None else run_launcher.image_pull_policy), image_pull_secrets=(exc_cfg.get("image_pull_secrets") if exc_cfg.get("image_pull_secrets") != None else run_launcher.image_pull_secrets), service_account_name=(exc_cfg.get("service_account_name") if exc_cfg.get("service_account_name") != None else run_launcher.service_account_name), env_config_maps=(exc_cfg.get("env_config_maps") if exc_cfg.get("env_config_maps") != None else run_launcher.env_config_maps), env_secrets=(exc_cfg.get("env_secrets") if exc_cfg.get("env_secrets") != None else run_launcher.env_secrets), ) return StepDelegatingExecutor( K8sStepHandler( job_config=job_config, job_namespace=(exc_cfg.get("job_namespace") if exc_cfg.get("job_namespace") != None else run_launcher.job_namespace), load_incluster_config=run_launcher.load_incluster_config, kubeconfig_file=run_launcher.kubeconfig_file, ))