示例#1
0
文件: types.py 项目: stonebig/dagster
 def value_from_storage_dict(storage_dict, klass):
     # Handle case where daemon_type used to be an enum (e.g. DaemonType.SCHEDULER)
     return DaemonHeartbeat(
         timestamp=storage_dict.get("timestamp"),
         daemon_type=(storage_dict.get("daemon_type").value if isinstance(
             storage_dict.get("daemon_type"), DaemonType) else
                      storage_dict.get("daemon_type")),
         daemon_id=storage_dict.get("daemon_id"),
         errors=[unpack_value(storage_dict.get("error"))
                 ]  # error was replaced with errors
         if storage_dict.get("error") else unpack_value(
             storage_dict.get("errors")),
     )
示例#2
0
    def _execute_plan(self, execute_step_args_packed, executable_dict):
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)

        check.dict_param(executable_dict, "executable_dict")

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        retries = Retries.from_config(execute_step_args.retries_dict)

        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)
        check.invariant(
            pipeline_run,
            "Could not load run {}".format(execute_step_args.pipeline_run_id))

        step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)

        execution_plan = create_execution_plan(
            pipeline,
            pipeline_run.run_config,
            mode=pipeline_run.mode,
            step_keys_to_execute=execute_step_args.step_keys_to_execute,
        )

        engine_event = instance.report_engine_event(
            "Executing steps {} in celery worker".format(step_keys_str),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, "step_keys"),
                    EventMetadataEntry.text(self.request.hostname,
                                            "Celery worker"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryExecutor,
            step_key=execution_plan.step_key_for_single_step_plans(),
        )

        events = [engine_event]
        for step_event in execute_plan_iterator(
                execution_plan,
                pipeline_run=pipeline_run,
                run_config=pipeline_run.run_config,
                instance=instance,
                retries=retries,
        ):
            events.append(step_event)

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
示例#3
0
    def from_dict(val):
        check.dict_param(val, "val")

        inst = unpack_value(val)
        check.invariant(
            isinstance(inst, ReconstructablePipeline),
            "Deserialized object is not instance of ReconstructablePipeline, got {type}"
            .format(type=type(inst)),
        )
        return inst
示例#4
0
    def from_dict(val):
        check.dict_param(val, 'val')

        inst = unpack_value(val)
        check.invariant(
            isinstance(inst, InterProcessExecutablePipeline),
            'Deserialized object is not instance of InterProcessExecutablePipeline, got {type}'
            .format(type=type(inst)),
        )
        return inst
示例#5
0
    def _execute_step_k8s_job(
        self,
        execute_step_args_packed,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        user_defined_k8s_config_dict=None,
        kubeconfig_file=None,
    ):
        """Run step execution in a K8s job pod."""
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)
        check.invariant(
            len(execute_step_args.step_keys_to_execute) == 1,
            "Celery K8s task executor can only execute 1 step at a time",
        )

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, "job_config", DagsterK8sJobConfig)
        check.str_param(job_namespace, "job_namespace")

        check.bool_param(load_incluster_config, "load_incluster_config")

        user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(
            user_defined_k8s_config_dict)
        check.opt_inst_param(
            user_defined_k8s_config,
            "user_defined_k8s_config",
            UserDefinedDagsterK8sConfig,
        )
        check.opt_str_param(kubeconfig_file, "kubeconfig_file")

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)
        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)

        check.inst(
            pipeline_run,
            PipelineRun,
            "Could not load run {}".format(execute_step_args.pipeline_run_id),
        )
        step_key = execute_step_args.step_keys_to_execute[0]

        celery_worker_name = self.request.hostname
        celery_pod_name = os.environ.get("HOSTNAME")
        instance.report_engine_event(
            "Task for step {step_key} picked up by Celery".format(
                step_key=step_key),
            pipeline_run,
            EngineEventData([
                EventMetadataEntry.text(celery_worker_name,
                                        "Celery worker name"),
                EventMetadataEntry.text(celery_pod_name,
                                        "Celery worker Kubernetes Pod name"),
            ]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )

        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                "Not scheduling step because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, "Step key"),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        # Ensure we stay below k8s name length limits
        k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id,
                                        step_key)

        retry_state = execute_step_args.known_state.get_retry_state()

        if retry_state.get_attempt_count(step_key):
            attempt_number = retry_state.get_attempt_count(step_key)
            job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
            pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
        else:
            job_name = "dagster-job-%s" % (k8s_name_key)
            pod_name = "dagster-job-%s" % (k8s_name_key)

        input_json = serialize_dagster_namedtuple(execute_step_args)
        args = ["dagster", "api", "execute_step", input_json]

        job = construct_dagster_k8s_job(job_config, args, job_name,
                                        user_defined_k8s_config, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            "Executing step {} in Kubernetes job {}".format(
                step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, "Step key"),
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(job_config.job_image, "Job image"),
                    EventMetadataEntry.text(job_config.image_pull_policy,
                                            "Image pull policy"),
                    EventMetadataEntry.text(str(job_config.image_pull_secrets),
                                            "Image pull secrets"),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name),
                        "Service account name"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)
        try:
            kubernetes.client.BatchV1Api().create_namespaced_job(
                body=job, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            if e.reason == "Conflict":
                # There is an existing job with the same name so proceed and see if the existing job succeeded
                instance.report_engine_event(
                    "Did not create Kubernetes job {} for step {} since job name already "
                    "exists, proceeding with existing job.".format(
                        job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                            EventMetadataEntry.text(job_name,
                                                    "Kubernetes Job name"),
                        ],
                        marker_end=DELEGATE_MARKER,
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            else:
                instance.report_engine_event(
                    "Encountered unexpected error while creating Kubernetes job {} for step {}, "
                    "exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                        ],
                        error=serializable_error_info_from_exc_info(
                            sys.exc_info()),
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
                return []

        try:
            wait_for_job_success(
                job_name=job_name,
                namespace=job_namespace,
                instance=instance,
                run_id=execute_step_args.pipeline_run_id,
            )
        except (DagsterK8sError, DagsterK8sTimeoutError) as err:
            step_failure_event = construct_step_failure_event_and_handle(
                pipeline_run, step_key, err, instance=instance)
            events.append(step_failure_event)
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                "Terminating Kubernetes Job because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, "Step key"),
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(job_namespace,
                                            "Kubernetes Job namespace"),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return []
        except (
                DagsterK8sUnrecoverableAPIError,
                DagsterK8sAPIRetryLimitExceeded,
                # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in
                # a retry boundary. We still catch it here just in case we missed one so that we can
                # report it to the event log
                kubernetes.client.rest.ApiException,
        ) as err:
            instance.report_engine_event(
                "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "
                "exiting.".format(job_name, step_key),
                pipeline_run,
                EngineEventData(
                    [
                        EventMetadataEntry.text(step_key, "Step key"),
                    ],
                    error=serializable_error_info_from_exc_info(
                        sys.exc_info()),
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        try:
            pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            instance.report_engine_event(
                "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "
                "exiting.".format(job_name, step_key),
                pipeline_run,
                EngineEventData(
                    [
                        EventMetadataEntry.text(step_key, "Step key"),
                    ],
                    error=serializable_error_info_from_exc_info(
                        sys.exc_info()),
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            "Retrieving logs from Kubernetes Job pods",
            pipeline_run,
            EngineEventData(
                [EventMetadataEntry.text("\n".join(pod_names), "Pod names")]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            try:
                raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
                logs += raw_logs.split("\n")
            except kubernetes.client.rest.ApiException as e:
                instance.report_engine_event(
                    "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "
                    "Pod name {} for step {}. Will attempt to continue with other pods."
                    .format(job_name, pod_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                        ],
                        error=serializable_error_info_from_exc_info(
                            sys.exc_info()),
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )

        events += filter_dagster_events_from_pod_logs(logs)
        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
示例#6
0
文件: executor.py 项目: sd2k/dagster
    def _execute_step_k8s_job(
        self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        retries_dict,
        pipeline_origin_packed,
        user_defined_k8s_config_dict=None,
        kubeconfig_file=None,
    ):
        """Run step execution in a K8s job pod.
        """

        check.dict_param(instance_ref_dict, "instance_ref_dict")
        check.list_param(step_keys, "step_keys", of_type=str)
        check.invariant(
            len(step_keys) == 1, "Celery K8s task executor can only execute 1 step at a time"
        )
        check.dict_param(run_config, "run_config")
        check.str_param(mode, "mode")
        check.str_param(repo_name, "repo_name")
        check.str_param(repo_location_name, "repo_location_name")
        check.str_param(run_id, "run_id")

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, "job_config", DagsterK8sJobConfig)
        check.str_param(job_namespace, "job_namespace")

        check.bool_param(load_incluster_config, "load_incluster_config")
        check.dict_param(retries_dict, "retries_dict")

        pipeline_origin = unpack_value(
            check.dict_param(
                pipeline_origin_packed, "pipeline_origin_packed"
            )  # TODO: make part of args
        )
        check.inst(pipeline_origin, PipelineOrigin)

        user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(
            user_defined_k8s_config_dict
        )
        check.opt_inst_param(
            user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig,
        )
        check.opt_str_param(kubeconfig_file, "kubeconfig_file")

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)

        check.invariant(pipeline_run, "Could not load run {}".format(run_id))
        step_key = step_keys[0]

        celery_worker_name = self.request.hostname
        celery_pod_name = os.environ.get("HOSTNAME")
        instance.report_engine_event(
            "Task for step {step_key} picked up by Celery".format(step_key=step_key),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(celery_worker_name, "Celery worker name"),
                    EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"),
                ]
            ),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )

        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                "Not scheduling step because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData([EventMetadataEntry.text(step_key, "Step keys"),]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return

        # Ensure we stay below k8s name length limits
        k8s_name_key = get_k8s_job_name(run_id, step_key)

        retries = Retries.from_config(retries_dict)

        if retries.get_attempt_count(step_key):
            attempt_number = retries.get_attempt_count(step_key)
            job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
            pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
        else:
            job_name = "dagster-job-%s" % (k8s_name_key)
            pod_name = "dagster-job-%s" % (k8s_name_key)

        input_json = serialize_dagster_namedtuple(
            ExecuteStepArgs(
                pipeline_origin=pipeline_origin,
                pipeline_run_id=run_id,
                instance_ref=None,
                mode=mode,
                step_keys_to_execute=step_keys,
                run_config=run_config,
                retries_dict=retries_dict,
            )
        )
        command = ["dagster"]
        args = ["api", "execute_step_with_structured_logs", input_json]

        job = construct_dagster_k8s_job(
            job_config, command, args, job_name, user_defined_k8s_config, pod_name
        )

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            "Executing step {} in Kubernetes job {}".format(step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, "Step keys"),
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(pod_name, "Kubernetes Pod name"),
                    EventMetadataEntry.text(job_config.job_image, "Job image"),
                    EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"),
                    EventMetadataEntry.text(
                        str(job_config.image_pull_secrets), "Image pull secrets"
                    ),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name), "Service account name"
                    ),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)

        try:
            kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            if e.reason == "Conflict":
                # There is an existing job with the same name so do not procede.
                instance.report_engine_event(
                    "Did not create Kubernetes job {} for step {} since job name already "
                    "exists, exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step keys"),
                            EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                            EventMetadataEntry.text(pod_name, "Kubernetes Pod name"),
                        ],
                        marker_end=DELEGATE_MARKER,
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            else:
                instance.report_engine_event(
                    "Encountered unexpected error while creating Kubernetes job {} for step {}, "
                    "exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step keys"),
                            EventMetadataEntry.text(e, "Error"),
                        ]
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            return

        try:
            wait_for_job_success(
                job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id,
            )
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                "Terminating Kubernetes Job because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData(
                    [
                        EventMetadataEntry.text(step_key, "Step keys"),
                        EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                        EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"),
                    ]
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return

        pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            "Retrieving logs from Kubernetes Job pods",
            pipeline_run,
            EngineEventData([EventMetadataEntry.text("\n".join(pod_names), "Pod names")]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split("\n")

        events += filter_dagster_events_from_pod_logs(logs)
        serialized_events = [serialize_dagster_namedtuple(event) for event in events]
        return serialized_events
示例#7
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        run_config=None,
        executable_dict=None,
        pipeline_run_dict=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        """Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        """
        check.opt_str_param(output_log_path, "output_log_path")
        check.opt_str_param(marshal_dir, "marshal_dir")
        run_config = check.opt_dict_param(run_config,
                                          "run_config",
                                          key_type=str)
        check.dict_param(pipeline_run_dict, "pipeline_run_dict")
        check.dict_param(executable_dict, "executable_dict")
        check.dict_param(solid_handle_kwargs, "solid_handle_kwargs")
        check.dict_param(instance_ref_dict, "instance_ref_dict")

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        pipeline_def = pipeline.get_definition()

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            raise DagstermillError(
                "Error when attempting to resolve DagsterInstance from serialized InstanceRef"
            ) from err

        pipeline_run = unpack_value(pipeline_run_dict)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle).definition

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline = pipeline

        environment_config = EnvironmentConfig.build(pipeline_def,
                                                     run_config,
                                                     mode=pipeline_run.mode)

        execution_plan = ExecutionPlan.build(
            self.pipeline,
            environment_config,
            step_keys_to_execute=pipeline_run.step_keys_to_execute,
        )

        with scoped_pipeline_context(
                execution_plan,
                pipeline,
                run_config,
                pipeline_run,
                instance,
                scoped_resources_builder_cm=self._setup_resources,
                # Set this flag even though we're not in test for clearer error reporting
                raise_on_error=True,
        ) as pipeline_context:
            self.context = DagstermillRuntimeExecutionContext(
                pipeline_context=pipeline_context,
                pipeline_def=pipeline_def,
                solid_config=run_config.get("solids",
                                            {}).get(solid_def.name,
                                                    {}).get("config"),
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_def,
                    environment_config,
                    pipeline_context.intermediate_storage_def,
                ),
                solid_name=solid_def.name,
            )

        return self.context
示例#8
0
    def _execute_step_k8s_job(
        _self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        retries_dict,
        pipeline_origin_packed,
        resources=None,
        kubeconfig_file=None,
    ):
        '''Run step execution in a K8s job pod.
        '''

        check.dict_param(instance_ref_dict, 'instance_ref_dict')
        check.list_param(step_keys, 'step_keys', of_type=str)
        check.invariant(
            len(step_keys) == 1,
            'Celery K8s task executor can only execute 1 step at a time')
        check.dict_param(run_config, 'run_config')
        check.str_param(mode, 'mode')
        check.str_param(repo_name, 'repo_name')
        check.str_param(repo_location_name, 'repo_location_name')
        check.str_param(run_id, 'run_id')

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
        check.str_param(job_namespace, 'job_namespace')

        check.bool_param(load_incluster_config, 'load_incluster_config')
        check.dict_param(retries_dict, 'retries_dict')

        pipeline_origin = unpack_value(
            check.dict_param(
                pipeline_origin_packed,
                'pipeline_origin_packed')  # TODO: make part of args
        )
        check.inst(pipeline_origin, PipelineOrigin)

        check.opt_dict_param(resources,
                             'resources',
                             key_type=str,
                             value_type=dict)
        check.opt_str_param(kubeconfig_file, 'kubeconfig_file')

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)

        check.invariant(pipeline_run, 'Could not load run {}'.format(run_id))

        step_key = step_keys[0]
        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                'Not scheduling step because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return

        # Ensure we stay below k8s name length limits
        k8s_name_key = get_k8s_job_name(run_id, step_key)

        retries = Retries.from_config(retries_dict)

        if retries.get_attempt_count(step_key):
            attempt_number = retries.get_attempt_count(step_key)
            job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
            pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number)
        else:
            job_name = 'dagster-job-%s' % (k8s_name_key)
            pod_name = 'dagster-job-%s' % (k8s_name_key)

        input_json = serialize_dagster_namedtuple(
            ExecuteStepArgs(
                pipeline_origin=pipeline_origin,
                pipeline_run_id=run_id,
                instance_ref=None,
                mode=mode,
                step_keys_to_execute=step_keys,
                run_config=run_config,
                retries_dict=retries_dict,
            ))
        command = ['dagster']
        args = ['api', 'execute_step_with_structured_logs', input_json]

        job = construct_dagster_k8s_job(job_config, command, args, job_name,
                                        resources, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            'Executing step {} in Kubernetes job {}'.format(
                step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'),
                    EventMetadataEntry.text(job_config.job_image, 'Job image'),
                    EventMetadataEntry.text(job_config.image_pull_policy,
                                            'Image pull policy'),
                    EventMetadataEntry.text(str(job_config.image_pull_secrets),
                                            'Image pull secrets'),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name),
                        'Service account name'),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)

        try:
            kubernetes.client.BatchV1Api().create_namespaced_job(
                body=job, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            if e.reason == 'Conflict':
                # There is an existing job with the same name so do not procede.
                instance.report_engine_event(
                    'Did not create Kubernetes job {} for step {} since job name already '
                    'exists, exiting.'.format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, 'Step keys'),
                            EventMetadataEntry.text(job_name,
                                                    'Kubernetes Job name'),
                            EventMetadataEntry.text(pod_name,
                                                    'Kubernetes Pod name'),
                        ],
                        marker_end=DELEGATE_MARKER,
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            else:
                instance.report_engine_event(
                    'Encountered unexpected error while creating Kubernetes job {} for step {}, '
                    'exiting.'.format(job_name, step_key),
                    pipeline_run,
                    EngineEventData([
                        EventMetadataEntry.text(step_key, 'Step keys'),
                        EventMetadataEntry.text(e, 'Error'),
                    ]),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            return

        try:
            wait_for_job_success(
                job_name=job_name,
                namespace=job_namespace,
                instance=instance,
                run_id=run_id,
            )
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                'Terminating Kubernetes Job because pipeline run status is not STARTED',
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, 'Step keys'),
                    EventMetadataEntry.text(job_name, 'Kubernetes Job name'),
                    EventMetadataEntry.text(job_namespace,
                                            'Kubernetes Job namespace'),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return

        pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            'Retrieving logs from Kubernetes Job pods',
            pipeline_run,
            EngineEventData(
                [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
            logs += raw_logs.split('\n')

        events += filter_dagster_events_from_pod_logs(logs)
        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
示例#9
0
    def _execute_step_docker(
        self,
        execute_step_args_packed,
        docker_config,
    ):
        """Run step execution in a Docker container."""
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)

        check.dict_param(docker_config, "docker_config")

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)
        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)
        check.inst(
            pipeline_run,
            PipelineRun,
            "Could not load run {}".format(execute_step_args.pipeline_run_id),
        )
        step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)

        input_json = serialize_dagster_namedtuple(execute_step_args)

        command = "dagster api execute_step {}".format(json.dumps(input_json))

        docker_image = (docker_config["image"]
                        if docker_config.get("image") else execute_step_args.
                        pipeline_origin.repository_origin.container_image)

        if not docker_image:
            raise Exception(
                "No docker image specified by either the job or the repository"
            )

        client = docker.client.from_env()

        if docker_config.get("registry"):
            client.login(
                registry=docker_config["registry"]["url"],
                username=docker_config["registry"]["username"],
                password=docker_config["registry"]["password"],
            )

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            "Executing steps {} in Docker container {}".format(
                step_keys_str, docker_image),
            pipeline_run,
            EngineEventData(
                [
                    MetadataEntry.text(step_keys_str, "Step keys"),
                    MetadataEntry.text(docker_image, "Image"),
                    MetadataEntry.text(self.request.hostname, "Celery worker"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryDockerExecutor,
            step_key=execute_step_args.step_keys_to_execute[0],
        )

        serialized_events = [serialize_dagster_namedtuple(engine_event)]

        docker_env = {}
        if docker_config.get("env_vars"):
            docker_env = {
                env_name: os.getenv(env_name)
                for env_name in docker_config["env_vars"]
            }

        try:
            docker_response = client.containers.run(
                docker_image,
                command=command,
                detach=False,
                auto_remove=True,
                # pass through this worker's environment for things like AWS creds etc.
                environment=docker_env,
                network=docker_config.get("network", None),
            )

            res = docker_response.decode("utf-8")
        except docker.errors.ContainerError as err:
            instance.report_engine_event(
                "Failed to run steps {} in Docker container {}".format(
                    step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    MetadataEntry.text(docker_image, "Job image"),
                    MetadataEntry.text(err.stderr, "Docker stderr"),
                ], ),
                CeleryDockerExecutor,
                step_key=execute_step_args.step_keys_to_execute[0],
            )
            raise
        else:
            if res is None:
                raise Exception(
                    "No response from execute_step in CeleryDockerExecutor")

            serialized_events += [event for event in res.split("\n") if event]

        return serialized_events
示例#10
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        environment_dict=None,
        handle_kwargs=None,
        pipeline_run_dict=None,
        solid_subset=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        '''Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        '''
        check.opt_str_param(output_log_path, 'output_log_path')
        check.opt_str_param(marshal_dir, 'marshal_dir')
        environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)
        check.dict_param(pipeline_run_dict, 'pipeline_run_dict')
        check.dict_param(handle_kwargs, 'handle_kwargs')
        check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
        check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs')
        check.dict_param(instance_ref_dict, 'instance_ref_dict')

        try:
            handle = load_handle.handle_for_pipeline_cli_args(
                handle_kwargs, use_default_repository_yaml=False
            )
        except (check.CheckError, load_handle.UsageError) as err:
            six.raise_from(
                DagstermillError(
                    'Cannot invoke a dagstermill solid from an in-memory pipeline that was not loaded '
                    'from an ExecutionTargetHandle. Run this pipeline using dagit, the dagster CLI, '
                    'through dagster-graphql, or in-memory after loading it through an '
                    'ExecutionTargetHandle.'
                ),
                err,
            )

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            six.raise_from(
                DagstermillError(
                    'Error when attempting to resolve DagsterInstance from serialized InstanceRef'
                ),
                err,
            )

        pipeline_def = check.inst_param(
            handle.build_pipeline_definition(),
            'pipeline_def (from handle {handle_dict})'.format(handle_dict=handle.data._asdict()),
            PipelineDefinition,
        ).build_sub_pipeline(solid_subset)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle).definition

        pipeline_run = unpack_value(pipeline_run_dict)

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline_def = pipeline_def

        execution_plan = create_execution_plan(self.pipeline_def, environment_dict, pipeline_run)

        with scoped_pipeline_context(
            self.pipeline_def,
            environment_dict,
            pipeline_run,
            instance,
            execution_plan,
            scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:
            self.context = DagstermillRuntimeExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=None,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan, pipeline_context.system_storage_def
                ),
            )

        return self.context
示例#11
0
    def _execute_step_docker(
        self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        run_id,
        docker_config,
        pipeline_origin_packed,
        retries_dict,
    ):
        """Run step execution in a Docker container.
        """
        check.dict_param(instance_ref_dict, "instance_ref_dict")
        check.list_param(step_keys, "step_keys", of_type=str)
        check.dict_param(run_config, "run_config")
        check.str_param(mode, "mode")
        check.str_param(repo_name, "repo_name")
        check.str_param(run_id, "run_id")
        check.dict_param(docker_config, "docker_config")
        pipeline_origin = unpack_value(
            check.dict_param(pipeline_origin_packed, "pipeline_origin_packed"))
        check.dict_param(retries_dict, "retries_dict")

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)
        check.invariant(pipeline_run, "Could not load run {}".format(run_id))

        step_keys_str = ", ".join(step_keys)

        input_json = serialize_dagster_namedtuple(
            ExecuteStepArgs(
                pipeline_origin=pipeline_origin,
                pipeline_run_id=run_id,
                instance_ref=instance_ref,
                mode=mode,
                step_keys_to_execute=step_keys,
                run_config=run_config,
                retries_dict=retries_dict,
            ))

        command = "dagster api execute_step_with_structured_logs {}".format(
            json.dumps(input_json))

        docker_image = docker_config["image"]

        client = docker.client.from_env()

        if docker_config.get("registry"):
            client.login(
                registry=docker_config["registry"]["url"],
                username=docker_config["registry"]["username"],
                password=docker_config["registry"]["password"],
            )

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            "Executing steps {} in Docker container {}".format(
                step_keys_str, docker_image),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, "Step keys"),
                    EventMetadataEntry.text(docker_image, "Image"),
                    EventMetadataEntry.text(self.request.hostname,
                                            "Celery worker"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryDockerExecutor,
            step_key=step_keys[0],
        )

        serialized_events = [serialize_dagster_namedtuple(engine_event)]

        docker_env = {}
        if docker_config.get("env_vars"):
            docker_env = {
                env_name: os.getenv(env_name)
                for env_name in docker_config["env_vars"]
            }

        try:
            docker_response = client.containers.run(
                docker_image,
                command=command,
                detach=False,
                auto_remove=True,
                # pass through this worker's environment for things like AWS creds etc.
                environment=docker_env,
                network=docker_config.get("network", None),
            )

            res = docker_response.decode("utf-8")
        except docker.errors.ContainerError as err:
            instance.report_engine_event(
                "Failed to run steps {} in Docker container {}".format(
                    step_keys_str, docker_image),
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(docker_image, "Job image"),
                    EventMetadataEntry.text(err.stderr, "Docker stderr"),
                ], ),
                CeleryDockerExecutor,
                step_key=step_keys[0],
            )
            raise
        else:
            if res is None:
                raise Exception(
                    "No response from execute_step_with_structured_logs in CeleryDockerExecutor"
                )

            serialized_events += [event for event in res.split("\n") if event]

        return serialized_events
示例#12
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        environment_dict=None,
        executable_dict=None,
        pipeline_run_dict=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        '''Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        '''
        check.opt_str_param(output_log_path, 'output_log_path')
        check.opt_str_param(marshal_dir, 'marshal_dir')
        environment_dict = check.opt_dict_param(environment_dict,
                                                'environment_dict',
                                                key_type=str)
        check.dict_param(pipeline_run_dict, 'pipeline_run_dict')
        check.dict_param(executable_dict, 'executable_dict')
        check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs')
        check.dict_param(instance_ref_dict, 'instance_ref_dict')

        pipeline = InterProcessExecutablePipeline.from_dict(executable_dict)
        pipeline_def = pipeline.get_definition()

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            six.raise_from(
                DagstermillError(
                    'Error when attempting to resolve DagsterInstance from serialized InstanceRef'
                ),
                err,
            )

        pipeline_run = unpack_value(pipeline_run_dict)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle).definition

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline = pipeline

        execution_plan = create_execution_plan(
            self.pipeline,
            environment_dict,
            mode=pipeline_run.mode,
            step_keys_to_execute=pipeline_run.step_keys_to_execute,
        )

        with scoped_pipeline_context(
                execution_plan,
                environment_dict,
                pipeline_run,
                instance,
                scoped_resources_builder_cm=self._setup_resources,
                # Set this flag even though we're not in test for clearer error reporting
                raise_on_error=True,
        ) as pipeline_context:
            self.context = DagstermillRuntimeExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=None,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan, pipeline_context.system_storage_def),
            )

        return self.context
示例#13
0
    def _execute_step_docker(
        self,
        execute_step_args_packed,
        docker_config,
    ):
        """Run step execution in a Docker container."""
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)

        check.dict_param(docker_config, "docker_config")

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)
        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)
        check.inst(
            pipeline_run,
            PipelineRun,
            "Could not load run {}".format(execute_step_args.pipeline_run_id),
        )
        step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)

        input_json = serialize_dagster_namedtuple(execute_step_args)

        command = "dagster api execute_step {}".format(json.dumps(input_json))

        docker_image = (docker_config["image"]
                        if docker_config.get("image") else execute_step_args.
                        pipeline_origin.repository_origin.container_image)

        if not docker_image:
            raise Exception(
                "No docker image specified by either the job or the repository"
            )

        client = docker.client.from_env()

        if docker_config.get("registry"):
            client.login(
                registry=docker_config["registry"]["url"],
                username=docker_config["registry"]["username"],
                password=docker_config["registry"]["password"],
            )

        # Post event for starting execution
        engine_event = instance.report_engine_event(
            "Executing steps {} in Docker container {}".format(
                step_keys_str, docker_image),
            pipeline_run,
            EngineEventData(
                [
                    MetadataEntry("Step keys", value=step_keys_str),
                    MetadataEntry("Image", value=docker_image),
                    MetadataEntry("Celery worker",
                                  value=self.request.hostname),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryDockerExecutor,
            step_key=execute_step_args.step_keys_to_execute[0],
        )

        serialized_events = [serialize_dagster_namedtuple(engine_event)]

        docker_env = {}
        if docker_config.get("env_vars"):
            docker_env = {
                env_name: os.getenv(env_name)
                for env_name in docker_config["env_vars"]
            }

        container_kwargs = check.opt_dict_param(
            docker_config.get("container_kwargs"),
            "container_kwargs",
            key_type=str)

        # set defaults for detach and auto_remove
        container_kwargs["detach"] = container_kwargs.get("detach", False)
        container_kwargs["auto_remove"] = container_kwargs.get(
            "auto_remove", True)

        # if environment variables are provided via container_kwargs, merge with env_vars
        if container_kwargs.get("environment") is not None:
            e_vars = container_kwargs.get("environment")
            if isinstance(e_vars, dict):
                docker_env.update(e_vars)
            else:
                for v in e_vars:
                    key, val = v.split("=")
                    docker_env[key] = val
            del container_kwargs["environment"]

        try:
            docker_response = client.containers.run(
                docker_image,
                command=command,
                # pass through this worker's environment for things like AWS creds etc.
                environment=docker_env,
                network=docker_config.get("network", None),
                **container_kwargs,
            )

            res = docker_response.decode("utf-8")
        except docker.errors.ContainerError as err:
            entries = [MetadataEntry("Job image", value=docker_image)]
            if err.stderr is not None:
                entries.append(MetadataEntry("Docker stderr",
                                             value=err.stderr))

            instance.report_engine_event(
                "Failed to run steps {} in Docker container {}".format(
                    step_keys_str, docker_image),
                pipeline_run,
                EngineEventData(entries),
                CeleryDockerExecutor,
                step_key=execute_step_args.step_keys_to_execute[0],
            )
            raise
        else:
            if res is None:
                raise Exception(
                    "No response from execute_step in CeleryDockerExecutor")

            serialized_events += [event for event in res.split("\n") if event]

        return serialized_events