def value_from_storage_dict(storage_dict, klass): # Handle case where daemon_type used to be an enum (e.g. DaemonType.SCHEDULER) return DaemonHeartbeat( timestamp=storage_dict.get("timestamp"), daemon_type=(storage_dict.get("daemon_type").value if isinstance( storage_dict.get("daemon_type"), DaemonType) else storage_dict.get("daemon_type")), daemon_id=storage_dict.get("daemon_id"), errors=[unpack_value(storage_dict.get("error")) ] # error was replaced with errors if storage_dict.get("error") else unpack_value( storage_dict.get("errors")), )
def _execute_plan(self, execute_step_args_packed, executable_dict): execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(executable_dict, "executable_dict") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline = ReconstructablePipeline.from_dict(executable_dict) retries = Retries.from_config(execute_step_args.retries_dict) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.invariant( pipeline_run, "Could not load run {}".format(execute_step_args.pipeline_run_id)) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) execution_plan = create_execution_plan( pipeline, pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=execute_step_args.step_keys_to_execute, ) engine_event = instance.report_engine_event( "Executing steps {} in celery worker".format(step_keys_str), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "step_keys"), EventMetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryExecutor, step_key=execution_plan.step_key_for_single_step_plans(), ) events = [engine_event] for step_event in execute_plan_iterator( execution_plan, pipeline_run=pipeline_run, run_config=pipeline_run.run_config, instance=instance, retries=retries, ): events.append(step_event) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def from_dict(val): check.dict_param(val, "val") inst = unpack_value(val) check.invariant( isinstance(inst, ReconstructablePipeline), "Deserialized object is not instance of ReconstructablePipeline, got {type}" .format(type=type(inst)), ) return inst
def from_dict(val): check.dict_param(val, 'val') inst = unpack_value(val) check.invariant( isinstance(inst, InterProcessExecutablePipeline), 'Deserialized object is not instance of InterProcessExecutablePipeline, got {type}' .format(type=type(inst)), ) return inst
def _execute_step_k8s_job( self, execute_step_args_packed, job_config_dict, job_namespace, load_incluster_config, user_defined_k8s_config_dict=None, kubeconfig_file=None, ): """Run step execution in a K8s job pod.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.invariant( len(execute_step_args.step_keys_to_execute) == 1, "Celery K8s task executor can only execute 1 step at a time", ) # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, "job_config", DagsterK8sJobConfig) check.str_param(job_namespace, "job_namespace") check.bool_param(load_incluster_config, "load_incluster_config") user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict( user_defined_k8s_config_dict) check.opt_inst_param( user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig, ) check.opt_str_param(kubeconfig_file, "kubeconfig_file") # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_key = execute_step_args.step_keys_to_execute[0] celery_worker_name = self.request.hostname celery_pod_name = os.environ.get("HOSTNAME") instance.report_engine_event( "Task for step {step_key} picked up by Celery".format( step_key=step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(celery_worker_name, "Celery worker name"), EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"), ]), CeleryK8sJobExecutor, step_key=step_key, ) if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( "Not scheduling step because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), ]), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id, step_key) retry_state = execute_step_args.known_state.get_retry_state() if retry_state.get_attempt_count(step_key): attempt_number = retry_state.get_attempt_count(step_key) job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) else: job_name = "dagster-job-%s" % (k8s_name_key) pod_name = "dagster-job-%s" % (k8s_name_key) input_json = serialize_dagster_namedtuple(execute_step_args) args = ["dagster", "api", "execute_step", input_json] job = construct_dagster_k8s_job(job_config, args, job_name, user_defined_k8s_config, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( "Executing step {} in Kubernetes job {}".format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_config.job_image, "Job image"), EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"), EventMetadataEntry.text(str(job_config.image_pull_secrets), "Image pull secrets"), EventMetadataEntry.text( str(job_config.service_account_name), "Service account name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == "Conflict": # There is an existing job with the same name so proceed and see if the existing job succeeded instance.report_engine_event( "Did not create Kubernetes job {} for step {} since job name already " "exists, proceeding with existing job.".format( job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( "Encountered unexpected error while creating Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=execute_step_args.pipeline_run_id, ) except (DagsterK8sError, DagsterK8sTimeoutError) as err: step_failure_event = construct_step_failure_event_and_handle( pipeline_run, step_key, err, instance=instance) events.append(step_failure_event) except DagsterK8sPipelineStatusException: instance.report_engine_event( "Terminating Kubernetes Job because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return [] except ( DagsterK8sUnrecoverableAPIError, DagsterK8sAPIRetryLimitExceeded, # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in # a retry boundary. We still catch it here just in case we missed one so that we can # report it to the event log kubernetes.client.rest.ApiException, ) as err: instance.report_engine_event( "Encountered unexpected error while waiting on Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Post engine event for log retrieval engine_event = instance.report_engine_event( "Retrieving logs from Kubernetes Job pods", pipeline_run, EngineEventData( [EventMetadataEntry.text("\n".join(pod_names), "Pod names")]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: try: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split("\n") except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error while fetching pod logs for Kubernetes job {}, " "Pod name {} for step {}. Will attempt to continue with other pods." .format(job_name, pod_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) events += filter_dagster_events_from_pod_logs(logs) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_step_k8s_job( self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, pipeline_origin_packed, user_defined_k8s_config_dict=None, kubeconfig_file=None, ): """Run step execution in a K8s job pod. """ check.dict_param(instance_ref_dict, "instance_ref_dict") check.list_param(step_keys, "step_keys", of_type=str) check.invariant( len(step_keys) == 1, "Celery K8s task executor can only execute 1 step at a time" ) check.dict_param(run_config, "run_config") check.str_param(mode, "mode") check.str_param(repo_name, "repo_name") check.str_param(repo_location_name, "repo_location_name") check.str_param(run_id, "run_id") # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, "job_config", DagsterK8sJobConfig) check.str_param(job_namespace, "job_namespace") check.bool_param(load_incluster_config, "load_incluster_config") check.dict_param(retries_dict, "retries_dict") pipeline_origin = unpack_value( check.dict_param( pipeline_origin_packed, "pipeline_origin_packed" ) # TODO: make part of args ) check.inst(pipeline_origin, PipelineOrigin) user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict( user_defined_k8s_config_dict ) check.opt_inst_param( user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig, ) check.opt_str_param(kubeconfig_file, "kubeconfig_file") # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_key = step_keys[0] celery_worker_name = self.request.hostname celery_pod_name = os.environ.get("HOSTNAME") instance.report_engine_event( "Task for step {step_key} picked up by Celery".format(step_key=step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(celery_worker_name, "Celery worker name"), EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( "Not scheduling step because pipeline run status is not STARTED", pipeline_run, EngineEventData([EventMetadataEntry.text(step_key, "Step keys"),]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(run_id, step_key) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) else: job_name = "dagster-job-%s" % (k8s_name_key) pod_name = "dagster-job-%s" % (k8s_name_key) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=None, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, ) ) command = ["dagster"] args = ["api", "execute_step_with_structured_logs", input_json] job = construct_dagster_k8s_job( job_config, command, args, job_name, user_defined_k8s_config, pod_name ) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( "Executing step {} in Kubernetes job {}".format(step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(pod_name, "Kubernetes Pod name"), EventMetadataEntry.text(job_config.job_image, "Job image"), EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"), EventMetadataEntry.text( str(job_config.image_pull_secrets), "Image pull secrets" ), EventMetadataEntry.text( str(job_config.service_account_name), "Service account name" ), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == "Conflict": # There is an existing job with the same name so do not procede. instance.report_engine_event( "Did not create Kubernetes job {} for step {} since job name already " "exists, exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(pod_name, "Kubernetes Pod name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( "Encountered unexpected error while creating Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(e, "Error"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) return try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( "Terminating Kubernetes Job because pipeline run status is not STARTED", pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( "Retrieving logs from Kubernetes Job pods", pipeline_run, EngineEventData([EventMetadataEntry.text("\n".join(pod_names), "Pod names")]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split("\n") events += filter_dagster_events_from_pod_logs(logs) serialized_events = [serialize_dagster_namedtuple(event) for event in events] return serialized_events
def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, run_config=None, executable_dict=None, pipeline_run_dict=None, solid_handle_kwargs=None, instance_ref_dict=None, ): """Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. """ check.opt_str_param(output_log_path, "output_log_path") check.opt_str_param(marshal_dir, "marshal_dir") run_config = check.opt_dict_param(run_config, "run_config", key_type=str) check.dict_param(pipeline_run_dict, "pipeline_run_dict") check.dict_param(executable_dict, "executable_dict") check.dict_param(solid_handle_kwargs, "solid_handle_kwargs") check.dict_param(instance_ref_dict, "instance_ref_dict") pipeline = ReconstructablePipeline.from_dict(executable_dict) pipeline_def = pipeline.get_definition() try: instance_ref = unpack_value(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) except Exception as err: # pylint: disable=broad-except raise DagstermillError( "Error when attempting to resolve DagsterInstance from serialized InstanceRef" ) from err pipeline_run = unpack_value(pipeline_run_dict) solid_handle = SolidHandle.from_dict(solid_handle_kwargs) solid_def = pipeline_def.get_solid(solid_handle).definition self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline = pipeline environment_config = EnvironmentConfig.build(pipeline_def, run_config, mode=pipeline_run.mode) execution_plan = ExecutionPlan.build( self.pipeline, environment_config, step_keys_to_execute=pipeline_run.step_keys_to_execute, ) with scoped_pipeline_context( execution_plan, pipeline, run_config, pipeline_run, instance, scoped_resources_builder_cm=self._setup_resources, # Set this flag even though we're not in test for clearer error reporting raise_on_error=True, ) as pipeline_context: self.context = DagstermillRuntimeExecutionContext( pipeline_context=pipeline_context, pipeline_def=pipeline_def, solid_config=run_config.get("solids", {}).get(solid_def.name, {}).get("config"), resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_def, environment_config, pipeline_context.intermediate_storage_def, ), solid_name=solid_def.name, ) return self.context
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, pipeline_origin_packed, resources=None, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time') check.dict_param(run_config, 'run_config') check.str_param(mode, 'mode') check.str_param(repo_name, 'repo_name') check.str_param(repo_location_name, 'repo_location_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.dict_param(retries_dict, 'retries_dict') pipeline_origin = unpack_value( check.dict_param( pipeline_origin_packed, 'pipeline_origin_packed') # TODO: make part of args ) check.inst(pipeline_origin, PipelineOrigin) check.opt_dict_param(resources, 'resources', key_type=str, value_type=dict) check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_key = step_keys[0] if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( 'Not scheduling step because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(run_id, step_key) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) else: job_name = 'dagster-job-%s' % (k8s_name_key) pod_name = 'dagster-job-%s' % (k8s_name_key) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=None, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, )) command = ['dagster'] args = ['api', 'execute_step_with_structured_logs', input_json] job = construct_dagster_k8s_job(job_config, command, args, job_name, resources, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( 'Executing step {} in Kubernetes job {}'.format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text(str(job_config.image_pull_secrets), 'Image pull secrets'), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == 'Conflict': # There is an existing job with the same name so do not procede. instance.report_engine_event( 'Did not create Kubernetes job {} for step {} since job name already ' 'exists, exiting.'.format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( 'Encountered unexpected error while creating Kubernetes job {} for step {}, ' 'exiting.'.format(job_name, step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(e, 'Error'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( 'Terminating Kubernetes Job because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(job_namespace, 'Kubernetes Job namespace'), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData( [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') events += filter_dagster_events_from_pod_logs(logs) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_step_docker( self, execute_step_args_packed, docker_config, ): """Run step execution in a Docker container.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(docker_config, "docker_config") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) input_json = serialize_dagster_namedtuple(execute_step_args) command = "dagster api execute_step {}".format(json.dumps(input_json)) docker_image = (docker_config["image"] if docker_config.get("image") else execute_step_args. pipeline_origin.repository_origin.container_image) if not docker_image: raise Exception( "No docker image specified by either the job or the repository" ) client = docker.client.from_env() if docker_config.get("registry"): client.login( registry=docker_config["registry"]["url"], username=docker_config["registry"]["username"], password=docker_config["registry"]["password"], ) # Post event for starting execution engine_event = instance.report_engine_event( "Executing steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ MetadataEntry.text(step_keys_str, "Step keys"), MetadataEntry.text(docker_image, "Image"), MetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) serialized_events = [serialize_dagster_namedtuple(engine_event)] docker_env = {} if docker_config.get("env_vars"): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config["env_vars"] } try: docker_response = client.containers.run( docker_image, command=command, detach=False, auto_remove=True, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, network=docker_config.get("network", None), ) res = docker_response.decode("utf-8") except docker.errors.ContainerError as err: instance.report_engine_event( "Failed to run steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData([ MetadataEntry.text(docker_image, "Job image"), MetadataEntry.text(err.stderr, "Docker stderr"), ], ), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) raise else: if res is None: raise Exception( "No response from execute_step in CeleryDockerExecutor") serialized_events += [event for event in res.split("\n") if event] return serialized_events
def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, environment_dict=None, handle_kwargs=None, pipeline_run_dict=None, solid_subset=None, solid_handle_kwargs=None, instance_ref_dict=None, ): '''Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. ''' check.opt_str_param(output_log_path, 'output_log_path') check.opt_str_param(marshal_dir, 'marshal_dir') environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) check.dict_param(pipeline_run_dict, 'pipeline_run_dict') check.dict_param(handle_kwargs, 'handle_kwargs') check.opt_list_param(solid_subset, 'solid_subset', of_type=str) check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs') check.dict_param(instance_ref_dict, 'instance_ref_dict') try: handle = load_handle.handle_for_pipeline_cli_args( handle_kwargs, use_default_repository_yaml=False ) except (check.CheckError, load_handle.UsageError) as err: six.raise_from( DagstermillError( 'Cannot invoke a dagstermill solid from an in-memory pipeline that was not loaded ' 'from an ExecutionTargetHandle. Run this pipeline using dagit, the dagster CLI, ' 'through dagster-graphql, or in-memory after loading it through an ' 'ExecutionTargetHandle.' ), err, ) try: instance_ref = unpack_value(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) except Exception as err: # pylint: disable=broad-except six.raise_from( DagstermillError( 'Error when attempting to resolve DagsterInstance from serialized InstanceRef' ), err, ) pipeline_def = check.inst_param( handle.build_pipeline_definition(), 'pipeline_def (from handle {handle_dict})'.format(handle_dict=handle.data._asdict()), PipelineDefinition, ).build_sub_pipeline(solid_subset) solid_handle = SolidHandle.from_dict(solid_handle_kwargs) solid_def = pipeline_def.get_solid(solid_handle).definition pipeline_run = unpack_value(pipeline_run_dict) self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline_def = pipeline_def execution_plan = create_execution_plan(self.pipeline_def, environment_dict, pipeline_run) with scoped_pipeline_context( self.pipeline_def, environment_dict, pipeline_run, instance, execution_plan, scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillRuntimeExecutionContext( pipeline_context=pipeline_context, solid_config=None, resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_context.system_storage_def ), ) return self.context
def _execute_step_docker( self, instance_ref_dict, step_keys, run_config, mode, repo_name, run_id, docker_config, pipeline_origin_packed, retries_dict, ): """Run step execution in a Docker container. """ check.dict_param(instance_ref_dict, "instance_ref_dict") check.list_param(step_keys, "step_keys", of_type=str) check.dict_param(run_config, "run_config") check.str_param(mode, "mode") check.str_param(repo_name, "repo_name") check.str_param(run_id, "run_id") check.dict_param(docker_config, "docker_config") pipeline_origin = unpack_value( check.dict_param(pipeline_origin_packed, "pipeline_origin_packed")) check.dict_param(retries_dict, "retries_dict") instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_keys_str = ", ".join(step_keys) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=instance_ref, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, )) command = "dagster api execute_step_with_structured_logs {}".format( json.dumps(input_json)) docker_image = docker_config["image"] client = docker.client.from_env() if docker_config.get("registry"): client.login( registry=docker_config["registry"]["url"], username=docker_config["registry"]["username"], password=docker_config["registry"]["password"], ) # Post event for starting execution engine_event = instance.report_engine_event( "Executing steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "Step keys"), EventMetadataEntry.text(docker_image, "Image"), EventMetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=step_keys[0], ) serialized_events = [serialize_dagster_namedtuple(engine_event)] docker_env = {} if docker_config.get("env_vars"): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config["env_vars"] } try: docker_response = client.containers.run( docker_image, command=command, detach=False, auto_remove=True, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, network=docker_config.get("network", None), ) res = docker_response.decode("utf-8") except docker.errors.ContainerError as err: instance.report_engine_event( "Failed to run steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, "Job image"), EventMetadataEntry.text(err.stderr, "Docker stderr"), ], ), CeleryDockerExecutor, step_key=step_keys[0], ) raise else: if res is None: raise Exception( "No response from execute_step_with_structured_logs in CeleryDockerExecutor" ) serialized_events += [event for event in res.split("\n") if event] return serialized_events
def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, environment_dict=None, executable_dict=None, pipeline_run_dict=None, solid_handle_kwargs=None, instance_ref_dict=None, ): '''Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. ''' check.opt_str_param(output_log_path, 'output_log_path') check.opt_str_param(marshal_dir, 'marshal_dir') environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) check.dict_param(pipeline_run_dict, 'pipeline_run_dict') check.dict_param(executable_dict, 'executable_dict') check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs') check.dict_param(instance_ref_dict, 'instance_ref_dict') pipeline = InterProcessExecutablePipeline.from_dict(executable_dict) pipeline_def = pipeline.get_definition() try: instance_ref = unpack_value(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) except Exception as err: # pylint: disable=broad-except six.raise_from( DagstermillError( 'Error when attempting to resolve DagsterInstance from serialized InstanceRef' ), err, ) pipeline_run = unpack_value(pipeline_run_dict) solid_handle = SolidHandle.from_dict(solid_handle_kwargs) solid_def = pipeline_def.get_solid(solid_handle).definition self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline = pipeline execution_plan = create_execution_plan( self.pipeline, environment_dict, mode=pipeline_run.mode, step_keys_to_execute=pipeline_run.step_keys_to_execute, ) with scoped_pipeline_context( execution_plan, environment_dict, pipeline_run, instance, scoped_resources_builder_cm=self._setup_resources, # Set this flag even though we're not in test for clearer error reporting raise_on_error=True, ) as pipeline_context: self.context = DagstermillRuntimeExecutionContext( pipeline_context=pipeline_context, solid_config=None, resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_context.system_storage_def), ) return self.context
def _execute_step_docker( self, execute_step_args_packed, docker_config, ): """Run step execution in a Docker container.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(docker_config, "docker_config") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) input_json = serialize_dagster_namedtuple(execute_step_args) command = "dagster api execute_step {}".format(json.dumps(input_json)) docker_image = (docker_config["image"] if docker_config.get("image") else execute_step_args. pipeline_origin.repository_origin.container_image) if not docker_image: raise Exception( "No docker image specified by either the job or the repository" ) client = docker.client.from_env() if docker_config.get("registry"): client.login( registry=docker_config["registry"]["url"], username=docker_config["registry"]["username"], password=docker_config["registry"]["password"], ) # Post event for starting execution engine_event = instance.report_engine_event( "Executing steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ MetadataEntry("Step keys", value=step_keys_str), MetadataEntry("Image", value=docker_image), MetadataEntry("Celery worker", value=self.request.hostname), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) serialized_events = [serialize_dagster_namedtuple(engine_event)] docker_env = {} if docker_config.get("env_vars"): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config["env_vars"] } container_kwargs = check.opt_dict_param( docker_config.get("container_kwargs"), "container_kwargs", key_type=str) # set defaults for detach and auto_remove container_kwargs["detach"] = container_kwargs.get("detach", False) container_kwargs["auto_remove"] = container_kwargs.get( "auto_remove", True) # if environment variables are provided via container_kwargs, merge with env_vars if container_kwargs.get("environment") is not None: e_vars = container_kwargs.get("environment") if isinstance(e_vars, dict): docker_env.update(e_vars) else: for v in e_vars: key, val = v.split("=") docker_env[key] = val del container_kwargs["environment"] try: docker_response = client.containers.run( docker_image, command=command, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, network=docker_config.get("network", None), **container_kwargs, ) res = docker_response.decode("utf-8") except docker.errors.ContainerError as err: entries = [MetadataEntry("Job image", value=docker_image)] if err.stderr is not None: entries.append(MetadataEntry("Docker stderr", value=err.stderr)) instance.report_engine_event( "Failed to run steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData(entries), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) raise else: if res is None: raise Exception( "No response from execute_step in CeleryDockerExecutor") serialized_events += [event for event in res.split("\n") if event] return serialized_events