def _execute_plan(self, execute_step_args_packed, executable_dict): execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(executable_dict, "executable_dict") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline = ReconstructablePipeline.from_dict(executable_dict) retries = Retries.from_config(execute_step_args.retries_dict) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.invariant( pipeline_run, "Could not load run {}".format(execute_step_args.pipeline_run_id)) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) execution_plan = create_execution_plan( pipeline, pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=execute_step_args.step_keys_to_execute, ) engine_event = instance.report_engine_event( "Executing steps {} in celery worker".format(step_keys_str), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "step_keys"), EventMetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryExecutor, step_key=execution_plan.step_key_for_single_step_plans(), ) events = [engine_event] for step_event in execute_plan_iterator( execution_plan, pipeline_run=pipeline_run, run_config=pipeline_run.run_config, instance=instance, retries=retries, ): events.append(step_event) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def test_input_manager_with_failure(): @root_input_manager def should_fail(_): raise Failure( description="Foolure", metadata_entries=[ EventMetadataEntry.text(label="label", text="text", description="description") ], ) @solid(input_defs=[InputDefinition("_fail_input", root_manager_key="should_fail")]) def fail_on_input(_, _fail_input): assert False, "should not be called" @pipeline(mode_defs=[ModeDefinition(resource_defs={"should_fail": should_fail})]) def simple(): fail_on_input() with tempfile.TemporaryDirectory() as tmpdir_path: instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path)) result = execute_pipeline(simple, instance=instance, raise_on_error=False) assert not result.success failure_data = result.result_for_solid("fail_on_input").failure_data assert failure_data.error.cls_name == "Failure" assert failure_data.user_failure_data.description == "Foolure" assert failure_data.user_failure_data.metadata_entries[0].label == "label" assert failure_data.user_failure_data.metadata_entries[0].entry_data.text == "text" assert failure_data.user_failure_data.metadata_entries[0].description == "description"
def run_daemon_loop( self, instance_ref, daemon_uuid, daemon_shutdown_event, gen_workspace, heartbeat_interval_seconds, error_interval_seconds, until=None, ): from dagster.core.telemetry_upload import uploading_logging_thread # Each loop runs in its own thread with its own instance and IWorkspace with DagsterInstance.from_ref(instance_ref) as instance: with uploading_logging_thread(): with gen_workspace(instance) as workspace: check.inst_param(workspace, "workspace", IWorkspace) daemon_generator = self.core_loop(instance, workspace) try: while (not daemon_shutdown_event.is_set()) and ( not until or pendulum.now("UTC") < until ): try: result = check.opt_inst( next(daemon_generator), SerializableErrorInfo ) if result: self._errors.appendleft((result, pendulum.now("UTC"))) except StopIteration: self._logger.error( "Daemon loop finished without raising an error - daemon loops should run forever until they are interrupted." ) break except Exception: error_info = serializable_error_info_from_exc_info(sys.exc_info()) self._logger.error( "Caught error, daemon loop will restart:\n{}".format(error_info) ) self._errors.appendleft((error_info, pendulum.now("UTC"))) daemon_generator.close() daemon_generator = self.core_loop(instance, workspace) finally: try: self._check_add_heartbeat( instance, daemon_uuid, heartbeat_interval_seconds, error_interval_seconds, ) except Exception: self._logger.error( "Failed to add heartbeat: \n{}".format( serializable_error_info_from_exc_info(sys.exc_info()) ) ) finally: # cleanup the generator if it was stopped part-way through daemon_generator.close()
def test_valid_log_level_instance_yaml(): ref = InstanceRef.from_dir( base_dir=file_relative_path(__file__, "../../../docs_snippets/concepts/logging"), config_filename="python_logging_python_log_level_config.yaml", ) instance = DagsterInstance.from_ref(ref) assert instance.python_log_level == "INFO"
def test_valid_managed_loggers_instance_yaml(): ref = InstanceRef.from_dir( base_dir=file_relative_path(__file__, "../../../docs_snippets/concepts/logging"), config_filename="python_logging_managed_loggers_config.yaml", ) instance = DagsterInstance.from_ref(ref) assert instance.managed_python_loggers == ["root"]
def _start_pipeline_execution(self, job_args): handle = job_args['handle'] pipeline_run = job_args['pipeline_run'] pipeline = handle.build_repository_definition().get_pipeline( pipeline_run.pipeline_name) instance = DagsterInstance.from_ref(job_args['instance_ref']) self._delegate.execute_pipeline(handle, pipeline, pipeline_run, instance)
def test_valid_handler_instance_yaml(): ref = InstanceRef.from_dir( base_dir=file_relative_path(__file__, "../../../docs_snippets/concepts/logging"), config_filename="python_logging_handler_config.yaml", ) instance = DagsterInstance.from_ref(ref) assert len(instance.get_handlers()) == 2
def test_output_manager_with_failure(): _called_input_manager = False _called_solid = False @output_manager def should_fail(_, _obj): raise Failure( description="Foolure", metadata_entries=[ EventMetadataEntry.text(label="label", text="text", description="description") ], ) @input_manager def should_not_enter(_): _called_input_manager = True @solid(output_defs=[OutputDefinition(manager_key="should_fail")]) def emit_str(_): return "emit" @solid( input_defs=[ InputDefinition(name="_input_str", dagster_type=str, manager_key="should_not_enter") ] ) def should_not_call(_, _input_str): _called_solid = True @pipeline( mode_defs=[ ModeDefinition( resource_defs={"should_fail": should_fail, "should_not_enter": should_not_enter} ) ] ) def simple(): should_not_call(emit_str()) with tempfile.TemporaryDirectory() as tmpdir_path: instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path)) result = execute_pipeline(simple, instance=instance, raise_on_error=False) assert not result.success failure_data = result.result_for_solid("emit_str").failure_data assert failure_data.error.cls_name == "Failure" assert failure_data.user_failure_data.description == "Foolure" assert failure_data.user_failure_data.metadata_entries[0].label == "label" assert failure_data.user_failure_data.metadata_entries[0].entry_data.text == "text" assert failure_data.user_failure_data.metadata_entries[0].description == "description" assert not _called_input_manager and not _called_solid
def _execute_plan(_self, instance_ref_dict, handle_dict, run_id, step_keys, retries_dict): check.dict_param(instance_ref_dict, 'instance_ref_dict') check.dict_param(handle_dict, 'handle_dict') check.str_param(run_id, 'run_id') check.list_param(step_keys, 'step_keys', of_type=str) check.dict_param(retries_dict, 'retries_dict') instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) handle = ExecutionTargetHandle.from_dict(handle_dict) retries = Retries.from_config(retries_dict) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) pipeline_def = handle.build_pipeline_definition().build_sub_pipeline( pipeline_run.selector.solid_subset) step_keys_str = ", ".join(step_keys) execution_plan = create_execution_plan( pipeline_def, pipeline_run.environment_dict, mode=pipeline_run.mode, step_keys_to_execute=pipeline_run.step_keys_to_execute, ).build_subset_plan(step_keys) engine_event = instance.report_engine_event( 'Executing steps {} in celery worker'.format(step_keys_str), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, 'step_keys'), ], marker_end=DELEGATE_MARKER, ), CeleryEngine, step_key=execution_plan.step_key_for_single_step_plans(), ) events = [engine_event] for step_event in execute_plan_iterator( execution_plan, pipeline_run=pipeline_run, environment_dict=pipeline_run.environment_dict, instance=instance, retries=retries, ): events.append(step_event) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_plan(_self, instance_ref_dict, executable_dict, run_id, step_keys, retries_dict): check.dict_param(instance_ref_dict, "instance_ref_dict") check.dict_param(executable_dict, "executable_dict") check.str_param(run_id, "run_id") check.list_param(step_keys, "step_keys", of_type=str) check.dict_param(retries_dict, "retries_dict") instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline = ReconstructablePipeline.from_dict(executable_dict) retries = Retries.from_config(retries_dict) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_keys_str = ", ".join(step_keys) execution_plan = create_execution_plan( pipeline, pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=pipeline_run.step_keys_to_execute, ).build_subset_plan(step_keys) engine_event = instance.report_engine_event( "Executing steps {} in celery worker".format(step_keys_str), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "step_keys"), ], marker_end=DELEGATE_MARKER, ), CeleryExecutor, step_key=execution_plan.step_key_for_single_step_plans(), ) events = [engine_event] for step_event in execute_plan_iterator( execution_plan, pipeline_run=pipeline_run, run_config=pipeline_run.run_config, instance=instance, retries=retries, ): events.append(step_event) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def run_loop( self, instance_ref, daemon_uuid, daemon_shutdown_event, gen_workspace, heartbeat_interval_seconds, error_interval_seconds, until=None, ): # Each loop runs in its own thread with its own instance and IWorkspace with DagsterInstance.from_ref(instance_ref) as instance: with gen_workspace(instance) as workspace: check.inst_param(workspace, "workspace", IWorkspace) while not daemon_shutdown_event.is_set() and ( not until or pendulum.now("UTC") < until): curr_time = pendulum.now("UTC") if (not self._last_iteration_time or (curr_time - self._last_iteration_time ).total_seconds() >= self.interval_seconds): self._last_iteration_time = curr_time self._run_iteration( instance, daemon_uuid, daemon_shutdown_event, workspace, heartbeat_interval_seconds, error_interval_seconds, until, ) try: self._check_add_heartbeat( instance, daemon_uuid, heartbeat_interval_seconds, error_interval_seconds, ) except Exception: # pylint: disable=broad-except self._logger.error( "Failed to add heartbeat: \n{}".format( serializable_error_info_from_exc_info( sys.exc_info()))) daemon_shutdown_event.wait(0.5)
def in_mp_process(cls, handle, pipeline_run, instance_ref, term_event): """ Execute pipeline using message queue as a transport """ run_id = pipeline_run.run_id pipeline_name = pipeline_run.pipeline_name instance = DagsterInstance.from_ref(instance_ref) pid = os.getpid() instance.report_engine_event( 'Started process for pipeline (pid: {pid}).'.format(pid=pid), pipeline_run, EngineEventData.in_process(pid, marker_end='dagit_subprocess_init'), cls, ) start_termination_thread(term_event) try: handle.build_repository_definition() pipeline_def = handle.with_pipeline_name( pipeline_name).build_pipeline_definition() except Exception: # pylint: disable=broad-except instance.report_engine_event( 'Failed attempting to load pipeline "{}"'.format( pipeline_name), pipeline_run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), cls, ) return try: event_list = [] for event in execute_run_iterator( pipeline_def.build_sub_pipeline( pipeline_run.selector.solid_subset), pipeline_run, instance, ): event_list.append(event) return PipelineExecutionResult(pipeline_def, run_id, event_list, lambda: None) # Add a DagsterEvent for unexpected exceptions # Explicitly ignore KeyboardInterrupts since they are used for termination except DagsterSubprocessError as err: if not all([ err_info.cls_name == 'KeyboardInterrupt' for err_info in err.subprocess_error_infos ]): instance.report_engine_event( 'An exception was thrown during execution that is likely a framework error, ' 'rather than an error in user code.', pipeline_run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), cls, ) except Exception: # pylint: disable=broad-except instance.report_engine_event( 'An exception was thrown during execution that is likely a framework error, ' 'rather than an error in user code.', pipeline_run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), cls, ) finally: instance.report_engine_event( 'Process for pipeline exited (pid: {pid}).'.format(pid=pid), pipeline_run, cls=cls, )
def _execute_step_k8s_job( self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, pipeline_origin_packed, user_defined_k8s_config_dict=None, kubeconfig_file=None, ): """Run step execution in a K8s job pod. """ check.dict_param(instance_ref_dict, "instance_ref_dict") check.list_param(step_keys, "step_keys", of_type=str) check.invariant( len(step_keys) == 1, "Celery K8s task executor can only execute 1 step at a time" ) check.dict_param(run_config, "run_config") check.str_param(mode, "mode") check.str_param(repo_name, "repo_name") check.str_param(repo_location_name, "repo_location_name") check.str_param(run_id, "run_id") # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, "job_config", DagsterK8sJobConfig) check.str_param(job_namespace, "job_namespace") check.bool_param(load_incluster_config, "load_incluster_config") check.dict_param(retries_dict, "retries_dict") pipeline_origin = unpack_value( check.dict_param( pipeline_origin_packed, "pipeline_origin_packed" ) # TODO: make part of args ) check.inst(pipeline_origin, PipelineOrigin) user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict( user_defined_k8s_config_dict ) check.opt_inst_param( user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig, ) check.opt_str_param(kubeconfig_file, "kubeconfig_file") # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_key = step_keys[0] celery_worker_name = self.request.hostname celery_pod_name = os.environ.get("HOSTNAME") instance.report_engine_event( "Task for step {step_key} picked up by Celery".format(step_key=step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(celery_worker_name, "Celery worker name"), EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( "Not scheduling step because pipeline run status is not STARTED", pipeline_run, EngineEventData([EventMetadataEntry.text(step_key, "Step keys"),]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(run_id, step_key) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) else: job_name = "dagster-job-%s" % (k8s_name_key) pod_name = "dagster-job-%s" % (k8s_name_key) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=None, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, ) ) command = ["dagster"] args = ["api", "execute_step_with_structured_logs", input_json] job = construct_dagster_k8s_job( job_config, command, args, job_name, user_defined_k8s_config, pod_name ) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( "Executing step {} in Kubernetes job {}".format(step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(pod_name, "Kubernetes Pod name"), EventMetadataEntry.text(job_config.job_image, "Job image"), EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"), EventMetadataEntry.text( str(job_config.image_pull_secrets), "Image pull secrets" ), EventMetadataEntry.text( str(job_config.service_account_name), "Service account name" ), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == "Conflict": # There is an existing job with the same name so do not procede. instance.report_engine_event( "Did not create Kubernetes job {} for step {} since job name already " "exists, exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(pod_name, "Kubernetes Pod name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( "Encountered unexpected error while creating Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(e, "Error"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) return try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( "Terminating Kubernetes Job because pipeline run status is not STARTED", pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( "Retrieving logs from Kubernetes Job pods", pipeline_run, EngineEventData([EventMetadataEntry.text("\n".join(pod_names), "Pod names")]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split("\n") events += filter_dagster_events_from_pod_logs(logs) serialized_events = [serialize_dagster_namedtuple(event) for event in events] return serialized_events
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, environment_dict, mode, pipeline_name, run_id, job_config_dict, job_namespace, load_incluster_config, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' from dagster_k8s.job import DagsterK8sJobConfig, construct_dagster_graphql_k8s_job from dagster_k8s.utils import get_pod_names_in_job, retrieve_pod_logs, wait_for_job_success import kubernetes check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time' ) check.dict_param(environment_dict, 'environment_dict') check.str_param(mode, 'mode') check.str_param(pipeline_name, 'pipeline_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_keys_str = ", ".join(step_keys) # Ensure we stay below k8s name length limits k8s_name_key = _get_k8s_name_key(run_id, step_keys) job_name = 'dagster-stepjob-%s' % k8s_name_key pod_name = 'dagster-stepjob-%s' % k8s_name_key variables = construct_variables(mode, environment_dict, pipeline_name, run_id, step_keys) args = ['-p', 'executePlan', '-v', seven.json.dumps(variables)] job = construct_dagster_graphql_k8s_job(job_config, args, job_name, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution engine_event = instance.report_engine_event( 'Executing steps {} in Kubernetes job {}'.format(step_keys_str, job.metadata.name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, 'Step keys'), EventMetadataEntry.text(job.metadata.name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text( str(job_config.image_pull_secrets), 'Image pull secrets' ), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name' ), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobEngine, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_keys[0], ) events.append(engine_event) kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace) wait_for_job_success(job.metadata.name, namespace=job_namespace) pod_names = get_pod_names_in_job(job.metadata.name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData([EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobEngine, step_key=step_keys[0], ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') res = parse_raw_log_lines(logs) handle_execution_errors(res, 'executePlan') step_events = handle_execute_plan_result(res) events += step_events serialized_events = [serialize_dagster_namedtuple(event) for event in events] return serialized_events
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, pipeline_origin_packed, resources=None, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time') check.dict_param(run_config, 'run_config') check.str_param(mode, 'mode') check.str_param(repo_name, 'repo_name') check.str_param(repo_location_name, 'repo_location_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.dict_param(retries_dict, 'retries_dict') pipeline_origin = unpack_value( check.dict_param( pipeline_origin_packed, 'pipeline_origin_packed') # TODO: make part of args ) check.inst(pipeline_origin, PipelineOrigin) check.opt_dict_param(resources, 'resources', key_type=str, value_type=dict) check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_key = step_keys[0] if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( 'Not scheduling step because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(run_id, step_key) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) else: job_name = 'dagster-job-%s' % (k8s_name_key) pod_name = 'dagster-job-%s' % (k8s_name_key) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=None, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, )) command = ['dagster'] args = ['api', 'execute_step_with_structured_logs', input_json] job = construct_dagster_k8s_job(job_config, command, args, job_name, resources, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( 'Executing step {} in Kubernetes job {}'.format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text(str(job_config.image_pull_secrets), 'Image pull secrets'), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == 'Conflict': # There is an existing job with the same name so do not procede. instance.report_engine_event( 'Did not create Kubernetes job {} for step {} since job name already ' 'exists, exiting.'.format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( 'Encountered unexpected error while creating Kubernetes job {} for step {}, ' 'exiting.'.format(job_name, step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(e, 'Error'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( 'Terminating Kubernetes Job because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(job_namespace, 'Kubernetes Job namespace'), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData( [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') events += filter_dagster_events_from_pod_logs(logs) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_step_docker( self, execute_step_args_packed, docker_config, ): """Run step execution in a Docker container.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(docker_config, "docker_config") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) input_json = serialize_dagster_namedtuple(execute_step_args) command = "dagster api execute_step {}".format(json.dumps(input_json)) docker_image = (docker_config["image"] if docker_config.get("image") else execute_step_args. pipeline_origin.repository_origin.container_image) if not docker_image: raise Exception( "No docker image specified by either the job or the repository" ) client = docker.client.from_env() if docker_config.get("registry"): client.login( registry=docker_config["registry"]["url"], username=docker_config["registry"]["username"], password=docker_config["registry"]["password"], ) # Post event for starting execution engine_event = instance.report_engine_event( "Executing steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ MetadataEntry.text(step_keys_str, "Step keys"), MetadataEntry.text(docker_image, "Image"), MetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) serialized_events = [serialize_dagster_namedtuple(engine_event)] docker_env = {} if docker_config.get("env_vars"): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config["env_vars"] } try: docker_response = client.containers.run( docker_image, command=command, detach=False, auto_remove=True, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, network=docker_config.get("network", None), ) res = docker_response.decode("utf-8") except docker.errors.ContainerError as err: instance.report_engine_event( "Failed to run steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData([ MetadataEntry.text(docker_image, "Job image"), MetadataEntry.text(err.stderr, "Docker stderr"), ], ), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) raise else: if res is None: raise Exception( "No response from execute_step in CeleryDockerExecutor") serialized_events += [event for event in res.split("\n") if event] return serialized_events
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, resources=None, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time') check.dict_param(run_config, 'run_config') check.str_param(mode, 'mode') check.str_param(repo_name, 'repo_name') check.str_param(repo_location_name, 'repo_location_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.dict_param(retries_dict, 'retries_dict') check.opt_dict_param(resources, 'resources', key_type=str, value_type=dict) check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_key = step_keys[0] if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( 'Not scheduling step because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = _get_k8s_name_key(run_id, step_keys) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) else: job_name = 'dagster-job-%s' % (k8s_name_key) pod_name = 'dagster-job-%s' % (k8s_name_key) variables = { 'executionParams': { 'runConfigData': run_config, 'mode': mode, 'selector': { 'repositoryLocationName': repo_location_name, 'repositoryName': repo_name, 'pipelineName': pipeline_run.pipeline_name, 'solidSelection': list(pipeline_run.solids_to_execute) if pipeline_run.solids_to_execute else None, }, 'executionMetadata': { 'runId': run_id }, 'stepKeys': step_keys, }, 'retries': retries.to_graphql_input(), } args = [ '-p', 'executePlan', '-v', seven.json.dumps(variables), '--remap-sigterm' ] job = construct_dagster_graphql_k8s_job(job_config, args, job_name, resources, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( 'Executing step {} in Kubernetes job {}'.format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text(str(job_config.image_pull_secrets), 'Image pull secrets'), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( 'Terminating Kubernetes Job because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(job_namespace, 'Kubernetes Job namespace'), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData( [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') res = parse_raw_log_lines(logs) handle_execution_errors(res, 'executePlan') step_events = handle_execute_plan_result(res) events += step_events serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def test_output_manager_with_retries(): _called = False _count = {"total": 0} @object_manager def should_succeed(_): class FakeObjectManager(ObjectManager): def load_input(self, _context): return "foo" def handle_output(self, _context, _obj): if _count["total"] < 2: _count["total"] += 1 raise RetryRequested(max_retries=3) return FakeObjectManager() @object_manager def should_retry(_): class FakeObjectManager(ObjectManager): def load_input(self, _context): return "foo" def handle_output(self, _context, _obj): raise RetryRequested(max_retries=3) return FakeObjectManager() @pipeline(mode_defs=[ ModeDefinition(resource_defs={ "should_succeed": should_succeed, "should_retry": should_retry, }) ]) def simple(): @solid( output_defs=[OutputDefinition(manager_key="should_succeed")], ) def source_solid(_): return "foo" @solid( input_defs=[InputDefinition("solid_input")], output_defs=[OutputDefinition(manager_key="should_retry")], ) def take_input(_, solid_input): return solid_input @solid(input_defs=[InputDefinition("_solid_input")]) def should_not_execute(_, _solid_input): _called = True should_not_execute(take_input(source_solid())) with tempfile.TemporaryDirectory() as tmpdir_path: instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path)) result = execute_pipeline(simple, instance=instance, raise_on_error=False) step_stats = instance.get_run_step_stats(result.run_id) assert len(step_stats) == 2 step_stats_1 = instance.get_run_step_stats(result.run_id, step_keys=["source_solid"]) assert len(step_stats_1) == 1 step_stat_1 = step_stats_1[0] assert step_stat_1.status.value == "SUCCESS" assert step_stat_1.attempts == 3 step_stats_2 = instance.get_run_step_stats(result.run_id, step_keys=["take_input"]) assert len(step_stats_2) == 1 step_stat_2 = step_stats_2[0] assert step_stat_2.status.value == "FAILURE" assert step_stat_2.attempts == 4 step_stats_3 = instance.get_run_step_stats( result.run_id, step_keys=["should_not_execute"]) assert len(step_stats_3) == 0 assert _called == False
def test_input_manager_with_retries(): _count = {"total": 0} @root_input_manager def should_succeed_after_retries(_): if _count["total"] < 2: _count["total"] += 1 raise RetryRequested(max_retries=3) return "foo" @root_input_manager def should_retry(_): raise RetryRequested(max_retries=3) @solid(input_defs=[ InputDefinition("solid_input", root_manager_key="should_succeed_after_retries") ]) def take_input_1(_, solid_input): return solid_input @solid(input_defs=[ InputDefinition("solid_input", root_manager_key="should_retry") ]) def take_input_2(_, solid_input): return solid_input @solid def take_input_3(_, _input1, _input2): assert False, "should not be called" @pipeline(mode_defs=[ ModeDefinition( resource_defs={ "should_succeed_after_retries": should_succeed_after_retries, "should_retry": should_retry, }) ]) def simple(): take_input_3(take_input_2(), take_input_1()) with tempfile.TemporaryDirectory() as tmpdir_path: instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path)) result = execute_pipeline(simple, instance=instance, raise_on_error=False) step_stats = instance.get_run_step_stats(result.run_id) assert len(step_stats) == 2 step_stats_1 = instance.get_run_step_stats(result.run_id, step_keys=["take_input_1"]) assert len(step_stats_1) == 1 step_stat_1 = step_stats_1[0] assert step_stat_1.status.value == "SUCCESS" assert step_stat_1.attempts == 3 step_stats_2 = instance.get_run_step_stats(result.run_id, step_keys=["take_input_2"]) assert len(step_stats_2) == 1 step_stat_2 = step_stats_2[0] assert step_stat_2.status.value == "FAILURE" assert step_stat_2.attempts == 4 step_stats_3 = instance.get_run_step_stats(result.run_id, step_keys=["take_input_3"]) assert len(step_stats_3) == 0
def _start_pipeline_execution(self, job_args): pipeline_dict = job_args['pipeline_dict'] pipeline_run = job_args['pipeline_run'] pipeline = InterProcessExecutablePipeline.from_dict(pipeline_dict) instance = DagsterInstance.from_ref(job_args['instance_ref']) self._delegate.execute_pipeline(pipeline, pipeline_run, instance)
def _execute_step_docker( self, execute_step_args_packed, docker_config, ): """Run step execution in a Docker container.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(docker_config, "docker_config") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) input_json = serialize_dagster_namedtuple(execute_step_args) command = "dagster api execute_step {}".format(json.dumps(input_json)) docker_image = (docker_config["image"] if docker_config.get("image") else execute_step_args. pipeline_origin.repository_origin.container_image) if not docker_image: raise Exception( "No docker image specified by either the job or the repository" ) client = docker.client.from_env() if docker_config.get("registry"): client.login( registry=docker_config["registry"]["url"], username=docker_config["registry"]["username"], password=docker_config["registry"]["password"], ) # Post event for starting execution engine_event = instance.report_engine_event( "Executing steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ MetadataEntry("Step keys", value=step_keys_str), MetadataEntry("Image", value=docker_image), MetadataEntry("Celery worker", value=self.request.hostname), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) serialized_events = [serialize_dagster_namedtuple(engine_event)] docker_env = {} if docker_config.get("env_vars"): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config["env_vars"] } container_kwargs = check.opt_dict_param( docker_config.get("container_kwargs"), "container_kwargs", key_type=str) # set defaults for detach and auto_remove container_kwargs["detach"] = container_kwargs.get("detach", False) container_kwargs["auto_remove"] = container_kwargs.get( "auto_remove", True) # if environment variables are provided via container_kwargs, merge with env_vars if container_kwargs.get("environment") is not None: e_vars = container_kwargs.get("environment") if isinstance(e_vars, dict): docker_env.update(e_vars) else: for v in e_vars: key, val = v.split("=") docker_env[key] = val del container_kwargs["environment"] try: docker_response = client.containers.run( docker_image, command=command, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, network=docker_config.get("network", None), **container_kwargs, ) res = docker_response.decode("utf-8") except docker.errors.ContainerError as err: entries = [MetadataEntry("Job image", value=docker_image)] if err.stderr is not None: entries.append(MetadataEntry("Docker stderr", value=err.stderr)) instance.report_engine_event( "Failed to run steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData(entries), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) raise else: if res is None: raise Exception( "No response from execute_step in CeleryDockerExecutor") serialized_events += [event for event in res.split("\n") if event] return serialized_events
def _execute_step_docker( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, docker_config, ): """Run step execution in a Docker container. """ instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_keys_str = ", ".join(step_keys) variables = { "executionParams": { "runConfigData": run_config, "mode": mode, "selector": { "repositoryLocationName": repo_location_name, "repositoryName": repo_name, "pipelineName": pipeline_run.pipeline_name, "solidSelection": list(pipeline_run.solids_to_execute) if pipeline_run.solids_to_execute else None, }, "executionMetadata": { "runId": run_id }, "stepKeys": step_keys, } } command = "dagster-graphql -v '{variables}' -p executePlan".format( variables=seven.json.dumps(variables)) docker_image = docker_config["image"] client = docker.client.from_env() if docker_config.get("registry"): client.login( registry=docker_config["registry"]["url"], username=docker_config["registry"]["username"], password=docker_config["registry"]["password"], ) # Post event for starting execution engine_event = instance.report_engine_event( "Executing steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "Step keys"), EventMetadataEntry.text(docker_image, "Image"), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=step_keys[0], ) events = [engine_event] docker_env = {} if docker_config.get("env_vars"): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config["env_vars"] } try: docker_response = client.containers.run( docker_image, command=command, detach=False, auto_remove=True, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, ) res = seven.json.loads(docker_response) except docker.errors.ContainerError as err: instance.report_engine_event( "Failed to run steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, "Job image"), EventMetadataEntry.text(err.stderr, "Docker stderr"), ], ), CeleryDockerExecutor, step_key=step_keys[0], ) raise except JSONDecodeError: instance.report_engine_event( "Failed to parse response for steps {} from Docker container {}" .format(step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, "Job image"), EventMetadataEntry.text(docker_response, "Docker Response"), ], ), CeleryDockerExecutor, step_key=step_keys[0], ) raise else: handle_execution_errors(res, "executePlan") step_events = handle_execute_plan_result(res) events += step_events serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_step_docker( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, docker_config, ): '''Run step execution in a Docker container. ''' instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_keys_str = ", ".join(step_keys) variables = { 'executionParams': { 'runConfigData': run_config, 'mode': mode, 'selector': { 'repositoryLocationName': repo_location_name, 'repositoryName': repo_name, 'pipelineName': pipeline_run.pipeline_name, 'solidSelection': pipeline_run.solid_selection, }, 'executionMetadata': { 'runId': run_id }, 'stepKeys': step_keys, } } command = 'dagster-graphql -v \'{variables}\' -p executePlan'.format( variables=seven.json.dumps(variables)) docker_image = docker_config['image'] client = docker.client.from_env() if docker_config.get('registry'): client.login( registry=docker_config['registry']['url'], username=docker_config['registry']['username'], password=docker_config['registry']['password'], ) # Post event for starting execution engine_event = instance.report_engine_event( 'Executing steps {} in Docker container {}'.format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, 'Step keys'), EventMetadataEntry.text(docker_image, 'Image'), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=step_keys[0], ) events = [engine_event] docker_env = {} if docker_config.get('env_vars'): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config['env_vars'] } try: docker_response = client.containers.run( docker_image, command=command, detach=False, auto_remove=True, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, ) res = seven.json.loads(docker_response) except docker.errors.ContainerError as err: instance.report_engine_event( 'Failed to run steps {} in Docker container {}'.format( step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, 'Job image'), EventMetadataEntry.text(err.stderr, 'Docker stderr'), ], ), CeleryDockerExecutor, step_key=step_keys[0], ) raise except JSONDecodeError: instance.report_engine_event( 'Failed to parse response for steps {} from Docker container {}' .format(step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, 'Job image'), EventMetadataEntry.text(docker_response, 'Docker Response'), ], ), CeleryDockerExecutor, step_key=step_keys[0], ) raise else: handle_execution_errors(res, 'executePlan') step_events = handle_execute_plan_result(res) events += step_events serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_step_k8s_job( self, execute_step_args_packed, job_config_dict, job_namespace, load_incluster_config, user_defined_k8s_config_dict=None, kubeconfig_file=None, ): """Run step execution in a K8s job pod.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.invariant( len(execute_step_args.step_keys_to_execute) == 1, "Celery K8s task executor can only execute 1 step at a time", ) # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, "job_config", DagsterK8sJobConfig) check.str_param(job_namespace, "job_namespace") check.bool_param(load_incluster_config, "load_incluster_config") user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict( user_defined_k8s_config_dict) check.opt_inst_param( user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig, ) check.opt_str_param(kubeconfig_file, "kubeconfig_file") # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_key = execute_step_args.step_keys_to_execute[0] celery_worker_name = self.request.hostname celery_pod_name = os.environ.get("HOSTNAME") instance.report_engine_event( "Task for step {step_key} picked up by Celery".format( step_key=step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(celery_worker_name, "Celery worker name"), EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"), ]), CeleryK8sJobExecutor, step_key=step_key, ) if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( "Not scheduling step because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), ]), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id, step_key) retry_state = execute_step_args.known_state.get_retry_state() if retry_state.get_attempt_count(step_key): attempt_number = retry_state.get_attempt_count(step_key) job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) else: job_name = "dagster-job-%s" % (k8s_name_key) pod_name = "dagster-job-%s" % (k8s_name_key) input_json = serialize_dagster_namedtuple(execute_step_args) args = ["dagster", "api", "execute_step", input_json] job = construct_dagster_k8s_job(job_config, args, job_name, user_defined_k8s_config, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( "Executing step {} in Kubernetes job {}".format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_config.job_image, "Job image"), EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"), EventMetadataEntry.text(str(job_config.image_pull_secrets), "Image pull secrets"), EventMetadataEntry.text( str(job_config.service_account_name), "Service account name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == "Conflict": # There is an existing job with the same name so proceed and see if the existing job succeeded instance.report_engine_event( "Did not create Kubernetes job {} for step {} since job name already " "exists, proceeding with existing job.".format( job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( "Encountered unexpected error while creating Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=execute_step_args.pipeline_run_id, ) except (DagsterK8sError, DagsterK8sTimeoutError) as err: step_failure_event = construct_step_failure_event_and_handle( pipeline_run, step_key, err, instance=instance) events.append(step_failure_event) except DagsterK8sPipelineStatusException: instance.report_engine_event( "Terminating Kubernetes Job because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return [] except ( DagsterK8sUnrecoverableAPIError, DagsterK8sAPIRetryLimitExceeded, # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in # a retry boundary. We still catch it here just in case we missed one so that we can # report it to the event log kubernetes.client.rest.ApiException, ) as err: instance.report_engine_event( "Encountered unexpected error while waiting on Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Post engine event for log retrieval engine_event = instance.report_engine_event( "Retrieving logs from Kubernetes Job pods", pipeline_run, EngineEventData( [EventMetadataEntry.text("\n".join(pod_names), "Pod names")]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: try: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split("\n") except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error while fetching pod logs for Kubernetes job {}, " "Pod name {} for step {}. Will attempt to continue with other pods." .format(job_name, pod_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) events += filter_dagster_events_from_pod_logs(logs) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_step_docker( self, instance_ref_dict, step_keys, run_config, mode, repo_name, run_id, docker_config, pipeline_origin_packed, retries_dict, ): """Run step execution in a Docker container. """ check.dict_param(instance_ref_dict, "instance_ref_dict") check.list_param(step_keys, "step_keys", of_type=str) check.dict_param(run_config, "run_config") check.str_param(mode, "mode") check.str_param(repo_name, "repo_name") check.str_param(run_id, "run_id") check.dict_param(docker_config, "docker_config") pipeline_origin = unpack_value( check.dict_param(pipeline_origin_packed, "pipeline_origin_packed")) check.dict_param(retries_dict, "retries_dict") instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_keys_str = ", ".join(step_keys) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=instance_ref, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, )) command = "dagster api execute_step_with_structured_logs {}".format( json.dumps(input_json)) docker_image = docker_config["image"] client = docker.client.from_env() if docker_config.get("registry"): client.login( registry=docker_config["registry"]["url"], username=docker_config["registry"]["username"], password=docker_config["registry"]["password"], ) # Post event for starting execution engine_event = instance.report_engine_event( "Executing steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "Step keys"), EventMetadataEntry.text(docker_image, "Image"), EventMetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=step_keys[0], ) serialized_events = [serialize_dagster_namedtuple(engine_event)] docker_env = {} if docker_config.get("env_vars"): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config["env_vars"] } try: docker_response = client.containers.run( docker_image, command=command, detach=False, auto_remove=True, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, network=docker_config.get("network", None), ) res = docker_response.decode("utf-8") except docker.errors.ContainerError as err: instance.report_engine_event( "Failed to run steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, "Job image"), EventMetadataEntry.text(err.stderr, "Docker stderr"), ], ), CeleryDockerExecutor, step_key=step_keys[0], ) raise else: if res is None: raise Exception( "No response from execute_step_with_structured_logs in CeleryDockerExecutor" ) serialized_events += [event for event in res.split("\n") if event] return serialized_events