def write_unary_input(input_file, obj): check.str_param(input_file, 'input_file') check.not_none_param(obj, 'obj') with open(os.path.abspath(input_file), 'w') as fp: fp.write(serialize_dagster_namedtuple(obj))
def _send(file_path, obj): with open(os.path.abspath(file_path), 'a+') as fp: fp.write(serialize_dagster_namedtuple(obj) + '\n')
def _execute_plan(self, execute_step_args_packed, executable_dict): execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(executable_dict, "executable_dict") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline = ReconstructablePipeline.from_dict(executable_dict) retry_mode = execute_step_args.retry_mode pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.invariant( pipeline_run, "Could not load run {}".format(execute_step_args.pipeline_run_id)) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) execution_plan = create_execution_plan( pipeline, pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=execute_step_args.step_keys_to_execute, known_state=execute_step_args.known_state, ) engine_event = instance.report_engine_event( "Executing steps {} in celery worker".format(step_keys_str), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "step_keys"), EventMetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryExecutor, step_key=execution_plan.step_handle_for_single_step_plans().to_key( ), ) events = [engine_event] for step_event in execute_plan_iterator( execution_plan, pipeline=pipeline, pipeline_run=pipeline_run, run_config=pipeline_run.run_config, instance=instance, retry_mode=retry_mode, ): events.append(step_event) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def serialize_rt(value): return deserialize_json_to_dagster_namedtuple(serialize_dagster_namedtuple(value))
def GetCurrentImage(self, request, _context): return api_pb2.GetCurrentImageReply( serialized_current_image=serialize_dagster_namedtuple( GetCurrentImageResult(current_image=_get_current_image(), serializable_error_info=None)))
def _execute_step_k8s_job( self, execute_step_args_packed, job_config_dict, job_namespace, load_incluster_config, user_defined_k8s_config_dict=None, kubeconfig_file=None, ): """Run step execution in a K8s job pod.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.invariant( len(execute_step_args.step_keys_to_execute) == 1, "Celery K8s task executor can only execute 1 step at a time", ) # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, "job_config", DagsterK8sJobConfig) check.str_param(job_namespace, "job_namespace") check.bool_param(load_incluster_config, "load_incluster_config") user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict( user_defined_k8s_config_dict) check.opt_inst_param( user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig, ) check.opt_str_param(kubeconfig_file, "kubeconfig_file") # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_key = execute_step_args.step_keys_to_execute[0] celery_worker_name = self.request.hostname celery_pod_name = os.environ.get("HOSTNAME") instance.report_engine_event( "Task for step {step_key} picked up by Celery".format( step_key=step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(celery_worker_name, "Celery worker name"), EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"), ]), CeleryK8sJobExecutor, step_key=step_key, ) if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( "Not scheduling step because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), ]), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id, step_key) retries = Retries.from_config(execute_step_args.retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) else: job_name = "dagster-job-%s" % (k8s_name_key) pod_name = "dagster-job-%s" % (k8s_name_key) input_json = serialize_dagster_namedtuple(execute_step_args) args = ["dagster", "api", "execute_step", input_json] job = construct_dagster_k8s_job(job_config, args, job_name, user_defined_k8s_config, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( "Executing step {} in Kubernetes job {}".format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_config.job_image, "Job image"), EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"), EventMetadataEntry.text(str(job_config.image_pull_secrets), "Image pull secrets"), EventMetadataEntry.text( str(job_config.service_account_name), "Service account name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == "Conflict": # There is an existing job with the same name so proceed and see if the existing job succeeded instance.report_engine_event( "Did not create Kubernetes job {} for step {} since job name already " "exists, proceeding with existing job.".format( job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( "Encountered unexpected error while creating Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=execute_step_args.pipeline_run_id, ) except (DagsterK8sError, DagsterK8sTimeoutError) as err: step_failure_event = construct_step_failure_event_and_handle( pipeline_run, step_key, err, instance=instance) events.append(step_failure_event) except DagsterK8sPipelineStatusException: instance.report_engine_event( "Terminating Kubernetes Job because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return [] except ( DagsterK8sUnrecoverableAPIError, DagsterK8sAPIRetryLimitExceeded, # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in # a retry boundary. We still catch it here just in case we missed one so that we can # report it to the event log kubernetes.client.rest.ApiException, ) as err: instance.report_engine_event( "Encountered unexpected error while waiting on Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Post engine event for log retrieval engine_event = instance.report_engine_event( "Retrieving logs from Kubernetes Job pods", pipeline_run, EngineEventData( [EventMetadataEntry.text("\n".join(pod_names), "Pod names")]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: try: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split("\n") except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error while fetching pod logs for Kubernetes job {}, " "Pod name {} for step {}. Will attempt to continue with other pods." .format(job_name, pod_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) events += filter_dagster_events_from_pod_logs(logs) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def launch_run(self, context: LaunchRunContext) -> None: run = context.pipeline_run job_name = get_job_name_from_run_id(run.run_id) pod_name = job_name exc_config = _get_validated_celery_k8s_executor_config(run.run_config) env_vars = None job_image_from_executor_config = exc_config.get("job_image") pipeline_origin = context.pipeline_code_origin repository_origin = pipeline_origin.repository_origin job_image = repository_origin.container_image if job_image: if job_image_from_executor_config: job_image = job_image_from_executor_config self._instance.report_engine_event( f"You have specified a job_image {job_image_from_executor_config} in your executor configuration, " f"but also {job_image} in your user-code deployment. Using the job image {job_image_from_executor_config} " f"from executor configuration as it takes precedence.", run, cls=self.__class__, ) else: if not job_image_from_executor_config: raise DagsterInvariantViolationError( "You have not specified a job_image in your executor configuration. " "To resolve this error, specify the job_image configuration in the executor " "config section in your run config. \n" "Note: You may also be seeing this error because you are using the configured API. " "Using configured with the celery-k8s executor is not supported at this time, " "and the job_image must be configured at the top-level executor config without " "using configured." ) job_image = job_image_from_executor_config job_config = DagsterK8sJobConfig( dagster_home=self.dagster_home, instance_config_map=self.instance_config_map, postgres_password_secret=self.postgres_password_secret, job_image=check.str_param(job_image, "job_image"), image_pull_policy=exc_config.get("image_pull_policy"), image_pull_secrets=exc_config.get("image_pull_secrets"), service_account_name=exc_config.get("service_account_name"), env_config_maps=exc_config.get("env_config_maps"), env_secrets=exc_config.get("env_secrets"), ) self._instance.add_run_tags( run.run_id, {DOCKER_IMAGE_TAG: job_config.job_image}, ) user_defined_k8s_config = get_user_defined_k8s_config(frozentags(run.tags)) from dagster.cli.api import ExecuteRunArgs input_json = serialize_dagster_namedtuple( # depends on DagsterInstance.get() returning the same instance # https://github.com/dagster-io/dagster/issues/2757 ExecuteRunArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run.run_id, instance_ref=None, ) ) job = construct_dagster_k8s_job( job_config, args=["dagster", "api", "execute_run", input_json], job_name=job_name, pod_name=pod_name, component="run_coordinator", user_defined_k8s_config=user_defined_k8s_config, env_vars=env_vars, ) job_namespace = exc_config.get("job_namespace") self._instance.report_engine_event( "Creating Kubernetes run worker job", run, EngineEventData( [ EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Namespace"), EventMetadataEntry.text(run.run_id, "Run ID"), ] ), cls=self.__class__, ) self._batch_api.create_namespaced_job(body=job, namespace=job_namespace) self._instance.report_engine_event( "Kubernetes run worker job created", run, EngineEventData( [ EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Namespace"), EventMetadataEntry.text(run.run_id, "Run ID"), ] ), cls=self.__class__, )
def test_execute_step_verify_step(): with get_foo_pipeline_handle() as pipeline_handle: runner = CliRunner() with instance_for_test( overrides={ "compute_logs": { "module": "dagster.core.storage.noop_compute_log_manager", "class": "NoOpComputeLogManager", } }) as instance: run = create_run_for_test( instance, pipeline_name="foo", run_id="new_run", run_config={"storage": { "filesystem": {} }}, ) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_handle.get_python_origin(), pipeline_run_id=run.run_id, step_keys_to_execute=None, instance_ref=instance.get_ref(), )) # Check that verify succeeds for step that has hasn't been fun (case 3) retries = Retries.from_config({"enabled": {}}) assert verify_step(instance, run, retries, step_keys_to_execute=["do_something"]) # Check that verify fails when trying to retry with no original attempt (case 3) retries = Retries.from_config({"enabled": {}}) retries.mark_attempt("do_something") assert not verify_step( instance, run, retries, step_keys_to_execute=["do_something"]) # Test trying to re-run a retry fails verify_step (case 2) with mock.patch("dagster.cli.api.get_step_stats_by_key" ) as _step_stats_by_key: _step_stats_by_key.return_value = { "do_something": RunStepKeyStatsSnapshot(run_id=run.run_id, step_key="do_something", attempts=2) } retries = Retries.from_config({"enabled": {}}) retries.mark_attempt("do_something") assert not verify_step(instance, run, retries, step_keys_to_execute=["do_something"]) runner_execute_step( runner, [input_json], ) # # Check that verify fails for step that has already run (case 1) retries = Retries.from_config({"enabled": {}}) assert not verify_step( instance, run, retries, step_keys_to_execute=["do_something"])
def send_to_buffer(event): buffer.append(serialize_dagster_namedtuple(event))
def mock_external_repository_data(): external_repo_data = external_repository_data_from_def(noop_repo()) return serialize_dagster_namedtuple(external_repo_data)
def launch_run(self, instance, run, external_pipeline): check.inst_param(instance, "instance", DagsterInstance) check.inst_param(run, "run", PipelineRun) check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline) job_name = get_job_name_from_run_id(run.run_id) pod_name = job_name exc_config = _get_validated_celery_k8s_executor_config(run.run_config) job_image = None pipeline_origin = None env_vars = None if isinstance(external_pipeline.get_origin(), PipelineGrpcServerOrigin): if exc_config.get("job_image"): raise DagsterInvariantViolationError( "Cannot specify job_image in executor config when loading pipeline " "from GRPC server." ) repository_location_handle = ( external_pipeline.repository_handle.repository_location_handle ) if not isinstance(repository_location_handle, GrpcServerRepositoryLocationHandle): raise DagsterInvariantViolationError( "Expected RepositoryLocationHandle to be of type " "GrpcServerRepositoryLocationHandle but found type {}".format( type(repository_location_handle) ) ) job_image = repository_location_handle.get_current_image() env_vars = {"DAGSTER_CURRENT_IMAGE": job_image} repository_name = external_pipeline.repository_handle.repository_name pipeline_origin = PipelinePythonOrigin( pipeline_name=external_pipeline.name, repository_origin=repository_location_handle.get_repository_python_origin( repository_name ), ) else: job_image = exc_config.get("job_image") if not job_image: raise DagsterInvariantViolationError( "Cannot find job_image in celery-k8s executor config." ) pipeline_origin = external_pipeline.get_origin() job_config = DagsterK8sJobConfig( dagster_home=self.dagster_home, instance_config_map=self.instance_config_map, postgres_password_secret=self.postgres_password_secret, job_image=check.str_param(job_image, "job_image"), image_pull_policy=exc_config.get("image_pull_policy"), image_pull_secrets=exc_config.get("image_pull_secrets"), service_account_name=exc_config.get("service_account_name"), env_config_maps=exc_config.get("env_config_maps"), env_secrets=exc_config.get("env_secrets"), ) user_defined_k8s_config = get_user_defined_k8s_config(frozentags(external_pipeline.tags)) from dagster.cli.api import ExecuteRunArgs input_json = serialize_dagster_namedtuple( # depends on DagsterInstance.get() returning the same instance # https://github.com/dagster-io/dagster/issues/2757 ExecuteRunArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run.run_id, instance_ref=None, ) ) job = construct_dagster_k8s_job( job_config, command=["dagster"], args=["api", "execute_run_with_structured_logs", input_json], job_name=job_name, pod_name=pod_name, component="run_coordinator", user_defined_k8s_config=user_defined_k8s_config, env_vars=env_vars, ) job_namespace = exc_config.get("job_namespace") api = kubernetes.client.BatchV1Api() api.create_namespaced_job(body=job, namespace=job_namespace) self._instance.report_engine_event( "Kubernetes run_coordinator job launched", run, EngineEventData( [ EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(pod_name, "Kubernetes Pod name"), EventMetadataEntry.text(job_namespace, "Kubernetes Namespace"), EventMetadataEntry.text(run.run_id, "Run ID"), ] ), cls=self.__class__, ) return run
def test_solid_definition_kitchen_sink(): @solid( input_defs=[ InputDefinition('arg_one', str, description='desc1'), InputDefinition('arg_two', int), ], output_defs=[ OutputDefinition(name='output_one', dagster_type=str), OutputDefinition( name='output_two', dagster_type=int, description='desc2', is_required=False ), ], config={'foo': int}, description='a description', tags={'a_tag': 'yup'}, required_resource_keys={'b_resource', 'a_resource'}, ) def kitchen_sink_solid(_, arg_two, arg_one): # out of order to test positional_inputs assert arg_one assert arg_two raise Exception('should not execute') kitchen_sink_solid_snap = build_core_solid_def_snap(kitchen_sink_solid) assert kitchen_sink_solid_snap assert kitchen_sink_solid_snap.name == 'kitchen_sink_solid' assert len(kitchen_sink_solid_snap.input_def_snaps) == 2 assert [inp.name for inp in kitchen_sink_solid_snap.input_def_snaps] == ['arg_one', 'arg_two'] assert [inp.dagster_type_key for inp in kitchen_sink_solid_snap.input_def_snaps] == [ 'String', 'Int', ] assert kitchen_sink_solid_snap.get_input_snap('arg_one').description == 'desc1' assert [out.name for out in kitchen_sink_solid_snap.output_def_snaps] == [ 'output_one', 'output_two', ] assert [out.dagster_type_key for out in kitchen_sink_solid_snap.output_def_snaps] == [ 'String', 'Int', ] assert kitchen_sink_solid_snap.get_output_snap('output_two').description == 'desc2' assert kitchen_sink_solid_snap.get_output_snap('output_two').is_required is False assert ( kitchen_sink_solid_snap.config_field_snap.type_key == kitchen_sink_solid.config_field.config_type.key ) assert kitchen_sink_solid_snap.required_resource_keys == ['a_resource', 'b_resource'] assert kitchen_sink_solid_snap.tags == {'a_tag': 'yup'} assert kitchen_sink_solid.positional_inputs == ['arg_two', 'arg_one'] assert ( deserialize_json_to_dagster_namedtuple( serialize_dagster_namedtuple(kitchen_sink_solid_snap) ) == kitchen_sink_solid_snap )
def launch_run(self, instance, run, external_pipeline): check.inst_param(run, 'run', PipelineRun) check.inst_param(external_pipeline, 'external_pipeline', ExternalPipeline) job_name = 'dagster-run-{}'.format(run.run_id) pod_name = job_name resources = get_k8s_resource_requirements( frozentags(external_pipeline.tags)) pipeline_origin = None job_config = None if isinstance(external_pipeline.get_origin(), PipelineGrpcServerOrigin): if self._job_image: raise DagsterInvariantViolationError( 'Cannot specify job_image in run launcher config when loading pipeline ' 'from GRPC server.') repository_location_handle = ( external_pipeline.repository_handle.repository_location_handle) if not isinstance(repository_location_handle, GrpcServerRepositoryLocationHandle): raise DagsterInvariantViolationError( 'Expected RepositoryLocationHandle to be of type ' 'GrpcServerRepositoryLocationHandle but found type {}'. format(type(repository_location_handle))) job_image = repository_location_handle.get_current_image() job_config = self._get_grpc_job_config(job_image) repository_name = external_pipeline.repository_handle.repository_name pipeline_origin = PipelinePythonOrigin( pipeline_name=external_pipeline.name, repository_origin=repository_location_handle. get_repository_python_origin(repository_name), ) else: pipeline_origin = external_pipeline.get_origin() job_config = self._get_static_job_config() input_json = serialize_dagster_namedtuple( ExecuteRunArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run.run_id, instance_ref=None, )) job = construct_dagster_k8s_job( job_config=job_config, command=['dagster'], args=['api', 'execute_run_with_structured_logs', input_json], job_name=job_name, pod_name=pod_name, component='run_coordinator', resources=resources, ) self._batch_api.create_namespaced_job(body=job, namespace=self.job_namespace) self._instance.report_engine_event( 'Kubernetes run_coordinator job launched', run, EngineEventData([ EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(self.job_namespace, 'Kubernetes Namespace'), EventMetadataEntry.text(run.run_id, 'Run ID'), ]), cls=K8sRunLauncher, ) return run
def to_json(self) -> str: return serialize_dagster_namedtuple(cast(NamedTuple, self))
def ExternalPartitionSetExecutionParams(self, request, _context): partition_set_execution_param_args = deserialize_json_to_dagster_namedtuple( request.serialized_partition_set_execution_param_args) check.inst_param( partition_set_execution_param_args, "partition_set_execution_param_args", PartitionSetExecutionParamArgs, ) recon_repo = self._recon_repository_from_origin( partition_set_execution_param_args.repository_origin) definition = recon_repo.get_definition() partition_set_def = definition.get_partition_set_def( partition_set_execution_param_args.partition_set_name) try: with user_code_error_boundary( PartitionExecutionError, lambda: "Error occurred during the partition generation for partition set " "{partition_set_name}".format(partition_set_name= partition_set_def.name), ): all_partitions = partition_set_def.get_partitions() partitions = [ partition for partition in all_partitions if partition.name in partition_set_execution_param_args.partition_names ] partition_data = [] for partition in partitions: def _error_message_fn(partition_set_name, partition_name): return lambda: ( "Error occurred during the partition config and tag generation for " "partition set {partition_set_name}::{partition_name}". format(partition_set_name=partition_set_name, partition_name=partition_name)) with user_code_error_boundary( PartitionExecutionError, _error_message_fn(partition_set_def.name, partition.name), ): run_config = partition_set_def.run_config_for_partition( partition) tags = partition_set_def.tags_for_partition(partition) partition_data.append( ExternalPartitionExecutionParamData( name=partition.name, tags=tags, run_config=run_config, )) return api_pb2.ExternalPartitionSetExecutionParamsReply( serialized_external_partition_set_execution_param_data_or_external_partition_execution_error =serialize_dagster_namedtuple( ExternalPartitionSetExecutionParamData( partition_data=partition_data))) except PartitionExecutionError: return api_pb2.ExternalPartitionSetExecutionParamsReply( ExternalPartitionExecutionErrorData( serializable_error_info_from_exc_info(sys.exc_info())))
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, environment_dict, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, resources=None, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' from dagster_k8s import DagsterK8sJobConfig, construct_dagster_graphql_k8s_job from dagster_k8s.utils import get_pod_names_in_job, retrieve_pod_logs, wait_for_job_success import kubernetes check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time' ) check.dict_param(environment_dict, 'environment_dict') check.str_param(mode, 'mode') check.str_param(repo_name, 'repo_name') check.str_param(repo_location_name, 'repo_location_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') resources = check.opt_inst_param( resources, 'resources', kubernetes.client.V1ResourceRequirements ) check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_keys_str = ", ".join(step_keys) # Ensure we stay below k8s name length limits k8s_name_key = _get_k8s_name_key(run_id, step_keys) job_name = 'dagster-stepjob-%s' % k8s_name_key pod_name = 'dagster-stepjob-%s' % k8s_name_key variables = { 'executionParams': { 'runConfigData': environment_dict, 'mode': mode, 'selector': { 'repositoryLocationName': repo_location_name, 'repositoryName': repo_name, 'pipelineName': pipeline_run.pipeline_name, }, 'executionMetadata': {'runId': run_id}, 'stepKeys': step_keys, } } args = ['-p', 'executePlan', '-v', seven.json.dumps(variables)] job = construct_dagster_graphql_k8s_job(job_config, args, job_name, resources, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution engine_event = instance.report_engine_event( 'Executing steps {} in Kubernetes job {}'.format(step_keys_str, job.metadata.name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, 'Step keys'), EventMetadataEntry.text(job.metadata.name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text( str(job_config.image_pull_secrets), 'Image pull secrets' ), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name' ), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobEngine, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_keys[0], ) events.append(engine_event) kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace) wait_for_job_success(job.metadata.name, namespace=job_namespace) pod_names = get_pod_names_in_job(job.metadata.name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData([EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobEngine, step_key=step_keys[0], ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') res = parse_raw_log_lines(logs) handle_execution_errors(res, 'executePlan') step_events = handle_execute_plan_result(res) events += step_events serialized_events = [serialize_dagster_namedtuple(event) for event in events] return serialized_events
def ExecuteRun(self, request, _context): try: execute_run_args = deserialize_json_to_dagster_namedtuple( request.serialized_execute_run_args) check.inst_param(execute_run_args, "execute_run_args", ExecuteRunArgs) run_id = execute_run_args.pipeline_run_id recon_pipeline = self._recon_pipeline_from_origin( execute_run_args.pipeline_origin) except: # pylint: disable=bare-except yield api_pb2.ExecuteRunEvent( serialized_dagster_event_or_ipc_error_message= serialize_dagster_namedtuple( IPCErrorMessage( serializable_error_info= serializable_error_info_from_exc_info(sys.exc_info()), message="Error during RPC setup for ExecuteRun", ))) return event_queue = multiprocessing.Queue() termination_event = multiprocessing.Event() execution_process = multiprocessing.Process( target=execute_run_in_subprocess, args=[ request.serialized_execute_run_args, recon_pipeline, event_queue, termination_event, ], ) with self._execution_lock: execution_process.start() self._executions[run_id] = ( execution_process, execute_run_args.instance_ref, ) self._termination_events[run_id] = termination_event done = False while not done: try: # We use `get_nowait()` instead of `get()` so that we can handle the case where the # execution process has died unexpectedly -- `get()` would hang forever in that case dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait( ) except queue.Empty: if not execution_process.is_alive(): # subprocess died unexpectedly yield api_pb2.ExecuteRunEvent( serialized_dagster_event_or_ipc_error_message= serialize_dagster_namedtuple( IPCErrorMessage( serializable_error_info= serializable_error_info_from_exc_info( sys.exc_info()), message= ("GRPC server: Subprocess for {run_id} terminated unexpectedly" ).format(run_id=run_id), ))) done = True time.sleep(EVENT_QUEUE_POLL_INTERVAL) else: if isinstance(dagster_event_or_ipc_error_message_or_done, RunInSubprocessComplete): done = True elif isinstance(dagster_event_or_ipc_error_message_or_done, StartRunInSubprocessSuccessful): continue else: yield api_pb2.ExecuteRunEvent( serialized_dagster_event_or_ipc_error_message= serialize_dagster_namedtuple( dagster_event_or_ipc_error_message_or_done)) with self._execution_lock: if run_id in self._executions: del self._executions[run_id] if run_id in self._termination_events: del self._termination_events[run_id]
def StartRun(self, request, _context): if self._shutdown_once_executions_finish_event.is_set(): return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=False, message= "Tried to start a run on a server after telling it to shut down", serializable_error_info=None, ))) try: execute_run_args = check.inst( deserialize_json_to_dagster_namedtuple( request.serialized_execute_run_args), ExecuteExternalPipelineArgs, ) run_id = execute_run_args.pipeline_run_id recon_pipeline = self._recon_pipeline_from_origin( execute_run_args.pipeline_origin) except: return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=False, message=None, serializable_error_info= serializable_error_info_from_exc_info(sys.exc_info()), ))) event_queue = self._mp_ctx.Queue() termination_event = self._mp_ctx.Event() execution_process = self._mp_ctx.Process( target=start_run_in_subprocess, args=[ request.serialized_execute_run_args, recon_pipeline, event_queue, termination_event, ], ) with self._execution_lock: execution_process.start() self._executions[run_id] = ( execution_process, execute_run_args.instance_ref, ) self._termination_events[run_id] = termination_event success = None message = None serializable_error_info = None while success is None: sleep(EVENT_QUEUE_POLL_INTERVAL) # We use `get_nowait()` instead of `get()` so that we can handle the case where the # execution process has died unexpectedly -- `get()` would hang forever in that case try: dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait( ) except queue.Empty: if not execution_process.is_alive(): # subprocess died unexpectedly success = False message = ( "GRPC server: Subprocess for {run_id} terminated unexpectedly with " "exit code {exit_code}".format( run_id=run_id, exit_code=execution_process.exitcode, )) serializable_error_info = serializable_error_info_from_exc_info( sys.exc_info()) else: if isinstance(dagster_event_or_ipc_error_message_or_done, StartRunInSubprocessSuccessful): success = True elif isinstance(dagster_event_or_ipc_error_message_or_done, RunInSubprocessComplete): continue if isinstance(dagster_event_or_ipc_error_message_or_done, IPCErrorMessage): success = False message = dagster_event_or_ipc_error_message_or_done.message serializable_error_info = ( dagster_event_or_ipc_error_message_or_done. serializable_error_info) # Ensure that if the run failed, we remove it from the executions map before # returning so that CanCancel will never return True if not success: with self._execution_lock: self._clear_run(run_id) return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=success, message=message, serializable_error_info=serializable_error_info, )))
def execute_run(self, execute_run_args): check.inst_param(execute_run_args, "execute_run_args", ExecuteExternalPipelineArgs) with DagsterInstance.from_ref( execute_run_args.instance_ref) as instance: try: pipeline_run = instance.get_run_by_id( execute_run_args.pipeline_run_id) event_iterator = self._streaming_query( "ExecuteRun", api_pb2.ExecuteRunRequest, serialized_execute_run_args=serialize_dagster_namedtuple( execute_run_args), ) except Exception as exc: # pylint: disable=bare-except yield instance.report_engine_event( message="Unexpected error in IPC client", pipeline_run=pipeline_run, engine_event_data=EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), ) raise exc try: for event in event_iterator: yield deserialize_json_to_dagster_namedtuple( event.serialized_dagster_event_or_ipc_error_message) except KeyboardInterrupt: self.cancel_execution( CancelExecutionRequest( run_id=execute_run_args.pipeline_run_id)) raise except grpc.RpcError as rpc_error: if ( # posix "Socket closed" in rpc_error.debug_error_string() # pylint: disable=no-member # windows or "Stream removed" in rpc_error.debug_error_string() # pylint: disable=no-member ): yield instance.report_engine_event( message= "User process: GRPC server for {run_id} terminated unexpectedly" .format(run_id=pipeline_run.run_id), pipeline_run=pipeline_run, engine_event_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info())), ) yield instance.report_run_failed(pipeline_run) else: yield instance.report_engine_event( message="Unexpected error in IPC client", pipeline_run=pipeline_run, engine_event_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info())), ) raise rpc_error except Exception as exc: # pylint: disable=bare-except yield instance.report_engine_event( message="Unexpected error in IPC client", pipeline_run=pipeline_run, engine_event_data=EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info())), ) raise exc
def launch_run(self, instance, run, external_pipeline): check.inst_param(run, "run", PipelineRun) check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline) job_name = "dagster-run-{}".format(run.run_id) pod_name = job_name user_defined_k8s_config = get_user_defined_k8s_config( frozentags(run.tags)) pipeline_origin = None job_config = None if isinstance( external_pipeline.get_external_origin(). external_repository_origin.repository_location_origin, GrpcServerRepositoryLocationOrigin, ): if self._job_image: raise DagsterInvariantViolationError( "Cannot specify job_image in run launcher config when loading pipeline " "from GRPC server.") repository_location_handle = ( external_pipeline.repository_handle.repository_location_handle) if not isinstance(repository_location_handle, GrpcServerRepositoryLocationHandle): raise DagsterInvariantViolationError( "Expected RepositoryLocationHandle to be of type " "GrpcServerRepositoryLocationHandle but found type {}". format(type(repository_location_handle))) repository_name = external_pipeline.repository_handle.repository_name repository_origin = repository_location_handle.reload_repository_python_origin( repository_name) job_image = repository_origin.container_image pipeline_origin = PipelinePythonOrigin( pipeline_name=external_pipeline.name, repository_origin=repository_origin) job_config = self._get_grpc_job_config(job_image) else: pipeline_origin = external_pipeline.get_python_origin() job_config = self._get_static_job_config() input_json = serialize_dagster_namedtuple( ExecuteRunArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run.run_id, instance_ref=None, )) job = construct_dagster_k8s_job( job_config=job_config, args=["dagster", "api", "execute_run", input_json], job_name=job_name, pod_name=pod_name, component="run_coordinator", user_defined_k8s_config=user_defined_k8s_config, ) self._batch_api.create_namespaced_job(body=job, namespace=self.job_namespace) self._instance.report_engine_event( "Kubernetes run_coordinator job launched", run, EngineEventData([ EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(self.job_namespace, "Kubernetes Namespace"), EventMetadataEntry.text(run.run_id, "Run ID"), ]), cls=self.__class__, ) return run
def launch_run(self, run, external_pipeline): check.inst_param(run, "run", PipelineRun) check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline) docker_image = external_pipeline.get_python_origin( ).repository_origin.container_image if not docker_image: docker_image = self._image if not docker_image: raise Exception( "No docker image specified by the instance config or repository" ) try: # validate that the docker image name is valid reference.Reference.parse(docker_image) except Exception as e: raise Exception( "Docker image name {docker_image} is not correctly formatted". format(docker_image=docker_image)) from e input_json = serialize_dagster_namedtuple( ExecuteRunArgs( pipeline_origin=external_pipeline.get_python_origin(), pipeline_run_id=run.run_id, instance_ref=self._instance.get_ref(), )) command = "dagster api execute_run_with_structured_logs {}".format( json.dumps(input_json)) docker_env = ( {env_name: os.getenv(env_name) for env_name in self._env_vars} if self._env_vars else {}) client = self._get_client() try: container = client.containers.create( image=docker_image, command=command, detach=True, environment=docker_env, network=self._network, ) except docker.errors.ImageNotFound: client.images.pull(docker_image) container = client.containers.create( image=docker_image, command=command, detach=True, environment=docker_env, network=self._network, ) self._instance.report_engine_event( message= "Launching run in a new container {container_id} with image {docker_image}" .format( container_id=container.id, docker_image=docker_image, ), pipeline_run=run, cls=self.__class__, ) self._instance.add_run_tags( run.run_id, { DOCKER_CONTAINER_ID_TAG: container.id, DOCKER_IMAGE_TAG: docker_image }, ) container.start() return run
def repository_snapshot_command(**kwargs): recon_repo = recon_repo_for_cli_args(kwargs) definition = recon_repo.get_definition() active_data = external_repository_data_from_def(definition) click.echo(serialize_dagster_namedtuple(active_data))
def to_json(self): return serialize_dagster_namedtuple(self)