def test_construct_dagster_k8s_job_with_sidecar_container(): cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", instance_config_map="test", ) job = construct_dagster_k8s_job(cfg, [], "job123").to_dict() assert job["spec"][ "ttl_seconds_after_finished"] == DEFAULT_K8S_JOB_TTL_SECONDS_AFTER_FINISHED user_defined_cfg = UserDefinedDagsterK8sConfig(pod_spec_config={ "containers": [{ "command": ["echo", "HI"], "image": "sidecar:bar", "name": "sidecar" }] }, ) job = construct_dagster_k8s_job( cfg, [], "job123", user_defined_k8s_config=user_defined_cfg).to_dict() containers = job["spec"]["template"]["spec"]["containers"] assert len(containers) == 2 assert containers[0]["image"] == "test/foo:latest" assert containers[1]["image"] == "sidecar:bar" assert containers[1]["command"] == ["echo", "HI"] assert containers[1]["name"] == "sidecar"
def test_construct_dagster_k8s_job_with_job_op_labels(): common_labels = { "app.kubernetes.io/name": "dagster", "app.kubernetes.io/instance": "dagster", "app.kubernetes.io/version": dagster_version, "app.kubernetes.io/part-of": "dagster", } cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", instance_config_map="test", ) job1 = construct_dagster_k8s_job( cfg, [], "job123", labels={ "dagster/job": "some_job", "dagster/op": "some_op", }, ).to_dict() expected_labels1 = dict( **common_labels, **{ "dagster/job": "some_job", "dagster/op": "some_op", }, ) assert job1["metadata"]["labels"] == expected_labels1 assert job1["spec"]["template"]["metadata"]["labels"] == expected_labels1 job2 = construct_dagster_k8s_job( cfg, [], "job456", labels={ "dagster/job": "long_job_name_64____01234567890123456789012345678901234567890123", "dagster/op": "long_op_name_64_____01234567890123456789012345678901234567890123", }, ).to_dict() expected_labels2 = dict( **common_labels, **{ # The last character should be truncated. "dagster/job": "long_job_name_64____0123456789012345678901234567890123456789012", "dagster/op": "long_op_name_64_____0123456789012345678901234567890123456789012", }, ) assert job2["metadata"]["labels"] == expected_labels2 assert job2["spec"]["template"]["metadata"]["labels"] == expected_labels2
def test_construct_dagster_k8s_job_with_ttl(): cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", instance_config_map="test", ) job = construct_dagster_k8s_job(cfg, [], "job123").to_dict() assert job["spec"][ "ttl_seconds_after_finished"] == DEFAULT_K8S_JOB_TTL_SECONDS_AFTER_FINISHED user_defined_cfg = UserDefinedDagsterK8sConfig( job_spec_config={"ttl_seconds_after_finished": 0}, ) job = construct_dagster_k8s_job( cfg, [], "job123", user_defined_k8s_config=user_defined_cfg).to_dict() assert job["spec"]["ttl_seconds_after_finished"] == 0
def test_construct_dagster_k8s_job_with_user_defined_env_from(): @graph def user_defined_k8s_env_from_tags_graph(): pass # These fields still work even when using underscore keys user_defined_k8s_config = get_user_defined_k8s_config( user_defined_k8s_env_from_tags_graph.to_job( tags={ USER_DEFINED_K8S_CONFIG_KEY: { "container_config": { "envFrom": [ { "configMapRef": { "name": "user_config_map_ref", "optional": "True", } }, { "secretRef": { "name": "user_secret_ref_one", "optional": "True" } }, { "secretRef": { "name": "user_secret_ref_two", "optional": "False", }, "prefix": "with_prefix", }, ] } } }).tags) cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", instance_config_map="some-instance-configmap", env_config_maps=["config_map"], env_secrets=["secret"], ) job = construct_dagster_k8s_job( cfg, ["foo", "bar"], "job", user_defined_k8s_config=user_defined_k8s_config).to_dict() env_from = job["spec"]["template"]["spec"]["containers"][0]["env_from"] env_from_mapping = {(env_var.get("config_map_ref") or env_var.get("secret_ref")).get("name"): env_var for env_var in env_from} assert len(env_from_mapping) == 5 assert env_from_mapping["config_map"] assert env_from_mapping["user_config_map_ref"] assert env_from_mapping["secret"] assert env_from_mapping["user_secret_ref_one"] assert env_from_mapping["user_secret_ref_two"]
def test_valid_job_format(run_launcher): docker_image = test_project_docker_image() run_config = load_yaml_from_path( os.path.join(test_project_environments_path(), 'env.yaml')) pipeline_name = 'demo_pipeline' run = PipelineRun(pipeline_name=pipeline_name, run_config=run_config) job_name = 'dagster-run-%s' % run.run_id pod_name = 'dagster-run-%s' % run.run_id job = construct_dagster_k8s_job( job_config=run_launcher.job_config, command=['dagster-graphql'], args=[ '-p', 'executeRunInProcess', '-v', seven.json.dumps({'runId': run.run_id}), ], job_name=job_name, pod_name=pod_name, component='runmaster', ) assert (yaml.dump( remove_none_recursively(job.to_dict()), default_flow_style=False).strip() == EXPECTED_JOB_SPEC.format( run_id=run.run_id, job_image=docker_image, image_pull_policy=image_pull_policy(), dagster_version=dagster_version, resources='', ).strip())
def test_construct_dagster_k8s_job_with_user_defined_service_account_name(): @graph def user_defined_k8s_service_account_name_tags_graph(): pass user_defined_k8s_config = get_user_defined_k8s_config( user_defined_k8s_service_account_name_tags_graph.to_job(tags={ USER_DEFINED_K8S_CONFIG_KEY: { "pod_spec_config": { "service_account_name": "this-should-take-precedence", }, }, }, ).tags) cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", instance_config_map="some-instance-configmap", service_account_name="this-should-be-overriden", ) job = construct_dagster_k8s_job( cfg, ["foo", "bar"], "job", user_defined_k8s_config=user_defined_k8s_config).to_dict() service_account_name = job["spec"]["template"]["spec"][ "service_account_name"] assert service_account_name == "this-should-take-precedence"
def test_valid_job_format(run_launcher): docker_image = test_project_docker_image() run_config = load_yaml_from_path( os.path.join(test_project_environments_path(), "env.yaml")) pipeline_name = "demo_pipeline" run = PipelineRun(pipeline_name=pipeline_name, run_config=run_config) job_name = "dagster-run-%s" % run.run_id pod_name = "dagster-run-%s" % run.run_id job = construct_dagster_k8s_job( job_config=run_launcher.job_config, command=["dagster"], args=["api", "execute_run_with_structured_logs"], job_name=job_name, pod_name=pod_name, component="run_coordinator", ) assert (yaml.dump( remove_none_recursively(job.to_dict()), default_flow_style=False).strip() == EXPECTED_JOB_SPEC.format( run_id=run.run_id, job_image=docker_image, image_pull_policy=image_pull_policy(), dagster_version=dagster_version, resources="", ).strip())
def test_construct_dagster_k8s_job_with_mounts(): cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", image_pull_policy="Always", image_pull_secrets=[{"name": "my_secret"}], service_account_name=None, instance_config_map="some-instance-configmap", postgres_password_secret=None, env_config_maps=None, env_secrets=None, volume_mounts=[ {"name": "foo", "path": "biz/buz", "sub_path": "file.txt", "configmap": "settings-cm"} ], ) job = construct_dagster_k8s_job(cfg, ["foo", "bar"], "job123").to_dict() assert len(job["spec"]["template"]["spec"]["volumes"]) == 2 foo_volumes = [ volume for volume in job["spec"]["template"]["spec"]["volumes"] if volume["name"] == "foo" ] assert len(foo_volumes) == 1 assert len(job["spec"]["template"]["spec"]["containers"][0]["volume_mounts"]) == 2 foo_volumes_mounts = [ volume for volume in job["spec"]["template"]["spec"]["containers"][0]["volume_mounts"] if volume["name"] == "foo" ] assert len(foo_volumes_mounts) == 1 cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", image_pull_policy="Always", image_pull_secrets=[{"name": "my_secret"}], service_account_name=None, instance_config_map="some-instance-configmap", postgres_password_secret=None, env_config_maps=None, env_secrets=None, volume_mounts=[ {"name": "foo", "path": "biz/buz", "sub_path": "file.txt", "secret": "settings-secret"} ], ) construct_dagster_k8s_job(cfg, ["foo", "bar"], "job123").to_dict()
def test_construct_dagster_k8s_job_with_user_defined_env(): @graph def user_defined_k8s_env_tags_graph(): pass user_defined_k8s_config = get_user_defined_k8s_config( user_defined_k8s_env_tags_graph.to_job( tags={ USER_DEFINED_K8S_CONFIG_KEY: { "container_config": { "env": [ { "name": "ENV_VAR_1", "value": "one" }, { "name": "ENV_VAR_2", "value": "two" }, { "name": "DD_AGENT_HOST", "valueFrom": { "fieldRef": { "fieldPath": "status.hostIP" } }, }, ] } } }).tags) cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", instance_config_map="some-instance-configmap", ) job = construct_dagster_k8s_job( cfg, ["foo", "bar"], "job", user_defined_k8s_config=user_defined_k8s_config).to_dict() env = job["spec"]["template"]["spec"]["containers"][0]["env"] env_mapping = remove_none_recursively( {env_var["name"]: env_var for env_var in env}) # Has DAGSTER_HOME and three additional env vars assert len(env_mapping) == 4 assert env_mapping["ENV_VAR_1"]["value"] == "one" assert env_mapping["ENV_VAR_2"]["value"] == "two" assert env_mapping["DD_AGENT_HOST"]["value_from"] == { "field_ref": { "field_path": "status.hostIP" } }
def test_valid_job_format_with_backcompat_resources(run_launcher): docker_image = test_project_docker_image() run_config = load_yaml_from_path( os.path.join(test_project_environments_path(), "env.yaml")) pipeline_name = "demo_pipeline" run = PipelineRun(pipeline_name=pipeline_name, run_config=run_config) tags = validate_tags({ K8S_RESOURCE_REQUIREMENTS_KEY: ({ "requests": { "cpu": "250m", "memory": "64Mi" }, "limits": { "cpu": "500m", "memory": "2560Mi" }, }) }) user_defined_k8s_config = get_user_defined_k8s_config(tags) job_name = "dagster-run-%s" % run.run_id pod_name = "dagster-run-%s" % run.run_id job = construct_dagster_k8s_job( job_config=run_launcher.job_config, command=["dagster-graphql"], args=[ "-p", "executeRunInProcess", "-v", seven.json.dumps({"runId": run.run_id}), ], job_name=job_name, user_defined_k8s_config=user_defined_k8s_config, pod_name=pod_name, component="run_coordinator", ) assert (yaml.dump( remove_none_recursively(job.to_dict()), default_flow_style=False).strip() == EXPECTED_JOB_SPEC.format( run_id=run.run_id, job_image=docker_image, image_pull_policy=image_pull_policy(), dagster_version=dagster_version, resources=""" resources: limits: cpu: 500m memory: 2560Mi requests: cpu: 250m memory: 64Mi""", ).strip())
def test_construct_dagster_k8s_job_with_user_defined_volume_mounts_snake_case( ): @graph def user_defined_k8s_volume_mounts_tags_graph(): pass # volume_mounts still work even when using underscore keys user_defined_k8s_config = get_user_defined_k8s_config( user_defined_k8s_volume_mounts_tags_graph.to_job( tags={ USER_DEFINED_K8S_CONFIG_KEY: { "container_config": { "volume_mounts": [ { "mountPath": "mount_path", "mountPropagation": "mount_propagation", "name": "a_volume_mount_one", "readOnly": "False", "subPath": "path/", }, { "mountPath": "mount_path", "mountPropagation": "mount_propagation", "name": "a_volume_mount_two", "readOnly": "False", "subPathExpr": "path/", }, ] } } }).tags) cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", instance_config_map="some-instance-configmap", ) job = construct_dagster_k8s_job( cfg, ["foo", "bar"], "job", user_defined_k8s_config=user_defined_k8s_config).to_dict() volume_mounts = job["spec"]["template"]["spec"]["containers"][0][ "volume_mounts"] volume_mounts_mapping = { volume_mount["name"]: volume_mount for volume_mount in volume_mounts } assert len(volume_mounts_mapping) == 3 assert volume_mounts_mapping["dagster-instance"] assert volume_mounts_mapping["a_volume_mount_one"] assert volume_mounts_mapping["a_volume_mount_two"]
def test_valid_job_format_with_resources(run_launcher): docker_image = test_project_docker_image() run_config = load_yaml_from_path( os.path.join(test_project_environments_path(), 'env.yaml')) pipeline_name = 'demo_pipeline' run = PipelineRun(pipeline_name=pipeline_name, run_config=run_config) tags = validate_tags({ K8S_RESOURCE_REQUIREMENTS_KEY: ({ 'requests': { 'cpu': '250m', 'memory': '64Mi' }, 'limits': { 'cpu': '500m', 'memory': '2560Mi' }, }) }) resources = get_k8s_resource_requirements(tags) job_name = 'dagster-run-%s' % run.run_id pod_name = 'dagster-run-%s' % run.run_id job = construct_dagster_k8s_job( job_config=run_launcher.job_config, command=['dagster-graphql'], args=[ '-p', 'executeRunInProcess', '-v', seven.json.dumps({'runId': run.run_id}), ], job_name=job_name, resources=resources, pod_name=pod_name, component='runmaster', ) assert (yaml.dump( remove_none_recursively(job.to_dict()), default_flow_style=False).strip() == EXPECTED_JOB_SPEC.format( run_id=run.run_id, job_image=docker_image, image_pull_policy=image_pull_policy(), dagster_version=dagster_version, resources=''' resources: limits: cpu: 500m memory: 2560Mi requests: cpu: 250m memory: 64Mi''', ).strip())
def test_construct_dagster_k8s_job_with_env(): with environ({"ENV_VAR_1": "one", "ENV_VAR_2": "two"}): cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", instance_config_map="some-instance-configmap", env_vars=["ENV_VAR_1", "ENV_VAR_2"], ) job = construct_dagster_k8s_job(cfg, ["foo", "bar"], "job").to_dict() env = job["spec"]["template"]["spec"]["containers"][0]["env"] env_mapping = {env_var["name"]: env_var for env_var in env} # Has DAGSTER_HOME and two additional env vars assert len(env_mapping) == 3 assert env_mapping["ENV_VAR_1"]["value"] == "one" assert env_mapping["ENV_VAR_2"]["value"] == "two"
def test_construct_dagster_k8s_job_no_postgres(): cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", image_pull_policy="Always", image_pull_secrets=[{"name": "my_secret"}], service_account_name=None, instance_config_map="some-instance-configmap", postgres_password_secret=None, env_config_maps=None, env_secrets=None, ) job = construct_dagster_k8s_job(cfg, ["foo", "bar"], "job123").to_dict() assert job["kind"] == "Job" assert job["metadata"]["name"] == "job123" assert job["spec"]["template"]["spec"]["containers"][0]["image"] == "test/foo:latest" assert DAGSTER_PG_PASSWORD_ENV_VAR not in [ env["name"] for env in job["spec"]["template"]["spec"]["containers"][0]["env"] ]
def test_sanitize_labels(): cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", instance_config_map="test", ) job = construct_dagster_k8s_job( cfg, [], "job456", labels={ "dagster/op": "-get_f\o.o[bar-0]-", # pylint: disable=anomalous-backslash-in-string "my_label": "_WhatsUP", }, ).to_dict() assert job["metadata"]["labels"]["dagster/op"] == "get_f-o.o-bar-0" assert job["metadata"]["labels"]["my_label"] == "WhatsUP"
def test_k8s_tag_op(): assert my_op user_defined_cfg = get_user_defined_k8s_config(my_op.tags) cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", instance_config_map="test", ) job = construct_dagster_k8s_job(cfg, [], "job123", user_defined_k8s_config=user_defined_cfg) assert job.to_dict( )["spec"]["template"]["spec"]["containers"][0]["resources"] == { "requests": { "cpu": "200m", "memory": "32Mi" }, "limits": None, }
def _execute_step_k8s_job( self, execute_step_args_packed, job_config_dict, job_namespace, load_incluster_config, user_defined_k8s_config_dict=None, kubeconfig_file=None, ): """Run step execution in a K8s job pod.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.invariant( len(execute_step_args.step_keys_to_execute) == 1, "Celery K8s task executor can only execute 1 step at a time", ) # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, "job_config", DagsterK8sJobConfig) check.str_param(job_namespace, "job_namespace") check.bool_param(load_incluster_config, "load_incluster_config") user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict( user_defined_k8s_config_dict) check.opt_inst_param( user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig, ) check.opt_str_param(kubeconfig_file, "kubeconfig_file") # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_key = execute_step_args.step_keys_to_execute[0] celery_worker_name = self.request.hostname celery_pod_name = os.environ.get("HOSTNAME") instance.report_engine_event( "Task for step {step_key} picked up by Celery".format( step_key=step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(celery_worker_name, "Celery worker name"), EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"), ]), CeleryK8sJobExecutor, step_key=step_key, ) if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( "Not scheduling step because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), ]), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id, step_key) retry_state = execute_step_args.known_state.get_retry_state() if retry_state.get_attempt_count(step_key): attempt_number = retry_state.get_attempt_count(step_key) job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) else: job_name = "dagster-job-%s" % (k8s_name_key) pod_name = "dagster-job-%s" % (k8s_name_key) input_json = serialize_dagster_namedtuple(execute_step_args) args = ["dagster", "api", "execute_step", input_json] job = construct_dagster_k8s_job(job_config, args, job_name, user_defined_k8s_config, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( "Executing step {} in Kubernetes job {}".format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_config.job_image, "Job image"), EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"), EventMetadataEntry.text(str(job_config.image_pull_secrets), "Image pull secrets"), EventMetadataEntry.text( str(job_config.service_account_name), "Service account name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == "Conflict": # There is an existing job with the same name so proceed and see if the existing job succeeded instance.report_engine_event( "Did not create Kubernetes job {} for step {} since job name already " "exists, proceeding with existing job.".format( job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( "Encountered unexpected error while creating Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=execute_step_args.pipeline_run_id, ) except (DagsterK8sError, DagsterK8sTimeoutError) as err: step_failure_event = construct_step_failure_event_and_handle( pipeline_run, step_key, err, instance=instance) events.append(step_failure_event) except DagsterK8sPipelineStatusException: instance.report_engine_event( "Terminating Kubernetes Job because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return [] except ( DagsterK8sUnrecoverableAPIError, DagsterK8sAPIRetryLimitExceeded, # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in # a retry boundary. We still catch it here just in case we missed one so that we can # report it to the event log kubernetes.client.rest.ApiException, ) as err: instance.report_engine_event( "Encountered unexpected error while waiting on Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Post engine event for log retrieval engine_event = instance.report_engine_event( "Retrieving logs from Kubernetes Job pods", pipeline_run, EngineEventData( [EventMetadataEntry.text("\n".join(pod_names), "Pod names")]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: try: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split("\n") except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error while fetching pod logs for Kubernetes job {}, " "Pod name {} for step {}. Will attempt to continue with other pods." .format(job_name, pod_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) events += filter_dagster_events_from_pod_logs(logs) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def test_valid_job_format_with_user_defined_k8s_config(run_launcher): docker_image = test_project_docker_image() run_config = load_yaml_from_path( os.path.join(test_project_environments_path(), "env.yaml")) pipeline_name = "demo_pipeline" run = PipelineRun(pipeline_name=pipeline_name, run_config=run_config) tags = validate_tags({ USER_DEFINED_K8S_CONFIG_KEY: ({ "container_config": { "resources": { "requests": { "cpu": "250m", "memory": "64Mi" }, "limits": { "cpu": "500m", "memory": "2560Mi" }, } }, "pod_template_spec_metadata": { "annotations": { "cluster-autoscaler.kubernetes.io/safe-to-evict": "true" }, "labels": { "spotinst.io/restrict-scale-down": "true" }, }, "pod_spec_config": { "affinity": { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { "nodeSelectorTerms": [{ "matchExpressions": [{ "key": "kubernetes.io/e2e-az-name", "operator": "In", "values": ["e2e-az1", "e2e-az2"], }] }] } } } }, }) }) user_defined_k8s_config = get_user_defined_k8s_config(tags) job_name = "dagster-run-%s" % run.run_id pod_name = "dagster-run-%s" % run.run_id job = construct_dagster_k8s_job( job_config=run_launcher.job_config, command=["dagster"], args=["api", "execute_run_with_structured_logs"], job_name=job_name, user_defined_k8s_config=user_defined_k8s_config, pod_name=pod_name, component="run_coordinator", ) assert (yaml.dump(remove_none_recursively(job.to_dict()), default_flow_style=False).strip() == EXPECTED_CONFIGURED_JOB_SPEC.format( run_id=run.run_id, job_image=docker_image, image_pull_policy=image_pull_policy(), dagster_version=dagster_version, labels="spotinst.io/restrict-scale-down: 'true'", resources=""" resources: limits: cpu: 500m memory: 2560Mi requests: cpu: 250m memory: 64Mi""", annotations="""annotations: cluster-autoscaler.kubernetes.io/safe-to-evict: \'true\'""", affinity="""affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/e2e-az-name operator: In values: - e2e-az1 - e2e-az2""", ).strip())
def test_construct_dagster_k8s_job_with_mounts(): cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", image_pull_policy="Always", image_pull_secrets=[{ "name": "my_secret" }], service_account_name=None, instance_config_map="some-instance-configmap", postgres_password_secret=None, env_config_maps=None, env_secrets=None, volume_mounts=[{ "name": "foo", "mountPath": "biz/buz", "subPath": "file.txt" }], volumes=[ { "name": "foo", "configMap": { "name": "settings-cm" } }, ], ) job = construct_dagster_k8s_job(cfg, ["foo", "bar"], "job123").to_dict() assert len(job["spec"]["template"]["spec"]["volumes"]) == 2 foo_volumes = [ volume for volume in job["spec"]["template"]["spec"]["volumes"] if volume["name"] == "foo" ] assert len(foo_volumes) == 1 assert foo_volumes[0]["config_map"]["name"] == "settings-cm" assert len( job["spec"]["template"]["spec"]["containers"][0]["volume_mounts"]) == 2 foo_volumes_mounts = [ volume for volume in job["spec"]["template"]["spec"]["containers"][0] ["volume_mounts"] if volume["name"] == "foo" ] assert len(foo_volumes_mounts) == 1 cfg = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", image_pull_policy="Always", image_pull_secrets=[{ "name": "my_secret" }], service_account_name=None, instance_config_map="some-instance-configmap", postgres_password_secret=None, env_config_maps=None, env_secrets=None, volume_mounts=[{ "name": "foo", "mountPath": "biz/buz", "subPath": "file.txt" }], volumes=[ { "name": "foo", "secret": { "secretName": "settings-secret" } }, ], ) job = construct_dagster_k8s_job(cfg, ["foo", "bar"], "job123").to_dict() assert len(job["spec"]["template"]["spec"]["volumes"]) == 2 foo_volumes = [ volume for volume in job["spec"]["template"]["spec"]["volumes"] if volume["name"] == "foo" ] assert len(foo_volumes) == 1 assert foo_volumes[0]["secret"]["secret_name"] == "settings-secret" cfg_with_invalid_volume_key = DagsterK8sJobConfig( job_image="test/foo:latest", dagster_home="/opt/dagster/dagster_home", image_pull_policy="Always", image_pull_secrets=[{ "name": "my_secret" }], service_account_name=None, instance_config_map="some-instance-configmap", postgres_password_secret=None, env_config_maps=None, env_secrets=None, volume_mounts=[{ "name": "foo", "mountPath": "biz/buz", "subPath": "file.txt" }], volumes=[ { "name": "foo", "invalidKey": "settings-secret" }, ], ) with pytest.raises( Exception, match="Unexpected keys in model class V1Volume: {'invalidKey'}"): construct_dagster_k8s_job(cfg_with_invalid_volume_key, ["foo", "bar"], "job123").to_dict()
def _execute_step_k8s_job( self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, pipeline_origin_packed, user_defined_k8s_config_dict=None, kubeconfig_file=None, ): """Run step execution in a K8s job pod. """ check.dict_param(instance_ref_dict, "instance_ref_dict") check.list_param(step_keys, "step_keys", of_type=str) check.invariant( len(step_keys) == 1, "Celery K8s task executor can only execute 1 step at a time" ) check.dict_param(run_config, "run_config") check.str_param(mode, "mode") check.str_param(repo_name, "repo_name") check.str_param(repo_location_name, "repo_location_name") check.str_param(run_id, "run_id") # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, "job_config", DagsterK8sJobConfig) check.str_param(job_namespace, "job_namespace") check.bool_param(load_incluster_config, "load_incluster_config") check.dict_param(retries_dict, "retries_dict") pipeline_origin = unpack_value( check.dict_param( pipeline_origin_packed, "pipeline_origin_packed" ) # TODO: make part of args ) check.inst(pipeline_origin, PipelineOrigin) user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict( user_defined_k8s_config_dict ) check.opt_inst_param( user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig, ) check.opt_str_param(kubeconfig_file, "kubeconfig_file") # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_key = step_keys[0] celery_worker_name = self.request.hostname celery_pod_name = os.environ.get("HOSTNAME") instance.report_engine_event( "Task for step {step_key} picked up by Celery".format(step_key=step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(celery_worker_name, "Celery worker name"), EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( "Not scheduling step because pipeline run status is not STARTED", pipeline_run, EngineEventData([EventMetadataEntry.text(step_key, "Step keys"),]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(run_id, step_key) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) else: job_name = "dagster-job-%s" % (k8s_name_key) pod_name = "dagster-job-%s" % (k8s_name_key) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=None, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, ) ) command = ["dagster"] args = ["api", "execute_step_with_structured_logs", input_json] job = construct_dagster_k8s_job( job_config, command, args, job_name, user_defined_k8s_config, pod_name ) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( "Executing step {} in Kubernetes job {}".format(step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(pod_name, "Kubernetes Pod name"), EventMetadataEntry.text(job_config.job_image, "Job image"), EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"), EventMetadataEntry.text( str(job_config.image_pull_secrets), "Image pull secrets" ), EventMetadataEntry.text( str(job_config.service_account_name), "Service account name" ), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == "Conflict": # There is an existing job with the same name so do not procede. instance.report_engine_event( "Did not create Kubernetes job {} for step {} since job name already " "exists, exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(pod_name, "Kubernetes Pod name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( "Encountered unexpected error while creating Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(e, "Error"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) return try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( "Terminating Kubernetes Job because pipeline run status is not STARTED", pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( "Retrieving logs from Kubernetes Job pods", pipeline_run, EngineEventData([EventMetadataEntry.text("\n".join(pod_names), "Pod names")]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split("\n") events += filter_dagster_events_from_pod_logs(logs) serialized_events = [serialize_dagster_namedtuple(event) for event in events] return serialized_events
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, pipeline_origin_packed, resources=None, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time') check.dict_param(run_config, 'run_config') check.str_param(mode, 'mode') check.str_param(repo_name, 'repo_name') check.str_param(repo_location_name, 'repo_location_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.dict_param(retries_dict, 'retries_dict') pipeline_origin = unpack_value( check.dict_param( pipeline_origin_packed, 'pipeline_origin_packed') # TODO: make part of args ) check.inst(pipeline_origin, PipelineOrigin) check.opt_dict_param(resources, 'resources', key_type=str, value_type=dict) check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_key = step_keys[0] if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( 'Not scheduling step because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(run_id, step_key) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) else: job_name = 'dagster-job-%s' % (k8s_name_key) pod_name = 'dagster-job-%s' % (k8s_name_key) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=None, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, )) command = ['dagster'] args = ['api', 'execute_step_with_structured_logs', input_json] job = construct_dagster_k8s_job(job_config, command, args, job_name, resources, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( 'Executing step {} in Kubernetes job {}'.format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text(str(job_config.image_pull_secrets), 'Image pull secrets'), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == 'Conflict': # There is an existing job with the same name so do not procede. instance.report_engine_event( 'Did not create Kubernetes job {} for step {} since job name already ' 'exists, exiting.'.format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( 'Encountered unexpected error while creating Kubernetes job {} for step {}, ' 'exiting.'.format(job_name, step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(e, 'Error'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( 'Terminating Kubernetes Job because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(job_namespace, 'Kubernetes Job namespace'), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData( [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') events += filter_dagster_events_from_pod_logs(logs) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def test_valid_job_format_with_user_defined_k8s_config(run_launcher): docker_image = test_project_docker_image() run_config = load_yaml_from_path( os.path.join(test_project_environments_path(), 'env.yaml')) pipeline_name = 'demo_pipeline' run = PipelineRun(pipeline_name=pipeline_name, run_config=run_config) tags = validate_tags({ USER_DEFINED_K8S_CONFIG_KEY: ({ 'container_config': { 'resources': { 'requests': { 'cpu': '250m', 'memory': '64Mi' }, 'limits': { 'cpu': '500m', 'memory': '2560Mi' }, } }, 'pod_template_spec_metadata': { 'annotations': { "cluster-autoscaler.kubernetes.io/safe-to-evict": "true" } }, 'pod_spec_config': { 'affinity': { 'nodeAffinity': { 'requiredDuringSchedulingIgnoredDuringExecution': { 'nodeSelectorTerms': [{ 'matchExpressions': [{ 'key': 'kubernetes.io/e2e-az-name', 'operator': 'In', 'values': ['e2e-az1', 'e2e-az2'], }] }] } } } }, }) }) user_defined_k8s_config = get_user_defined_k8s_config(tags) job_name = 'dagster-run-%s' % run.run_id pod_name = 'dagster-run-%s' % run.run_id job = construct_dagster_k8s_job( job_config=run_launcher.job_config, command=['dagster-graphql'], args=[ '-p', 'executeRunInProcess', '-v', seven.json.dumps({'runId': run.run_id}), ], job_name=job_name, user_defined_k8s_config=user_defined_k8s_config, pod_name=pod_name, component='run_coordinator', ) assert (yaml.dump(remove_none_recursively(job.to_dict()), default_flow_style=False).strip() == EXPECTED_CONFIGURED_JOB_SPEC.format( run_id=run.run_id, job_image=docker_image, image_pull_policy=image_pull_policy(), dagster_version=dagster_version, resources=''' resources: limits: cpu: 500m memory: 2560Mi requests: cpu: 250m memory: 64Mi''', annotations='''annotations: cluster-autoscaler.kubernetes.io/safe-to-evict: \'true\'''', affinity='''affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/e2e-az-name operator: In values: - e2e-az1 - e2e-az2''', ).strip())