def execute(self, context): if "run_id" in self.params: self._run_id = self.params["run_id"] elif "dag_run" in context and context["dag_run"] is not None: self._run_id = context["dag_run"].run_id try: if self.instance: tags = {AIRFLOW_EXECUTION_DATE_STR: context.get("ts")} if "ts" in context else {} run = self.instance.register_managed_run( pipeline_name=self.pipeline_name, run_id=self.run_id, run_config=self.run_config, mode=self.mode, solids_to_execute=None, step_keys_to_execute=None, tags=tags, root_run_id=None, parent_run_id=None, pipeline_snapshot=self.pipeline_snapshot, execution_plan_snapshot=self.execution_plan_snapshot, parent_pipeline_snapshot=self.parent_pipeline_snapshot, ) raw_res = self.execute_raw(context) self.log.info("Finished executing container.") res = parse_raw_log_lines(raw_res) try: handle_execution_errors(res, "executePlan") except DagsterGraphQLClientError as err: if self.instance: self.instance.report_engine_event( str(err), run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info()) ), self.__class__, ) raise events = handle_execute_plan_result_raw(res) if self.instance: for event in events: self.instance.handle_new_event(event) events = [e.dagster_event for e in events] check_events_for_failures(events) check_events_for_skips(events) return events finally: self._run_id = None
def test_parse_raw_log_lines(): raw_log_lines = [ 'Some extraneous log line', 'Another extraneous log line, JSON data is on the next line', '{"data": {"executePlan": {"__typename": "ExecutePlanSuccess"}}}', ] result = {'data': {'executePlan': {'__typename': 'ExecutePlanSuccess'}}} assert parse_raw_log_lines(raw_log_lines) == result
def test_parse_raw_log_lines(): raw_log_lines = [ "Some extraneous log line", "Another extraneous log line, JSON data is on the next line", '{"data": {"executePlan": {"__typename": "ExecutePlanSuccess"}}}', ] result = {"data": {"executePlan": {"__typename": "ExecutePlanSuccess"}}} assert parse_raw_log_lines(raw_log_lines) == result
def wait_for_job_and_get_logs(job_name, namespace): '''Wait for a dagster-k8s job to complete, ensure it launched only one pod, and then grab the logs from the pod it launched. ''' check.str_param(job_name, 'job_name') check.str_param(namespace, 'namespace') wait_for_job_success(job_name, namespace=namespace) pod_names = get_pod_names_in_job(job_name, namespace) assert len(pod_names) == 1 pod_name = pod_names[0] raw_logs = retrieve_pod_logs(pod_name, namespace=namespace) return parse_raw_log_lines(raw_logs.split('\n'))
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, environment_dict, mode, pipeline_name, run_id, job_config_dict, job_namespace, load_incluster_config, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' from dagster_k8s.job import DagsterK8sJobConfig, construct_dagster_graphql_k8s_job from dagster_k8s.utils import get_pod_names_in_job, retrieve_pod_logs, wait_for_job_success import kubernetes check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time' ) check.dict_param(environment_dict, 'environment_dict') check.str_param(mode, 'mode') check.str_param(pipeline_name, 'pipeline_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_keys_str = ", ".join(step_keys) # Ensure we stay below k8s name length limits k8s_name_key = _get_k8s_name_key(run_id, step_keys) job_name = 'dagster-stepjob-%s' % k8s_name_key pod_name = 'dagster-stepjob-%s' % k8s_name_key variables = construct_variables(mode, environment_dict, pipeline_name, run_id, step_keys) args = ['-p', 'executePlan', '-v', seven.json.dumps(variables)] job = construct_dagster_graphql_k8s_job(job_config, args, job_name, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution engine_event = instance.report_engine_event( 'Executing steps {} in Kubernetes job {}'.format(step_keys_str, job.metadata.name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, 'Step keys'), EventMetadataEntry.text(job.metadata.name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text( str(job_config.image_pull_secrets), 'Image pull secrets' ), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name' ), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobEngine, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_keys[0], ) events.append(engine_event) kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace) wait_for_job_success(job.metadata.name, namespace=job_namespace) pod_names = get_pod_names_in_job(job.metadata.name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData([EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobEngine, step_key=step_keys[0], ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') res = parse_raw_log_lines(logs) handle_execution_errors(res, 'executePlan') step_events = handle_execute_plan_result(res) events += step_events serialized_events = [serialize_dagster_namedtuple(event) for event in events] return serialized_events
def execute(self, context): try: from dagster_graphql.client.mutations import ( DagsterGraphQLClientError, handle_execution_errors, handle_execute_plan_result_raw, ) except ImportError: raise AirflowException( 'To use the DagsterDockerOperator, dagster and dagster_graphql must be installed ' 'in your Airflow environment.') if 'run_id' in self.params: self._run_id = self.params['run_id'] elif 'dag_run' in context and context['dag_run'] is not None: self._run_id = context['dag_run'].run_id try: if self.instance: run = self.instance.register_managed_run( pipeline_name=self.pipeline_name, run_id=self.run_id, environment_dict=self.environment_dict, mode=self.mode, solids_to_execute=None, step_keys_to_execute=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot=self.pipeline_snapshot, execution_plan_snapshot=self.execution_plan_snapshot, parent_pipeline_snapshot=self.parent_pipeline_snapshot, ) raw_res = super(DagsterDockerOperator, self).execute(context) self.log.info('Finished executing container.') res = parse_raw_log_lines(raw_res) try: handle_execution_errors(res, 'executePlan') except DagsterGraphQLClientError as err: if self.instance: self.instance.report_engine_event( str(err), run, EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info())), self.__class__, ) raise events = handle_execute_plan_result_raw(res) if self.instance: for event in events: self.instance.handle_new_event(event) events = [e.dagster_event for e in events] check_events_for_failures(events) check_events_for_skips(events) return events finally: self._run_id = None
def execute(self, context): try: from dagster_graphql.client.mutations import ( DagsterGraphQLClientError, handle_execution_errors, handle_execute_plan_result_raw, ) except ImportError: raise AirflowException( 'To use the DagsterKubernetesPodOperator, dagster and dagster_graphql must be' ' installed in your Airflow environment.' ) if 'run_id' in self.params: self._run_id = self.params['run_id'] elif 'dag_run' in context and context['dag_run'] is not None: self._run_id = context['dag_run'].run_id # return to original execute code: try: client = kube_client.get_kube_client( in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file, ) gen = pod_generator.PodGenerator() for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.query(context.get('ts')), labels=self.labels, ) pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.image_pull_secrets = self.image_pull_secrets pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity pod.node_selectors = self.node_selectors pod.hostnetwork = self.hostnetwork pod.tolerations = self.tolerations pod.configmaps = self.configmaps pod.security_context = self.security_context launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push) try: if self.instance: tags = ( {AIRFLOW_EXECUTION_DATE_STR: context.get('ts')} if 'ts' in context else {} ) run = self.instance.register_managed_run( pipeline_name=self.pipeline_name, run_id=self.run_id, run_config=self.run_config, mode=self.mode, solids_to_execute=None, step_keys_to_execute=None, tags=tags, root_run_id=None, parent_run_id=None, pipeline_snapshot=self.pipeline_snapshot, execution_plan_snapshot=self.execution_plan_snapshot, parent_pipeline_snapshot=self.parent_pipeline_snapshot, ) # we won't use the "result", which is the pod's xcom json file (final_state, _) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs ) # fetch the last line independently of whether logs were read # unbelievably, if you set tail_lines=1, the returned json has its double quotes # turned into unparseable single quotes res = None num_attempts = 0 while not res and num_attempts < LOG_RETRIEVAL_MAX_ATTEMPTS: raw_res = client.read_namespaced_pod_log( name=pod.name, namespace=pod.namespace, container='base' ) res = parse_raw_log_lines(raw_res.split('\n')) time.sleep(LOG_RETRIEVAL_WAITS_BETWEEN_ATTEMPTS_SEC) num_attempts += 1 try: handle_execution_errors(res, 'executePlan') except DagsterGraphQLClientError as err: self.instance.report_engine_event( str(err), run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info()) ), self.__class__, ) raise events = handle_execute_plan_result_raw(res) if self.instance: for event in events: self.instance.handle_new_event(event) events = [e.dagster_event for e in events] check_events_for_failures(events) check_events_for_skips(events) return events finally: self._run_id = None if self.is_delete_operator_pod: launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException('Pod returned a failure: {state}'.format(state=final_state)) # note the lack of returning the default xcom except AirflowException as ex: raise AirflowException('Pod Launching failed: {error}'.format(error=ex))
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, resources=None, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time') check.dict_param(run_config, 'run_config') check.str_param(mode, 'mode') check.str_param(repo_name, 'repo_name') check.str_param(repo_location_name, 'repo_location_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.dict_param(retries_dict, 'retries_dict') check.opt_dict_param(resources, 'resources', key_type=str, value_type=dict) check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_key = step_keys[0] if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( 'Not scheduling step because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = _get_k8s_name_key(run_id, step_keys) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) else: job_name = 'dagster-job-%s' % (k8s_name_key) pod_name = 'dagster-job-%s' % (k8s_name_key) variables = { 'executionParams': { 'runConfigData': run_config, 'mode': mode, 'selector': { 'repositoryLocationName': repo_location_name, 'repositoryName': repo_name, 'pipelineName': pipeline_run.pipeline_name, 'solidSelection': list(pipeline_run.solids_to_execute) if pipeline_run.solids_to_execute else None, }, 'executionMetadata': { 'runId': run_id }, 'stepKeys': step_keys, }, 'retries': retries.to_graphql_input(), } args = [ '-p', 'executePlan', '-v', seven.json.dumps(variables), '--remap-sigterm' ] job = construct_dagster_graphql_k8s_job(job_config, args, job_name, resources, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( 'Executing step {} in Kubernetes job {}'.format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text(str(job_config.image_pull_secrets), 'Image pull secrets'), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( 'Terminating Kubernetes Job because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(job_namespace, 'Kubernetes Job namespace'), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData( [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') res = parse_raw_log_lines(logs) handle_execution_errors(res, 'executePlan') step_events = handle_execute_plan_result(res) events += step_events serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events