def execute(self, context): try: client = kube_client.get_kube_client( in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file) gen = pod_generator.PodGenerator() for port in self.ports: gen.add_port(port) for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.arguments, labels=self.labels, ) pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.image_pull_secrets = self.image_pull_secrets pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity pod.node_selectors = self.node_selectors pod.hostnetwork = self.hostnetwork pod.tolerations = self.tolerations pod.configmaps = self.configmaps pod.security_context = self.security_context pod.pod_runtime_info_envs = self.pod_runtime_info_envs pod.dnspolicy = self.dnspolicy launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push) try: (final_state, result) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) finally: if self.is_delete_operator_pod: launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format( state=final_state)) if self.xcom_push: return result except AirflowException as ex: raise AirflowException( 'Pod Launching failed: {error}'.format(error=ex))
def execute(self, context): try: client = kube_client.get_kube_client(in_cluster=self.in_cluster) gen = pod_generator.PodGenerator() pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.arguments, labels=self.labels ) pod.secrets = self.secrets pod.envs = self.env_vars launcher = pod_launcher.PodLauncher(client) final_state = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) if final_state != State.SUCCESS: raise AirflowException('Pod returned a failure') except AirflowException as ex: raise AirflowException('Pod Launching failed: {error}'.format(error=ex))
def infrastructure(self): kube_client = get_kube_client() namespace = configuration.get("kubernetes", "namespace") headers = { "Authorization": kube_client.api_client.configuration.get_api_key_with_prefix( 'authorization') } url = "{0}/oapi/v1/namespaces/{1}/deploymentconfigs".format( kube_client.api_client.configuration.host, namespace) response = requests.get( url, headers=headers, params={"labelSelector": self.AIRFLOW_LABEL}, verify=kube_client.api_client.configuration.ssl_ca_cert) if response.status_code == 200: deployment_configs = response.json() else: return abort(response.status_code) try: pods = kube_client.list_namespaced_pod( namespace=namespace, label_selector=self.AIRFLOW_LABEL) pods = pods.to_dict() except ApiException: return abort(401) return jsonify( self.to_graph(namespace=namespace, oc_deployment_configs=deployment_configs, oc_pods=pods))
def pod(self, pod): task_instances = self.get_task_instances(pod=pod) if not task_instances: abort(404) kube_client = get_kube_client() namespace = configuration.get("kubernetes", "namespace") client = elasticsearch.Elasticsearch([ELASTICSEARCH_HOST]) count = Search(using=client) \ .query('match', **{"beat.hostname": pod}) \ .sort('offset') \ .count() task_instance = task_instances try: pod = kube_client.read_namespaced_pod(name=pod, namespace=namespace) pod = self.pod_info(oc_pod=pod.to_dict(), task_instance=task_instance, namespace=namespace, host=self.HOST) except ApiException: pod = self.pod_info( oc_pod=self.get_pod_from_log_info(task_instance=task_instance), task_instance=task_instance, namespace=namespace, host=self.HOST) pod['log'] = {"count": count} return jsonify(pod)
def execute(self, context): try: client = kube_client.get_kube_client(in_cluster=self.in_cluster) gen = pod_generator.PodGenerator() pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.arguments, labels=self.labels ) pod.secrets = self.secrets launcher = pod_launcher.PodLauncher(client) final_state = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) if final_state != State.SUCCESS: raise AirflowException('Pod returned a failure') except AirflowException as ex: raise AirflowException('Pod Launching failed: {error}'.format(error=ex))
def on_kill(self): self.log.debug("Kill Command is being called") if self._should_track_driver_status: if self._driver_id: self.log.info("Killing driver {} on cluster".format( self._driver_id)) kill_cmd = self._build_spark_driver_kill_command() driver_kill = subprocess.Popen(kill_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.log.info( "Spark driver {} killed with return code: {}".format( self._driver_id, driver_kill.wait())) if self._submit_sp and self._submit_sp.poll() is None: self.log.info("Sending kill signal to %s", self._connection["spark_binary"]) self._submit_sp.kill() if self._yarn_application_id: self.log.info("Killing application {} on YARN".format( self._yarn_application_id)) kill_cmd = "yarn application -kill {}".format( self._yarn_application_id).split() yarn_kill = subprocess.Popen(kill_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.log.info("YARN killed with return code: %s", yarn_kill.wait()) if self._kubernetes_driver_pod: self.log.info("Killing pod %s on Kubernetes", self._kubernetes_driver_pod) # Currently only instantiate Kubernetes client for killing a spark pod. try: import kubernetes client = kube_client.get_kube_client() api_response = client.delete_namespaced_pod( self._kubernetes_driver_pod, self._connection["namespace"], body=kubernetes.client.V1DeleteOptions(), pretty=True, ) self.log.info("Spark on K8s killed with response: %s", api_response) except kube_client.ApiException as e: self.log.info( "Exception when attempting to kill Spark on K8s:") self.log.exception(e)
def execute(self, context): try: client = kube_client.get_kube_client( in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file) gen = pod_generator.PodGenerator() for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) if self.in_cluster: worker_pod_name = os.environ.get('WORKER_POD_NAME') worker_pod_uid = os.environ.get('WORKER_POD_UID') gen.add_ownerreference(worker_pod_name, "v1", "Pod", worker_pod_uid) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.arguments, labels=self.labels, ) pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity pod.node_selectors = self.node_selectors pod.image_pull_secrets = self.image_pull_secrets pod.hostnetwork = self.hostnetwork launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push) (final_state, result) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) # if self.is_delete_operator_pod: # launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format( state=final_state)) if self.xcom_push: return result except AirflowException as ex: raise AirflowException( 'Pod Launching failed: {error}'.format(error=ex))
def cluster_view(self): kube_client = get_kube_client() namespace = configuration.get("kubernetes", "namespace") pods = kube_client.list_namespaced_pod( namespace=namespace, label_selector=self.AIRFLOW_LABEL) return self.render("openshift_cluster_view.html", namespace=namespace, pods=pods)
def log_pod_creation(self, pod: Pod, resp, session=None): from openshift_plugin.executor.airflow_openshift_scheduler import AirflowOpenShiftScheduler execution_date = AirflowOpenShiftScheduler.label_safe_datestring_to_datetime(pod.labels['execution_date']) task_instance = session.query(TaskInstance) \ .filter(TaskInstance.dag_id == pod.labels['dag_id']) \ .filter(TaskInstance.task_id == pod.labels['task_id']) \ .filter(TaskInstance.execution_date == execution_date).first() if not task_instance: self.log.error( "Could not find task instance based on the pod labels" " ({dag_id} {task_id} {execution_date} {try_number})".format( **pod.labels)) self.log.error("Log information will be incomplete. This is a BUG please report!!!") def default(o): if isinstance(o, (datetime.date, datetime.datetime)): return o.isoformat() kube_client = get_kube_client() headers = {"Authorization": kube_client.api_client.configuration.get_api_key_with_prefix('authorization')} url = "{0}/apis/image.openshift.io/v1/namespaces/{1}/imagestreamtags/{2}".format( kube_client.api_client.configuration.host, pod.image.split("/")[-2], quote_plus(pod.image.split("/")[-1])) response = requests.get(url, headers=headers, verify=kube_client.api_client.configuration.ssl_ca_cert) resp = resp.to_dict() if response.status_code == 200: image_reference = response.json() resp['spec']['containers'][0]['image'] = image_reference["tag"]["from"]["name"] else: image_reference = None log = Log( event=OpenShiftPodLauncer.EVENT_POD_CREATION, dag_id=task_instance.dag_id, task_instance=None, task_id=task_instance.task_id, execution_date=task_instance.execution_date, extra=json.dumps( { "request": self.kube_req_factory.create(pod), "response": resp, "image": image_reference }, default=default) ) session.add(log) session.commit() pass
def execute(self, context): try: client = kube_client.get_kube_client(in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file) gen = pod_generator.PodGenerator() for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.arguments, labels=self.labels, ) pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.image_pull_secrets = self.image_pull_secrets pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity pod.node_selectors = self.node_selectors pod.hostnetwork = self.hostnetwork pod.tolerations = self.tolerations pod.configmaps = self.configmaps pod.security_context = self.security_context launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.do_xcom_push) try: (final_state, result) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) finally: if self.is_delete_operator_pod: launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format(state=final_state) ) return result except AirflowException as ex: raise AirflowException('Pod Launching failed: {error}'.format(error=ex))
def run(self): kube_client = get_kube_client() while True: try: self.resource_version = self._run(kube_client, self.resource_version, self.worker_uuid) except Exception: self.log.exception('Unknown error in KubernetesJobWatcher. Failing') raise else: self.log.warn('Watch died gracefully, starting back up with: ' 'last resource_version: %s', self.resource_version)
def get_image_url(self, host, orig_namespace, image_reference: str): if "kibana" in image_reference: return "https://www.docker.elastic.co" if "postgresql" in image_reference: return None kube_client = get_kube_client() image_reference_split = image_reference.split("/") image_hash = image_reference_split[-1] if len(image_reference_split) >= 2: namespace = image_reference_split[-2] else: namespace = orig_namespace if "@" in image_hash: image_stream, image_sha256 = image_hash.split("@") headers = { "Authorization": kube_client.api_client.configuration.get_api_key_with_prefix( 'authorization') } url = "{0}/oapi/v1/namespaces/{1}/imagestreams/{2}".format( kube_client.api_client.configuration.host, namespace, image_stream) response = requests.get( url, headers=headers, params={"labelSelector": self.AIRFLOW_LABEL}, verify=kube_client.api_client.configuration.ssl_ca_cert) if response.status_code == 200: image = response.json() else: return None image_tag_name = None for image_tag in image['spec']['tags']: if image_tag['from']['name'] == image_hash: image_tag_name = image_tag['name'] break else: image_stream, image_tag_name = image_hash.split(":") if image_tag_name: return "{0}/console/project/{1}/browse/images/{2}/{3}?tab=body".format( host, namespace, image_stream, image_tag_name) else: return None
def on_kill(self): self.log.debug("Kill Command is being called") if self._should_track_driver_status: if self._driver_id: self.log.info('Killing driver {} on cluster' .format(self._driver_id)) kill_cmd = self._build_spark_driver_kill_command() driver_kill = subprocess.Popen(kill_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.log.info("Spark driver {} killed with return code: {}" .format(self._driver_id, driver_kill.wait())) if self._submit_sp and self._submit_sp.poll() is None: self.log.info('Sending kill signal to %s', self._connection['spark_binary']) self._submit_sp.kill() if self._yarn_application_id: self.log.info('Killing application {} on YARN' .format(self._yarn_application_id)) kill_cmd = "yarn application -kill {}" \ .format(self._yarn_application_id).split() yarn_kill = subprocess.Popen(kill_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.log.info("YARN killed with return code: %s", yarn_kill.wait()) if self._kubernetes_driver_pod: self.log.info('Killing pod %s on Kubernetes', self._kubernetes_driver_pod) # Currently only instantiate Kubernetes client for killing a spark pod. try: import kubernetes client = kube_client.get_kube_client() api_response = client.delete_namespaced_pod( self._kubernetes_driver_pod, self._connection['namespace'], body=kubernetes.client.V1DeleteOptions(), pretty=True) self.log.info("Spark on K8s killed with response: %s", api_response) except kube_client.ApiException as e: self.log.info("Exception when attempting to kill Spark on K8s:") self.log.exception(e)
def get_image_dag_info(self): client = self.kube_client or get_kube_client() launcher = PodLauncher(kube_client=client) pod = self.create_sync_pod() status, result = launcher.run_pod(pod, get_logs=False) logs = client.read_namespaced_pod_log( name=pod.name, namespace=pod.namespace, container='base', follow=True, _preload_content=False) launcher.delete_pod(pod) return status, logs.data, pod
def extract_env_and_secrets(pod, req): KubernetesRequestFactory.extract_env_and_secrets(pod, req) env = req['spec']['containers'][0]['env'].copy() env = [i for i in env if not i["name"].startswith("AIRFLOW")] kube_client = get_kube_client() if configuration.conf.getboolean("kubernetes", "in_cluster"): pod_config = kube_client.read_namespaced_pod( name=os.getenv("HOSTNAME"), namespace=configuration.conf.get("kubernetes", "namespace")) else: pods = kube_client.list_namespaced_pod( namespace=configuration.conf.get("kubernetes", "namespace"), label_selector="component=airflow-scheduler") pod_config = pods.items[0] self_env = [ convert_dict_key_case(e.to_dict()) for e in pod_config.spec.containers[0].env if e.name not in ["AIRFLOW_COMMAND", "AIRFLOW_EXECUTOR"] ] self_env.append({"name": "AIRFLOW_EXECUTOR", "value": "LocalExecutor"}) if configuration.conf.has_option("core", "worker_logging_level"): self_env.append({ "name": "AIRFLOW__CORE__LOGGING_LEVEL", "value": configuration.conf.get("core", "worker_logging_level") }) self_env.append({ "name": "AIRFLOW__CORE__FAB_LOGGING_LEVEL", "value": configuration.conf.get("core", "worker_logging_level") }) req['spec']['containers'][0]['env'] = self_env + env self_env_from = [ convert_dict_key_case(e.to_dict()) for e in pod_config.spec.containers[0].env_from ] req['spec']['containers'][0]['envFrom'] = self_env_from
def authorize(oauth_app, authorized_response, user_info): with open('/run/secrets/kubernetes.io/serviceaccount/namespace', 'r') as file: namespace = file.read() kube_client = get_kube_client() url = "{0}/apis/rbac.authorization.k8s.io/v1beta1/namespaces/{1}/rolebindings".format( kube_client.api_client.configuration.host, namespace) response = requests.get( url, headers={ "Authorization": "Bearer {0}".format(oauth_app.consumer_secret) }, verify=kube_client.api_client.configuration.ssl_ca_cert if kube_client.api_client.configuration.ssl_ca_cert else False) if response.status_code != 200: LoggingMixin().log.error( "The service account providing OAuth is not allowed to list rolebindings. Deniyng " "access to everyone!!!") return False, False role_binding_list = response.json() allowed_roles = [] for role in role_binding_list['items']: def predicate(subject): if subject['kind'] in ['ServiceAccount', 'User']: return subject['name'] == user_info['metadata']['name'] elif subject['kind'] is 'Group': return subject['name'] in user_info['groups'] name = role['roleRef']['name'] if next((x for x in role['subjects'] if predicate(x)), None): allowed_roles.append(name) allowed_roles = set(allowed_roles) access_roles = set( configuration.conf.get('openshift_plugin', 'access_roles').split(',')) superuser_roles = set( configuration.conf.get('openshift_plugin', 'superuser_roles').split(',')) return bool(allowed_roles & access_roles), \ bool(allowed_roles & superuser_roles)
def start(self): self.log.info('Start Kubernetes executor') self.worker_uuid = KubeWorkerIdentifier.get_or_create_current_kube_worker_uuid( ) self.log.debug('Start with worker_uuid: %s', self.worker_uuid) # always need to reset resource version since we don't know # when we last started, note for behavior below # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs # /CoreV1Api.md#list_namespaced_pod KubeResourceVersion.reset_resource_version() self.task_queue = Queue() self.result_queue = Queue() self.kube_client = get_kube_client() self.kube_scheduler = AirflowKubernetesScheduler( self.kube_config, self.task_queue, self.result_queue, self.kube_client, self.worker_uuid) self._inject_secrets() self.clear_not_launched_queued_tasks()
def start(self): self.log.info('Start Kubernetes executor') self.worker_uuid = KubeWorkerIdentifier.get_or_create_current_kube_worker_uuid() self.log.debug('Start with worker_uuid: %s', self.worker_uuid) # always need to reset resource version since we don't know # when we last started, note for behavior below # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs # /CoreV1Api.md#list_namespaced_pod KubeResourceVersion.reset_resource_version() self.task_queue = Queue() self.result_queue = Queue() self.kube_client = get_kube_client() self.kube_scheduler = AirflowKubernetesScheduler( self.kube_config, self.task_queue, self.result_queue, self.kube_client, self.worker_uuid ) self._inject_secrets() self.clear_not_launched_queued_tasks()
def execute(self, context): try: client = kube_client.get_kube_client( in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file) gen = pod_generator.PodGenerator() for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.arguments, labels=self.labels, ) pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push) (final_state, result) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format( state=final_state)) if self.xcom_push: return result except AirflowException as ex: raise AirflowException( 'Pod Launching failed: {error}'.format(error=ex))
def post_sync(self): dag_tar, pod = self.get_dag_tag() kube_client = get_kube_client() headers = { "Authorization": kube_client.api_client.configuration.get_api_key_with_prefix( 'authorization') } url = "{0}/apis/image.openshift.io/v1/namespaces/{1}/imagestreamtags/{2}".format( kube_client.api_client.configuration.host, pod.image.split("/")[-2], quote_plus(pod.image.split("/")[-1])) response = requests.get( url, headers=headers, verify=kube_client.api_client.configuration.ssl_ca_cert) if response.status_code == 200: image = response.json() else: image = None echo = subprocess.Popen(("echo", dag_tar.decode("utf-8")), stdout=subprocess.PIPE) base64 = subprocess.Popen(("base64", "-d"), stdin=echo.stdout, stdout=subprocess.PIPE) output = subprocess.check_output(("tar", "-tzv"), stdin=base64.stdout) echo.wait() return self.render( "openshift_worker_image_sync_view.html", files=self.parse_tar_list(output), image=image, image_url= "{0}/console/project/dsi-test/browse/images/{1}/{2}?tab=body". format(OpenshiftClusterView.HOST, image['metadata']['name'].split(":")[0], image['tag']['name']))
def execute(self, context): try: client = kube_client.get_kube_client(in_cluster=self.in_cluster, cluster_context=self.cluster_context) gen = pod_generator.PodGenerator() for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.arguments, labels=self.labels, ) pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity launcher = pod_launcher.PodLauncher(kube_client=client) final_state = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format(state=final_state) ) except AirflowException as ex: raise AirflowException('Pod Launching failed: {error}'.format(error=ex))
def run(self): """Performs watching""" kube_client = get_kube_client() while True: try: self.resource_version = self._run(kube_client, self.resource_version, self.worker_uuid, self.kube_config) except ReadTimeoutError: self.log.warning( "There was a timeout error accessing the Kube API. " "Retrying request.", exc_info=True) time.sleep(1) except Exception: self.log.exception( 'Unknown error in KubernetesJobWatcher. Failing') raise else: self.log.warning( 'Watch died gracefully, starting back up with: ' 'last resource_version: %s', self.resource_version)
def on_kill(self): self.log.info("Kill Command is being called") if self._is_ssh: if self._dataeng_spark: SSHOperator( task_id='_kill_task', command=f'kill -TERM $(cat {self.pidfile})', ssh_conn_id=self._ssh_conn_id).execute(context=None) self.log.info("on_kill is finished") elif self._is_yarn: self.log.info('Killing application {} on YARN'.format( self._yarn_application_id)) kill_cmd = "yarn application -kill {}" \ .format(self._yarn_application_id) self.log.info('Killing via ssh command: {}'.format(kill_cmd)) SSHOperator(task_id='_kill_spark', ssh_conn_id=self._ssh_conn_id, command=kill_cmd).execute(None) self.log.info("YARN killed") if self._should_track_driver_status: if self._driver_id: self.log.info('Killing driver {} on cluster'.format( self._driver_id)) kill_cmd = self._build_spark_driver_kill_command() if self._is_ssh: ssh_kill_command = " ".join(kill_cmd) self.log.info( 'Killing via ssh command: {}'.format(ssh_kill_command)) SSHOperator(task_id='_kill_spark', ssh_conn_id=self._ssh_conn_id, command=ssh_kill_command).execute(None) self.log.info("Spark driver {} killed".format( self._driver_id)) else: driver_kill = subprocess.Popen(kill_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.log.info( "Spark driver {} killed with return code: {}".format( self._driver_id, driver_kill.wait())) if self._submit_sp and self._submit_sp.poll() is None: self.log.info('Sending kill signal to %s', self._connection['spark_binary']) self._submit_sp.kill() if self._yarn_application_id: self.log.info('Killing application {} on YARN'.format( self._yarn_application_id)) kill_cmd = "yarn application -kill {}" \ .format(self._yarn_application_id).split() self.log.info('Killing via ssh command: {}'.format(kill_cmd)) yarn_kill = subprocess.Popen(kill_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.log.info("YARN killed with return code: %s", yarn_kill.wait()) if self._kubernetes_driver_pod: self.log.info('Killing pod %s on Kubernetes', self._kubernetes_driver_pod) # Currently only instantiate Kubernetes client for killing a spark pod. try: client = kube_client.get_kube_client() api_response = client.delete_namespaced_pod( self._kubernetes_driver_pod, self._connection['namespace'], body=client.V1DeleteOptions(), pretty=True) self.log.info("Spark on K8s killed with response: %s", api_response) except kube_client.ApiException as e: self.log.info( "Exception when attempting to kill Spark on K8s:") self.log.exception(e)
def execute(self, context): ( pod_template, deployment, ) = get_pod_template_from_deployment_labels_and_namespace( namespace=self.deployment_namespace or self.namespace, config_file=self.config_file, cluster_context=self.cluster_context, in_cluster=self.in_cluster, fields=self.deployment_fields, labels=self.deployment_labels, ) pod_spec: V1PodSpec = pod_template.spec container: V1Container = pod_spec.containers[0] metadata: V1ObjectMeta = pod_template.metadata ( plain_env_vars, container_secrets, container_config_maps, runtime_info_envs, ) = handle_container_environment_variables(container.env) self.image = self.image or container.image self.cmds = self.cmds or container.command self.arguments = self.arguments or container.args or [] self.labels = self.labels or metadata.labels or {} self.name = self._set_name(self.name or deployment.metadata.name) self.env_vars = self.env_vars or plain_env_vars self.ports = self.ports or convert_ports(container) self.volume_mounts = self.volume_mounts or convert_volume_mounts( container) self.volumes = self.volumes or convert_volumes(pod_spec) self.secrets = self.secrets or container_secrets self.image_pull_policy = (self.image_pull_policy or container.image_pull_policy or "IfNotPresent") self.node_selectors = self.node_selectors or pod_spec.node_selector or {} self.annotations = self.annotations or metadata.annotations or {} self.affinity = self.affinity or convert_affinity(pod_spec) self.resources = (self.resources if (self.resources.has_limits() or self.resources.has_requests()) else convert_resources(container)) self.image_pull_secrets = self.image_pull_secrets or convert_image_pull_secrets( pod_spec) self.service_account_name = (self.service_account_name or pod_spec.service_account_name or pod_spec.service_account or "default") self.hostnetwork = (pod_spec.host_network or False if self.hostnetwork is None else self.hostnetwork) self.tolerations = self.tolerations or convert_tolerations(pod_spec) self.configmaps = self.configmaps or container_config_maps self.security_context = self.security_context or convert_security_context( pod_spec) self.pod_runtime_info_envs = self.pod_runtime_info_envs or runtime_info_envs self.dnspolicy = self.dnspolicy or pod_spec.dns_policy self.log.info("volumes %s", self.volumes) try: if self.in_cluster is not None: client = kube_client.get_kube_client( in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file) else: client = kube_client.get_kube_client( cluster_context=self.cluster_context, config_file=self.config_file) # Add Airflow Version to the label # And a label to identify that pod is launched by KubernetesPodOperator self.labels.update({ 'airflow_version': airflow_version.replace('+', '-'), 'kubernetes_pod_operator': 'True', }) gen = pod_generator.PodGenerator() for port in self.ports: gen.add_port(port) for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.arguments, labels=self.labels, ) pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.image_pull_secrets = self.image_pull_secrets pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity pod.node_selectors = self.node_selectors pod.hostnetwork = self.hostnetwork pod.tolerations = self.tolerations pod.configmaps = self.configmaps pod.security_context = self.security_context pod.pod_runtime_info_envs = self.pod_runtime_info_envs pod.dnspolicy = self.dnspolicy launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.do_xcom_push) # monkey patch to avoid https://github.com/apache/airflow/issues/8275 launcher.kube_req_factory.extract_env_and_secrets = extract_env_and_secrets try: (final_state, result) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) finally: if self.is_delete_operator_pod: launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format( state=final_state)) if self.do_xcom_push: return result except AirflowException as ex: raise AirflowException( 'Pod Launching failed: {error}'.format(error=ex))
def execute(self, context): try: from dagster_graphql.client.mutations import ( handle_start_pipeline_execution_errors, handle_start_pipeline_execution_result, ) except ImportError: raise AirflowException( 'To use the DagsterKubernetesPodOperator, dagster and dagster_graphql must be' ' installed in your Airflow environment.') if 'run_id' in self.params: self._run_id = self.params['run_id'] elif 'dag_run' in context and context['dag_run'] is not None: self._run_id = context['dag_run'].run_id # return to original execute code: try: client = kube_client.get_kube_client( in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file, ) gen = pod_generator.PodGenerator() for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.query, labels=self.labels, ) pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.image_pull_secrets = self.image_pull_secrets pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity pod.node_selectors = self.node_selectors pod.hostnetwork = self.hostnetwork pod.tolerations = self.tolerations pod.configmaps = self.configmaps pod.security_context = self.security_context launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push) try: # we won't use the "result", which is the pod's xcom json file (final_state, _) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) # fetch the last line independently of whether logs were read # unbelievably, if you set tail_lines=1, the returned json has its double quotes # turned into unparseable single quotes # TODO: add retries - k8s log servers are _extremely_ flaky raw_res = client.read_namespaced_pod_log( name=pod.name, namespace=pod.namespace, container='base', tail_lines=5) # find the relevant line # TODO: raise sensible exception on garbage API string responses res = parse_raw_res(raw_res) handle_start_pipeline_execution_errors(res) events = handle_start_pipeline_execution_result(res) check_events_for_skips(events) return events finally: self._run_id = None if self.is_delete_operator_pod: launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format( state=final_state)) # note the lack of returning the default xcom except AirflowException as ex: raise AirflowException( 'Pod Launching failed: {error}'.format(error=ex))
def execute(self, context): try: conf = context['dag_run'].conf self.image = conf.get('docker_image_name') if conf.get("commands"): self.cmds = conf.get("commands") if conf.get("arguments"): self.arguments = conf.get("arguments") if conf.get("env_vars"): self.env_vars = conf.get("env_vars") except Exception as e: raise XKubernetesPodOperatorException( "Could not start off with Dag Run Configuration", e) try: client = kube_client.get_kube_client( in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file) gen = pod_generator.PodGenerator() for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.arguments, labels=self.labels, ) pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity pod.node_selectors = self.node_selectors pod.hostnetwork = self.hostnetwork pod.tolerations = self.tolerations launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push) (final_state, result) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) if self.is_delete_operator_pod: launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format( state=final_state)) if self.xcom_push: return result except AirflowException as ex: raise AirflowException( 'Pod Launching failed: {error}'.format(error=ex))
def execute(self, context): try: from dagster_graphql.implementation.pipeline_execution_manager import ( build_synthetic_pipeline_error_record, ) from dagster_graphql.client.mutations import ( DagsterGraphQLClientError, handle_execution_errors, handle_execute_plan_result_raw, ) except ImportError: raise AirflowException( 'To use the DagsterKubernetesPodOperator, dagster and dagster_graphql must be' ' installed in your Airflow environment.') if 'run_id' in self.params: self._run_id = self.params['run_id'] elif 'dag_run' in context and context['dag_run'] is not None: self._run_id = context['dag_run'].run_id # return to original execute code: try: client = kube_client.get_kube_client( in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file, ) gen = pod_generator.PodGenerator() for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.query, labels=self.labels, ) pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.image_pull_secrets = self.image_pull_secrets pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity pod.node_selectors = self.node_selectors pod.hostnetwork = self.hostnetwork pod.tolerations = self.tolerations pod.configmaps = self.configmaps pod.security_context = self.security_context launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push) try: if self.instance: self.instance.get_or_create_run( PipelineRun( pipeline_name=self.pipeline_name, run_id=self.run_id, environment_dict=self.environment_dict, mode=self.mode, selector=ExecutionSelector(self.pipeline_name), reexecution_config=None, step_keys_to_execute=None, tags=None, status=PipelineRunStatus.MANAGED, )) # we won't use the "result", which is the pod's xcom json file (final_state, _) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs) # fetch the last line independently of whether logs were read # unbelievably, if you set tail_lines=1, the returned json has its double quotes # turned into unparseable single quotes # TODO: add retries - k8s log servers are _extremely_ flaky raw_res = client.read_namespaced_pod_log( name=pod.name, namespace=pod.namespace, container='base', tail_lines=5) res = parse_raw_res(raw_res.split('\n')) try: handle_execution_errors(res, 'executePlan') except DagsterGraphQLClientError: event = build_synthetic_pipeline_error_record( self.run_id, serializable_error_info_from_exc_info(sys.exc_info()), self.pipeline_name, ) if self.instance: self.instance.handle_new_event(event) raise events = handle_execute_plan_result_raw(res) if self.instance: for event in events: self.instance.handle_new_event(event) check_raw_events_for_skips(events) return events finally: self._run_id = None if self.is_delete_operator_pod: launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format( state=final_state)) # note the lack of returning the default xcom except AirflowException as ex: raise AirflowException( 'Pod Launching failed: {error}'.format(error=ex))
def execute(self, context): try: from dagster_graphql.client.mutations import ( DagsterGraphQLClientError, handle_execution_errors, handle_execute_plan_result_raw, ) except ImportError: raise AirflowException( 'To use the DagsterKubernetesPodOperator, dagster and dagster_graphql must be' ' installed in your Airflow environment.' ) if 'run_id' in self.params: self._run_id = self.params['run_id'] elif 'dag_run' in context and context['dag_run'] is not None: self._run_id = context['dag_run'].run_id # return to original execute code: try: client = kube_client.get_kube_client( in_cluster=self.in_cluster, cluster_context=self.cluster_context, config_file=self.config_file, ) gen = pod_generator.PodGenerator() for mount in self.volume_mounts: gen.add_mount(mount) for volume in self.volumes: gen.add_volume(volume) pod = gen.make_pod( namespace=self.namespace, image=self.image, pod_id=self.name, cmds=self.cmds, arguments=self.query(context.get('ts')), labels=self.labels, ) pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy pod.image_pull_secrets = self.image_pull_secrets pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity pod.node_selectors = self.node_selectors pod.hostnetwork = self.hostnetwork pod.tolerations = self.tolerations pod.configmaps = self.configmaps pod.security_context = self.security_context launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push) try: if self.instance: tags = ( {AIRFLOW_EXECUTION_DATE_STR: context.get('ts')} if 'ts' in context else {} ) run = self.instance.register_managed_run( pipeline_name=self.pipeline_name, run_id=self.run_id, run_config=self.run_config, mode=self.mode, solids_to_execute=None, step_keys_to_execute=None, tags=tags, root_run_id=None, parent_run_id=None, pipeline_snapshot=self.pipeline_snapshot, execution_plan_snapshot=self.execution_plan_snapshot, parent_pipeline_snapshot=self.parent_pipeline_snapshot, ) # we won't use the "result", which is the pod's xcom json file (final_state, _) = launcher.run_pod( pod, startup_timeout=self.startup_timeout_seconds, get_logs=self.get_logs ) # fetch the last line independently of whether logs were read # unbelievably, if you set tail_lines=1, the returned json has its double quotes # turned into unparseable single quotes res = None num_attempts = 0 while not res and num_attempts < LOG_RETRIEVAL_MAX_ATTEMPTS: raw_res = client.read_namespaced_pod_log( name=pod.name, namespace=pod.namespace, container='base' ) res = parse_raw_log_lines(raw_res.split('\n')) time.sleep(LOG_RETRIEVAL_WAITS_BETWEEN_ATTEMPTS_SEC) num_attempts += 1 try: handle_execution_errors(res, 'executePlan') except DagsterGraphQLClientError as err: self.instance.report_engine_event( str(err), run, EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info()) ), self.__class__, ) raise events = handle_execute_plan_result_raw(res) if self.instance: for event in events: self.instance.handle_new_event(event) events = [e.dagster_event for e in events] check_events_for_failures(events) check_events_for_skips(events) return events finally: self._run_id = None if self.is_delete_operator_pod: launcher.delete_pod(pod) if final_state != State.SUCCESS: raise AirflowException('Pod returned a failure: {state}'.format(state=final_state)) # note the lack of returning the default xcom except AirflowException as ex: raise AirflowException('Pod Launching failed: {error}'.format(error=ex))