def clean_up(self, job_name): """ Deletes the job. Deleting the job deletes are related pods. """ logging.info("KILLING job {}".format(str(job_name))) result = retryable_check_output(args=namespaced_kubectl() + [ 'delete', '--ignore-not-found=true', # in case we hit an edge case on retry 'job', job_name ]) logging.info(result)
def get_pods(job_name): """ Return pods for a given job_name :param job_name: a unique job name :type job_name: string :return: result form kubectl command that contains job pod and container information :type: dict """ return json.loads( retryable_check_output( args=namespaced_kubectl() + ['get', 'pods', '-o', 'json', '-l', 'job-name==%s' % job_name]))
def log_container_logs(self, job_name, pod_output=None): """ Reads the logs from each container in each pod in the job, re-logs them back :param job_name: job that owns the pods with the containers we want to log :param pod_output: Result of get_pods(job_name) call. If None, will be requested. This is a convenience so we can share/ reuse the results of get_pods() :return: """ pod_output = pod_output or self.get_pods(job_name) for pod in pod_output['items']: pod_name = pod['metadata']['name'] for container in pod['spec']['containers']: container_name = container['name'] extra = dict(pod=pod_name, container=container_name) logging.info('LOGGING OUTPUT FROM JOB [%s/%s]:' % (pod_name, container_name), extra=extra) output = retryable_check_output( args=namespaced_kubectl() + ['logs', pod_name, container_name]) for line in output.splitlines(): logging.info(line, extra=extra)
def poll_job_completion(self, job_name, dependent_containers={'cloudsql-proxy'}): """ Polls for completion of the created job. Sleeps for sleep_seconds_between_polling between polling. Any failed pods will raise an error and fail the KubernetesJobOperator task. """ logging.info('Polling for completion of job: %s' % job_name) pod_output = None # keeping this out here so we can reuse it in the "finally" clause has_live_existed = False while True: time.sleep(self.sleep_seconds_between_polling) pod_output = self.get_pods(job_name) job_description = json.loads( retryable_check_output(namespaced_kubectl() + ['get', 'job', "-o", "json", job_name])) status_block = job_description['status'] if 'succeeded' in status_block and 'failed' in status_block: raise Exception( "Invalid status block containing both succeeded and failed: %s", json.dumps(status_block)) if 'active' in status_block: status = 'running' elif 'failed' in status_block: status = "failed" elif 'succeeded' in status_block: status = 'complete' else: status = "pending" logging.info('Current status is: %s' % status) if "pending" == status: pass if "failed" == status: raise Exception('%s has failed pods, failing task.' % job_name) if "complete" == status: return pod_output # Determine if we have any containers left running in each pod of the job. # Dependent containers don't count. # If there are no pods left running anything, we are done here. Cleaning up # dependent containers will be left to the top-level `finally` block down below. has_live = False for pod in pod_output['items']: if 'Unknown' == pod['status']['phase']: # we haven't run yet has_live = True break elif 'Pending' == pod['status']['phase']: has_live = True start_time_s = pod['status'].get('startTime') if not start_time_s: logging.info('Pod not yet started') break start_time = datetime.strptime(start_time_s, "%Y-%m-%dT%H:%M:%SZ") start_duration_secs = (datetime.utcnow() - start_time).total_seconds() if start_duration_secs > 300: raise Exception( '%s has failed to start after %0.2f seconds' % ( job_name, start_duration_secs, )) elif 'Running' == pod['status']['phase']: # get all of the independent containers that are still alive (running or waiting) live_cnt = 0 for cs in pod['status']['containerStatuses']: if cs['name'] in dependent_containers: pass elif 'terminated' in cs['state']: has_live_existed = True exit_code = int(cs['state']['terminated'].get( 'exitCode', 0)) if exit_code > 0: raise Exception( '%s has failed pods, failing task.' % job_name) else: live_cnt += 1 if live_cnt > 0: has_live = True break elif 'Succeeded' == pod['status']['phase']: # For us to end up in this block, the job has to be Running and the pod has to be Succeeded. # This happens when (on a previous attempt) we successfully finished execution, killed dependent # containers, and failed to delete the job. # In this scenario, we want to immediately stop polling, and retry job deletion. has_live_existed = True has_live = False elif 'Failed' == pod['status']['phase']: raise Exception("Containers failed!") else: raise Exception( "Encountered pod state {state} - no behavior has been prepared for pods in this state!" .format(state=pod["status"]["phase"])) total_pods = len(pod_output['items']) logging.info( "total pods: {total_pods}".format(total_pods=total_pods)) has_live_existed = has_live_existed or has_live # if we get to this point but for some reason there are no pods, log it and retry if not has_live_existed: logging.info('No pods have run. Retrying.') # we have no live pods, but live pods have existed. elif not has_live: logging.info('No live, independent pods left.') return pod_output
def get_pods(self, job_name): return json.loads( retryable_check_output( args=namespaced_kubectl() + ['get', 'pods', '-o', 'json', '-l', 'job-name==%s' % job_name]))