def get_job_log(jobname): """ get log belong to job --- responses: 200: description: get logs schema: type: object examples: application/json: data: "haha" """ job = Job.get_by_name(name=jobname) if not job: abort(404, "job {} not found".format(jobname)) try: pods = KubeApi.instance().get_job_pods(jobname, namespace=DEFAULT_JOB_NS) if pods.items: podname = pods.items[0].metadata.name data = KubeApi.instance().get_pod_log(podname=podname, namespace=DEFAULT_JOB_NS) return {'data': data} else: return {'data': "no log, please retry"} except ApiException as e: abort(e.status, "Error when get job log: {}".format(str(e))) except Exception as e: abort(500, "Error when get job log: {}".format(str(e)))
def delete_job(jobname): """ Delete a single job --- parameters: - name: jobname in: path type: string required: true responses: 200: description: error message schema: $ref: '#/definitions/Error' examples: application/json: error: null """ job = Job.get_by_name(jobname) if not job: abort(404, "job {} not found".format(jobname)) with handle_k8s_err("Error when delete job"): KubeApi.instance().delete_job(jobname, ignore_404=True, namespace=DEFAULT_JOB_NS) job.delete() return DEFAULT_RETURN_VALUE
def watch_app_job_pods(self, cluster): last_seen_version = None label_selector = "kae-type in (app, job)" while True: try: if last_seen_version is not None: watcher = KubeApi.instance().watch_pods(cluster_name=cluster, label_selector=label_selector, resource_version=last_seen_version) else: watcher = KubeApi.instance().watch_pods(cluster_name=cluster, label_selector=label_selector) for event in watcher: obj = event['object'] labels = obj.metadata.labels or {} last_seen_version = obj.metadata.resource_version if 'kae-app-name' in labels: appname = labels['kae-app-name'] channel = make_app_watcher_channel_name(cluster, appname) data = { 'object': obj.to_dict(), 'action': event['type'], } rds.publish(message=json.dumps(data, cls=VersatileEncoder), channel=channel) elif 'kae-job-name' in labels: if event['type'] == 'DELETED': continue jobname = labels['kae-job-name'] handle_job_pod_event.delay(jobname, event['raw_object']) except ProtocolError: logger.warn('skip this error... because kubernetes disconnect client after default 10m...') except Exception as e: # logger.error("---------watch error ------------------") logger.exception("watch pods workers error")
def save_pod_log(jobname, podname, version=0): try: resp = KubeApi.instance().get_pod_log(podname=podname) except ApiException as e: if e.status == 404: return else: raise e try: save_job_log(job_name=jobname, resp=resp, version=version) except: logger.exception("Error when get pod log")
def get_job_log_events(socket, jobname): """ SSE endpoint fo job log --- responses: 200: description: event stream schema: type: object """ ns = DEFAULT_JOB_NS job = Job.get_by_name(jobname) if not job: socket.send(json.dumps({"error": "job {} not found".format(jobname)})) return with session_removed(): try: pods = KubeApi.instance().get_job_pods(jobname, namespace=ns) except ApiException as e: socket.send( json.dumps( {"error": "Error when get job pods: {}".format(str(e))})) return if pods.items: podname = pods.items[0].metadata.name try: for line in KubeApi.instance().follow_pod_log(podname=podname, namespace=ns): socket.send(json.dumps({'data': line})) except ApiException as e: socket.send( json.dumps({ "error": "Error when follow job log, please retry: {}".format( str(e)) })) return else: socket.send(json.dumps({"error": "no log, please retry"}))
def list_cluster(): """ List all the available clusters --- responses: 200: description: available cluster list schema: type: array items: type: string examples: application/json: [ "cluster1", "cluster2", ] """ return KubeApi.instance().cluster_names
def restart_job(jobname): """ Restart a single job --- parameters: - name: jobname in: path type: string required: true responses: 200: description: error message schema: $ref: '#/definitions/Error' examples: application/json: error: null """ job = Job.get_by_name(jobname) if not job: abort(404, "job {} not found".format(jobname)) job.inc_version() with handle_k8s_err("Error when delete job"): KubeApi.instance().delete_job(jobname, ignore_404=True, namespace=DEFAULT_JOB_NS) specs = job.specs # FIXME: need to wait the old job to be deleted while True: try: KubeApi.instance().get_job(jobname, namespace=DEFAULT_JOB_NS) except ApiException as e: if e.status == 404: break else: logger.exception("kubernetes error") abort(500, "kubernetes error") except: logger.exception("kubernetes error") abort(500, "kubernetes error") time.sleep(2) with handle_k8s_err("Error when create job"): KubeApi.instance().create_job(specs, namespace=DEFAULT_JOB_NS) return DEFAULT_RETURN_VALUE
def create_job(args): """ create a new job --- responses: 200: description: Error message schema: type: object $ref: '#/definitions/Error' examples: application/json: error: null """ specs_text = args.get('specs_text', None) cluster = args.get('cluster', KubeApi.DEFAULT_CLUSTER) if specs_text: try: yaml_dict = yaml.load(specs_text) except yaml.YAMLError as e: return abort(403, 'specs text is invalid yaml {}'.format(str(e))) else: # construct specs dict from args command = shlex.split(args['command']) if args['shell'] or args.get('gpu', 0) > 0: if len(command) > 2 and (command[0] != 'sh' or command[1] != '-c'): command = ['sh', '-c'] + command yaml_dict = { 'containers': [{ 'name': args['jobname'], 'image': args['image'], 'command': command, }] } copy_list = ('jobname', 'git', 'branch', 'commit', 'autoRestart', 'comment') for field in copy_list: if field in args: yaml_dict[field] = args[field] if 'gpu' in args: yaml_dict['containers'][0]['gpu'] = args['gpu'] try: specs = load_job_specs(yaml_dict) except ValidationError as e: return abort(400, 'specs text is invalid {}'.format(str(e))) try: job = Job.create(name=specs.jobname, git=specs.get('git'), branch=specs.get('branch'), commit=specs.get('commit'), comment=specs.get('comment'), status="Pending", specs_text=yaml.dump(specs.to_dict())) except IntegrityError as e: return abort(400, 'job is duplicate') except ValueError as e: return abort(400, str(e)) def clean_func(): """ clean database when got an error. :return: """ job.delete() job_dir = os.path.join(JOBS_ROOT_DIR, specs.jobname) code_dir = os.path.join(job_dir, "code") if specs.git: try: cloner = Cloner(repo=specs.git, dst_directory=code_dir, branch=specs.branch, commit_id=specs.commit) cloner.clone_and_copy() except Exception as e: job.delete() logger.exception("clone error") abort(500, "clone and copy code error: {}".format(str(e))) with handle_k8s_err("Error when create job", clean_func=clean_func): KubeApi.instance().create_job(specs, namespace=DEFAULT_JOB_NS, cluster_name=cluster) try: job.grant_user(g.user) except IntegrityError as e: pass return DEFAULT_RETURN_VALUE
def validate_cluster_name(cluster): if KubeApi.instance().cluster_exist(cluster) is False: raise ValidationError("cluster {} not exists".format(cluster))
def enter_pod(socket, appname): payload = None while True: message = socket.receive() if message is None: return try: payload = pod_entry_schema.loads(message) break except ValidationError as e: socket.send(json.dumps(e.messages)) except JSONDecodeError as e: socket.send(json.dumps({'error': str(e)})) app = App.get_by_name(appname) if not app: socket.send( make_errmsg('app {} not found'.format(appname), jsonize=True)) return if not g.user.granted_to_app(app): socket.send( make_errmsg( 'You\'re not granted to this app, ask administrators for permission', jsonize=True)) return args = payload.data podname = args['podname'] cluster = args['cluster'] namespace = args['namespace'] container = args.get('container', None) sh = KubeApi.instance().exec_shell(podname, namespace=namespace, cluster_name=cluster, container=container) need_exit = False def heartbeat_sender(): nonlocal need_exit interval = WS_HEARTBEAT_TIMEOUT - 3 if interval <= 0: interval = WS_HEARTBEAT_TIMEOUT try: while need_exit is False: time.sleep(interval) try: # send a null character to client logger.debug("send PING") send_ping(socket) except WebSocketError as e: need_exit = True return finally: logger.debug("pod entry heartbeat greenlet exit") def resp_sender(): nonlocal need_exit try: while sh.is_open() and need_exit is False: sh.update(timeout=1) if sh.peek_stdout(): msg = sh.read_stdout() logger.debug("STDOUT: %s" % msg) socket.send(msg) if sh.peek_stderr(): msg = sh.read_stderr() logger.debug("STDERR: %s" % msg) socket.send(msg) except ProtocolError: logger.warn('kubernetes disconnect client after default 10m...') except WebSocketError as e: logger.warn('client socket is closed') except Exception as e: logger.warn("unknown exception: {}".format(str(e))) finally: need_exit = True logger.debug("exec output sender greenlet exit") gevent.spawn(resp_sender) gevent.spawn(heartbeat_sender) # to avoid lost mysql connection exception db.session.remove() try: while need_exit is False: # get command from client message = socket.receive() if message is None: logger.info("client socket closed") break sh.write_stdin(message) continue finally: need_exit = True logger.debug("pod entry greenlet exit")
def get_app_pods_events(socket, appname): payload = None socket_active_ts = time.time() while True: message = socket.receive() if message is None: return try: payload = cluster_canary_schema.loads(message) break except ValidationError as e: socket.send(json.dumps(e.messages)) except JSONDecodeError as e: socket.send(json.dumps({'error': str(e)})) args = payload.data cluster = args['cluster'] canary = args['canary'] name = "{}-canary".format(appname) if canary else appname channel = make_app_watcher_channel_name(cluster, name) ns = DEFAULT_APP_NS app = App.get_by_name(appname) if not app: socket.send( make_errmsg('app {} not found'.format(appname), jsonize=True)) return if not g.user.granted_to_app(app): socket.send( make_errmsg( 'You\'re not granted to this app, ask administrators for permission', jsonize=True)) return # since this request may pend long time, so we remove the db session # otherwise we may get error like `sqlalchemy.exc.TimeoutError: QueuePool limit of size 50 overflow 10 reached, connection timed out` with session_removed(): pod_list = KubeApi.instance().get_app_pods(name, cluster_name=cluster, namespace=ns) pods = pod_list.to_dict() for item in pods['items']: data = { 'object': item, 'action': "ADDED", } socket.send(json.dumps(data, cls=VersatileEncoder)) pubsub = rds.pubsub() pubsub.subscribe(channel) need_exit = False def check_client_socket(): nonlocal need_exit while need_exit is False: if socket.receive() is None: need_exit = True break def heartbeat_sender(): nonlocal need_exit, socket_active_ts interval = WS_HEARTBEAT_TIMEOUT - 3 if interval <= 0: interval = WS_HEARTBEAT_TIMEOUT while need_exit is False: now = time.time() if now - socket_active_ts <= (interval - 1): time.sleep(interval - (now - socket_active_ts)) else: try: send_ping(socket) socket_active_ts = time.time() except WebSocketError as e: need_exit = True return gevent.spawn(check_client_socket) gevent.spawn(heartbeat_sender) try: while need_exit is False: resp = pubsub.get_message(timeout=30) if resp is None: continue if resp['type'] == 'message': raw_content = resp['data'] # omit the initial message where resp['data'] is 1L if not isinstance(raw_content, (bytes, str)): continue content = raw_content if isinstance(content, bytes): content = content.decode('utf-8') socket.send(content) socket_active_ts = time.time() finally: # need close the connection created by PUB/SUB, # otherwise it will cause too many redis connections pubsub.unsubscribe() pubsub.close() need_exit = True logger.info("ws connection closed")
def start(self): for name in KubeApi.instance().cluster_names: logger.info("create watcher thread for cluster {}".format(name)) self.thread_map[name] = spawn(self.watch_app_job_pods, name)