def createService(self, uid, runInfo): authFile = ApiConfig().get("k8s", "auth_file") config.load_kube_config(authFile if authFile else None) configuration = kubernetes.client.Configuration() api_instance = kubernetes.client.CoreV1Api( kubernetes.client.ApiClient(configuration)) namespace = 'default' body = self.genV1Service(uid) print body logging.info("create service body: " + str(body)) try: print '=' * 10 api_response = api_instance.create_namespaced_service( namespace, body) print api_response logging.info("service response: " + str(api_response)) return api_response.spec.cluster_ip + ":" + ApiConfig().get( "k8s", "http_port") except ApiException as e: print( "Exception when calling CoreV1Api->create_namespaced_service: %s\n" % e) logging.info( "Exception when calling CoreV1Api->create_namespaced_service: %s\n" % e) raise
def genV1Job(self, uid, workType, seq, count, info, ps, workers): tfId = "-".join(["tf", str(uid), workType, str(seq), str(count)]) body = kubernetes.client.V1Job() body.api_version = "batch/v1" body.kind = "Job" metaBody = kubernetes.client.V1ObjectMeta() metaBody.name = tfId body.metadata = metaBody tempSpec = kubernetes.client.V1PodTemplateSpec() tempMetaBody = kubernetes.client.V1ObjectMeta() tempMetaBody.name = tfId tempMetaBody.labels = {"tf": tfId} tempSpec.metadata = tempMetaBody containerBody = kubernetes.client.V1Container(name=tfId) tempInnerSpec = kubernetes.client.V1PodSpec(containers=[containerBody]) tempInnerSpec.restart_policy = "Never" #tempInnerSpec.containers = [containerBody] #containerBody.name = tfId containerBody.image = ApiConfig().get("image", "tensorflow") hdfsUrl = ApiConfig().get("hdfs", "web") hdfsNN = ApiConfig().get("hdfs", "namenode") #containerBody.command = ["/notebooks/entry.sh", workType, str(seq), ps, workers, info.get("file", ""), # info.get("data", "/notebooks"), info.get("export", "/tmp"), hdfsUrl, hdfsNN]、 containerBody.command = ["python", info.get("file", "")] # containerBody.command = ["sleep","3600"] #containerBody.args=[info.get("file", ""),"--job_name = "+workType,"--task_index="+str(seq),"--ps_host="+ps, "--worker_hosts="+workers] portBody = kubernetes.client.V1ContainerPort(ApiConfig().getint( "k8s", "headless_port")) containerBody.ports = [portBody] tempSpec.spec = tempInnerSpec specBody = kubernetes.client.V1JobSpec(template=tempSpec) body.spec = specBody return body
def loadHandlers(self): handlerStrs = ApiConfig().get("event", "handlers") handlerNames = handlerStrs.split(",") for name in handlerNames: print("name: " + name) m = importlib.import_module('eventHandlers.'+name.strip()) cls = getattr(m, name.strip()) self.eventHandlers.append(cls())
def addEvent(self, objName, eStatus): print '*************** CleanJobHandler: ' + str(objName) rc = RedisHelper().getRedis() tfId, curSeq, cnt = self.searchPattern(CleanJobHandler.psPt, objName) if tfId: rc.hsetnx(ApiConfig().get("event", "ps_key"), tfId, cnt) else: tfId, curSeq, cnt = self.searchPattern(CleanJobHandler.workerPt, objName) if tfId: rc.hsetnx(ApiConfig().get("event", "worker_key"), tfId, cnt)
def genV1Rs(self, uid, modelParentPath, modelName): servingId = "tf-serving-" + uid body = kubernetes.client.V1ReplicaSet() body.api_version = "apps/v1" body.kind = "ReplicaSet" body.metadata = kubernetes.client.V1ObjectMeta() body.metadata.name = servingId labelSelector = kubernetes.client.V1LabelSelector() labelSelector.match_labels = {"tf": servingId} specBody = kubernetes.client.V1ReplicaSetSpec(selector=labelSelector) specBody.replicas = 1 tempSpec = kubernetes.client.V1PodTemplateSpec() tempMetaBody = kubernetes.client.V1ObjectMeta() tempMetaBody.name = servingId tempMetaBody.labels = {"tf": servingId} tempSpec.metadata = tempMetaBody containerBody = kubernetes.client.V1Container(name=servingId) containerBody.image = ApiConfig().get("image", "serving") rpcPortBody = kubernetes.client.V1ContainerPort( container_port=ApiConfig().getint("k8s", "rpc_port")) rpcPortBody.name = "rpc" httpPortBody = kubernetes.client.V1ContainerPort( container_port=ApiConfig().getint("k8s", "http_port")) httpPortBody.name = "http" containerBody.ports = [rpcPortBody, httpPortBody] volMount = kubernetes.client.V1VolumeMount(mount_path="/models/" + modelName, name="glusterfsvol-" + uid) containerBody.volume_mounts = [volMount] envBody = kubernetes.client.V1EnvVar(name="MODEL_NAME", value=modelName) containerBody.env = [envBody] volBody = kubernetes.client.V1Volume(name="glusterfsvol-" + uid) # TODO mount model path parentPath = modelParentPath + "/" if modelParentPath else "" modelPath = "/" + parentPath + modelName gfsVol = kubernetes.client.V1GlusterfsVolumeSource( endpoints="glusterfs-cluster", path="gv1/good/" + self.basicUsername + modelPath) volBody.glusterfs = gfsVol tempInnerSpec = kubernetes.client.V1PodSpec(containers=[containerBody], volumes=[volBody]) tempSpec.spec = tempInnerSpec specBody.template = tempSpec body.spec = specBody print body logging.info("rs body: " + str(body)) return body
def genV1Job(self, uid, workType, seq, count, info, ps, workers): try: print 'gen v1 job ......' tfId = "-".join(["tf", str(uid), workType, str(seq), str(count)]) body = kubernetes.client.V1Job() body.api_version = "batch/v1" body.kind = "Job" metaBody = kubernetes.client.V1ObjectMeta() metaBody.name = tfId body.metadata = metaBody tempSpec = kubernetes.client.V1PodTemplateSpec() tempMetaBody = kubernetes.client.V1ObjectMeta() tempMetaBody.name = tfId tempMetaBody.labels = {"tf": tfId} tempSpec.metadata = tempMetaBody containerBody = kubernetes.client.V1Container(name=tfId) volBody = kubernetes.client.V1Volume(name="glusterfsvol") gfsVol = kubernetes.client.V1GlusterfsVolumeSource( endpoints="glusterfs-cluster", path="gv1/good/" + self.basicUsername) volBody.glusterfs = gfsVol tempInnerSpec = kubernetes.client.V1PodSpec( containers=[containerBody], volumes=[volBody]) tempInnerSpec.restart_policy = "Never" #tempInnerSpec.containers = [containerBody] #containerBody.name = tfId containerBody.image = ApiConfig().get("image", "tensorflow") hdfsUrl = ApiConfig().get("hdfs", "web") hdfsNN = ApiConfig().get("hdfs", "namenode") containerBody.command = [ "/notebooks/entry.sh", workType, str(seq), ps, workers, info.get("file", ""), info.get("data", "/notebooks"), info.get("export", "/tmp"), hdfsUrl, hdfsNN, info.get("main", "") ] portBody = kubernetes.client.V1ContainerPort(ApiConfig().getint( "k8s", "headless_port")) containerBody.ports = [portBody] volMount = kubernetes.client.V1VolumeMount(mount_path="/mnt", name="glusterfsvol") containerBody.volume_mounts = [volMount] tempSpec.spec = tempInnerSpec specBody = kubernetes.client.V1JobSpec(template=tempSpec) body.spec = specBody print 'gen v1 job ok ......' return body except: print 'get exc ...' traceback.print_exc()
def delEvent(self, objName, eStatus): print '************* CleanJobHandler delete event: ' + str(objName) rc = RedisHelper().getRedis() tfId, seq, cnt = self.searchPattern(CleanJobHandler.psPt, objName) if tfId: print 'delete event ps tfId: ' + tfId psCurCount = rc.hincrby(ApiConfig().get("event", "ps_key"), tfId, -1) if (int(psCurCount) == 0): # TODO record successful tfId print 'tfId successfully done' rc.hdel(ApiConfig().get("event", "ps_key"), tfId) rc.hdel(ApiConfig().get("event", "worker_key"), tfId)
def createService(self, uid, runInfo): authFile = ApiConfig().get("k8s", "auth_file") config.load_kube_config(authFile if authFile else None) configuration = kubernetes.client.Configuration() api_instance = kubernetes.client.CoreV1Api( kubernetes.client.ApiClient(configuration)) namespace = 'default' for workType in runInfo: workCount = runInfo.get(workType, 1) for i in xrange(workCount): body = self.genV1Service(uid, workType, i, workCount) print body logging.info("create service body: " + str(body)) try: print '=' * 10 api_response = api_instance.create_namespaced_service( namespace, body) print api_response logging.info("service response: " + str(api_response)) except ApiException as e: print( "Exception when calling CoreV1Api->create_namespaced_service: %s\n" % e) logging.info( "Exception when calling CoreV1Api->create_namespaced_service: %s\n" % e) raise
def storeInfo(self, uid, ps_hosts, worker_hosts): info = {"ps": ps_hosts, "worker": worker_hosts, "status": "running"} js_info = json.dumps(info) rc = RedisHelper().getRedis() # TODO pipeline rc.sadd(ApiConfig().get("redis", "running_set"), uid) rc.set(uid, js_info)
def addEvent(self, objName, eStatus): print '*************** MarkJobHandler: ' + str(objName) rc = RedisHelper().getRedis() psPt = "tf-([0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})-ps-([0-9].*)-([0-9].*)" res = re.match(psPt, objName) if res: psKey = res.group(1) rc.hsetnx(ApiConfig().get("event", "ps_key"), psKey, res.group(3)) else: workerPt = "tf-([0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})-worker-([0-9].*)-([0-9].*)" res = re.match(workerPt, objName) if not res: return workerKey = res.group(1) rc.hsetnx(ApiConfig().get("event", "worker_key"), workerKey, res.group(3))
def createJob(self, uid, info): configuration = kubernetes.client.Configuration() api_instance = kubernetes.client.BatchV1Api( kubernetes.client.ApiClient(configuration)) runInfo = info.get("detail", None) ps_count = runInfo.get("ps", 0) worker_count = runInfo.get("worker", 0) svcPort = ApiConfig().get("k8s", "headless_port") ps_hosts = [ "-".join(["tf", str(uid), "ps", str(i), str(ps_count)]) + ":" + svcPort for i in xrange(ps_count) ] worker_hosts = [ "-".join( ["tf", str(uid), "worker", str(i), str(worker_count)]) + ":" + svcPort for i in xrange(worker_count) ] print "ps: " + str(ps_hosts) logging.info("ps: " + str(ps_hosts)) print "worker: " + str(worker_hosts) logging.info("worker: " + str(worker_hosts)) for workType in runInfo: count = runInfo.get(workType, 1) for i in xrange(count): try: body = self.genV1Job(uid, workType, i, count, info, ",".join(ps_hosts), ",".join(worker_hosts)) print body namespace = ApiConfig().get("namespace", info.get("type", "tensorflow")) api_response = api_instance.create_namespaced_job( namespace, body) print api_response logging.info("create job: " + str(api_response)) except ApiException as e: print( "Exception when calling BatchV1Api->create_namespaced_job: %s\n" % e) logging.info( "Exception when calling BatchV1Api->create_namespaced_job: %s\n" % e) raise return ps_hosts, worker_hosts
def genV1Service(self, uid, workType, seq, count): tfId = "-".join(["tf", uid, workType, str(seq), str(count)]) body = kubernetes.client.V1Service() body.api_version = "v1" body.kind = "Service" metaBody = kubernetes.client.V1ObjectMeta() metaBody.name = tfId body.metadata = metaBody specBody = kubernetes.client.V1ServiceSpec() specBody.cluster_ip = "None" specBody.selector = {"tf": tfId} portBody = kubernetes.client.V1ServicePort( port=ApiConfig().getint("k8s", "headless_port")) portBody.target_port = ApiConfig().getint("k8s", "headless_port") specBody.ports = [portBody] body.spec = specBody return body
def removeWorker(self, workerList): print 'deleting worker list: ' + str(workerList) # TODO del k8s-ps, del keys authFile = ApiConfig().get("k8s", "auth_file") config.load_kube_config(authFile if authFile else None) configuration = kubernetes.client.Configuration() delSvcInstance = kubernetes.client.CoreV1Api( kubernetes.client.ApiClient(configuration)) body = kubernetes.client.V1DeleteOptions() body.propagation_policy = 'Foreground' namespace = ApiConfig().get("namespace", "tensorflow") for worker in workerList: try: svcRes = delSvcInstance.delete_namespaced_service( worker, namespace, body) print '----------------- worker svcRes: ' + str(svcRes) except: traceback.print_exc()
class DisDeepService(object): def __init__(self): self.config = ApiConfig() def start(self): app = tornado.web.Application([(r"/v1/train", TrainHandler)]) app.listen(self.config.getint("service", "port")) logging.info("service start ...") tornado.ioloop.IOLoop.current().start()
def post(self, path): print "POST" print "path: " + path print "file: " + str(self.request.body) suffix = "?op=CREATE&user.name={0}&data=true".format(ApiConfig().get( "request", "hdfs_user")) fullUrl = ApiConfig().get("request", "hdfs_url") + "/" + path + suffix print 'url: ' + fullUrl header = {"Content-Type": "application/octet-stream"} #suffix = "?op=CREATE&user.name={0}&data=true".format() #fullUrl = ApiConfig().get("request", "hdfs_url") + "/" + path + suffix request = HTTPRequest(url=fullUrl, method="PUT", headers=header, body=self.request.body, request_timeout=ApiConfig().getint( "request", "timeout")) client = AsyncHTTPClient() client.fetch(request, self.on_response) self.finish()
def modifEvent(self, objName, eStatus): print '*************** UpdateJobHandler modify event: ' + str(objName) rc = RedisHelper().getRedis() psPt = "tf-([0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})-ps-([0-9].*)-([0-9].*)" res = re.match(psPt, objName) if res: # ps may be shutdown itself through singal from worker print 'ps modified' #psKey = res.group(1) #rc.hincrby(ApiConfig().get("event", "ps_key"), psKey, -1) else: workerPt = "tf-([0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})-worker-([0-9].*)-([0-9].*)" res = re.match(workerPt, objName) if not res: return if eStatus.succeeded and eStatus.succeeded == 1: workerKey = res.group(1) curCount = rc.hincrby(ApiConfig().get("event", "worker_key"), workerKey, -1) if (int(curCount) == 0): print 'prepare delete ps ++++++++++++++++++++++++++++++' psCnt = rc.hget(ApiConfig().get("event", "ps_key"), res.group(1)) allPs = [ 'tf-' + res.group(1) + '-ps-' + str(i) + '-' + psCnt for i in xrange(int(psCnt)) ] allWorker = [ 'tf-' + res.group(1) + '-worker-' + str(i) + '-' + res.group(3) for i in xrange(int(res.group(3))) ] print 'all ps: ' + str(allPs) print 'all worker: ' + str(allWorker) tfInfo = {'ps': allPs, 'worker': allWorker} rc.rpush(ApiConfig().get("event", "delete_queue"), json.dumps(tfInfo)) else: print 'one tf worker done successfully ......' else: # TODO mark failed pass
def modifEvent(self, objName, eStatus): print '*************** CleanJobHandler modify event: ' + str(objName) rc = RedisHelper().getRedis() tfId, curSeq, cnt = self.searchPattern(CleanJobHandler.psPt, objName) if tfId: # ps may be shutdown itself through singal from worker print 'ps modified' else: tfId, curSeq, cnt = self.searchPattern(CleanJobHandler.workerPt, objName) if not tfId: return if eStatus.succeeded and eStatus.succeeded == 1: curCount = rc.hincrby(ApiConfig().get("event", "worker_key"), tfId, -1) if (int(curCount) == 0): print 'all worker done, clean ...' psCnt = rc.hget(ApiConfig().get("event", "ps_key"), tfId) print 'psCnt: ' + psCnt allPs = [ 'tf-' + tfId + '-ps-' + str(i) + '-' + psCnt for i in xrange(int(psCnt)) ] allWorker = [ 'tf-' + tfId + '-worker-' + str(i) + '-' + cnt for i in xrange(int(cnt)) ] tfInfo = {'ps': allPs, 'worker': allWorker} print 'tfInfo: ' + str(tfInfo) try: pushRes = rc.rpush( ApiConfig().get("event", "delete_queue"), json.dumps(tfInfo)) print 'pushRes: ' + str(pushRes) except: traceback.print_exc() else: print 'one tf worker done successfully ......' else: # TODO failed pass
def genV1Service(self, uid): servingId = "tf-serving-" + uid body = kubernetes.client.V1Service() body.api_version = "v1" body.kind = "Service" metaBody = kubernetes.client.V1ObjectMeta() metaBody.name = servingId body.metadata = metaBody specBody = kubernetes.client.V1ServiceSpec() specBody.selector = {"tf": servingId} rpcPortBody = kubernetes.client.V1ServicePort( port=ApiConfig().getint("k8s", "rpc_port")) rpcPortBody.name = "rpc" rpcPortBody.target_port = "rpc" httpPortBody = kubernetes.client.V1ServicePort( port=ApiConfig().getint("k8s", "http_port")) httpPortBody.name = "http" httpPortBody.target_port = "http" specBody.ports = [rpcPortBody, httpPortBody] body.spec = specBody return body
def delEvent(self, objName, eStatus): print '************* UpdateJobHandler delete event: ' + str(objName) rc = RedisHelper().getRedis() psPt = "tf-([0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})-ps-([0-9].*)-([0-9].*)" res = re.match(psPt, objName) if res: print 'delete event matched' psKey = res.group(1) print 'delete event ps_key: ' + psKey try: psCurCount = rc.hincrby(ApiConfig().get("event", "ps_key"), psKey, -1) except: print 'got error' traceback.print_exc() print 'after hincrby ......' print 'delete event ps cur count: ' + str(psCurCount) if (int(psCurCount) == 0): print '' rc.hdel(ApiConfig().get("event", "ps_key"), psKey) rc.hdel(ApiConfig().get("event", "worker_key"), psKey) else: print 'del event not matched'
def removePs(self, psList): print 'deleting ps list: ' + str(psList) # TODO del k8s-ps, del keys authFile = ApiConfig().get("k8s", "auth_file") config.load_kube_config(authFile if authFile else None) configuration = kubernetes.client.Configuration() delJobInstance = kubernetes.client.BatchV1Api( kubernetes.client.ApiClient(configuration)) delSvcInstance = kubernetes.client.CoreV1Api( kubernetes.client.ApiClient(configuration)) body = kubernetes.client.V1DeleteOptions() body.propagation_policy = 'Foreground' namespace = ApiConfig().get("namespace", "tensorflow") for ps in psList: try: jobRes = delJobInstance.delete_namespaced_job( ps, namespace, body) print '----------------- ps jobRes: ' + str(jobRes) svcRes = delSvcInstance.delete_namespaced_service( ps, namespace, body) print '----------------- ps svcRes: ' + str(svcRes) except: traceback.print_exc()
def checkJobs(self, jobInfo): successUids = [] failedUids = [] rc = RedisHelper().getRedis() runningSets = rc.smembers(ApiConfig().get("redis", "running_set")) for info in jobInfo.items: uid = "-".join(info.metadata.name.split('-')[1:-3]) print("make uid: " + str(uid)) if uid not in runningSets: continue failedCount = info.status.failed succeedCount = info.status.succeeded uidJs = rc.get(uid) print('failedCount: ' + str(failedCount)) print('succeedCount: ' + str(succeedCount)) print('type: ' + str(type(succeedCount))) print('uidjs: ' + uidJs) if not uidJs: continue uidDetail = json.loads(uidJs) print('detail: ' + str(uidDetail)) if succeedCount and succeedCount == 1: print('success done') if "success" not in uidDetail: uidDetail["success"] = [] if info.metadata.name not in uidDetail["success"]: uidDetail["success"].append(info.metadata.name) rc.set(uid, json.dumps(uidDetail)) else: print('failed done') if failedCount and failedCount >= 1: if "failed" not in uidDetail: uidDetail["failed"] = [] if info.metadata.name not in uidDetail["failed"]: uidDetail["failed"].append(info.metadata.name) for uid in runningSets: uidJs = rc.get(uid) if not uidJs: continue uidDetail = json.loads(uidJs) runningJobs = uidDetail.get("worker") successJobs = uidDetail.get("success", []) failedJobs = uidDetail.get("failed", []) if len(runningJobs) == len(successJobs): successUids.append(uid) elif len(failedJobs) >= 1: failedUids.append(uid) return successUids, failedUids
def run(self): try: rc = RedisHelper().getRedis() while True: res = rc.blpop(ApiConfig().get("event", "delete_queue"), 0) print '------------------ get res: ' + str(res) jsInfo = res[1] print '-------------- get info: ' + str(jsInfo) infoMap = json.loads(jsInfo) self.removePs(infoMap.get('ps', [])) self.removeWorker(infoMap.get('worker', [])) except KeyboardInterrupt: pass except: traceback.print_exc()
class DisDeepService(object): def __init__(self): self.config = ApiConfig() def start(self): settings = { "cookie_secret": "bZJc2sWbQLKos6GkHn/VB9oXwQt8S0R0kRvJ5/xJ89E=" } app = tornado.web.Application( [(r"/v1/train", TrainHandler), (r"/v1/serving", ServingHandler), (r"/v1/test", TestHandler), (r"/v1/user/([a-zA-Z0-9_-]+)", UserHandler), (r"/v1/upload/(.*)", UploadHandler)], **settings) app.listen(self.config.getint("service", "port")) logging.info("service start ...") tornado.ioloop.IOLoop.current().start()
def removePs(self, psList): print 'deleting ps list: ' + str(psList) config.load_kube_config() configuration = kubernetes.client.Configuration() delJobInstance = kubernetes.client.BatchV1Api( kubernetes.client.ApiClient(configuration)) delSvcInstance = kubernetes.client.CoreV1Api( kubernetes.client.ApiClient(configuration)) body = kubernetes.client.V1DeleteOptions() body.propagation_policy = 'Foreground' namespace = ApiConfig().get("namespace", "tensorflow") for ps in psList: try: delJobInstance.delete_namespaced_job(ps, namespace, body) delSvcInstance.delete_namespaced_service(ps, namespace, body) except: traceback.print_exc()
def createRs(self, uid, info): configuration = kubernetes.client.Configuration() api_instance = kubernetes.client.AppsV1Api( kubernetes.client.ApiClient(configuration)) namespace = ApiConfig().get("namespace", info.get("type", "tensorflow")) model = info.get("name", "test") modelParentPath = info.get("path", "/path") try: body = self.genV1Rs(uid, modelParentPath, model) api_response = api_instance.create_namespaced_replica_set( namespace, body) except ApiException as e: print( "Exception when calling AppsV1Api->create_namespaced_replica_set: %s\n" % e) logging.error( "Exception when calling AppsV1Api->create_namespaced_replica_set: %s\n" % e) raise
def __new__(cls, *args, **kwargs): if not hasattr(cls, 'instance'): conf = ApiConfig() cls.instance = super(RedisHelper, cls).__new__(cls) cls.redis = redis.Redis(host=conf.get("redis", "host"), port=conf.getint("redis", "port")) return cls.instance
def moveUid(self, uid): rc = RedisHelper().getRedis() rc.smove(ApiConfig().get("redis", "running_set"), ApiConfig().get("redis", "success_set"), uid)
def __init__(self): self.config = ApiConfig()