예제 #1
0
 def createService(self, uid, runInfo):
     authFile = ApiConfig().get("k8s", "auth_file")
     config.load_kube_config(authFile if authFile else None)
     configuration = kubernetes.client.Configuration()
     api_instance = kubernetes.client.CoreV1Api(
         kubernetes.client.ApiClient(configuration))
     namespace = 'default'
     body = self.genV1Service(uid)
     print body
     logging.info("create service body: " + str(body))
     try:
         print '=' * 10
         api_response = api_instance.create_namespaced_service(
             namespace, body)
         print api_response
         logging.info("service response: " + str(api_response))
         return api_response.spec.cluster_ip + ":" + ApiConfig().get(
             "k8s", "http_port")
     except ApiException as e:
         print(
             "Exception when calling CoreV1Api->create_namespaced_service: %s\n"
             % e)
         logging.info(
             "Exception when calling CoreV1Api->create_namespaced_service: %s\n"
             % e)
         raise
예제 #2
0
    def genV1Job(self, uid, workType, seq, count, info, ps, workers):
        tfId = "-".join(["tf", str(uid), workType, str(seq), str(count)])
        body = kubernetes.client.V1Job()
        body.api_version = "batch/v1"
        body.kind = "Job"
        metaBody = kubernetes.client.V1ObjectMeta()
        metaBody.name = tfId
        body.metadata = metaBody

        tempSpec = kubernetes.client.V1PodTemplateSpec()
        tempMetaBody = kubernetes.client.V1ObjectMeta()
        tempMetaBody.name = tfId
        tempMetaBody.labels = {"tf": tfId}
        tempSpec.metadata = tempMetaBody
        containerBody = kubernetes.client.V1Container(name=tfId)
        tempInnerSpec = kubernetes.client.V1PodSpec(containers=[containerBody])
        tempInnerSpec.restart_policy = "Never"
        #tempInnerSpec.containers = [containerBody]
        #containerBody.name = tfId
        containerBody.image = ApiConfig().get("image", "tensorflow")
        hdfsUrl = ApiConfig().get("hdfs", "web")
        hdfsNN = ApiConfig().get("hdfs", "namenode")
        #containerBody.command = ["/notebooks/entry.sh", workType, str(seq), ps, workers, info.get("file", ""),
        #                     info.get("data", "/notebooks"), info.get("export", "/tmp"), hdfsUrl, hdfsNN]、
        containerBody.command = ["python", info.get("file", "")]
        #  containerBody.command = ["sleep","3600"]
        #containerBody.args=[info.get("file", ""),"--job_name = "+workType,"--task_index="+str(seq),"--ps_host="+ps, "--worker_hosts="+workers]
        portBody = kubernetes.client.V1ContainerPort(ApiConfig().getint(
            "k8s", "headless_port"))
        containerBody.ports = [portBody]
        tempSpec.spec = tempInnerSpec
        specBody = kubernetes.client.V1JobSpec(template=tempSpec)
        body.spec = specBody
        return body
예제 #3
0
 def loadHandlers(self):
     handlerStrs = ApiConfig().get("event", "handlers")
     handlerNames = handlerStrs.split(",")
     for name in handlerNames:
         print("name: " + name)
         m = importlib.import_module('eventHandlers.'+name.strip())
         cls = getattr(m, name.strip())
         self.eventHandlers.append(cls())
 def addEvent(self, objName, eStatus):
     print '*************** CleanJobHandler: ' + str(objName)
     rc = RedisHelper().getRedis()
     tfId, curSeq, cnt = self.searchPattern(CleanJobHandler.psPt, objName)
     if tfId:
         rc.hsetnx(ApiConfig().get("event", "ps_key"), tfId, cnt)
     else:
         tfId, curSeq, cnt = self.searchPattern(CleanJobHandler.workerPt,
                                                objName)
         if tfId:
             rc.hsetnx(ApiConfig().get("event", "worker_key"), tfId, cnt)
예제 #5
0
    def genV1Rs(self, uid, modelParentPath, modelName):
        servingId = "tf-serving-" + uid
        body = kubernetes.client.V1ReplicaSet()
        body.api_version = "apps/v1"
        body.kind = "ReplicaSet"
        body.metadata = kubernetes.client.V1ObjectMeta()
        body.metadata.name = servingId
        labelSelector = kubernetes.client.V1LabelSelector()
        labelSelector.match_labels = {"tf": servingId}
        specBody = kubernetes.client.V1ReplicaSetSpec(selector=labelSelector)
        specBody.replicas = 1

        tempSpec = kubernetes.client.V1PodTemplateSpec()
        tempMetaBody = kubernetes.client.V1ObjectMeta()
        tempMetaBody.name = servingId
        tempMetaBody.labels = {"tf": servingId}
        tempSpec.metadata = tempMetaBody

        containerBody = kubernetes.client.V1Container(name=servingId)
        containerBody.image = ApiConfig().get("image", "serving")
        rpcPortBody = kubernetes.client.V1ContainerPort(
            container_port=ApiConfig().getint("k8s", "rpc_port"))
        rpcPortBody.name = "rpc"
        httpPortBody = kubernetes.client.V1ContainerPort(
            container_port=ApiConfig().getint("k8s", "http_port"))
        httpPortBody.name = "http"
        containerBody.ports = [rpcPortBody, httpPortBody]
        volMount = kubernetes.client.V1VolumeMount(mount_path="/models/" +
                                                   modelName,
                                                   name="glusterfsvol-" + uid)
        containerBody.volume_mounts = [volMount]
        envBody = kubernetes.client.V1EnvVar(name="MODEL_NAME",
                                             value=modelName)
        containerBody.env = [envBody]

        volBody = kubernetes.client.V1Volume(name="glusterfsvol-" + uid)
        # TODO mount model path
        parentPath = modelParentPath + "/" if modelParentPath else ""
        modelPath = "/" + parentPath + modelName
        gfsVol = kubernetes.client.V1GlusterfsVolumeSource(
            endpoints="glusterfs-cluster",
            path="gv1/good/" + self.basicUsername + modelPath)
        volBody.glusterfs = gfsVol

        tempInnerSpec = kubernetes.client.V1PodSpec(containers=[containerBody],
                                                    volumes=[volBody])
        tempSpec.spec = tempInnerSpec
        specBody.template = tempSpec
        body.spec = specBody
        print body
        logging.info("rs body: " + str(body))
        return body
예제 #6
0
    def genV1Job(self, uid, workType, seq, count, info, ps, workers):
        try:
            print 'gen v1 job ......'
            tfId = "-".join(["tf", str(uid), workType, str(seq), str(count)])
            body = kubernetes.client.V1Job()
            body.api_version = "batch/v1"
            body.kind = "Job"
            metaBody = kubernetes.client.V1ObjectMeta()
            metaBody.name = tfId
            body.metadata = metaBody

            tempSpec = kubernetes.client.V1PodTemplateSpec()
            tempMetaBody = kubernetes.client.V1ObjectMeta()
            tempMetaBody.name = tfId
            tempMetaBody.labels = {"tf": tfId}
            tempSpec.metadata = tempMetaBody
            containerBody = kubernetes.client.V1Container(name=tfId)
            volBody = kubernetes.client.V1Volume(name="glusterfsvol")
            gfsVol = kubernetes.client.V1GlusterfsVolumeSource(
                endpoints="glusterfs-cluster",
                path="gv1/good/" + self.basicUsername)
            volBody.glusterfs = gfsVol
            tempInnerSpec = kubernetes.client.V1PodSpec(
                containers=[containerBody], volumes=[volBody])
            tempInnerSpec.restart_policy = "Never"
            #tempInnerSpec.containers = [containerBody]
            #containerBody.name = tfId
            containerBody.image = ApiConfig().get("image", "tensorflow")
            hdfsUrl = ApiConfig().get("hdfs", "web")
            hdfsNN = ApiConfig().get("hdfs", "namenode")
            containerBody.command = [
                "/notebooks/entry.sh", workType,
                str(seq), ps, workers,
                info.get("file", ""),
                info.get("data", "/notebooks"),
                info.get("export", "/tmp"), hdfsUrl, hdfsNN,
                info.get("main", "")
            ]
            portBody = kubernetes.client.V1ContainerPort(ApiConfig().getint(
                "k8s", "headless_port"))
            containerBody.ports = [portBody]
            volMount = kubernetes.client.V1VolumeMount(mount_path="/mnt",
                                                       name="glusterfsvol")
            containerBody.volume_mounts = [volMount]
            tempSpec.spec = tempInnerSpec
            specBody = kubernetes.client.V1JobSpec(template=tempSpec)
            body.spec = specBody
            print 'gen v1 job ok ......'
            return body
        except:
            print 'get exc ...'
            traceback.print_exc()
 def delEvent(self, objName, eStatus):
     print '************* CleanJobHandler delete event: ' + str(objName)
     rc = RedisHelper().getRedis()
     tfId, seq, cnt = self.searchPattern(CleanJobHandler.psPt, objName)
     if tfId:
         print 'delete event ps tfId: ' + tfId
         psCurCount = rc.hincrby(ApiConfig().get("event", "ps_key"), tfId,
                                 -1)
         if (int(psCurCount) == 0):
             # TODO record successful tfId
             print 'tfId successfully done'
             rc.hdel(ApiConfig().get("event", "ps_key"), tfId)
             rc.hdel(ApiConfig().get("event", "worker_key"), tfId)
예제 #8
0
 def createService(self, uid, runInfo):
     authFile = ApiConfig().get("k8s", "auth_file")
     config.load_kube_config(authFile if authFile else None)
     configuration = kubernetes.client.Configuration()
     api_instance = kubernetes.client.CoreV1Api(
         kubernetes.client.ApiClient(configuration))
     namespace = 'default'
     for workType in runInfo:
         workCount = runInfo.get(workType, 1)
         for i in xrange(workCount):
             body = self.genV1Service(uid, workType, i, workCount)
             print body
             logging.info("create service body: " + str(body))
             try:
                 print '=' * 10
                 api_response = api_instance.create_namespaced_service(
                     namespace, body)
                 print api_response
                 logging.info("service response: " + str(api_response))
             except ApiException as e:
                 print(
                     "Exception when calling CoreV1Api->create_namespaced_service: %s\n"
                     % e)
                 logging.info(
                     "Exception when calling CoreV1Api->create_namespaced_service: %s\n"
                     % e)
                 raise
예제 #9
0
 def storeInfo(self, uid, ps_hosts, worker_hosts):
     info = {"ps": ps_hosts, "worker": worker_hosts, "status": "running"}
     js_info = json.dumps(info)
     rc = RedisHelper().getRedis()
     # TODO pipeline
     rc.sadd(ApiConfig().get("redis", "running_set"), uid)
     rc.set(uid, js_info)
예제 #10
0
 def addEvent(self, objName, eStatus):
     print '*************** MarkJobHandler: ' + str(objName)
     rc = RedisHelper().getRedis()
     psPt = "tf-([0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})-ps-([0-9].*)-([0-9].*)"
     res = re.match(psPt, objName)
     if res:
         psKey = res.group(1)
         rc.hsetnx(ApiConfig().get("event", "ps_key"), psKey, res.group(3))
     else:
         workerPt = "tf-([0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})-worker-([0-9].*)-([0-9].*)"
         res = re.match(workerPt, objName)
         if not res:
             return
         workerKey = res.group(1)
         rc.hsetnx(ApiConfig().get("event", "worker_key"), workerKey,
                   res.group(3))
예제 #11
0
 def createJob(self, uid, info):
     configuration = kubernetes.client.Configuration()
     api_instance = kubernetes.client.BatchV1Api(
         kubernetes.client.ApiClient(configuration))
     runInfo = info.get("detail", None)
     ps_count = runInfo.get("ps", 0)
     worker_count = runInfo.get("worker", 0)
     svcPort = ApiConfig().get("k8s", "headless_port")
     ps_hosts = [
         "-".join(["tf", str(uid), "ps",
                   str(i), str(ps_count)]) + ":" + svcPort
         for i in xrange(ps_count)
     ]
     worker_hosts = [
         "-".join(
             ["tf", str(uid), "worker",
              str(i), str(worker_count)]) + ":" + svcPort
         for i in xrange(worker_count)
     ]
     print "ps: " + str(ps_hosts)
     logging.info("ps: " + str(ps_hosts))
     print "worker: " + str(worker_hosts)
     logging.info("worker: " + str(worker_hosts))
     for workType in runInfo:
         count = runInfo.get(workType, 1)
         for i in xrange(count):
             try:
                 body = self.genV1Job(uid, workType, i, count, info,
                                      ",".join(ps_hosts),
                                      ",".join(worker_hosts))
                 print body
                 namespace = ApiConfig().get("namespace",
                                             info.get("type", "tensorflow"))
                 api_response = api_instance.create_namespaced_job(
                     namespace, body)
                 print api_response
                 logging.info("create job: " + str(api_response))
             except ApiException as e:
                 print(
                     "Exception when calling BatchV1Api->create_namespaced_job: %s\n"
                     % e)
                 logging.info(
                     "Exception when calling BatchV1Api->create_namespaced_job: %s\n"
                     % e)
                 raise
     return ps_hosts, worker_hosts
예제 #12
0
 def genV1Service(self, uid, workType, seq, count):
     tfId = "-".join(["tf", uid, workType, str(seq), str(count)])
     body = kubernetes.client.V1Service()
     body.api_version = "v1"
     body.kind = "Service"
     metaBody = kubernetes.client.V1ObjectMeta()
     metaBody.name = tfId
     body.metadata = metaBody
     specBody = kubernetes.client.V1ServiceSpec()
     specBody.cluster_ip = "None"
     specBody.selector = {"tf": tfId}
     portBody = kubernetes.client.V1ServicePort(
         port=ApiConfig().getint("k8s", "headless_port"))
     portBody.target_port = ApiConfig().getint("k8s", "headless_port")
     specBody.ports = [portBody]
     body.spec = specBody
     return body
예제 #13
0
 def removeWorker(self, workerList):
     print 'deleting worker list: ' + str(workerList)
     # TODO del k8s-ps, del keys
     authFile = ApiConfig().get("k8s", "auth_file")
     config.load_kube_config(authFile if authFile else None)
     configuration = kubernetes.client.Configuration()
     delSvcInstance = kubernetes.client.CoreV1Api(
         kubernetes.client.ApiClient(configuration))
     body = kubernetes.client.V1DeleteOptions()
     body.propagation_policy = 'Foreground'
     namespace = ApiConfig().get("namespace", "tensorflow")
     for worker in workerList:
         try:
             svcRes = delSvcInstance.delete_namespaced_service(
                 worker, namespace, body)
             print '----------------- worker svcRes: ' + str(svcRes)
         except:
             traceback.print_exc()
예제 #14
0
class DisDeepService(object):
    def __init__(self):
        self.config = ApiConfig()

    def start(self):
        app = tornado.web.Application([(r"/v1/train", TrainHandler)])
        app.listen(self.config.getint("service", "port"))
        logging.info("service start ...")
        tornado.ioloop.IOLoop.current().start()
예제 #15
0
 def post(self, path):
     print "POST"
     print "path: " + path
     print "file: " + str(self.request.body)
     suffix = "?op=CREATE&user.name={0}&data=true".format(ApiConfig().get(
         "request", "hdfs_user"))
     fullUrl = ApiConfig().get("request", "hdfs_url") + "/" + path + suffix
     print 'url: ' + fullUrl
     header = {"Content-Type": "application/octet-stream"}
     #suffix = "?op=CREATE&user.name={0}&data=true".format()
     #fullUrl = ApiConfig().get("request", "hdfs_url") + "/" + path + suffix
     request = HTTPRequest(url=fullUrl,
                           method="PUT",
                           headers=header,
                           body=self.request.body,
                           request_timeout=ApiConfig().getint(
                               "request", "timeout"))
     client = AsyncHTTPClient()
     client.fetch(request, self.on_response)
     self.finish()
예제 #16
0
 def modifEvent(self, objName, eStatus):
     print '*************** UpdateJobHandler modify event: ' + str(objName)
     rc = RedisHelper().getRedis()
     psPt = "tf-([0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})-ps-([0-9].*)-([0-9].*)"
     res = re.match(psPt, objName)
     if res:
         # ps may be shutdown itself through singal from worker
         print 'ps modified'
         #psKey = res.group(1)
         #rc.hincrby(ApiConfig().get("event", "ps_key"), psKey, -1)
     else:
         workerPt = "tf-([0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})-worker-([0-9].*)-([0-9].*)"
         res = re.match(workerPt, objName)
         if not res:
             return
         if eStatus.succeeded and eStatus.succeeded == 1:
             workerKey = res.group(1)
             curCount = rc.hincrby(ApiConfig().get("event", "worker_key"),
                                   workerKey, -1)
             if (int(curCount) == 0):
                 print 'prepare delete ps ++++++++++++++++++++++++++++++'
                 psCnt = rc.hget(ApiConfig().get("event", "ps_key"),
                                 res.group(1))
                 allPs = [
                     'tf-' + res.group(1) + '-ps-' + str(i) + '-' + psCnt
                     for i in xrange(int(psCnt))
                 ]
                 allWorker = [
                     'tf-' + res.group(1) + '-worker-' + str(i) + '-' +
                     res.group(3) for i in xrange(int(res.group(3)))
                 ]
                 print 'all ps: ' + str(allPs)
                 print 'all worker: ' + str(allWorker)
                 tfInfo = {'ps': allPs, 'worker': allWorker}
                 rc.rpush(ApiConfig().get("event", "delete_queue"),
                          json.dumps(tfInfo))
             else:
                 print 'one tf worker done successfully ......'
         else:
             # TODO mark failed
             pass
 def modifEvent(self, objName, eStatus):
     print '*************** CleanJobHandler modify event: ' + str(objName)
     rc = RedisHelper().getRedis()
     tfId, curSeq, cnt = self.searchPattern(CleanJobHandler.psPt, objName)
     if tfId:
         # ps may be shutdown itself through singal from worker
         print 'ps modified'
     else:
         tfId, curSeq, cnt = self.searchPattern(CleanJobHandler.workerPt,
                                                objName)
         if not tfId:
             return
         if eStatus.succeeded and eStatus.succeeded == 1:
             curCount = rc.hincrby(ApiConfig().get("event", "worker_key"),
                                   tfId, -1)
             if (int(curCount) == 0):
                 print 'all worker done, clean ...'
                 psCnt = rc.hget(ApiConfig().get("event", "ps_key"), tfId)
                 print 'psCnt: ' + psCnt
                 allPs = [
                     'tf-' + tfId + '-ps-' + str(i) + '-' + psCnt
                     for i in xrange(int(psCnt))
                 ]
                 allWorker = [
                     'tf-' + tfId + '-worker-' + str(i) + '-' + cnt
                     for i in xrange(int(cnt))
                 ]
                 tfInfo = {'ps': allPs, 'worker': allWorker}
                 print 'tfInfo: ' + str(tfInfo)
                 try:
                     pushRes = rc.rpush(
                         ApiConfig().get("event", "delete_queue"),
                         json.dumps(tfInfo))
                     print 'pushRes: ' + str(pushRes)
                 except:
                     traceback.print_exc()
             else:
                 print 'one tf worker done successfully ......'
         else:
             # TODO failed
             pass
예제 #18
0
 def genV1Service(self, uid):
     servingId = "tf-serving-" + uid
     body = kubernetes.client.V1Service()
     body.api_version = "v1"
     body.kind = "Service"
     metaBody = kubernetes.client.V1ObjectMeta()
     metaBody.name = servingId
     body.metadata = metaBody
     specBody = kubernetes.client.V1ServiceSpec()
     specBody.selector = {"tf": servingId}
     rpcPortBody = kubernetes.client.V1ServicePort(
         port=ApiConfig().getint("k8s", "rpc_port"))
     rpcPortBody.name = "rpc"
     rpcPortBody.target_port = "rpc"
     httpPortBody = kubernetes.client.V1ServicePort(
         port=ApiConfig().getint("k8s", "http_port"))
     httpPortBody.name = "http"
     httpPortBody.target_port = "http"
     specBody.ports = [rpcPortBody, httpPortBody]
     body.spec = specBody
     return body
예제 #19
0
 def delEvent(self, objName, eStatus):
     print '************* UpdateJobHandler delete event: ' + str(objName)
     rc = RedisHelper().getRedis()
     psPt = "tf-([0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12})-ps-([0-9].*)-([0-9].*)"
     res = re.match(psPt, objName)
     if res:
         print 'delete event matched'
         psKey = res.group(1)
         print 'delete event ps_key: ' + psKey
         try:
             psCurCount = rc.hincrby(ApiConfig().get("event", "ps_key"),
                                     psKey, -1)
         except:
             print 'got error'
             traceback.print_exc()
         print 'after hincrby ......'
         print 'delete event ps cur count: ' + str(psCurCount)
         if (int(psCurCount) == 0):
             print ''
             rc.hdel(ApiConfig().get("event", "ps_key"), psKey)
             rc.hdel(ApiConfig().get("event", "worker_key"), psKey)
     else:
         print 'del event not matched'
예제 #20
0
 def removePs(self, psList):
     print 'deleting ps list: ' + str(psList)
     # TODO del k8s-ps, del keys
     authFile = ApiConfig().get("k8s", "auth_file")
     config.load_kube_config(authFile if authFile else None)
     configuration = kubernetes.client.Configuration()
     delJobInstance = kubernetes.client.BatchV1Api(
         kubernetes.client.ApiClient(configuration))
     delSvcInstance = kubernetes.client.CoreV1Api(
         kubernetes.client.ApiClient(configuration))
     body = kubernetes.client.V1DeleteOptions()
     body.propagation_policy = 'Foreground'
     namespace = ApiConfig().get("namespace", "tensorflow")
     for ps in psList:
         try:
             jobRes = delJobInstance.delete_namespaced_job(
                 ps, namespace, body)
             print '----------------- ps jobRes: ' + str(jobRes)
             svcRes = delSvcInstance.delete_namespaced_service(
                 ps, namespace, body)
             print '----------------- ps svcRes: ' + str(svcRes)
         except:
             traceback.print_exc()
예제 #21
0
    def checkJobs(self, jobInfo):
        successUids = []
        failedUids = []
        rc = RedisHelper().getRedis()
        runningSets = rc.smembers(ApiConfig().get("redis", "running_set"))
        for info in jobInfo.items:
            uid = "-".join(info.metadata.name.split('-')[1:-3])
            print("make uid: " + str(uid))
            if uid not in runningSets:
                continue
            failedCount = info.status.failed
            succeedCount = info.status.succeeded
            uidJs = rc.get(uid)
            print('failedCount: ' + str(failedCount))
            print('succeedCount: ' + str(succeedCount))
            print('type: ' + str(type(succeedCount)))
            print('uidjs: ' + uidJs)
            if not uidJs:
                continue
            uidDetail = json.loads(uidJs)
            print('detail: ' + str(uidDetail))
            if succeedCount and succeedCount == 1:
                print('success done')
                if "success" not in uidDetail:
                    uidDetail["success"] = []
                if info.metadata.name not in uidDetail["success"]:
                    uidDetail["success"].append(info.metadata.name)
                rc.set(uid, json.dumps(uidDetail))
            else:
                print('failed done')
                if failedCount and failedCount >= 1:
                    if "failed" not in uidDetail:
                        uidDetail["failed"] = []
                    if info.metadata.name not in uidDetail["failed"]:
                        uidDetail["failed"].append(info.metadata.name)

        for uid in runningSets:
            uidJs = rc.get(uid)
            if not uidJs:
                continue
            uidDetail = json.loads(uidJs)
            runningJobs = uidDetail.get("worker")
            successJobs = uidDetail.get("success", [])
            failedJobs = uidDetail.get("failed", [])
            if len(runningJobs) == len(successJobs):
                successUids.append(uid)
            elif len(failedJobs) >= 1:
                failedUids.append(uid)
        return successUids, failedUids
예제 #22
0
 def run(self):
     try:
         rc = RedisHelper().getRedis()
         while True:
             res = rc.blpop(ApiConfig().get("event", "delete_queue"), 0)
             print '------------------ get res: ' + str(res)
             jsInfo = res[1]
             print '--------------  get info: ' + str(jsInfo)
             infoMap = json.loads(jsInfo)
             self.removePs(infoMap.get('ps', []))
             self.removeWorker(infoMap.get('worker', []))
     except KeyboardInterrupt:
         pass
     except:
         traceback.print_exc()
예제 #23
0
파일: dl.py 프로젝트: goodshark/kube-deep
class DisDeepService(object):
    def __init__(self):
        self.config = ApiConfig()

    def start(self):
        settings = {
            "cookie_secret": "bZJc2sWbQLKos6GkHn/VB9oXwQt8S0R0kRvJ5/xJ89E="
        }
        app = tornado.web.Application(
            [(r"/v1/train", TrainHandler), (r"/v1/serving", ServingHandler),
             (r"/v1/test", TestHandler),
             (r"/v1/user/([a-zA-Z0-9_-]+)", UserHandler),
             (r"/v1/upload/(.*)", UploadHandler)], **settings)
        app.listen(self.config.getint("service", "port"))
        logging.info("service start ...")
        tornado.ioloop.IOLoop.current().start()
예제 #24
0
 def removePs(self, psList):
     print 'deleting ps list: ' + str(psList)
     config.load_kube_config()
     configuration = kubernetes.client.Configuration()
     delJobInstance = kubernetes.client.BatchV1Api(
         kubernetes.client.ApiClient(configuration))
     delSvcInstance = kubernetes.client.CoreV1Api(
         kubernetes.client.ApiClient(configuration))
     body = kubernetes.client.V1DeleteOptions()
     body.propagation_policy = 'Foreground'
     namespace = ApiConfig().get("namespace", "tensorflow")
     for ps in psList:
         try:
             delJobInstance.delete_namespaced_job(ps, namespace, body)
             delSvcInstance.delete_namespaced_service(ps, namespace, body)
         except:
             traceback.print_exc()
예제 #25
0
 def createRs(self, uid, info):
     configuration = kubernetes.client.Configuration()
     api_instance = kubernetes.client.AppsV1Api(
         kubernetes.client.ApiClient(configuration))
     namespace = ApiConfig().get("namespace",
                                 info.get("type", "tensorflow"))
     model = info.get("name", "test")
     modelParentPath = info.get("path", "/path")
     try:
         body = self.genV1Rs(uid, modelParentPath, model)
         api_response = api_instance.create_namespaced_replica_set(
             namespace, body)
     except ApiException as e:
         print(
             "Exception when calling AppsV1Api->create_namespaced_replica_set: %s\n"
             % e)
         logging.error(
             "Exception when calling AppsV1Api->create_namespaced_replica_set: %s\n"
             % e)
         raise
예제 #26
0
 def __new__(cls, *args, **kwargs):
     if not hasattr(cls, 'instance'):
         conf = ApiConfig()
         cls.instance = super(RedisHelper, cls).__new__(cls)
         cls.redis = redis.Redis(host=conf.get("redis", "host"), port=conf.getint("redis", "port"))
     return cls.instance
예제 #27
0
 def moveUid(self, uid):
     rc = RedisHelper().getRedis()
     rc.smove(ApiConfig().get("redis", "running_set"),
              ApiConfig().get("redis", "success_set"), uid)
예제 #28
0
 def __init__(self):
     self.config = ApiConfig()