예제 #1
0
    def stop_vnode(self, request, context):
        logger.info('stop vnode with config: ' + str(request))
        taskid = request.taskid
        username = request.username
        vnodeid = request.vnodeid
        brname = request.vnode.network.brname
        mount_list = request.vnode.mount
        lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))

        logger.info("Stop the task with lxc:"+lxcname)
        container = lxc.Container(lxcname)
        if container.stop():
            logger.info("stop container %s success" % lxcname)
        else:
            logger.error("stop container %s failed" % lxcname)

        #umount oss
        self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)

        logger.info("deleting container:%s" % lxcname)
        if self.imgmgr.deleteFS(lxcname):
            logger.info("delete container %s success" % lxcname)
        else:
            logger.error("delete container %s failed" % lxcname)

        #del ovs bridge
        if brname is not None:
            netcontrol.del_bridge(brname)

        #release gpu
        self.release_gpu_device(lxcname)

        return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
예제 #2
0
 def stop_task(self, request, context):
     logger.info('stop task with config: ' + str(request))
     taskid = request.taskid
     username = request.username
     vnodeid = request.vnodeid
     lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
     logger.info("Stop the task with lxc:"+lxcname)
     subprocess.run("lxc-stop -k -n %s" % lxcname, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
     return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
예제 #3
0
 def stop_tasks(self, request, context):
     for msg in request.taskmsgs:
         lxcname = '%s-batch-%s-%s-%s' % (msg.username, msg.taskid,
                                          str(msg.instanceid), msg.token)
         logger.info("Stop the task with lxc:" + lxcname)
         subprocess.run("lxc-stop -k -n %s" % lxcname,
                        stdout=subprocess.PIPE,
                        stderr=subprocess.STDOUT,
                        shell=True)
     return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED, message="")
예제 #4
0
    def start_task(self, request, context):
        logger.info('start task with config: ' + str(request))
        taskid = request.taskid
        username = request.username
        vnodeid = request.vnodeid
        # get config from request
        command = request.parameters.command.commandLine #'/root/getenv.sh'  #parameter['Parameters']['Command']['CommandLine']
        #envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars']
        pkgpath = request.parameters.command.packagePath
        envs = request.parameters.command.envVars
        envs['taskid'] = str(taskid)
        envs['vnodeid'] = str(vnodeid)
        timeout = request.timeout
        token = request.token
        outpath = [request.parameters.stdoutRedirectPath,request.parameters.stderrRedirectPath]
        lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))

        thread = threading.Thread(target = self.execute_task, args=(username,taskid,vnodeid,envs,lxcname,pkgpath,command,timeout,outpath,token))
        thread.setDaemon(True)
        thread.start()

        return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")
예제 #5
0
    def process_task(self, request, context):
        logger.info('excute task with parameter: ' + str(request))
        taskid = request.id
        instanceid = request.instanceid

        # get config from request
        command = request.parameters.command.commandLine  #'/root/getenv.sh'  #parameter['Parameters']['Command']['CommandLine']
        #envs = {'MYENV1':'MYVAL1', 'MYENV2':'MYVAL2'} #parameters['Parameters']['Command']['EnvVars']
        pkgpath = request.parameters.command.packagePath
        envs = request.parameters.command.envVars
        envs['taskid'] = str(taskid)
        envs['instanceid'] = str(instanceid)
        image = {}
        image['name'] = request.cluster.image.name
        if request.cluster.image.type == rpc_pb2.Image.PRIVATE:
            image['type'] = 'private'
        elif request.cluster.image.type == rpc_pb2.Image.PUBLIC:
            image['type'] = 'public'
        else:
            image['type'] = 'base'
        image['owner'] = request.cluster.image.owner
        username = request.username
        token = request.token
        lxcname = '%s-batch-%s-%s-%s' % (username, taskid, str(instanceid),
                                         token)
        instance_type = request.cluster.instance
        mount_list = request.cluster.mount
        outpath = [
            request.parameters.stdoutRedirectPath,
            request.parameters.stderrRedirectPath
        ]
        timeout = request.timeout
        gpu_need = int(request.cluster.instance.gpu)
        reused = request.reused

        #create container
        [success, ip] = self.create_container(instanceid, username, image,
                                              lxcname, instance_type)
        if not success:
            return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=ip)

        #mount oss
        self.mount_oss("%s/global/users/%s/oss" % (self.fspath, username),
                       mount_list)
        conffile = open("/var/lib/lxc/%s/config" % lxcname, 'a+')
        mount_str = "lxc.mount.entry = %s/global/users/%s/oss/%s %s/root/oss/%s none bind,rw,create=dir 0 0"
        for mount in mount_list:
            conffile.write("\n" + mount_str %
                           (self.fspath, username, mount.remotePath, rootfs,
                            mount.remotePath))
        conffile.close()

        logger.info("Start container %s..." % lxcname)
        #container = lxc.Container(lxcname)
        ret = subprocess.run('lxc-start -n %s' % lxcname,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=True)
        if ret.returncode != 0:
            logger.error('start container %s failed' % lxcname)
            self.release_ip(ip)
            self.imgmgr.deleteFS(lxcname)
            return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,
                                 message="Can't start the container")

        logger.info('start container %s success' % lxcname)

        #add GPU
        [success, msg] = self.add_gpu_device(lxcname, gpu_need)
        if not success:
            logger.error("Fail to add gpu device. " + msg)
            container.stop()
            self.release_ip(ip)
            self.imgmgr.deleteFS(lxcname)
            return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,
                                 message="Fail to add gpu device. " + msg)

        thread = threading.Thread(target=self.execute_task,
                                  args=(username, taskid, instanceid, envs,
                                        lxcname, pkgpath, command, timeout,
                                        outpath, ip, token, mount_list))
        thread.setDaemon(True)
        thread.start()

        return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED, message="")
예제 #6
0
    def start_vnode(self, request, context):
        logger.info('start vnode with config: ' + str(request))
        taskid = request.taskid
        vnodeid = request.vnodeid

        envs = {}
        envs['taskid'] = str(taskid)
        envs['vnodeid'] = str(vnodeid)
        image = {}
        image['name'] = request.vnode.image.name
        if request.vnode.image.type == rpc_pb2.Image.PRIVATE:
            image['type'] = 'private'
        elif request.vnode.image.type == rpc_pb2.Image.PUBLIC:
            image['type'] = 'public'
        else:
            image['type'] = 'base'
        image['owner'] = request.vnode.image.owner
        username = request.username
        lxcname = '%s-batch-%s-%s' % (username,taskid,str(vnodeid))
        instance_type =  request.vnode.instance
        mount_list = request.vnode.mount
        gpu_need = int(request.vnode.instance.gpu)
        ipaddr = request.vnode.network.ipaddr
        gateway = request.vnode.network.gateway
        brname = request.vnode.network.brname
        masterip = request.vnode.network.masterip
        hostname = request.vnode.hostname

        #create container
        [success, msg] = self.create_container(taskid, vnodeid, username, image, lxcname, instance_type, ipaddr, gateway, brname, hostname)
        if not success:
            return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED, message=msg)

        #mount oss
        lxcpath = "/var/lib/lxc/%s" % lxcname
        rootfs = lxcpath + "/rootfs"
        self.mount_oss(lxcpath + "/oss", mount_list)
        conffile = open(lxcpath + "/config", 'a+')
        mount_str = "lxc.mount.entry = "+ lxcpath +"/oss/%s/%s %s/root/oss/%s none bind,rw,create=dir 0 0"
        for mount in mount_list:
            conffile.write("\n"+ mount_str % (mount.provider, mount.remotePath, rootfs, mount.remotePath))
        conffile.close()

        logger.info("Start container %s..." % lxcname)
        container = lxc.Container(lxcname)
        ret = subprocess.run('lxc-start -n %s'%lxcname,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
        if ret.returncode != 0:
            logger.error('start container %s failed' % lxcname)
            self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)
            self.imgmgr.deleteFS(lxcname)
            return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Can't start the container(%s)"%lxcname)

        logger.info('start container %s success' % lxcname)

        if masterip != self.worker_ip:
            netcontrol.setup_gre(brname, masterip)

        #add GPU
        [success, msg] = self.add_gpu_device(lxcname,gpu_need)
        if not success:
            logger.error("Fail to add gpu device. " + msg)
            container.stop()
            self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)
            self.imgmgr.deleteFS(lxcname)
            return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Fail to add gpu device. " + msg)

        #start ssh service
        cmd = "lxc-attach -n %s -- service ssh start" % lxcname
        ret = subprocess.run(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT, shell=True)
        if ret.returncode != 0:
            logger.error('Fail to start ssh service of container %s' % lxcname)
            container.stop()
            self.umount_oss("/var/lib/lxc/%s/oss" % (lxcname), mount_list)
            self.imgmgr.deleteFS(lxcname)
            return rpc_pb2.Reply(status=rpc_pb2.Reply.REFUSED,message="Fail to start ssh service. lxc(%s)"%lxcname)

        return rpc_pb2.Reply(status=rpc_pb2.Reply.ACCEPTED,message="")