Exemplo n.º 1
0
 def get_scheduled_history(self,job_id):
     req = master_pb2.ListTaskRequest()
     req.job_id = job_id
     master = master_pb2.Master_Stub(self.channel)
     controller = client.Controller()
     controller.SetTimeout(3.5)
     try:
         response = master.ListTask(controller,req)
         if not response:
             LOG.error('fail to list task %s'%job_id)
             return False,[]
         ret = []
         for task in response.scheduled_tasks:
             base = BaseEntity()
             base.id = task.info.task_id
             base.status = STATE_MAP[task.status]
             base.name = task.info.task_name
             base.agent_addr = task.agent_addr
             base.job_id = task.job_id
             base.offset = task.offset
             base.mem_limit = task.info.required_mem
             base.cpu_limit = task.info.required_cpu
             base.mem_used = task.memory_usage
             base.cpu_used = task.cpu_usage
             base.start_time = task.start_time
             base.gc_path = task.root_path
             base.end_time =  datetime.datetime.fromtimestamp(task.end_time).strftime("%m-%d %H:%M:%S") 
             ret.append(base)
         return True,ret
     except:
         LOG.exception('fail to list task history')
     return False,[]
Exemplo n.º 2
0
 def list_all_node(self):
     """
     list all node of galaxy master
     return:
           if error ,None will be return
     """
     master = master_pb2.Master_Stub(self.channel)
     controller = client.Controller()
     controller.SetTimeout(1.5)
     request = master_pb2.ListNodeRequest()
     try:
         response = master.ListNode(controller,request)
         if not response:
             LOG.error('fail to call list node')
             return []
         ret = []
         for node in response.nodes:
             base = BaseEntity()
             base.id = node.node_id
             base.node_id = node.node_id
             base.addr = node.addr
             base.task_num = node.task_num
             base.cpu_share = node.cpu_share
             base.mem_share = node.mem_share
             base.cpu_allocated = node.cpu_allocated
             base.mem_allocated = node.mem_allocated
             base.mem_used = node.mem_used
             base.cpu_used = node.cpu_used
             ret.append(base)
         return ret
     except:
         LOG.exception("fail to call list node")
     return []
Exemplo n.º 3
0
 def get_real_time_status(self):
     controller = client.Controller()
     controller.SetTimeout(5)
     master = master_pb2.Master_Stub(self.channel)
     request = master_pb2.GetMasterStatusRequest()
     response = master.GetStatus(controller, request)
     return response
Exemplo n.º 4
0
 def list_task_by_job_id(self,job_id):
     req = master_pb2.ListTaskRequest()
     req.job_id = job_id
     master = master_pb2.Master_Stub(self.channel)
     controller = client.Controller()
     controller.SetTimeout(3.5)
     try:
         response = master.ListTask(controller,req)
         if not response:
             LOG.error('fail to list task %s'%job_id)
             return False,[]
         ret = []
         for task in response.tasks:
             base = BaseEntity()
             base.id = task.info.task_id
             base.status = STATE_MAP[task.status]
             base.name = task.info.task_name
             base.agent_addr = task.agent_addr
             base.job_id = task.job_id
             base.offset = task.offset
             base.mem_limit = task.info.required_mem
             base.cpu_limit = task.info.required_cpu
             base.mem_used = task.memory_usage
             base.cpu_used = task.cpu_usage
             base.start_time = task.start_time
             ret.append(base)
         return True,ret
     except:
         LOG.exception('fail to list task')
     return False,[]
Exemplo n.º 5
0
    def list_all_job(self):

        request = master_pb2.ListJobRequest()
        master = master_pb2.Master_Stub(self.channel)
        controller = client.Controller()
        controller.SetTimeout(1.5)
        try:
            response =  master.ListJob(controller,request)
            if not response:
                return False,[]
            ret  = []
            for job in response.jobs:
                base = BaseEntity()
                base.job_id = job.job_id
                base.job_name = job.job_name
                base.running_task_num = job.running_task_num
                base.replica_num = job.replica_num
                trace = BaseEntity()
                trace.killed_count = job.trace.killed_count
                trace.overflow_killed_count = job.trace.overflow_killed_count
                trace.start_count = job.trace.start_count
                trace.deploy_failed_count = job.trace.deploy_failed_count
                trace.reschedule_count = job.trace.reschedule_count
                trace.deploy_start_time = job.trace.deploy_start_time
                trace.deploy_end_time = job.trace.deploy_end_time
                trace.state = SCHEDULE_STATE_MAP[job.trace.state]
                base.trace = trace
                ret.append(base)
            return True,ret
        except:
            LOG.exception('fail to list jobs')
        return False,[]
Exemplo n.º 6
0
    def make_job(self,name,pkg_type,
                      pkg_src,boot_cmd,
                      replicate_num = 1,
                      mem_limit = 1024,
                      cpu_limit = 2, 
                      deploy_step_size=-1):
        """
        send a new job command to galaxy master
        return:

        """
        assert name
        assert pkg_type
        assert pkg_src
        assert boot_cmd
        req = self._build_new_job_req(name,pkg_type,str(pkg_src),
                                      boot_cmd,
                                      replicate_num = replicate_num,
                                      mem_limit = mem_limit,
                                      cpu_limit = cpu_limit,
                                      deploy_step_size = deploy_step_size)
        master = master_pb2.Master_Stub(self.channel)
        controller = client.Controller()
        controller.SetTimeout(1.5)
        try:
            response = master.NewJob(controller,req)
            if not response:
                LOG.error("fail to create job")
                return False,None
            if response.status == 0:
                return True,response.job_id
            return False,response.job_id
        except:
            LOG.exception("fail to create  job")
        return False,None
Exemplo n.º 7
0
 def get_all_job(self):
     controller = client.Controller()
     controller.SetTimeout(5)
     master = master_pb2.Master_Stub(self.channel)
     request = master_pb2.ListJobsRequest()
     response = master.ListJobs(controller, request)
     return response.jobs, True
Exemplo n.º 8
0
 def kill_job(self,job_id):
     req = master_pb2.KillJobRequest()
     req.job_id = job_id
     master = master_pb2.Master_Stub(self.channel)
     controller = client.Controller()
     controller.SetTimeout(1.5)
     try:
         master.KillJob(controller,req)
     except:
         LOG.exception('fail to kill job')
Exemplo n.º 9
0
 def tag_agent(self, tag, agent_set):
     entity = master_pb2.TagEntity(tag = tag,
                                   agents = agent_set)
     request = master_pb2.TagAgentRequest(tag_entity = entity)
     master = master_pb2.Master_Stub(self.channel)
     controller = client.Controller()
     controller.SetTimeout(1.5)
     try:
         response = master.TagAgent(controller, request)
         if response.status == 0 :
             return True
         return False
     except:
         LOG.exception("fail to tag agent")
         return False
Exemplo n.º 10
0
 def update_job(self,id,replicate_num):
     req = master_pb2.UpdateJobRequest()
     req.job_id = int(id)
     req.replica_num = int(replicate_num)
     master = master_pb2.Master_Stub(self.channel)
     controller = client.Controller()
     controller.SetTimeout(1.5)
     try:
         response = master.UpdateJob(controller,req)
         if not response or response.status != 0 :
             return False
         return True
     except:
         LOG.exception('fail to update job')
     return False
Exemplo n.º 11
0
 def list_tag(self):
     request = master_pb2.ListTagRequest()
     master = master_pb2.Master_Stub(self.channel)
     controller = client.Controller()
     controller.SetTimeout(1.5)
     try:
         response = master.ListTag(controller, request)
         ret = []
         for tag in response.tags:
             base = BaseEntity()
             base.tag = tag.tag
             base.agents = [agent for agent in tag.agents]
             ret.append(base.__dict__)
         return ret
     except Exception as e:
         LOG.exception("fail to list tag %s"%str(e))
         return []
Exemplo n.º 12
0
 def get_pods(self, jobid):
     """
     """
     controller = client.Controller()
     controller.SetTimeout(5)
     master = master_pb2.Master_Stub(self.channel)
     request = master_pb2.ShowPodRequest()
     request.jobid = jobid
     response = master.ShowPod(controller, request)
     if response.status != galaxy_pb2.kOk:
         LOG.error("fail get pods");
         return [], False
     for pod in response.pods:
         new_pod = util.pb2dict(pod) 
         new_pod["stage"] = galaxy_pb2.PodStage.Name(pod.stage)
         new_pod["state"] = galaxy_pb2.PodState.Name(pod.state)
         pods.append(new_pod)
     return pods, True
Exemplo n.º 13
0
 def update_job(self,id,replicate_num, deploy_step_size = None):
     req = master_pb2.UpdateJobRequest()
     req.job_id = int(id)
     req.replica_num = int(replicate_num)
     if deploy_step_size != None :
         req.deploy_step_size = deploy_step_size;          
     master = master_pb2.Master_Stub(self.channel)
     controller = client.Controller()
     controller.SetTimeout(1.5)
     try:
         response = master.UpdateJob(controller,req)
         if not response or response.status != 0 :
             return False
         return True
     except client.TimeoutError:
         LOG.exception('rpc timeout')
     except :
         LOG.exception('fail to update job')
     return False
Exemplo n.º 14
0
    def list_all_job(self):

        request = master_pb2.ListJobRequest()
        master = master_pb2.Master_Stub(self.channel)
        controller = client.Controller()
        controller.SetTimeout(1.5)
        try:
            response =  master.ListJob(controller,request)
            if not response:
                return False,[]
            ret  = []
            for job in response.jobs:
                base = BaseEntity()
                base.job_id = job.job_id
                base.job_name = job.job_name
                base.running_task_num = job.running_task_num
                base.replica_num = job.replica_num
                ret.append(base)
            return True,ret
        except:
            LOG.exception('fail to list jobs')
        return False,[]
Exemplo n.º 15
0
 def HeartBeat(self):
     while True:
         with self._mutex:
             master = master_pb2.Master_Stub(self._channel)
             controller = client.Controller()
             controller.SetTimeout(100)
             req = master_pb2.HeartBeatRequest()
             req.cpu_share = self._cpu
             req.mem_share = self._mem
             req.version = self._version
             req.agent_addr = self._my_addr
             status_list = []
             for key in self._task_status:
                 print "running task %s " % key
                 status_list.append(self._task_status[key])
             req.task_status.extend(status_list)
             response = master.HeartBeat(controller, req)
             self._agent_id = response.agent_id
             self._version = response.version
             print "heart beat version %s agent %s" % (self._version,
                                                       self._agent_id)
             time.sleep(1)
Exemplo n.º 16
0
Arquivo: sdk.py Projeto: sguzwf/lumia
 def get_agents(self):
     """
     """
     master = master_pb2.Master_Stub(self.channel)