def get_scheduled_history(self,job_id): req = master_pb2.ListTaskRequest() req.job_id = job_id master = master_pb2.Master_Stub(self.channel) controller = client.Controller() controller.SetTimeout(3.5) try: response = master.ListTask(controller,req) if not response: LOG.error('fail to list task %s'%job_id) return False,[] ret = [] for task in response.scheduled_tasks: base = BaseEntity() base.id = task.info.task_id base.status = STATE_MAP[task.status] base.name = task.info.task_name base.agent_addr = task.agent_addr base.job_id = task.job_id base.offset = task.offset base.mem_limit = task.info.required_mem base.cpu_limit = task.info.required_cpu base.mem_used = task.memory_usage base.cpu_used = task.cpu_usage base.start_time = task.start_time base.gc_path = task.root_path base.end_time = datetime.datetime.fromtimestamp(task.end_time).strftime("%m-%d %H:%M:%S") ret.append(base) return True,ret except: LOG.exception('fail to list task history') return False,[]
def list_all_node(self): """ list all node of galaxy master return: if error ,None will be return """ master = master_pb2.Master_Stub(self.channel) controller = client.Controller() controller.SetTimeout(1.5) request = master_pb2.ListNodeRequest() try: response = master.ListNode(controller,request) if not response: LOG.error('fail to call list node') return [] ret = [] for node in response.nodes: base = BaseEntity() base.id = node.node_id base.node_id = node.node_id base.addr = node.addr base.task_num = node.task_num base.cpu_share = node.cpu_share base.mem_share = node.mem_share base.cpu_allocated = node.cpu_allocated base.mem_allocated = node.mem_allocated base.mem_used = node.mem_used base.cpu_used = node.cpu_used ret.append(base) return ret except: LOG.exception("fail to call list node") return []
def get_real_time_status(self): controller = client.Controller() controller.SetTimeout(5) master = master_pb2.Master_Stub(self.channel) request = master_pb2.GetMasterStatusRequest() response = master.GetStatus(controller, request) return response
def list_task_by_job_id(self,job_id): req = master_pb2.ListTaskRequest() req.job_id = job_id master = master_pb2.Master_Stub(self.channel) controller = client.Controller() controller.SetTimeout(3.5) try: response = master.ListTask(controller,req) if not response: LOG.error('fail to list task %s'%job_id) return False,[] ret = [] for task in response.tasks: base = BaseEntity() base.id = task.info.task_id base.status = STATE_MAP[task.status] base.name = task.info.task_name base.agent_addr = task.agent_addr base.job_id = task.job_id base.offset = task.offset base.mem_limit = task.info.required_mem base.cpu_limit = task.info.required_cpu base.mem_used = task.memory_usage base.cpu_used = task.cpu_usage base.start_time = task.start_time ret.append(base) return True,ret except: LOG.exception('fail to list task') return False,[]
def list_all_job(self): request = master_pb2.ListJobRequest() master = master_pb2.Master_Stub(self.channel) controller = client.Controller() controller.SetTimeout(1.5) try: response = master.ListJob(controller,request) if not response: return False,[] ret = [] for job in response.jobs: base = BaseEntity() base.job_id = job.job_id base.job_name = job.job_name base.running_task_num = job.running_task_num base.replica_num = job.replica_num trace = BaseEntity() trace.killed_count = job.trace.killed_count trace.overflow_killed_count = job.trace.overflow_killed_count trace.start_count = job.trace.start_count trace.deploy_failed_count = job.trace.deploy_failed_count trace.reschedule_count = job.trace.reschedule_count trace.deploy_start_time = job.trace.deploy_start_time trace.deploy_end_time = job.trace.deploy_end_time trace.state = SCHEDULE_STATE_MAP[job.trace.state] base.trace = trace ret.append(base) return True,ret except: LOG.exception('fail to list jobs') return False,[]
def make_job(self,name,pkg_type, pkg_src,boot_cmd, replicate_num = 1, mem_limit = 1024, cpu_limit = 2, deploy_step_size=-1): """ send a new job command to galaxy master return: """ assert name assert pkg_type assert pkg_src assert boot_cmd req = self._build_new_job_req(name,pkg_type,str(pkg_src), boot_cmd, replicate_num = replicate_num, mem_limit = mem_limit, cpu_limit = cpu_limit, deploy_step_size = deploy_step_size) master = master_pb2.Master_Stub(self.channel) controller = client.Controller() controller.SetTimeout(1.5) try: response = master.NewJob(controller,req) if not response: LOG.error("fail to create job") return False,None if response.status == 0: return True,response.job_id return False,response.job_id except: LOG.exception("fail to create job") return False,None
def get_all_job(self): controller = client.Controller() controller.SetTimeout(5) master = master_pb2.Master_Stub(self.channel) request = master_pb2.ListJobsRequest() response = master.ListJobs(controller, request) return response.jobs, True
def kill_job(self,job_id): req = master_pb2.KillJobRequest() req.job_id = job_id master = master_pb2.Master_Stub(self.channel) controller = client.Controller() controller.SetTimeout(1.5) try: master.KillJob(controller,req) except: LOG.exception('fail to kill job')
def tag_agent(self, tag, agent_set): entity = master_pb2.TagEntity(tag = tag, agents = agent_set) request = master_pb2.TagAgentRequest(tag_entity = entity) master = master_pb2.Master_Stub(self.channel) controller = client.Controller() controller.SetTimeout(1.5) try: response = master.TagAgent(controller, request) if response.status == 0 : return True return False except: LOG.exception("fail to tag agent") return False
def update_job(self,id,replicate_num): req = master_pb2.UpdateJobRequest() req.job_id = int(id) req.replica_num = int(replicate_num) master = master_pb2.Master_Stub(self.channel) controller = client.Controller() controller.SetTimeout(1.5) try: response = master.UpdateJob(controller,req) if not response or response.status != 0 : return False return True except: LOG.exception('fail to update job') return False
def list_tag(self): request = master_pb2.ListTagRequest() master = master_pb2.Master_Stub(self.channel) controller = client.Controller() controller.SetTimeout(1.5) try: response = master.ListTag(controller, request) ret = [] for tag in response.tags: base = BaseEntity() base.tag = tag.tag base.agents = [agent for agent in tag.agents] ret.append(base.__dict__) return ret except Exception as e: LOG.exception("fail to list tag %s"%str(e)) return []
def get_pods(self, jobid): """ """ controller = client.Controller() controller.SetTimeout(5) master = master_pb2.Master_Stub(self.channel) request = master_pb2.ShowPodRequest() request.jobid = jobid response = master.ShowPod(controller, request) if response.status != galaxy_pb2.kOk: LOG.error("fail get pods"); return [], False for pod in response.pods: new_pod = util.pb2dict(pod) new_pod["stage"] = galaxy_pb2.PodStage.Name(pod.stage) new_pod["state"] = galaxy_pb2.PodState.Name(pod.state) pods.append(new_pod) return pods, True
def update_job(self,id,replicate_num, deploy_step_size = None): req = master_pb2.UpdateJobRequest() req.job_id = int(id) req.replica_num = int(replicate_num) if deploy_step_size != None : req.deploy_step_size = deploy_step_size; master = master_pb2.Master_Stub(self.channel) controller = client.Controller() controller.SetTimeout(1.5) try: response = master.UpdateJob(controller,req) if not response or response.status != 0 : return False return True except client.TimeoutError: LOG.exception('rpc timeout') except : LOG.exception('fail to update job') return False
def list_all_job(self): request = master_pb2.ListJobRequest() master = master_pb2.Master_Stub(self.channel) controller = client.Controller() controller.SetTimeout(1.5) try: response = master.ListJob(controller,request) if not response: return False,[] ret = [] for job in response.jobs: base = BaseEntity() base.job_id = job.job_id base.job_name = job.job_name base.running_task_num = job.running_task_num base.replica_num = job.replica_num ret.append(base) return True,ret except: LOG.exception('fail to list jobs') return False,[]
def HeartBeat(self): while True: with self._mutex: master = master_pb2.Master_Stub(self._channel) controller = client.Controller() controller.SetTimeout(100) req = master_pb2.HeartBeatRequest() req.cpu_share = self._cpu req.mem_share = self._mem req.version = self._version req.agent_addr = self._my_addr status_list = [] for key in self._task_status: print "running task %s " % key status_list.append(self._task_status[key]) req.task_status.extend(status_list) response = master.HeartBeat(controller, req) self._agent_id = response.agent_id self._version = response.version print "heart beat version %s agent %s" % (self._version, self._agent_id) time.sleep(1)
def get_agents(self): """ """ master = master_pb2.Master_Stub(self.channel)