def get_nodeinfolist(self): nodeinfolist = {} infile = open_file('/etc/clues2/mesos_vnodes.info') if infile: for line in infile: name = line.rstrip('\n') state = NodeInfo.OFF # Illustrative values for Clues, since the node is not running, we # cannot know the real values slots_count = 1 memory_total = 1572864000 slots_free = 1 memory_free = 1572864000 # Create a fake queue keywords = {} keywords['hostname'] = TypedClass.auto(name) queues = ["default"] if queues: keywords['queues'] = TypedList([TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo(name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = state infile.close() mesos_slaves = self._obtain_mesos_nodes() if mesos_slaves: used_nodes = self._obtain_mesos_used_nodes() for mesos_slave in mesos_slaves['slaves']: name = mesos_slave['hostname'] if nodeinfolist: for node in nodeinfolist: if name == nodeinfolist[node].name: state = infer_clues_node_state(mesos_slave["id"], mesos_slave["active"], used_nodes) slots_count = float(mesos_slave['resources']['cpus']) memory_total = calculate_memory_bytes(mesos_slave['resources']['mem']) used_cpu, used_mem = self._obtain_cpu_mem_used_in_mesos_node(mesos_slave["id"]) slots_free = slots_count - used_cpu memory_free = memory_total - used_mem # Create a fake queue keywords = {} keywords['hostname'] = TypedClass.auto(name) queues = ["default"] if queues: keywords['queues'] = TypedList([TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo( name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = state return nodeinfolist
def _get_NodeInfo(self, info_node, default_info_node): # Check queues keywords = default_info_node['keywords'] queues = default_info_node['keywords']['queues'] q = info_node['node_class'] if not (q in self._queues or q == ''): _LOGGER.error( " '%s' (node_class of Nomad Client) is not a valid queue, queue is set to queue of file %s." % (q, self._nodes_info_file)) if q in self._queues: queues = [q] keywords['queues'] = TypedList( [TypedClass.auto(q) for q in queues]) # Illustrative values for Clues, since the node is not running, we cannot know the real values slots_count = default_info_node['cpus'] slots_free = default_info_node['cpus'] memory_total = default_info_node['memory'] memory_free = default_info_node['memory'] # Information obtained from queries if 'slots_count' in info_node['resources']: slots_count = info_node['resources']['slots_count'] if 'memory_total' in info_node['resources']: memory_total = info_node['resources']['memory_total'] if 'slots_used' in info_node['resources']: slots_free = float(slots_count) - float( info_node['resources']['slots_used']) if 'memory_used' in info_node['resources']: memory_free = float(memory_total) - float( info_node['resources']['memory_used']) # Check state state = NodeInfo.UNKNOWN if (info_node['status'] == self._state_on and not info_node['any_job_is_running']): state = NodeInfo.IDLE elif (info_node['status'] == self._state_on and info_node['any_job_is_running']): state = NodeInfo.USED elif (info_node['status'] == self._state_off): state = NodeInfo.OFF #_LOGGER.debug(" name= " + info_node['name'] + ", slots_count= " + str(slots_count) + ", slots_free= " + str(slots_free) + ", memory_total= " + str(memory_total) + ", memory_free= " + str(memory_free) + ", keywords= " + str(keywords) + ", memory_used=" + str(info_node['resources']['memory_used']) + ", slots_used=" + str(info_node['resources']['slots_used']) ) node = NodeInfo(info_node['name'], slots_count, slots_free, memory_total, memory_free, keywords) node.state = state return node
def get_nodeinfolist(self): nodeinfolist = collections.OrderedDict() '''Exit example of scontrol show nodes NodeName=wn0 Arch=x86_64 CoresPerSocket=1 CPUAlloc=0 CPUErr=0 CPUTot=1 CPULoad=0.02 Features=(null) Gres=(null) NodeAddr=wn0 NodeHostName=wn0 Version=14.11 OS=Linux RealMemory=1 AllocMem=0 Sockets=1 Boards=1 State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 BootTime=2015-04-28T13:12:21 SlurmdStartTime=2015-04-28T13:16:32 CurrentWatts=0 LowestJoules=0 ConsumedJoules=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s''' exit = " " try: exit = parse_scontrol(run_command(self._nodes)) except: _LOGGER.error( "could not obtain information about SLURM nodes %s (%s)" % (self._server_ip, exit)) return None if exit: for key in exit: try: name = str(key["NodeName"]) slots_count = int(key["CPUTot"]) slots_free = int(key["CPUTot"]) - int(key["CPUAlloc"]) #NOTE: memory is in GB memory_total = _translate_mem_value(key["RealMemory"] + ".GB") memory_free = _translate_mem_value( key["RealMemory"] + ".GB") - _translate_mem_value(key["AllocMem"] + ".GB") state = infer_clues_node_state(self, str(key["State"])) keywords = {} queues = get_partition(self, name) keywords['hostname'] = TypedClass.auto(name) if queues: keywords['queues'] = TypedList( [TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo(name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = state except: _LOGGER.error("Error adding node: %s." % key) return nodeinfolist
def to_nodeinfo(self): # Transformation of the properties string to a keyword dictionary that would be used for further try: # TODO: it seems that creating a lexer or a yacc is very slow, so we should get rid of this piece of code. We could simply recognice the pairs key=value # _annie.clear_vars() # _annie.check("hostname=\"%s\";ppn=%d;%s" % (self.name, self.slots_count, self.keywords)) # kw = _annie.get_vars() kw = cpyutils.evaluate.vars_from_string("hostname=\"%s\";ppn=%d;%s" % (self.name, self.slots_count, self.keywords)) except: _LOGGER.error("an error happened evaluating host information: 'hostname=\"%s\";%s'" % (self.name, self.keywords)) kw = {} ni = NodeInfo(self.name, self.slots_count, self.slots_free, self.memory_total, self.memory_free, kw) ni.state = self._infer_clues_state() return ni
def _host_to_nodeinfo(h): if (h.state != 'free') and (h.total_slots == 0): h.total_slots = -1 if (h.state != 'free') and (h.memory_total== 0): h.memory_total = -1 ni = NodeInfo(h.NAME, h.total_slots, h.free_slots, h.memory_total, h.memory_free, h.keywords) if h.state in [ 'free' ]: if h.free_slots == h.total_slots: ni.state = Node.IDLE else: ni.state = Node.USED if h.state in [ 'busy' ]: ni.state = Node.USED if h.state in [ 'down', 'error' ]: ni.state = Node.OFF return ni
def get_nodeinfolist(self): nodeinfolist = {} '''Exit example of scontrol show nodes NodeName=wn0 Arch=x86_64 CoresPerSocket=1 CPUAlloc=0 CPUErr=0 CPUTot=1 CPULoad=0.02 Features=(null) Gres=(null) NodeAddr=wn0 NodeHostName=wn0 Version=14.11 OS=Linux RealMemory=1 AllocMem=0 Sockets=1 Boards=1 State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 BootTime=2015-04-28T13:12:21 SlurmdStartTime=2015-04-28T13:16:32 CurrentWatts=0 LowestJoules=0 ConsumedJoules=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s''' exit = " " try: exit = parse_scontrol(run_command(self._nodes)) except: _LOGGER.error("could not obtain information about SLURM nodes %s (%s)" % (self._server_ip, exit)) return None if exit: for key in exit: name = str(key["NodeName"]) slots_count = int(key["CPUTot"]) slots_free = int(key["CPUTot"]) - int(key["CPUAlloc"]) #NOTE: memory is in GB memory_total = _translate_mem_value(key["RealMemory"] + ".GB") memory_free = _translate_mem_value(key["RealMemory"] + ".GB") - _translate_mem_value(key["AllocMem"] + ".GB") state = infer_clues_node_state(self, str(key["State"])) keywords = {} queues = get_partition(self, name) keywords['hostname'] = TypedClass.auto(name) if len(queues) > 0: keywords['queues'] = TypedList([TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo(name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = state return nodeinfolist
def get_nodeinfolist(self): from clueslib.node import NodeInfo nodeinfolist = {} for node in self.nodepool: n_info = NodeInfo(node.name, node.total_cores, node.cores, node.total_memory, node.memory, {}) # self._host_to_nodeinfo(host) nodeinfolist[n_info.name] = n_info if node.state in [ Node.OFF, Node.ERR ]: n_info.state = NodeInfo.OFF elif node.state in [ Node.ON ]: if node.cores != node.total_cores or node.memory != node.total_memory: n_info.state = NodeInfo.USED else: n_info.state = NodeInfo.IDLE elif node.state in [ Node.POW_ON, Node.POW_OFF ]: n_info.state = NodeInfo.OFF return nodeinfolist
def get_nodeinfolist(self): nodeinfolist = collections.OrderedDict() infile = open_file('/etc/clues2/mesos_vnodes.info') if infile: for line in infile: name = line.rstrip('\n') state = NodeInfo.OFF # Illustrative values for Clues, since the node is not running, we # cannot know the real values slots_count = self._node_slots memory_total = self._node_memory slots_free = self._node_slots memory_free = self._node_memory # Create a fake queue keywords = {} keywords['hostname'] = TypedClass.auto(name) queues = ["default"] if queues: keywords['queues'] = TypedList( [TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo(name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = state infile.close() mesos_slaves = self._obtain_mesos_nodes() if mesos_slaves: used_nodes = self._obtain_mesos_used_nodes() for mesos_slave in mesos_slaves['slaves']: name = mesos_slave['hostname'] if nodeinfolist: for node in nodeinfolist: nodeinfolist_node_ip = None try: nodeinfolist_node_ip = socket.gethostbyname( nodeinfolist[node].name) except: _LOGGER.warning("Error resolving node ip %s" % nodeinfolist[node].name) if name == nodeinfolist[ node].name or name == nodeinfolist_node_ip: name = nodeinfolist[node].name state = infer_clues_node_state( mesos_slave["id"], mesos_slave["active"], used_nodes) slots_count = float( mesos_slave['resources']['cpus']) memory_total = calculate_memory_bytes( mesos_slave['resources']['mem']) used_cpu, used_mem = self._obtain_cpu_mem_used_in_mesos_node( mesos_slave["id"]) slots_free = slots_count - used_cpu memory_free = memory_total - used_mem # Create a fake queue keywords = {} keywords['hostname'] = TypedClass.auto(name) queues = ["default"] if queues: keywords['queues'] = TypedList( [TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo( name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = state return nodeinfolist
def get_nodeinfolist(self): nodeinfolist = collections.OrderedDict() worker_nodes = get_worker_nodes_list_from_Startd() if len(worker_nodes) > 0: for worker_node in worker_nodes: activity = "" name = "" slots = 0 slots_free = 0 memory = 0 memory_free = 0 keywords = {} queues = [] try: activity = worker_node["Activity"] except: activity = "undefined" if activity == "Idle": try: name = worker_node["Name"] except: name = "" try: slots = worker_node["TotalSlots"] except: slots = 0 slots_free = slots try: memory = worker_node["Memory"] except: memory = 0 memory_free = memory keywords['hostname'] = TypedClass.auto(name) queues = ["default"] keywords['queues'] = TypedList( [TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo(name, slots, slots_free, memory, memory_free, keywords) nodeinfolist[name].state = NodeInfo.IDLE elif activity != "undefined": try: name = worker_node["Name"] except: name = "" try: slots = worker_node["TotalSlots"] except: slots = 0 slots_free = slots try: memory = worker_node["Memory"] except: memory = 0 memory_free = memory keywords['hostname'] = TypedClass.auto(name) schedulers = get_schedulers_list_from_Schedd() if len(schedulers) > 0: for scheduler in schedulers: jobs_scheduled = htcondor.Schedd(scheduler) jobs_scheduled_attributes = jobs_scheduled.query() if len(jobs_scheduled_attributes) > 0: for job_scheduled_attributes in jobs_scheduled_attributes: nodes = [] try: nodes = job_scheduled_attributes[ "AllRemoteHosts"].split(",") except: try: nodes = [ job_scheduled_attributes[ "RemoteHost"] ] except: nodes = [] if name in nodes: cpus = 0 try: cpus = job_scheduled_attributes[ "RequestCpus"] except: cpus = 0 slots_free -= cpus mem = 0 try: mem = (job_scheduled_attributes[ "ImageSize"] + 1023) / 1024 except: mem = 0 memory_free -= mem queues = ["default"] if len(queues) > 0: keywords['queues'] = TypedList( [TypedClass.auto(q) for q in queues]) if slots_free < 0: slots_free = 0 if memory_free < 0: memory_free = 0 nodeinfolist[name] = NodeInfo(name, slots, slots_free, memory, memory_free, keywords) nodeinfolist[name].state = NodeInfo.USED else: _LOGGER.warning( "could not obtain information about nodes.") return None else: try: infile = open('/etc/clues2/condor_vnodes.info', 'r') for line in infile: name = line.rstrip('\n') # Illustrative values for Clues, since the node is not running, we cannot know the real values slots_count = 1 slots_free = 1 memory_total = 1572864000 memory_free = 1572864000 # Create a fake queue keywords = {} keywords['hostname'] = TypedClass.auto(name) queues = ["default"] keywords['queues'] = TypedList( [TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo(name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = NodeInfo.OFF infile.close() except: _LOGGER.warning("could not obtain information about nodes.") return None return nodeinfolist
def get_nodeinfolist(self): nodeinfolist = {} '''Exit example of /usr/bin/curl -L -X GET http://mesosserverpublic:5050/master/slaves { "slaves": [ { "active": true, "attributes": {}, "hostname": "10.0.0.84", "id": "20150716-115932-1063856798-5050-14165-S0", "pid": "slave(1)@10.0.0.84:5051", "registered_time": 1437487335.75923, "reregistered_time": 1437487335.75927, "resources": { "cpus": 1, "disk": 13438, "mem": 623, "ports": "[31000-32000]" } } ] }''' exit = " " try: exit = run_command(self._nodes) json_data = json.loads(exit) infile = open('/etc/clues2/mesos_vnodes.info', 'r') except: _LOGGER.error("could not obtain information about MESOS nodes %s (%s)" % (self._server_ip, exit)) return None for line in infile: #name = line[:-1] name = line.rstrip('\n') #name = line state = NodeInfo.OFF # Illustrative values for Clues, since the node is not running, we cannot know the real values slots_count = 1 memory_total = 1572864000 slots_free = 1 memory_free = 1572864000 # Create a fake queue keywords = {} queues = ["default"] keywords['hostname'] = TypedClass.auto(name) if len(queues) > 0: keywords['queues'] = TypedList([TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo(name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = state infile.close() if json_data: for node, details in json_data.items(): used_nodes = obtain_used_nodes() for element in details: name = element['hostname'] for node in nodeinfolist: if name == nodeinfolist[node].name: state = infer_clues_node_state(element["id"], element["active"], used_nodes) slots_count = float(element['resources']['cpus']) memory_total = element['resources']['mem'] * 1048576 used_cpu, used_mem = obtain_cpu_mem_used(element["id"]) slots_free = slots_count - used_cpu memory_free = memory_total - used_mem # Create a fake queue keywords = {} queues = ["default"] keywords['hostname'] = TypedClass.auto(name) if len(queues) > 0: keywords['queues'] = TypedList([TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo(name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = state return nodeinfolist
def get_nodeinfolist(self): nodeinfolist = {} worker_nodes = [] collector = htcondor.Collector() try: worker_nodes = collector.locateAll(htcondor.DaemonTypes.Startd) except: worker_nodes = [] if len(worker_nodes) > 0: for worker_node in worker_nodes: activity = "" name = "" slots = 0 slots_free = 0 memory = 0 memory_free = 0 keywords = {} queues = [] try: activity = worker_node["Activity"] except: activity = "undefined" if activity == "Idle": try: name = worker_node["Name"] except: name = "" try: slots = worker_node["TotalSlots"] except: slots = 0 slots_free = slots try: memory = worker_node["Memory"] except: memory = 0 memory_free = memory keywords['hostname'] = TypedClass.auto(name) queues = ["default"] keywords['queues'] = TypedList([TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo(name, slots, slots_free, memory, memory_free, keywords) nodeinfolist[name].state = NodeInfo.IDLE elif activity != "undefined": try: name = worker_node["Name"] except: name = "" try: slots = worker_node["TotalSlots"] except: slots = 0 slots_free = slots try: memory = worker_node["Memory"] except: memory = 0 memory_free = memory keywords['hostname'] = TypedClass.auto(name) try: schedulers = collector.locateAll(htcondor.DaemonTypes.Schedd) except: schedulers = [] if len(schedulers) > 0: for scheduler in schedulers: jobs_scheduled = htcondor.Schedd(scheduler) jobs_scheduled_attributes = jobs_scheduled.query() if len(jobs_scheduled_attributes) > 0: for job_scheduled_attributes in jobs_scheduled_attributes: nodes = [] try: nodes = job_scheduled_attributes["AllRemoteHosts"].split(",") except: try: nodes = [job_scheduled_attributes["RemoteHost"]] except: nodes = [] if name in nodes: cpus = 0 try: cpus = job_scheduled_attributes["RequestCpus"] except: cpus = 0 slots_free -= cpus mem = 0 try: mem = (job_scheduled_attributes["ImageSize"] + 1023)/1024 except: mem = 0 memory_free -= mem queues = ["default"] if len(queues) > 0: keywords['queues'] = TypedList([TypedClass.auto(q) for q in queues]) if slots_free < 0: slots_free = 0 if memory_free < 0: memory_free = 0 nodeinfolist[name] = NodeInfo(name , slots , slots_free , memory , memory_free, keywords) nodeinfolist[name].state = NodeInfo.USED else: _LOGGER.warning("could not obtain information about nodes.") return None else: try: infile = open('/etc/clues2/condor_vnodes.info', 'r') for line in infile: name = line.rstrip('\n') # Illustrative values for Clues, since the node is not running, we cannot know the real values slots_count = 1 slots_free = 1 memory_total = 1572864000 memory_free = 1572864000 # Create a fake queue keywords = {} keywords['hostname'] = TypedClass.auto(name) queues = ["default"] keywords['queues'] = TypedList([TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo(name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = NodeInfo.OFF infile.close() except: _LOGGER.warning("could not obtain information about nodes.") return None return nodeinfolist
def get_nodeinfolist(self): nodeinfolist = collections.OrderedDict() try: vnodes = json.load(open('/etc/clues2/mesos_vnodes.info', 'r')) for vnode in vnodes: name = vnode["name"] if name not in nodeinfolist: keywords = {'hostname': TypedClass(name, TypedClass.STRING)} state = NodeInfo.OFF slots_count = self._node_slots slots_free = self._node_slots if "cpu" in vnode: slots_count = int(vnode["cpu"]) slots_free = int(vnode["cpu"]) memory_total = self._node_memory memory_free = self._node_memory if "memory" in vnode: memory_total = get_memory_in_bytes(vnode["memory"]) memory_free = get_memory_in_bytes(vnode["memory"]) #queues = ["default"] #if "queues" in vnode: # queues = vnode["queues"].split(",") # if queues: # keywords['queues'] = TypedList([TypedClass.auto(q) for q in queues]) if "keywords" in vnode: for keypair in vnode["keywords"].split(','): parts = keypair.split('=') keywords[parts[0].strip()] = TypedClass(parts[1].strip(), TypedClass.STRING) nodeinfolist[name] = NodeInfo(name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = state except Exception as ex: _LOGGER.error("Error processing file /etc/clues2/mesos_vnodes.info: %s" % str(ex)) mesos_slaves = self._obtain_mesos_nodes() if mesos_slaves: for mesos_slave in mesos_slaves['slaves']: name = mesos_slave['hostname'] if nodeinfolist: for node in nodeinfolist: nodeinfolist_node_ip = None try: nodeinfolist_node_ip = socket.gethostbyname(nodeinfolist[node].name) except: _LOGGER.warning("Error resolving node ip %s" % nodeinfolist[node].name) if name == nodeinfolist[node].name or name == nodeinfolist_node_ip: name = nodeinfolist[node].name slots_count = float(mesos_slave['resources']['cpus']) memory_total = calculate_memory_bytes(mesos_slave['resources']['mem']) used_cpu = float(mesos_slave['used_resources']['cpus']) used_mem = calculate_memory_bytes(mesos_slave['used_resources']['mem']) state = NodeInfo.UNKNOWN if mesos_slave["active"]: if used_cpu > 0 or used_mem > 0: state = NodeInfo.USED else: state = NodeInfo.IDLE else: state = NodeInfo.OFF slots_free = slots_count - used_cpu memory_free = memory_total - used_mem # Create a fake queue keywords = {} keywords['hostname'] = TypedClass.auto(name) #queues = ["default"] #if queues: # keywords['queues'] = TypedList([TypedClass.auto(q) for q in queues]) nodeinfolist[name] = NodeInfo( name, slots_count, slots_free, memory_total, memory_free, keywords) nodeinfolist[name].state = state return nodeinfolist
def get_nodeinfolist(self): hostgroups = self._get_hostgroups() dom = self._parse_qhost_xml() if dom is None: return None hosts = dom.getElementsByTagName("host") nodeinfolist = collections.OrderedDict() for h in hosts: hostname = h.getAttribute("name") keywords = {} node_queues = [] # ignore the generic host "global" if hostname != "global": memory_total = 0 memory_used = 0 # get the host values to get the information powered_on = False hostvalues = h.getElementsByTagName("hostvalue") for hv in hostvalues: valuename = hv.getAttribute("name") if valuename == "load_avg": # If the load_avg is defined, the node is considered to be on # TODO: Try to improve this if hv.firstChild.nodeValue != "-": powered_on = True elif valuename == "mem_total": if hv.firstChild.nodeValue != "-": memory_total = self._translate_mem_value( hv.firstChild.nodeValue) elif valuename == "mem_used": if hv.firstChild.nodeValue != "-": memory_used = self._translate_mem_value( hv.firstChild.nodeValue) used_slots = 0 total_slots = 0 # Get the info about the queues queues = h.getElementsByTagName("queue") for q in queues: queue_name = q.getAttribute("name") node_queues.append(TypedClass.auto(str(queue_name))) # Get the queue values queuevalues = q.getElementsByTagName("queuevalue") queue_used_slots = 0 queue_total_slots = 0 state = None for qv in queuevalues: queuevaluename = qv.getAttribute("name") if queuevaluename == "slots_used": queue_used_slots = int(qv.firstChild.nodeValue) if queuevaluename == "slots": queue_total_slots = int(qv.firstChild.nodeValue) if queuevaluename == "state_string": if qv.firstChild != None: state = qv.firstChild.nodeValue # if some of the queues are in "Alarm Unknown" state the node is down if state != None and (state.lower().find('au') != -1): powered_on = False # This slots are disabled/suspended if state != None and (state.lower().find('d') != -1 or state.lower().find('s') != -1): _LOGGER.debug(queue_name + "@" + hostname + " is in " + state + " state. Ignoring this slots") else: used_slots += queue_used_slots total_slots += queue_total_slots keywords['hostname'] = TypedClass.auto(hostname) if len(node_queues) > 0: keywords['queues'] = TypedList(node_queues) node_hgs = [] for hg, nodelist in hostgroups.iteritems(): if hostname in nodelist: node_hgs.append(TypedClass.auto(str(hg))) keywords['hostgroups'] = TypedList(node_hgs) free_slots = total_slots - used_slots if powered_on: if free_slots > 0: state = NodeInfo.IDLE else: state = NodeInfo.USED else: state = NodeInfo.OFF memory_free = -1 if memory_total != -1: memory_free = memory_total - memory_used nodeinfolist[hostname] = NodeInfo(hostname, total_slots, free_slots, memory_total, memory_free, keywords) nodeinfolist[hostname].state = state return nodeinfolist
def _get_nodeinfolist(self, server_nodes_info): ##_LOGGER.info("***** START - get_nodeinfolist ***** ") nodeinfolist = collections.OrderedDict() default_node_info = collections.OrderedDict() # DEFAULT NODE INFO try: vnodes = json.load(open(self._nodes_info_file, 'r')) for vnode in vnodes: NODE = {} NODE['name'] = vnode["name"] NODE['state'] = NodeInfo.OFF NODE['keywords'] = {} NODE['cpus'] = float(self._default_cpu_node) if "cpu" in vnode: NODE['cpus'] = int(vnode["cpu"]) NODE['memory'] = _get_memory_in_bytes( self._default_memory_node) if "memory" in vnode: NODE['memory'] = _get_memory_in_bytes(vnode["memory"]) if "keywords" in vnode: for keypair in vnode["keywords"].split(','): parts = keypair.split('=') NODE['keywords'][parts[0].strip()] = TypedClass( parts[1].strip(), TypedClass.STRING) if "queues" in vnode: queues = vnode["queues"].split(",") if queues: NODE['keywords']['queues'] = TypedList( [TypedClass.auto(q) for q in queues]) else: # All queues to the node NODE['keywords']['queues'] = TypedList( [TypedClass.auto(q) for q in self._queues[:]]) default_node_info[NODE['name']] = NODE except Exception as ex: _LOGGER.error("Error processing file %s: %s" % (self._nodes_info_file, str(ex))) clients_by_server = {} for server_node in server_nodes_info: clients_by_server[server_node] = self._get_Clients_by_Server( server_node ) # Obtain ID, Name, Status, NodeClass and if the Client is running some job # Obtain Resources and Queues for client_id in clients_by_server[server_node]: info_client = clients_by_server[server_node][client_id] if (info_client['state'] in [NodeInfo.IDLE, NodeInfo.USED]): # Client is ON # Obtain Client node address for checking used resources info_client['resources'] = self._get_Client_resources( server_node, client_id) if info_client[ 'name'] in default_node_info: # Valid node for CLUES and IM nodeinfolist[info_client['name']] = self._get_NodeInfo( info_client, default_node_info[info_client['name']]) else: _LOGGER.warning( "Nomad Client with name '%s' founded using Nomad Server API but not exists this node in the configuration file %s" % (info_client['name'], self._nodes_info_file)) # Add nodes from nomad_info file to the list for namenode, node_info in default_node_info.items(): if namenode not in nodeinfolist: nodeinfolist[namenode] = NodeInfo(namenode, node_info['cpus'], node_info['cpus'], node_info['memory'], node_info['memory'], node_info['keywords']) nodeinfolist[namenode].state = node_info['state'] # Print all nodes in log with keywords for key, value in nodeinfolist.items(): string = "%s + keywords={ " % (str(value)) for key2 in value.keywords: string += key2 + ":" + str(value.keywords[key2]) + "," string = string[:-1] + "}" _LOGGER.debug(string) ##_LOGGER.info("***** END - get_nodeinfolist ***** ") return nodeinfolist
def get_nodeinfolist(self): nodeinfolist = collections.OrderedDict() nodes_data = self._create_request('GET', self._nodes_api_url_path, self.auth_data) if nodes_data: pods_data = self._create_request('GET', self._pods_api_url_path, self.auth_data) if not pods_data: _LOGGER.error( "Error getting Kubernetes pod list. Node usage will not be obtained." ) for node in nodes_data["items"]: name = node["metadata"]["name"] memory_total = self._get_memory_in_bytes( node["status"]["allocatable"]["memory"]) slots_total = int(node["status"]["allocatable"]["cpu"]) pods_total = int(node["status"]["allocatable"]["pods"]) skip_node = False # Get Taints if 'taints' in node["spec"] and node["spec"]['taints']: for taint in node["spec"]['taints']: if taint['effect'] in [ "NoSchedule", "PreferNoSchedule", "NoExecute" ]: skip_node = True _LOGGER.debug( "Node %s is tainted with %s, skiping." % (name, taint['effect'])) if not skip_node: used_mem, used_cpus, used_pods, system_pods = self._get_node_used_resources( name, pods_data) memory_free = memory_total - used_mem slots_free = slots_total - used_cpus pods_free = pods_total - used_pods is_ready = True for conditions in node["status"]["conditions"]: if conditions['type'] == "Ready": if conditions['status'] != "True": is_ready = False keywords = { 'pods_free': TypedNumber(pods_free), 'nodeName': TypedClass(name, TypedClass.STRING) } # Add labels as keywords for key, value in node["metadata"]["labels"].items(): keywords[key] = TypedClass(value, TypedClass.STRING) nodeinfolist[name] = NodeInfo(name, slots_total, slots_free, memory_total, memory_free, keywords) if is_ready: nodeinfolist[name].state = NodeInfo.IDLE if (used_pods - system_pods) > 0: nodeinfolist[name].state = NodeInfo.USED else: nodeinfolist[name].state = NodeInfo.OFF else: _LOGGER.error("Error getting Kubernetes node list.") # Add the "virtual" nodes try: vnodes = json.load(open('/etc/clues2/kubernetes_vnodes.info', 'r')) for vnode in vnodes: name = vnode["name"] if name not in nodeinfolist: keywords = { 'pods_free': TypedNumber(self._node_pods), 'nodeName': TypedClass(name, TypedClass.STRING) } cpus = self._node_slots if "cpu" in vnode: cpus = int(vnode["cpu"]) memory = self._node_memory if "memory" in vnode: memory = self._get_memory_in_bytes(vnode["memory"]) if "queues" in vnode: queues = vnode["queues"].split(",") if queues: keywords['queues'] = TypedList( [TypedClass.auto(q) for q in queues]) if "keywords" in vnode: for keypair in vnode["keywords"].split(','): parts = keypair.split('=') keywords[parts[0].strip()] = TypedClass( parts[1].strip(), TypedClass.STRING) nodeinfolist[name] = NodeInfo(name, cpus, cpus, memory, memory, keywords) nodeinfolist[name].state = NodeInfo.OFF except Exception as ex: _LOGGER.error( "Error processing file /etc/clues2/kubernetes_vnodes.info: %s" % str(ex)) return nodeinfolist
def get_nodeinfolist(self): hostgroups = self._get_hostgroups() dom = self._parse_qhost_xml() if dom is None: return None hosts = dom.getElementsByTagName("host") nodeinfolist = {} for h in hosts: hostname = h.getAttribute("name"); keywords = {} node_queues = [] # ignore the generic host "global" if hostname != "global": memory_total = 0 memory_used = 0 # get the host values to get the information powered_on = False hostvalues = h.getElementsByTagName("hostvalue") for hv in hostvalues: valuename = hv.getAttribute("name"); if valuename == "load_avg": # If the load_avg is defined, the node is considered to be on # TODO: Try to improve this if hv.firstChild.nodeValue != "-": powered_on = True elif valuename == "mem_total": if hv.firstChild.nodeValue != "-": memory_total = self._translate_mem_value(hv.firstChild.nodeValue) elif valuename == "mem_used": if hv.firstChild.nodeValue != "-": memory_used = self._translate_mem_value(hv.firstChild.nodeValue) used_slots = 0 total_slots = 0 # Get the info about the queues queues = h.getElementsByTagName("queue") for q in queues: queue_name = q.getAttribute("name"); node_queues.append(TypedClass.auto(str(queue_name))) # Get the queue values queuevalues = q.getElementsByTagName("queuevalue") queue_used_slots = 0 queue_total_slots = 0 state = None for qv in queuevalues: queuevaluename = qv.getAttribute("name"); if queuevaluename == "slots_used": queue_used_slots = int(qv.firstChild.nodeValue) if queuevaluename == "slots": queue_total_slots = int(qv.firstChild.nodeValue) if queuevaluename == "state_string": if qv.firstChild != None: state = qv.firstChild.nodeValue # if some of the queues are in "Alarm Unknown" state the node is down if state != None and (state.lower().find('au') != -1): powered_on = False # This slots are disabled/suspended if state != None and (state.lower().find('d') != -1 or state.lower().find('s') != -1): _LOGGER.debug(queue_name + "@" + hostname + " is in " + state + " state. Ignoring this slots") else: used_slots += queue_used_slots total_slots += queue_total_slots keywords['hostname'] = TypedClass.auto(hostname) if len(node_queues) > 0: keywords['queues'] = TypedList(node_queues) node_hgs = [] for hg, nodelist in hostgroups.iteritems(): if hostname in nodelist: node_hgs.append(TypedClass.auto(str(hg))) keywords['hostgroups'] = TypedList(node_hgs) free_slots = total_slots - used_slots if powered_on: if free_slots > 0: state = NodeInfo.IDLE else: state = NodeInfo.USED else: state = NodeInfo.OFF memory_free = -1 if memory_total != -1: memory_free = memory_total - memory_used nodeinfolist[hostname] = NodeInfo(hostname, total_slots, free_slots, memory_total, memory_free, keywords) nodeinfolist[hostname].state = state return nodeinfolist