def test_node_scontrol(): """Node: Compare scontrol values to PySlurm values.""" all_node_ids = pyslurm.node().ids() test_node = all_node_ids[-1] test_node_info = pyslurm.node().find_id(test_node) assert_equals(test_node, test_node_info["name"]) sctl_dict = scontrol_show('node', test_node) assert_equals(test_node_info["alloc_mem"], int(sctl_dict["AllocMem"])) assert_equals(test_node_info["boards"], int(sctl_dict["Boards"])) assert_equals(test_node_info["alloc_cpus"], int(sctl_dict["CPUAlloc"])) assert_equals(test_node_info["cpus"], int(sctl_dict["CPUTot"])) assert_equals(test_node_info["cores"], int(sctl_dict["CoresPerSocket"])) assert_equals(test_node_info["energy"]["current_watts"], int(sctl_dict["CurrentWatts"])) assert_equals(test_node_info["name"], sctl_dict["NodeName"]) assert_equals(test_node_info["node_addr"], sctl_dict["NodeAddr"]) assert_equals(test_node_info["node_hostname"], sctl_dict["NodeHostName"]) assert_equals(test_node_info["partitions"], sctl_dict["Partitions"].split(",")) assert_equals(test_node_info["real_memory"], int(sctl_dict["RealMemory"])) assert_equals(test_node_info["sockets"], int(sctl_dict["Sockets"])) assert_equals(test_node_info["state"], sctl_dict["State"]) assert_equals(test_node_info["threads"], int(sctl_dict["ThreadsPerCore"])) assert_equals(test_node_info["tmp_disk"], int(sctl_dict["TmpDisk"])) assert_equals(test_node_info["weight"], int(sctl_dict["Weight"]))
def test_node_scontrol(): """Node: Compare scontrol values to PySlurm values.""" all_node_ids = pyslurm.node().ids() test_node = all_node_ids[-1] test_node_info = pyslurm.node().find_id(test_node) assert_equals(test_node, test_node_info["name"]) sctl = subprocess.Popen(["scontrol", "-d", "show", "node", test_node], stdout=subprocess.PIPE).communicate() sctl_stdout = sctl[0].strip().decode("UTF-8").split() sctl_dict = dict((value.split("=")[0], value.split("=")[1]) for value in sctl_stdout) assert_equals(test_node_info["alloc_mem"], int(sctl_dict["AllocMem"])) assert_equals(test_node_info["boards"], int(sctl_dict["Boards"])) assert_equals(test_node_info["alloc_cpus"], int(sctl_dict["CPUAlloc"])) assert_equals(test_node_info["err_cpus"], int(sctl_dict["CPUErr"])) assert_equals(test_node_info["cpus"], int(sctl_dict["CPUTot"])) assert_equals(test_node_info["energy"]["consumed_energy"], int(sctl_dict["ConsumedJoules"])) assert_equals(test_node_info["cores"], int(sctl_dict["CoresPerSocket"])) assert_equals(test_node_info["energy"]["current_watts"], int(sctl_dict["CurrentWatts"])) assert_equals(test_node_info["name"], sctl_dict["NodeName"]) assert_equals(test_node_info["node_addr"], sctl_dict["NodeAddr"]) assert_equals(test_node_info["node_hostname"], sctl_dict["NodeHostName"]) assert_equals(test_node_info["partitions"], sctl_dict["Partitions"].split(",")) assert_equals(test_node_info["real_memory"], int(sctl_dict["RealMemory"])) assert_equals(test_node_info["sockets"], int(sctl_dict["Sockets"])) assert_equals(test_node_info["state"], sctl_dict["State"]) assert_equals(test_node_info["threads"], int(sctl_dict["ThreadsPerCore"])) assert_equals(test_node_info["tmp_disk"], int(sctl_dict["TmpDisk"])) assert_equals(test_node_info["weight"], int(sctl_dict["Weight"]))
def test_node_scontrol(): """Node: Compare scontrol values to PySlurm values.""" all_node_ids = pyslurm.node().ids() test_node = all_node_ids[0] test_node_info = pyslurm.node().find_id(test_node) assert_equals(test_node, test_node_info["name"]) sctl = subprocess.Popen(["scontrol", "-d", "show", "node", test_node], stdout=subprocess.PIPE).communicate() sctl_stdout = sctl[0].strip().decode("UTF-8").split() sctl_dict = dict((value.split("=")[0], value.split("=")[1]) for value in sctl_stdout) assert_equals(test_node_info["alloc_mem"], int(sctl_dict["AllocMem"])) assert_equals(test_node_info["boards"], int(sctl_dict["Boards"])) assert_equals(test_node_info["alloc_cpus"], int(sctl_dict["CPUAlloc"])) assert_equals(test_node_info["err_cpus"], int(sctl_dict["CPUErr"])) assert_equals(test_node_info["cpus"], int(sctl_dict["CPUTot"])) assert_equals(test_node_info["energy"]["consumed_energy"], int(sctl_dict["ConsumedJoules"])) assert_equals(test_node_info["cores"], int(sctl_dict["CoresPerSocket"])) assert_equals(test_node_info["energy"]["current_watts"], int(sctl_dict["CurrentWatts"])) assert_equals(test_node_info["name"], sctl_dict["NodeName"]) assert_equals(test_node_info["node_addr"], sctl_dict["NodeAddr"]) assert_equals(test_node_info["node_hostname"], sctl_dict["NodeHostName"]) assert_equals(test_node_info["real_memory"], int(sctl_dict["RealMemory"])) assert_equals(test_node_info["sockets"], int(sctl_dict["Sockets"])) assert_equals(test_node_info["state"], sctl_dict["State"]) assert_equals(test_node_info["threads"], int(sctl_dict["ThreadsPerCore"])) assert_equals(test_node_info["tmp_disk"], int(sctl_dict["TmpDisk"])) assert_equals(test_node_info["weight"], int(sctl_dict["Weight"]))
def test_node_update(): """Node: Test node().update().""" node_test_before = pyslurm.node().find_id("c10") assert_equals(node_test_before["state"], "IDLE") node_test_update = { "node_names": "c10", "node_state": pyslurm.NODE_STATE_DRAIN, "reason": "unit testing" } rc = pyslurm.node().update(node_test_update) assert_equals(rc, 0) node_test_during = pyslurm.node().find_id("c10") assert_equals(node_test_during["state"], "IDLE+DRAIN") node_test_update = { "node_names": "c10", "node_state": pyslurm.NODE_RESUME } rc = pyslurm.node().update(node_test_update) assert_equals(rc, 0) node_test_after = pyslurm.node().find_id("c10") assert_equals(node_test_after["state"], "IDLE")
def is_node(n): try: pyslurm.node().find_id(n) except IndexError: return False return True
def test_gres_used_parser(): """Node: Test node().parse_gres().""" assert_equals( pyslurm.node().parse_gres("gpu:p100:2(IDX:1,3),lscratch:0"), ["gpu:p100:2(IDX:1,3)", "lscratch:0"] ) assert_equals( pyslurm.node().parse_gres("gpu:0,hbm:0"), ["gpu:0", "hbm:0"] ) assert_equals( pyslurm.node().parse_gres("gpu:p100:0(IDX:N/A),hbm:0"), ["gpu:p100:0(IDX:N/A)", "hbm:0"] ) assert_equals( pyslurm.node().parse_gres("gpu:p100:1(IDX:0),hbm:0"), ["gpu:p100:1(IDX:0)", "hbm:0"] ) assert_equals( pyslurm.node().parse_gres("gpu:p100:1(IDX:1),hbm:0"), ["gpu:p100:1(IDX:1)", "hbm:0"] ) assert_equals( pyslurm.node().parse_gres("gpu:p100:2(IDX:0-1),hbm:0"), ["gpu:p100:2(IDX:0-1)", "hbm:0"] ) assert_equals( pyslurm.node().parse_gres("hbm:0"), ["hbm:0"] ) assert_equals( pyslurm.node().parse_gres("lscratch:0,hbm:0"), ["lscratch:0", "hbm:0"] )
def get_available_nodes(self,slices_size=1): """ Returns a list of currently available nodes by slice of slices_size ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]'] :param slices_size: slices size :param type: int :returns: list of nodes_id :rtype: str """ node_list=[] a = pyslurm.node() node_dict = a.get() node_count=0 nodeset = NodeSet() if len(node_dict) > 0: for key, value in sorted(node_dict.iteritems()): if value['state']=='IDLE': nodetype=value nodeset.update(key) node_count+=1 if node_count==slices_size: node_list.append(str(nodeset)) nodeset=NodeSet() slice_str=None node_count=0 return node_list
def get_available_nodes(self, slices_size=1): """ Returns a list of currently available nodes by slice of slices_size ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]'] :param slices_size: slices size :param type: int :returns: list of nodes_id :rtype: str """ node_list = [] a = pyslurm.node() node_dict = a.get() node_count = 0 nodeset = NodeSet() if len(node_dict) > 0: for key, value in sorted(node_dict.iteritems()): if value['state'] == 'IDLE': nodetype = value nodeset.update(key) node_count += 1 if node_count == slices_size: node_list.append(str(nodeset)) nodeset = NodeSet() slice_str = None node_count = 0 return node_list
def collect(self): # Metric declarations NODES_CPUS = GaugeMetricFamily('slurm_nodes_cpus', 'Numbers of CPUs on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels) NODES_CPUS_ALLOC = GaugeMetricFamily('slurm_nodes_cpus_alloc', 'Numbers of CPUs allocated on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels) NODES_CPU_LOAD = GaugeMetricFamily('slurm_nodes_cpu_load', 'CPU loads on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels) NODES_MEM_TOTAL = GaugeMetricFamily('slurm_nodes_mem_total', 'Total amounts of memory available on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels, unit='bytes') NODES_MEM_FREE = GaugeMetricFamily('slurm_nodes_mem_free', 'Amounts of free memory allocated on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels, unit='bytes') NODES_MEM_ALLOC = GaugeMetricFamily('slurm_nodes_mem_alloc', 'Amounts of memory allocated on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels, unit='bytes') # Load node info from Slurm nodes = pyslurm.node().get() cluster = pyslurm.config().get()['cluster_name'] for node in nodes.keys(): for partition in nodes[node]['partitions']: labels_ = [cluster, partition] + [str(nodes[node][prop]) for prop in self.props] if 'METRIC_VALUE_NULL' in os.environ and os.environ['METRIC_VALUE_NULL'].lower() == 'include': NODES_CPUS.add_metric(labels_, nodes[node]['cpus']) NODES_CPUS_ALLOC.add_metric(labels_, nodes[node]['alloc_cpus']) NODES_CPU_LOAD.add_metric(labels_, nodes[node]['cpu_load']/100.0) NODES_MEM_TOTAL.add_metric(labels_, nodes[node]['real_memory']*1000**2) # MB to Bytes NODES_MEM_ALLOC.add_metric(labels_, nodes[node]['alloc_mem']*1000**2) # MB to Bytes NODES_MEM_FREE.add_metric(labels_, nodes[node]['free_mem']*2**20) # MiB to Bytes else: NODES_CPUS.add_metric(labels_, nodes[node]['cpus']) if nodes[node]['cpus'] else None NODES_CPUS_ALLOC.add_metric(labels_, nodes[node]['alloc_cpus']) if nodes[node]['alloc_cpus'] else None NODES_CPU_LOAD.add_metric(labels_, nodes[node]['cpu_load']/100.0) if nodes[node]['cpu_load'] else None NODES_MEM_TOTAL.add_metric(labels_, nodes[node]['real_memory']*1000**2) if nodes[node]['real_memory'] else None NODES_MEM_ALLOC.add_metric(labels_, nodes[node]['alloc_mem']*1000**2) if nodes[node]['alloc_mem'] else None NODES_MEM_FREE.add_metric(labels_, nodes[node]['free_mem']*2**20) if nodes[node]['free_mem'] else None yield NODES_CPUS yield NODES_CPUS_ALLOC yield NODES_CPU_LOAD yield NODES_MEM_TOTAL yield NODES_MEM_FREE yield NODES_MEM_ALLOC
def retrieveInfluxPoints (self): global ignore_count #self.pyslurmQueryTime= datetime.now().timestamp() #pdb.set_trace() rp_points = DDict(list) #{'autogen':[point...]} self.nodeData = pyslurm.node().get() #autogen.cpu_info: time, hostname, total_socket, total_thread, total_cores, cpu_model, is_vm #hostinfo_msgs = self.consume_hostinfo_msgs() #hostinfo = self.process_list(hostinfo_msgs, self.hostinfo2point) #rp_points['autogen'] = hostinfo #logger.debug("MQTTReader generate {} hostinfo points".format(len(hostinfo))) #autogen.cpu_load: time, hostname, proc_*, load_*, cpu_*, mem_*, net_*, disk_* hostperf_msgs = self.consume_hostperf_msgs() hostperf = self.process_list(hostperf_msgs, self.hostperf2point) rp_points['autogen'] = hostperf logger.debug("MQTTReader generate {} hostperf points".format(len(hostperf))) #autogen.cpu_proc_info, job_proc_mon hostproc_msgs = self.consume_hostproc_msgs() ignore_count = 0 self.process_list_add(rp_points, hostproc_msgs, self.hostproc2point) logger.debug("MQTTReader generate {} {} points after adding hostproc, ignore {} points".format(len(rp_points['autogen']), len(rp_points['one_month']), ignore_count)) #points = hostinfo + hostperf + hostproc if self.cpu_up_ts_count > 0: with open (self.TS_FNAME, 'w') as f: json.dump(self.cpu_up_ts, f) self.cpu_up_ts_count = 0 self.node2tsPidsCache.writeFile () #save intermediate data structure, node2 return rp_points
def test_node_scontrol(): all_node_ids = pyslurm.node().ids() test_node = all_node_ids[0] assert type(test_node) is StringType test_node_info = pyslurm.node().find_id(test_node) assert test_node == test_node_info["name"] scontrol = subprocess.Popen(["scontrol", "-d", "show", "node", test_node], stdout=subprocess.PIPE).communicate() scontrol_stdout = scontrol[0].strip().split() scontrol_dict = { value.split("=")[0]: value.split("=")[1] for value in scontrol_stdout } assert test_node_info["alloc_mem"] == int(scontrol_dict["AllocMem"]) assert test_node_info["arch"] == scontrol_dict["Arch"] assert test_node_info["boards"] == int(scontrol_dict["Boards"]) #BootTime=2016-01-12T23:56:26 assert test_node_info["alloc_cpus"] == int(scontrol_dict["CPUAlloc"]) assert test_node_info["err_cpus"] == int(scontrol_dict["CPUErr"]) assert test_node_info["cpus"] == int(scontrol_dict["CPUTot"]) #CPULoad=0.01 #CapWatts=n/a assert test_node_info["energy"]["consumed_energy"] == int( scontrol_dict["ConsumedJoules"]) assert test_node_info["cores"] == int(scontrol_dict["CoresPerSocket"]) assert test_node_info["energy"]["current_watts"] == int( scontrol_dict["CurrentWatts"]) # TODO: skipping some assert test_node_info["features"] == scontrol_dict["Features"] assert test_node_info["free_mem"] == int(scontrol_dict["FreeMem"]) # TODO: skipping some assert test_node_info["name"] == scontrol_dict["NodeName"] assert test_node_info["node_addr"] == scontrol_dict["NodeAddr"] assert test_node_info["node_hostname"] == scontrol_dict["NodeHostName"] assert test_node_info["os"] == scontrol_dict["OS"] assert test_node_info["real_memory"] == int(scontrol_dict["RealMemory"]) assert test_node_info["sockets"] == int(scontrol_dict["Sockets"]) assert test_node_info["state"] == scontrol_dict["State"] assert test_node_info["threads"] == int(scontrol_dict["ThreadsPerCore"]) assert test_node_info["tmp_disk"] == int(scontrol_dict["TmpDisk"]) assert test_node_info["version"] == scontrol_dict["Version"] assert test_node_info["weight"] == int(scontrol_dict["Weight"])
def __init__(self): self.noFarm = ['slurmweb', 'huematrix'] self.nodeInformations = {} self.__hosts = hostlist() self.__node = node() self.SearchNodes() self.nodes = sorted(self.nodeInformations.keys()) self.state = '' self.node = ''
def get_cluster(): nodes = pyslurm.node().get() cluster = {} cluster['name'] = pyslurm.config().get()['cluster_name'] cluster['nodes'] = len(nodes.keys()) cluster['cores'] = 0 for nodename, node in nodes.iteritems(): cluster['cores'] += node['cpus'] return jsonify(cluster)
def getJobAllocGPU(job, node_dict=None): if not node_dict: node_dict = pyslurm.node().get() if not job['cpus_allocated']: return None node_list = [node_dict[node] for node in job['cpus_allocated']] gpus_allocated = MyTool.getGPUAlloc_layout(node_list, job['gres_detail']) return gpus_allocated
def get(id, limit=100): node_o = pyslurm.node() try: this_node = node_o.get_node(str(id)) except Exception as e: resp = {"code": 500, "message": str(e)} return resp, 500 if len(this_node.values()) < 1: return NoContent, 204 return dict(this_node[str(id)]), 200
def update_nodes_from_tests(prefix, test_type, fail_nodes=None, error_nodes=None, down=False): fail_nodes_path = os.path.join(prefix, test_type, 'fail_nodes') try: fail_nodes_ = set(bench.util.read_node_list(fail_nodes_path)) except IOError as ex: logger.info('unable to read {0}'.format(fail_nodes_path)) logger.debug(ex, exc_info=True) fail_nodes_ = set() error_nodes_path = os.path.join(prefix, test_type, 'error_nodes') try: error_nodes_ = set(bench.util.read_node_list(error_nodes_path)) except IOError as ex: logger.info('unable to read {0}'.format(error_nodes_path)) logger.debug(ex, exc_info=True) error_nodes_ = set() # by default, reserve fail_nodes and error_nodes if not (fail_nodes or error_nodes): nodes_to_update = fail_nodes_ | error_nodes_ else: nodes_to_update = set() if fail_nodes: nodes_to_update |= fail_nodes_ if error_nodes: nodes_to_update |= error_nodes_ if down: node_state = pyslurm.NODE_STATE_DOWN node_state_s = 'DOWN' else: node_state = pyslurm.NODE_STATE_DRAIN node_state_s = 'DRAINED' pyslurm_node = pyslurm.node() for node in sorted(nodes_to_update): current_node_state = pyslurm_node.find_id(node)['node_state'] if current_node_state.startswith(node_state_s): continue node_update = { 'node_names': node, 'node_state': node_state, 'reason': 'bench:{0}'.format(test_type), } rc = pyslurm_node.update(node_update) if rc != 0: logger.error('unable to update node {0}: {1}'.format( node, pyslurm.slurm_strerror(pyslurm.slurm_get_errno()))) else: logger.info('{0} set to {1}'.format(node, node_state_s))
def update_nodes_from_tests (prefix, test_type, fail_nodes=None, error_nodes=None, down=False): fail_nodes_path = os.path.join(prefix, test_type, 'fail_nodes') try: fail_nodes_ = set(bench.util.read_node_list(fail_nodes_path)) except IOError as ex: logger.info('unable to read {0}'.format(fail_nodes_path)) logger.debug(ex, exc_info=True) fail_nodes_ = set() error_nodes_path = os.path.join(prefix, test_type, 'error_nodes') try: error_nodes_ = set(bench.util.read_node_list(error_nodes_path)) except IOError as ex: logger.info('unable to read {0}'.format(error_nodes_path)) logger.debug(ex, exc_info=True) error_nodes_ = set() # by default, reserve fail_nodes and error_nodes if not (fail_nodes or error_nodes): nodes_to_update = fail_nodes_ | error_nodes_ else: nodes_to_update = set() if fail_nodes: nodes_to_update |= fail_nodes_ if error_nodes: nodes_to_update |= error_nodes_ if down: node_state = pyslurm.NODE_STATE_DOWN node_state_s = 'DOWN' else: node_state = pyslurm.NODE_STATE_DRAIN node_state_s = 'DRAINED' pyslurm_node = pyslurm.node() for node in sorted(nodes_to_update): current_node_state = pyslurm_node.find_id(node)['node_state'] if current_node_state.startswith(node_state_s): continue node_update = { 'node_names': node, 'node_state': node_state, 'reason': 'bench:{0}'.format(test_type), } rc = pyslurm_node.update(node_update) if rc != 0: logger.error('unable to update node {0}: {1}'.format( node, pyslurm.slurm_strerror(pyslurm.slurm_get_errno()) )) else: logger.info('{0} set to {1}'.format( node, node_state_s))
def get_core_counts(): data = dict() n = pyslurm.node() nodes = n.get() data['total'] = [0, 0, 0] for id, node in nodes.iteritems(): data[id] = [ node['alloc_cpus'], node['cpus'] - node['alloc_cpus'], node['cpus'] ] data['total'] = [x + y for x, y in zip(data['total'], data[id])] return data
def run(self): #pdb.set_trace() while True: curr_ts = time.time() slurmJobs = pyslurm.job().get() slurmNodes = pyslurm.node().get() msgs = self.source.retrieveMsgs() if msgs: self.dealData(curr_ts, slurmJobs, slurmNodes, msgs) time.sleep(self.INTERVAL)
def pre_update(self): # Ganglia self.ganglia_data = ganglia.Stats(do_cpus=True).all # Slurm self.pyslurm_node = pyslurm.node().get() self.pyslurm_job = pyslurm.job().get() # Influx self.update_mem_data() self.prune_mem_max()
def test_node_scontrol(): all_node_ids = pyslurm.node().ids() test_node = all_node_ids[0] assert type(test_node) is StringType test_node_info = pyslurm.node().find_id(test_node) assert test_node == test_node_info["name"] scontrol = subprocess.Popen(["scontrol", "-d", "show", "node", test_node], stdout=subprocess.PIPE).communicate() scontrol_stdout = scontrol[0].strip().split() scontrol_dict = {value.split("=")[0]: value.split("=")[1] for value in scontrol_stdout} assert test_node_info["alloc_mem"] == int(scontrol_dict["AllocMem"]) assert test_node_info["arch"] == scontrol_dict["Arch"] assert test_node_info["boards"] == int(scontrol_dict["Boards"]) #BootTime=2016-01-12T23:56:26 assert test_node_info["alloc_cpus"] == int(scontrol_dict["CPUAlloc"]) assert test_node_info["err_cpus"] == int(scontrol_dict["CPUErr"]) assert test_node_info["cpus"] == int(scontrol_dict["CPUTot"]) #CPULoad=0.01 #CapWatts=n/a assert test_node_info["energy"]["consumed_energy"] == int(scontrol_dict["ConsumedJoules"]) assert test_node_info["cores"] == int(scontrol_dict["CoresPerSocket"]) assert test_node_info["energy"]["current_watts"] == int(scontrol_dict["CurrentWatts"]) # TODO: skipping some assert test_node_info["features"] == scontrol_dict["Features"] assert test_node_info["free_mem"] == int(scontrol_dict["FreeMem"]) # TODO: skipping some assert test_node_info["name"] == scontrol_dict["NodeName"] assert test_node_info["node_addr"] == scontrol_dict["NodeAddr"] assert test_node_info["node_hostname"] == scontrol_dict["NodeHostName"] assert test_node_info["os"] == scontrol_dict["OS"] assert test_node_info["real_memory"] == int(scontrol_dict["RealMemory"]) assert test_node_info["sockets"] == int(scontrol_dict["Sockets"]) assert test_node_info["state"] == scontrol_dict["State"] assert test_node_info["threads"] == int(scontrol_dict["ThreadsPerCore"]) assert test_node_info["tmp_disk"] == int(scontrol_dict["TmpDisk"]) assert test_node_info["version"] == scontrol_dict["Version"] assert test_node_info["weight"] == int(scontrol_dict["Weight"])
def sinfo(): # Partition and node lists are required # to compute sinfo informations partitions = pyslurm.partition().get() nodes = pyslurm.node().get() # Retreiving the state of each nodes nodes_state = dict( (node.lower(), attributes['state'].lower()) for node, attributes in nodes.iteritems() ) # For all partitions, retrieving the states of each nodes sinfo_data = {} for name, attr in partitions.iteritems(): for node in list(NodeSet(attr['nodes'])): key = (name, nodes_state[node]) if key not in sinfo_data.keys(): sinfo_data[key] = [] sinfo_data[key].append(node) # Preparing the response resp = [] for k, nodes in sinfo_data.iteritems(): name, state = k partition = partitions[name] avail = partition['state'].lower() min_nodes = partition['min_nodes'] max_nodes = partition['max_nodes'] total_nodes = partition['total_nodes'] job_size = "{0}-{1}".format(min_nodes, max_nodes) job_size = job_size.replace('UNLIMITED', 'infinite') time_limit = partition['max_time_str'].replace('UNLIMITED', 'infinite') # Creating the nodeset nodeset = NodeSet() map(nodeset.update, nodes) resp.append({ 'name': name, 'avail': avail, 'job_size': job_size, 'time_limit': time_limit, 'nodes': total_nodes, 'state': state, 'nodelist': str(nodeset), }) # Jsonify can not works on list, thus using json.dumps # And making sure headers are properly set return make_response(json.dumps(resp), mimetype='application/json')
def sinfo(): # Partition and node lists are required # to compute sinfo informations partitions = get_from_cache(pyslurm.partition().get, 'get_partitions') nodes = get_from_cache(pyslurm.node().get, 'get_nodes') # Retreiving the state of each nodes nodes_state = dict( (node.lower(), attributes['state'].lower()) for node, attributes in nodes.iteritems() ) # For all partitions, retrieving the states of each nodes sinfo_data = {} for name, attr in partitions.iteritems(): for node in list(NodeSet(attr['nodes'])): key = (name, nodes_state[node]) if key not in sinfo_data.keys(): sinfo_data[key] = [] sinfo_data[key].append(node) # Preparing the response resp = [] for k, nodes in sinfo_data.iteritems(): name, state = k partition = partitions[name] avail = partition['state'].lower() min_nodes = partition['min_nodes'] max_nodes = partition['max_nodes'] total_nodes = partition['total_nodes'] job_size = "{0}-{1}".format(min_nodes, max_nodes) job_size = job_size.replace('UNLIMITED', 'infinite') time_limit = partition['max_time_str'].replace('UNLIMITED', 'infinite') # Creating the nodeset nodeset = NodeSet() map(nodeset.update, nodes) resp.append({ 'name': name, 'avail': avail, 'job_size': job_size, 'time_limit': time_limit, 'nodes': total_nodes, 'state': state, 'nodelist': str(nodeset), }) # Jsonify can not works on list, thus using json.dumps # And making sure headers are properly set return make_response(json.dumps(resp), mimetype='application/json')
def get_cluster(): if mocking: return mock('cluster.json') nodes = pyslurm.node().get() cluster = {} cluster['name'] = pyslurm.config().get()['cluster_name'] cluster['nodes'] = len(nodes.keys()) cluster['cores'] = 0 for nodename, node in nodes.iteritems(): cluster['cores'] += node['cpus'] return cluster
def test_node_update(): """Node: Test node().update().""" time.sleep(3) test_node = pyslurm.node().ids()[-1] node_test_before = pyslurm.node().find_id(test_node) assert node_test_before["state"] == "IDLE" node_test_update = { "node_names": "c10", "node_state": pyslurm.NODE_STATE_DRAIN, "reason": "unit testing", } rc = pyslurm.node().update(node_test_update) assert rc == 0 node_test_during = pyslurm.node().find_id("c10") assert node_test_during["state"] == "IDLE+DRAIN" node_test_update = {"node_names": "c10", "node_state": pyslurm.NODE_RESUME} rc = pyslurm.node().update(node_test_update) assert rc == 0 node_test_after = pyslurm.node().find_id("c10") assert node_test_after["state"] == "IDLE"
def nodes(ids=None): """ Either return all nodes or a set of nodes from SLURM. """ nodes = pyslurm.node().get() if ids is None: return nodes else: nodes_dict = {} if not isinstance(ids, list): ids = [ids] for idx in ids: nodes_dict[idx] = nodes[idx] return nodes_dict
def search(limit=100): nodes = [] node_o = pyslurm.node() try: all_nodes = node_o.get() except Exception as e: resp = {"code": 500, "message": str(e)} return resp, 500 if len(all_nodes.values()) < 1: return NoContent, 204 for node in all_nodes.values(): nodes.append(node) return nodes[0:limit], 200
def _set_node_state(self, state, reason=None): """Set the node state to the provided argument.""" node_dict = { 'node_names': self.name, 'node_state': state, } if reason: node_dict['reason'] = reason rc = pyslurm.node().update(node_dict) if rc == -1: return False return True
def main(): """ Do some stuff, eventually printing output to stdout... """ # Parse command-line arguments arguments = parse_arguments() # Logging setup if arguments.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) Nodes = pyslurm.node() node_dict = Nodes.get() Jobs = pyslurm.job() job_dict = Jobs.get() #print(job_dict) if len(node_dict) > 0 and len(job_dict) > 0: nt = get_nodetag(node_dict, job_dict, arguments) pc = get_pending(job_dict) if arguments.csv: print_csv(arguments.csv_header_suppress, nt, pc) else: js = get_aggregated_jobs(job_dict, arguments) print_usage(js) else: print "No Nodes and/or no Jobs found !" sys.exit() node_reservations = get_node_reservations() jobs = get_jobs(all_jobs=arguments.all_jobs) cred_totals, public_cores, public_nodes, public_nodes_free = get_counts( node_reservations, jobs) if arguments.free_cores: print_free_cores(cred_totals, public_cores) elif arguments.csv: print_csv(arguments.csv_header_suppress, cred_totals, public_cores, public_nodes, public_nodes_free) else: print_output(cred_totals, public_cores, public_nodes, public_nodes_free)
def main(): """ Do some stuff, eventually printing output to stdout... """ # Parse command-line arguments arguments = parse_arguments() # Logging setup if arguments.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) Nodes = pyslurm.node() node_dict = Nodes.get() Jobs = pyslurm.job() job_dict = Jobs.get() #print(job_dict) if len(node_dict) > 0 and len(job_dict) > 0: nt = get_nodetag(node_dict, job_dict, arguments) pc = get_pending(job_dict) if arguments.csv: print_csv(arguments.csv_header_suppress, nt, pc) else: js=get_aggregated_jobs(job_dict, arguments) print_usage(js) else: print "No Nodes and/or no Jobs found !" sys.exit() node_reservations = get_node_reservations() jobs = get_jobs(all_jobs=arguments.all_jobs) cred_totals, public_cores, public_nodes, public_nodes_free = get_counts(node_reservations, jobs) if arguments.free_cores: print_free_cores(cred_totals, public_cores) elif arguments.csv: print_csv(arguments.csv_header_suppress, cred_totals, public_cores, public_nodes, public_nodes_free) else: print_output(cred_totals, public_cores, public_nodes, public_nodes_free)
def get_cluster(): nodes = pyslurm.node().get() cluster = {} cluster['name'] = pyslurm.config().get()['cluster_name'] cluster['nodes'] = len(nodes.keys()) cluster['cores'] = 0 for nodename, node in nodes.iteritems(): cluster['cores'] += node['cpus'] resp = jsonify({ 'authentication': { 'enabled': auth_enabled, 'guest': guests_allowed }, 'data': cluster }) return resp
def getUserAllocGPU(uid, node_dict=None): if not node_dict: node_dict = pyslurm.node().get() rlt = {} rlt_jobs = [] jobs = PyslurmQuery.getUserCurrJobs(uid) if jobs: for job in jobs: job_gpus = PyslurmQuery.getJobAllocGPU(job, node_dict) if job_gpus: # job has gpu rlt_jobs.append(job) for node, gpu_ids in job_gpus.items(): if node in rlt: rlt[node].extend(gpu_ids) else: rlt[node] = gpu_ids return rlt, rlt_jobs
def index_view(request): data = pyslurm.job().get() jobs = list(data.values()) table = QueueTable(jobs, user_logged_in=get_current_user_id(request.user)) table.order_by = '-submit_time' stats = {} nodes = pyslurm.node().get() stats["cpus_util"] = nodes["gpu-server"]["alloc_cpus"] #FIXME stats["cpus_total"] = nodes["gpu-server"]["cpus"] #FIXME stats["ram_util"] = nodes["gpu-server"]["alloc_mem"] #FIXME stats["ram_total"] = nodes["gpu-server"]["real_memory"] #FIXME stats["gpus_util"] = nodes["gpu-server"]["gres_used"] #FIXME stats["gpus_total"] = nodes["gpu-server"]["gres"] #FIXME return render(request, 'job_submission/job_queue.html', {'table': table, 'stats': stats})
def run(self): #pdb.set_trace() logger.info("Start running PyslurmReader ...") while True: # pyslurm query ts = int(datetime.now().timestamp()) job_dict = pyslurm.job().get() node_dict= pyslurm.node().get() part_dict= pyslurm.partition().get() qos_dict = pyslurm.qos().get() #res_dict = pyslurm.reservation().get() res_dict = {} #TODO: pyslurm reservation coredump ERROR #js_dict = pyslurm.jobstep().get() #convert to points points = [] for jid,job in job_dict.items(): self.slurmJob2point(ts, job, points) finishJob = [jid for jid in self.sav_job_dict.keys() if jid not in job_dict.keys()] #logger.debug ("PyslurmReader::run: Finish jobs {}".format(finishJob)) for jid in finishJob: del self.sav_job_dict[jid] for node in node_dict.values(): self.slurmNode2point(ts, node, points) if json.dumps(part_dict) != json.dumps(self.sav_part_dict): for pname, part in part_dict.items(): self.slurmPartition2point(ts, pname, part, points) self.sav_part_dict = part_dict if json.dumps(qos_dict) != json.dumps(self.sav_qos_dict): for qname, qos in qos_dict.items(): self.slurmQOS2point(ts, qname, qos, points) self.sav_qos_dict = qos_dict if res_dict and (json.dumps(res_dict) != json.dumps(self.sav_res_dict)): for rname, res in res_dict.items(): self.slurmReservation2point(ts, rname, res, points) self.sav_res_dict = res_dict with self.lock: self.points.extend(points) time.sleep (PyslurmReader.INTERVAL)
def get_jobs_by_nodes(): jobs = pyslurm.job().get() nodes = pyslurm.node().get() returned_nodes = {} for node_id, node in nodes.iteritems(): returned_jobs = {} # filter jobs by node for jobid, job in jobs.iteritems(): nodes_list = job['cpus_allocated'].keys() if node_id in nodes_list: returned_jobs[jobid] = job returned_nodes[node_id] = returned_jobs return returned_nodes
def get_jobs_by_nodes(): jobs = get_from_cache(pyslurm.job().get, 'get_jobs') nodes = get_from_cache(pyslurm.node().get, 'get_nodes') returned_nodes = {} for node_id, node in nodes.iteritems(): returned_jobs = {} # filter jobs by node for jobid, job in jobs.iteritems(): nodes_list = job['cpus_allocated'].keys() if node_id in nodes_list: returned_jobs[jobid] = job returned_nodes[node_id] = filter_entities('jobs', returned_jobs) return returned_nodes
def display_nodes_from_github(): try: Nodes = pyslurm.node() node_dict = Nodes.get() if len(node_dict) > 0: display(node_dict) print() print("Node IDs - {0}".format(Nodes.ids())) else: print("No Nodes found !") except ValueError as e: print("Error - {0}".format(e.args[0]))
def test_node_count(): all_nodes = pyslurm.node().get() all_node_ids = pyslurm.node().ids() assert len(all_nodes) == len(all_node_ids)
def get_nodes(): if mocking: return mock('nodes.json') nodes = pyslurm.node().get() return nodes
Valid States : NODE_RESUME NODE_STATE_DRAIN NODE_STATE_COMPLETING NODE_STATE_NO_RESPOND NODE_STATE_POWER_SAVE NODE_STATE_FAIL NODE_STATE_POWER_UP Some states are not valid on a Blue Gene """ from __future__ import print_function import pyslurm Node_dict = { 'node_names': 'bps000', 'node_state': pyslurm.NODE_STATE_DRAIN, 'reason': 'API test' } try: a = pyslurm.node() rc = a.update(Node_dict) except ValueError as e: print("Node Update error - {0}".format(e.args[0])) else: print("Node {0} successfully updated".format(Node_dict["node_names"]))
def test_node_ids(): """Node: Test node().ids() return type.""" all_node_ids = pyslurm.node().ids() assert_true(isinstance(all_node_ids, list))
def get_nodes(): nodes = pyslurm.node().get() return nodes
def test_node_count(): """Node: Test node count.""" all_nodes = pyslurm.node().get() all_node_ids = pyslurm.node().ids() assert_equals(len(all_nodes), len(all_node_ids))
print("Total Configured CPUs : {0:>8}".format( metrics["total_cpus_config"])) print("Cluster CPU Utilization : {0:>7}%".format( metrics["total_cpus_alloc"] * 100 / metrics["total_cpus_avail"])) print() # MEMORY print("Total Allocated Memory : {0:>8}".format( human_readable(metrics["total_memory_alloc"] * 1024 * 1024))) print("Total Eligible Memory : {0:>8}".format( human_readable(metrics["total_memory_avail"] * 1024 * 1024))) print("Total Configured Memory : {0:>8}".format( human_readable(metrics["total_memory_config"] * 1024 * 1024))) print("Cluster Memory Utilization : {0:>7}%".format( metrics["total_memory_alloc"] * 100 / metrics["total_memory_avail"])) print() if __name__ == "__main__": try: # Make sure pyslurm works, or else exit here pyslurmnode = pyslurm.node() # Get all node info nodes = pyslurmnode.get() except ValueError as e: print('Query failed - {0}').format(e) sys.exit(1) metrics = get_util(nodes) display_metrics(metrics)
def get_nodes(): nodes = pyslurm.node().get() return jsonify(nodes)
def test_node_ids(): all_node_ids = pyslurm.node().ids() assert type(all_node_ids) is ListType
def test_node_get(): all_nodes = pyslurm.node().get() assert type(all_nodes) is DictType
ddate = pyslurm.epoch2date(ddate) print("\t{0:<17} : {1}".format(part_key, ddate)) elif ('reason_uid' in part_key and value['reason'] is None): print("\t{0:<17} : ".format(part_key)) else: print("\t{0:<17} : {1}".format(part_key, value[part_key])) print('{0:*^80}'.format('')) if __name__ == "__main__": import pyslurm try: Nodes = pyslurm.node() node_dict = Nodes.get() if len(node_dict) > 0: display(node_dict) print() print("Node IDs - {0}".format(Nodes.ids())) else: print("No Nodes found !") except ValueError as e: print("Error - {0}".format(e.args[0]))
def test_node_get(): """Node: Test node().get() return type.""" all_nodes = pyslurm.node().get() assert_true(isinstance(all_nodes, dict))