Пример #1
0
def test_node_scontrol():
    """Node: Compare scontrol values to PySlurm values."""
    all_node_ids = pyslurm.node().ids()
    test_node = all_node_ids[-1]

    test_node_info = pyslurm.node().find_id(test_node)
    assert_equals(test_node, test_node_info["name"])

    sctl_dict = scontrol_show('node', test_node)

    assert_equals(test_node_info["alloc_mem"], int(sctl_dict["AllocMem"]))
    assert_equals(test_node_info["boards"], int(sctl_dict["Boards"]))
    assert_equals(test_node_info["alloc_cpus"], int(sctl_dict["CPUAlloc"]))
    assert_equals(test_node_info["cpus"], int(sctl_dict["CPUTot"]))
    assert_equals(test_node_info["cores"], int(sctl_dict["CoresPerSocket"]))
    assert_equals(test_node_info["energy"]["current_watts"], int(sctl_dict["CurrentWatts"]))
    assert_equals(test_node_info["name"], sctl_dict["NodeName"])
    assert_equals(test_node_info["node_addr"], sctl_dict["NodeAddr"])
    assert_equals(test_node_info["node_hostname"], sctl_dict["NodeHostName"])
    assert_equals(test_node_info["partitions"], sctl_dict["Partitions"].split(","))
    assert_equals(test_node_info["real_memory"], int(sctl_dict["RealMemory"]))
    assert_equals(test_node_info["sockets"], int(sctl_dict["Sockets"]))
    assert_equals(test_node_info["state"], sctl_dict["State"])
    assert_equals(test_node_info["threads"], int(sctl_dict["ThreadsPerCore"]))
    assert_equals(test_node_info["tmp_disk"], int(sctl_dict["TmpDisk"]))
    assert_equals(test_node_info["weight"], int(sctl_dict["Weight"]))
Пример #2
0
def test_node_scontrol():
    """Node: Compare scontrol values to PySlurm values."""
    all_node_ids = pyslurm.node().ids()
    test_node = all_node_ids[-1]

    test_node_info = pyslurm.node().find_id(test_node)
    assert_equals(test_node, test_node_info["name"])

    sctl = subprocess.Popen(["scontrol", "-d", "show", "node", test_node],
                            stdout=subprocess.PIPE).communicate()
    sctl_stdout = sctl[0].strip().decode("UTF-8").split()
    sctl_dict = dict((value.split("=")[0], value.split("=")[1])
                     for value in sctl_stdout)

    assert_equals(test_node_info["alloc_mem"], int(sctl_dict["AllocMem"]))
    assert_equals(test_node_info["boards"], int(sctl_dict["Boards"]))
    assert_equals(test_node_info["alloc_cpus"], int(sctl_dict["CPUAlloc"]))
    assert_equals(test_node_info["err_cpus"], int(sctl_dict["CPUErr"]))
    assert_equals(test_node_info["cpus"], int(sctl_dict["CPUTot"]))
    assert_equals(test_node_info["energy"]["consumed_energy"], int(sctl_dict["ConsumedJoules"]))
    assert_equals(test_node_info["cores"], int(sctl_dict["CoresPerSocket"]))
    assert_equals(test_node_info["energy"]["current_watts"], int(sctl_dict["CurrentWatts"]))
    assert_equals(test_node_info["name"], sctl_dict["NodeName"])
    assert_equals(test_node_info["node_addr"], sctl_dict["NodeAddr"])
    assert_equals(test_node_info["node_hostname"], sctl_dict["NodeHostName"])

    assert_equals(test_node_info["partitions"], sctl_dict["Partitions"].split(","))

    assert_equals(test_node_info["real_memory"], int(sctl_dict["RealMemory"]))
    assert_equals(test_node_info["sockets"], int(sctl_dict["Sockets"]))
    assert_equals(test_node_info["state"], sctl_dict["State"])
    assert_equals(test_node_info["threads"], int(sctl_dict["ThreadsPerCore"]))
    assert_equals(test_node_info["tmp_disk"], int(sctl_dict["TmpDisk"]))
    assert_equals(test_node_info["weight"], int(sctl_dict["Weight"]))
Пример #3
0
def test_node_scontrol():
    """Node: Compare scontrol values to PySlurm values."""
    all_node_ids = pyslurm.node().ids()
    test_node = all_node_ids[0]

    test_node_info = pyslurm.node().find_id(test_node)
    assert_equals(test_node, test_node_info["name"])

    sctl = subprocess.Popen(["scontrol", "-d", "show", "node", test_node],
                            stdout=subprocess.PIPE).communicate()
    sctl_stdout = sctl[0].strip().decode("UTF-8").split()
    sctl_dict = dict((value.split("=")[0], value.split("=")[1])
                     for value in sctl_stdout)

    assert_equals(test_node_info["alloc_mem"], int(sctl_dict["AllocMem"]))
    assert_equals(test_node_info["boards"], int(sctl_dict["Boards"]))
    assert_equals(test_node_info["alloc_cpus"], int(sctl_dict["CPUAlloc"]))
    assert_equals(test_node_info["err_cpus"], int(sctl_dict["CPUErr"]))
    assert_equals(test_node_info["cpus"], int(sctl_dict["CPUTot"]))
    assert_equals(test_node_info["energy"]["consumed_energy"], int(sctl_dict["ConsumedJoules"]))
    assert_equals(test_node_info["cores"], int(sctl_dict["CoresPerSocket"]))
    assert_equals(test_node_info["energy"]["current_watts"], int(sctl_dict["CurrentWatts"]))
    assert_equals(test_node_info["name"], sctl_dict["NodeName"])
    assert_equals(test_node_info["node_addr"], sctl_dict["NodeAddr"])
    assert_equals(test_node_info["node_hostname"], sctl_dict["NodeHostName"])
    assert_equals(test_node_info["real_memory"], int(sctl_dict["RealMemory"]))
    assert_equals(test_node_info["sockets"], int(sctl_dict["Sockets"]))
    assert_equals(test_node_info["state"], sctl_dict["State"])
    assert_equals(test_node_info["threads"], int(sctl_dict["ThreadsPerCore"]))
    assert_equals(test_node_info["tmp_disk"], int(sctl_dict["TmpDisk"]))
    assert_equals(test_node_info["weight"], int(sctl_dict["Weight"]))
Пример #4
0
def test_node_update():
    """Node: Test node().update()."""
    node_test_before = pyslurm.node().find_id("c10")
    assert_equals(node_test_before["state"], "IDLE")

    node_test_update = {
        "node_names": "c10",
        "node_state": pyslurm.NODE_STATE_DRAIN,
        "reason": "unit testing"
    }

    rc = pyslurm.node().update(node_test_update)
    assert_equals(rc, 0)

    node_test_during = pyslurm.node().find_id("c10")
    assert_equals(node_test_during["state"], "IDLE+DRAIN")

    node_test_update = {
        "node_names": "c10",
        "node_state": pyslurm.NODE_RESUME
    }

    rc = pyslurm.node().update(node_test_update)
    assert_equals(rc, 0)

    node_test_after = pyslurm.node().find_id("c10")
    assert_equals(node_test_after["state"], "IDLE")
Пример #5
0
def is_node(n):
    try:
        pyslurm.node().find_id(n)
    except IndexError:
        return False

    return True
Пример #6
0
def test_node_update():
    """Node: Test node().update()."""
    node_test_before = pyslurm.node().find_id("c10")
    assert_equals(node_test_before["state"], "IDLE")

    node_test_update = {
        "node_names": "c10",
        "node_state": pyslurm.NODE_STATE_DRAIN,
        "reason": "unit testing"
    }

    rc = pyslurm.node().update(node_test_update)
    assert_equals(rc, 0)

    node_test_during = pyslurm.node().find_id("c10")
    assert_equals(node_test_during["state"], "IDLE+DRAIN")

    node_test_update = {
        "node_names": "c10",
        "node_state": pyslurm.NODE_RESUME
    }

    rc = pyslurm.node().update(node_test_update)
    assert_equals(rc, 0)

    node_test_after = pyslurm.node().find_id("c10")
    assert_equals(node_test_after["state"], "IDLE")
Пример #7
0
def test_gres_used_parser():
    """Node: Test node().parse_gres()."""
    assert_equals(
        pyslurm.node().parse_gres("gpu:p100:2(IDX:1,3),lscratch:0"),
        ["gpu:p100:2(IDX:1,3)", "lscratch:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("gpu:0,hbm:0"),
        ["gpu:0", "hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("gpu:p100:0(IDX:N/A),hbm:0"),
        ["gpu:p100:0(IDX:N/A)", "hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("gpu:p100:1(IDX:0),hbm:0"),
        ["gpu:p100:1(IDX:0)", "hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("gpu:p100:1(IDX:1),hbm:0"),
        ["gpu:p100:1(IDX:1)", "hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("gpu:p100:2(IDX:0-1),hbm:0"),
        ["gpu:p100:2(IDX:0-1)", "hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("hbm:0"),
        ["hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("lscratch:0,hbm:0"),
        ["lscratch:0", "hbm:0"]
    )
Пример #8
0
def test_gres_used_parser():
    """Node: Test node().parse_gres()."""
    assert_equals(
        pyslurm.node().parse_gres("gpu:p100:2(IDX:1,3),lscratch:0"),
        ["gpu:p100:2(IDX:1,3)", "lscratch:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("gpu:0,hbm:0"),
        ["gpu:0", "hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("gpu:p100:0(IDX:N/A),hbm:0"),
        ["gpu:p100:0(IDX:N/A)", "hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("gpu:p100:1(IDX:0),hbm:0"),
        ["gpu:p100:1(IDX:0)", "hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("gpu:p100:1(IDX:1),hbm:0"),
        ["gpu:p100:1(IDX:1)", "hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("gpu:p100:2(IDX:0-1),hbm:0"),
        ["gpu:p100:2(IDX:0-1)", "hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("hbm:0"),
        ["hbm:0"]
    )
    assert_equals(
        pyslurm.node().parse_gres("lscratch:0,hbm:0"),
        ["lscratch:0", "hbm:0"]
    )
Пример #9
0
     def get_available_nodes(self,slices_size=1):
          """ Returns a list of currently available nodes by slice of slices_size
          ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]']
          :param slices_size: slices size
          :param type: int
          :returns: list of nodes_id
          :rtype: str """

          node_list=[]
          a = pyslurm.node()
          node_dict = a.get()
          node_count=0
          nodeset = NodeSet()
          if len(node_dict) > 0:
               for key, value in sorted(node_dict.iteritems()):
                    if value['state']=='IDLE':
                         nodetype=value
                         nodeset.update(key)
                         node_count+=1
                    if node_count==slices_size:
                         node_list.append(str(nodeset))
                         nodeset=NodeSet()
                         slice_str=None
                         node_count=0


          return node_list
    def get_available_nodes(self, slices_size=1):
        """ Returns a list of currently available nodes by slice of slices_size
          ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]']
          :param slices_size: slices size
          :param type: int
          :returns: list of nodes_id
          :rtype: str """

        node_list = []
        a = pyslurm.node()
        node_dict = a.get()
        node_count = 0
        nodeset = NodeSet()
        if len(node_dict) > 0:
            for key, value in sorted(node_dict.iteritems()):
                if value['state'] == 'IDLE':
                    nodetype = value
                    nodeset.update(key)
                    node_count += 1
                if node_count == slices_size:
                    node_list.append(str(nodeset))
                    nodeset = NodeSet()
                    slice_str = None
                    node_count = 0

        return node_list
Пример #11
0
	def collect(self):
		# Metric declarations
		NODES_CPUS = GaugeMetricFamily('slurm_nodes_cpus', 'Numbers of CPUs on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels)
		NODES_CPUS_ALLOC = GaugeMetricFamily('slurm_nodes_cpus_alloc', 'Numbers of CPUs allocated on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels)
		NODES_CPU_LOAD = GaugeMetricFamily('slurm_nodes_cpu_load', 'CPU loads on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels)
		NODES_MEM_TOTAL = GaugeMetricFamily('slurm_nodes_mem_total', 'Total amounts of memory available on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels, unit='bytes')
		NODES_MEM_FREE = GaugeMetricFamily('slurm_nodes_mem_free', 'Amounts of free memory allocated on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels, unit='bytes')
		NODES_MEM_ALLOC = GaugeMetricFamily('slurm_nodes_mem_alloc', 'Amounts of memory allocated on nodes in the cluster grouped by {}'.format(', '.join(self.labels)), labels=self.labels, unit='bytes')
		
		# Load node info from Slurm
		nodes = pyslurm.node().get()
		cluster = pyslurm.config().get()['cluster_name']
		for node in nodes.keys():
			for partition in nodes[node]['partitions']:
				labels_ = [cluster, partition] +  [str(nodes[node][prop]) for prop in self.props]
				if 'METRIC_VALUE_NULL' in os.environ and os.environ['METRIC_VALUE_NULL'].lower() == 'include':
					NODES_CPUS.add_metric(labels_, nodes[node]['cpus'])
					NODES_CPUS_ALLOC.add_metric(labels_, nodes[node]['alloc_cpus'])
					NODES_CPU_LOAD.add_metric(labels_, nodes[node]['cpu_load']/100.0)
					NODES_MEM_TOTAL.add_metric(labels_, nodes[node]['real_memory']*1000**2) # MB to Bytes
					NODES_MEM_ALLOC.add_metric(labels_, nodes[node]['alloc_mem']*1000**2)   # MB to Bytes
					NODES_MEM_FREE.add_metric(labels_,  nodes[node]['free_mem']*2**20)      # MiB to Bytes
				else:
					NODES_CPUS.add_metric(labels_, nodes[node]['cpus'])			if nodes[node]['cpus'] else None
					NODES_CPUS_ALLOC.add_metric(labels_, nodes[node]['alloc_cpus'])		if nodes[node]['alloc_cpus'] else None
					NODES_CPU_LOAD.add_metric(labels_, nodes[node]['cpu_load']/100.0) 	if nodes[node]['cpu_load'] else None
					NODES_MEM_TOTAL.add_metric(labels_, nodes[node]['real_memory']*1000**2)	if nodes[node]['real_memory'] else None
					NODES_MEM_ALLOC.add_metric(labels_, nodes[node]['alloc_mem']*1000**2)	if nodes[node]['alloc_mem'] else None
					NODES_MEM_FREE.add_metric(labels_,  nodes[node]['free_mem']*2**20)	if nodes[node]['free_mem'] else None
		yield NODES_CPUS
		yield NODES_CPUS_ALLOC
		yield NODES_CPU_LOAD
		yield NODES_MEM_TOTAL
		yield NODES_MEM_FREE
		yield NODES_MEM_ALLOC
Пример #12
0
    def retrieveInfluxPoints (self):
        global ignore_count
        #self.pyslurmQueryTime= datetime.now().timestamp()
        #pdb.set_trace()
        rp_points = DDict(list)      #{'autogen':[point...]}

        self.nodeData = pyslurm.node().get()

        #autogen.cpu_info: time, hostname, total_socket, total_thread, total_cores, cpu_model, is_vm
        #hostinfo_msgs = self.consume_hostinfo_msgs()
        #hostinfo      = self.process_list(hostinfo_msgs, self.hostinfo2point)
        #rp_points['autogen'] = hostinfo
        #logger.debug("MQTTReader generate {} hostinfo points".format(len(hostinfo)))

        #autogen.cpu_load: time, hostname, proc_*, load_*, cpu_*, mem_*, net_*, disk_*
        hostperf_msgs = self.consume_hostperf_msgs()
        hostperf      = self.process_list(hostperf_msgs, self.hostperf2point)
        rp_points['autogen'] = hostperf
        logger.debug("MQTTReader generate {} hostperf points".format(len(hostperf)))

        #autogen.cpu_proc_info, job_proc_mon
        hostproc_msgs = self.consume_hostproc_msgs()
        ignore_count  = 0
        self.process_list_add(rp_points, hostproc_msgs, self.hostproc2point)
        logger.debug("MQTTReader generate {} {} points after adding hostproc, ignore {} points".format(len(rp_points['autogen']), len(rp_points['one_month']), ignore_count))

        #points = hostinfo + hostperf + hostproc
        if self.cpu_up_ts_count > 0:
           with open (self.TS_FNAME, 'w') as f:
                json.dump(self.cpu_up_ts, f)
           self.cpu_up_ts_count = 0

        self.node2tsPidsCache.writeFile ()         #save intermediate data structure, node2
        return rp_points
Пример #13
0
def test_node_scontrol():
    all_node_ids = pyslurm.node().ids()
    test_node = all_node_ids[0]
    assert type(test_node) is StringType

    test_node_info = pyslurm.node().find_id(test_node)
    assert test_node == test_node_info["name"]

    scontrol = subprocess.Popen(["scontrol", "-d", "show", "node", test_node],
                                stdout=subprocess.PIPE).communicate()
    scontrol_stdout = scontrol[0].strip().split()
    scontrol_dict = {
        value.split("=")[0]: value.split("=")[1]
        for value in scontrol_stdout
    }

    assert test_node_info["alloc_mem"] == int(scontrol_dict["AllocMem"])
    assert test_node_info["arch"] == scontrol_dict["Arch"]
    assert test_node_info["boards"] == int(scontrol_dict["Boards"])
    #BootTime=2016-01-12T23:56:26
    assert test_node_info["alloc_cpus"] == int(scontrol_dict["CPUAlloc"])
    assert test_node_info["err_cpus"] == int(scontrol_dict["CPUErr"])
    assert test_node_info["cpus"] == int(scontrol_dict["CPUTot"])
    #CPULoad=0.01
    #CapWatts=n/a
    assert test_node_info["energy"]["consumed_energy"] == int(
        scontrol_dict["ConsumedJoules"])
    assert test_node_info["cores"] == int(scontrol_dict["CoresPerSocket"])
    assert test_node_info["energy"]["current_watts"] == int(
        scontrol_dict["CurrentWatts"])
    # TODO: skipping some
    assert test_node_info["features"] == scontrol_dict["Features"]
    assert test_node_info["free_mem"] == int(scontrol_dict["FreeMem"])
    # TODO: skipping some
    assert test_node_info["name"] == scontrol_dict["NodeName"]
    assert test_node_info["node_addr"] == scontrol_dict["NodeAddr"]
    assert test_node_info["node_hostname"] == scontrol_dict["NodeHostName"]
    assert test_node_info["os"] == scontrol_dict["OS"]
    assert test_node_info["real_memory"] == int(scontrol_dict["RealMemory"])
    assert test_node_info["sockets"] == int(scontrol_dict["Sockets"])
    assert test_node_info["state"] == scontrol_dict["State"]
    assert test_node_info["threads"] == int(scontrol_dict["ThreadsPerCore"])
    assert test_node_info["tmp_disk"] == int(scontrol_dict["TmpDisk"])
    assert test_node_info["version"] == scontrol_dict["Version"]
    assert test_node_info["weight"] == int(scontrol_dict["Weight"])
Пример #14
0
 def __init__(self):
     self.noFarm = ['slurmweb', 'huematrix']
     self.nodeInformations = {}
     self.__hosts = hostlist()
     self.__node = node()
     self.SearchNodes()
     self.nodes = sorted(self.nodeInformations.keys())
     self.state = ''
     self.node = ''
Пример #15
0
def get_cluster():
    nodes = pyslurm.node().get()
    cluster = {}
    cluster['name'] = pyslurm.config().get()['cluster_name']
    cluster['nodes'] = len(nodes.keys())
    cluster['cores'] = 0
    for nodename, node in nodes.iteritems():
        cluster['cores'] += node['cpus']
    return jsonify(cluster)
Пример #16
0
 def getJobAllocGPU(job, node_dict=None):
     if not node_dict:
         node_dict = pyslurm.node().get()
     if not job['cpus_allocated']:
         return None
     node_list = [node_dict[node] for node in job['cpus_allocated']]
     gpus_allocated = MyTool.getGPUAlloc_layout(node_list,
                                                job['gres_detail'])
     return gpus_allocated
Пример #17
0
def get(id, limit=100):
    node_o = pyslurm.node()
    try:
        this_node = node_o.get_node(str(id))
    except Exception as e:
        resp = {"code": 500, "message": str(e)}
        return resp, 500
    if len(this_node.values()) < 1:
        return NoContent, 204
    return dict(this_node[str(id)]), 200
Пример #18
0
def update_nodes_from_tests(prefix,
                            test_type,
                            fail_nodes=None,
                            error_nodes=None,
                            down=False):
    fail_nodes_path = os.path.join(prefix, test_type, 'fail_nodes')
    try:
        fail_nodes_ = set(bench.util.read_node_list(fail_nodes_path))
    except IOError as ex:
        logger.info('unable to read {0}'.format(fail_nodes_path))
        logger.debug(ex, exc_info=True)
        fail_nodes_ = set()

    error_nodes_path = os.path.join(prefix, test_type, 'error_nodes')
    try:
        error_nodes_ = set(bench.util.read_node_list(error_nodes_path))
    except IOError as ex:
        logger.info('unable to read {0}'.format(error_nodes_path))
        logger.debug(ex, exc_info=True)
        error_nodes_ = set()

    # by default, reserve fail_nodes and error_nodes
    if not (fail_nodes or error_nodes):
        nodes_to_update = fail_nodes_ | error_nodes_
    else:
        nodes_to_update = set()
        if fail_nodes:
            nodes_to_update |= fail_nodes_
        if error_nodes:
            nodes_to_update |= error_nodes_

    if down:
        node_state = pyslurm.NODE_STATE_DOWN
        node_state_s = 'DOWN'
    else:
        node_state = pyslurm.NODE_STATE_DRAIN
        node_state_s = 'DRAINED'

    pyslurm_node = pyslurm.node()

    for node in sorted(nodes_to_update):
        current_node_state = pyslurm_node.find_id(node)['node_state']
        if current_node_state.startswith(node_state_s):
            continue
        node_update = {
            'node_names': node,
            'node_state': node_state,
            'reason': 'bench:{0}'.format(test_type),
        }
        rc = pyslurm_node.update(node_update)
        if rc != 0:
            logger.error('unable to update node {0}: {1}'.format(
                node, pyslurm.slurm_strerror(pyslurm.slurm_get_errno())))
        else:
            logger.info('{0} set to {1}'.format(node, node_state_s))
Пример #19
0
def update_nodes_from_tests (prefix, test_type, fail_nodes=None,
                             error_nodes=None, down=False):
    fail_nodes_path = os.path.join(prefix, test_type, 'fail_nodes')
    try:
        fail_nodes_ = set(bench.util.read_node_list(fail_nodes_path))
    except IOError as ex:
        logger.info('unable to read {0}'.format(fail_nodes_path))
        logger.debug(ex, exc_info=True)
        fail_nodes_ = set()

    error_nodes_path = os.path.join(prefix, test_type, 'error_nodes')
    try:
        error_nodes_ = set(bench.util.read_node_list(error_nodes_path))
    except IOError as ex:
        logger.info('unable to read {0}'.format(error_nodes_path))
        logger.debug(ex, exc_info=True)
        error_nodes_ = set()

    # by default, reserve fail_nodes and error_nodes
    if not (fail_nodes or error_nodes):
        nodes_to_update = fail_nodes_ | error_nodes_
    else:
        nodes_to_update = set()
        if fail_nodes:
            nodes_to_update |= fail_nodes_
        if error_nodes:
            nodes_to_update |= error_nodes_

    if down:
        node_state = pyslurm.NODE_STATE_DOWN
        node_state_s = 'DOWN'
    else:
        node_state = pyslurm.NODE_STATE_DRAIN
        node_state_s = 'DRAINED'

    pyslurm_node = pyslurm.node()

    for node in sorted(nodes_to_update):
        current_node_state = pyslurm_node.find_id(node)['node_state']
        if current_node_state.startswith(node_state_s):
            continue
        node_update = {
            'node_names': node,
            'node_state': node_state,
            'reason': 'bench:{0}'.format(test_type),
        }
        rc = pyslurm_node.update(node_update)
        if rc != 0:
            logger.error('unable to update node {0}: {1}'.format(
                node,
                pyslurm.slurm_strerror(pyslurm.slurm_get_errno())
            ))
        else:
            logger.info('{0} set to {1}'.format(
                node, node_state_s))
Пример #20
0
def get_core_counts():
    data = dict()
    n = pyslurm.node()
    nodes = n.get()
    data['total'] = [0, 0, 0]
    for id, node in nodes.iteritems():
        data[id] = [
            node['alloc_cpus'], node['cpus'] - node['alloc_cpus'], node['cpus']
        ]
        data['total'] = [x + y for x, y in zip(data['total'], data[id])]
    return data
Пример #21
0
    def run(self):
        #pdb.set_trace()
        while True:
            curr_ts = time.time()
            slurmJobs = pyslurm.job().get()
            slurmNodes = pyslurm.node().get()
            msgs = self.source.retrieveMsgs()
            if msgs:
                self.dealData(curr_ts, slurmJobs, slurmNodes, msgs)

            time.sleep(self.INTERVAL)
Пример #22
0
    def pre_update(self):
        # Ganglia
        self.ganglia_data = ganglia.Stats(do_cpus=True).all

        # Slurm
        self.pyslurm_node = pyslurm.node().get()
        self.pyslurm_job = pyslurm.job().get()

        # Influx
        self.update_mem_data()
        self.prune_mem_max()
Пример #23
0
def test_node_scontrol():
    all_node_ids = pyslurm.node().ids()
    test_node = all_node_ids[0]
    assert type(test_node) is StringType

    test_node_info = pyslurm.node().find_id(test_node)
    assert test_node == test_node_info["name"]

    scontrol = subprocess.Popen(["scontrol", "-d", "show", "node", test_node],
                                stdout=subprocess.PIPE).communicate()
    scontrol_stdout = scontrol[0].strip().split()
    scontrol_dict = {value.split("=")[0]: value.split("=")[1]
                     for value in scontrol_stdout}

    assert test_node_info["alloc_mem"] == int(scontrol_dict["AllocMem"])
    assert test_node_info["arch"] == scontrol_dict["Arch"]
    assert test_node_info["boards"] == int(scontrol_dict["Boards"])
    #BootTime=2016-01-12T23:56:26
    assert test_node_info["alloc_cpus"] == int(scontrol_dict["CPUAlloc"])
    assert test_node_info["err_cpus"] == int(scontrol_dict["CPUErr"])
    assert test_node_info["cpus"] == int(scontrol_dict["CPUTot"])
    #CPULoad=0.01
    #CapWatts=n/a
    assert test_node_info["energy"]["consumed_energy"] == int(scontrol_dict["ConsumedJoules"])
    assert test_node_info["cores"] == int(scontrol_dict["CoresPerSocket"])
    assert test_node_info["energy"]["current_watts"] == int(scontrol_dict["CurrentWatts"])
    # TODO: skipping some
    assert test_node_info["features"] == scontrol_dict["Features"]
    assert test_node_info["free_mem"] == int(scontrol_dict["FreeMem"])
    # TODO: skipping some
    assert test_node_info["name"] == scontrol_dict["NodeName"]
    assert test_node_info["node_addr"] == scontrol_dict["NodeAddr"]
    assert test_node_info["node_hostname"] == scontrol_dict["NodeHostName"]
    assert test_node_info["os"] == scontrol_dict["OS"]
    assert test_node_info["real_memory"] == int(scontrol_dict["RealMemory"])
    assert test_node_info["sockets"] == int(scontrol_dict["Sockets"])
    assert test_node_info["state"] == scontrol_dict["State"]
    assert test_node_info["threads"] == int(scontrol_dict["ThreadsPerCore"])
    assert test_node_info["tmp_disk"] == int(scontrol_dict["TmpDisk"])
    assert test_node_info["version"] == scontrol_dict["Version"]
    assert test_node_info["weight"] == int(scontrol_dict["Weight"])
Пример #24
0
def sinfo():

    # Partition and node lists are required
    # to compute sinfo informations
    partitions = pyslurm.partition().get()
    nodes = pyslurm.node().get()

    # Retreiving the state of each nodes
    nodes_state = dict(
        (node.lower(), attributes['state'].lower())
        for node, attributes in nodes.iteritems()
    )

    # For all partitions, retrieving the states of each nodes
    sinfo_data = {}
    for name, attr in partitions.iteritems():

        for node in list(NodeSet(attr['nodes'])):
            key = (name, nodes_state[node])
            if key not in sinfo_data.keys():
                sinfo_data[key] = []
            sinfo_data[key].append(node)

    # Preparing the response
    resp = []
    for k, nodes in sinfo_data.iteritems():
        name, state = k
        partition = partitions[name]
        avail = partition['state'].lower()
        min_nodes = partition['min_nodes']
        max_nodes = partition['max_nodes']
        total_nodes = partition['total_nodes']
        job_size = "{0}-{1}".format(min_nodes, max_nodes)
        job_size = job_size.replace('UNLIMITED', 'infinite')
        time_limit = partition['max_time_str'].replace('UNLIMITED', 'infinite')

        # Creating the nodeset
        nodeset = NodeSet()
        map(nodeset.update, nodes)

        resp.append({
          'name': name,
          'avail': avail,
          'job_size': job_size,
          'time_limit': time_limit,
          'nodes': total_nodes,
          'state': state,
          'nodelist': str(nodeset),
        })

    # Jsonify can not works on list, thus using json.dumps
    # And making sure headers are properly set
    return make_response(json.dumps(resp), mimetype='application/json')
Пример #25
0
def sinfo():

    # Partition and node lists are required
    # to compute sinfo informations
    partitions = get_from_cache(pyslurm.partition().get, 'get_partitions')
    nodes = get_from_cache(pyslurm.node().get, 'get_nodes')

    # Retreiving the state of each nodes
    nodes_state = dict(
        (node.lower(), attributes['state'].lower())
        for node, attributes in nodes.iteritems()
    )

    # For all partitions, retrieving the states of each nodes
    sinfo_data = {}
    for name, attr in partitions.iteritems():

        for node in list(NodeSet(attr['nodes'])):
            key = (name, nodes_state[node])
            if key not in sinfo_data.keys():
                sinfo_data[key] = []
            sinfo_data[key].append(node)

    # Preparing the response
    resp = []
    for k, nodes in sinfo_data.iteritems():
        name, state = k
        partition = partitions[name]
        avail = partition['state'].lower()
        min_nodes = partition['min_nodes']
        max_nodes = partition['max_nodes']
        total_nodes = partition['total_nodes']
        job_size = "{0}-{1}".format(min_nodes, max_nodes)
        job_size = job_size.replace('UNLIMITED', 'infinite')
        time_limit = partition['max_time_str'].replace('UNLIMITED', 'infinite')

        # Creating the nodeset
        nodeset = NodeSet()
        map(nodeset.update, nodes)

        resp.append({
          'name': name,
          'avail': avail,
          'job_size': job_size,
          'time_limit': time_limit,
          'nodes': total_nodes,
          'state': state,
          'nodelist': str(nodeset),
        })

    # Jsonify can not works on list, thus using json.dumps
    # And making sure headers are properly set
    return make_response(json.dumps(resp), mimetype='application/json')
Пример #26
0
def get_cluster():
    if mocking:
        return mock('cluster.json')

    nodes = pyslurm.node().get()
    cluster = {}
    cluster['name'] = pyslurm.config().get()['cluster_name']
    cluster['nodes'] = len(nodes.keys())
    cluster['cores'] = 0
    for nodename, node in nodes.iteritems():
        cluster['cores'] += node['cpus']
    return cluster
Пример #27
0
def get_cluster():
    if mocking:
        return mock('cluster.json')

    nodes = pyslurm.node().get()
    cluster = {}
    cluster['name'] = pyslurm.config().get()['cluster_name']
    cluster['nodes'] = len(nodes.keys())
    cluster['cores'] = 0
    for nodename, node in nodes.iteritems():
        cluster['cores'] += node['cpus']
    return cluster
Пример #28
0
def test_node_update():
    """Node: Test node().update()."""

    time.sleep(3)
    test_node = pyslurm.node().ids()[-1]
    node_test_before = pyslurm.node().find_id(test_node)
    assert node_test_before["state"] == "IDLE"

    node_test_update = {
        "node_names": "c10",
        "node_state": pyslurm.NODE_STATE_DRAIN,
        "reason": "unit testing",
    }

    rc = pyslurm.node().update(node_test_update)
    assert rc == 0

    node_test_during = pyslurm.node().find_id("c10")
    assert node_test_during["state"] == "IDLE+DRAIN"

    node_test_update = {"node_names": "c10", "node_state": pyslurm.NODE_RESUME}

    rc = pyslurm.node().update(node_test_update)
    assert rc == 0

    node_test_after = pyslurm.node().find_id("c10")
    assert node_test_after["state"] == "IDLE"
Пример #29
0
def nodes(ids=None):
    """
    Either return all nodes or a set of nodes from SLURM.
    """
    nodes = pyslurm.node().get()
    if ids is None:
        return nodes
    else:
        nodes_dict = {}
        if not isinstance(ids, list): ids = [ids]
        for idx in ids:
            nodes_dict[idx] = nodes[idx]
        return nodes_dict
Пример #30
0
def search(limit=100):
    nodes = []
    node_o = pyslurm.node()
    try:
        all_nodes = node_o.get()
    except Exception as e:
        resp = {"code": 500, "message": str(e)}
        return resp, 500
    if len(all_nodes.values()) < 1:
        return NoContent, 204
    for node in all_nodes.values():
        nodes.append(node)
    return nodes[0:limit], 200
Пример #31
0
    def _set_node_state(self, state, reason=None):
        """Set the node state to the provided argument."""
        node_dict = {
            'node_names': self.name,
            'node_state': state,
        }
        if reason:
            node_dict['reason'] = reason

        rc = pyslurm.node().update(node_dict)
        if rc == -1:
            return False
        return True
Пример #32
0
def main():
    """
    Do some stuff, eventually printing output to stdout...
    """
    # Parse command-line arguments
    arguments = parse_arguments()

    # Logging setup
    if arguments.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    Nodes = pyslurm.node()
    node_dict = Nodes.get()
    Jobs = pyslurm.job()
    job_dict = Jobs.get()
    #print(job_dict)

    if len(node_dict) > 0 and len(job_dict) > 0:

        nt = get_nodetag(node_dict, job_dict, arguments)
        pc = get_pending(job_dict)

        if arguments.csv:
            print_csv(arguments.csv_header_suppress, nt, pc)
        else:
            js = get_aggregated_jobs(job_dict, arguments)
            print_usage(js)

    else:

        print "No Nodes and/or no Jobs found !"

    sys.exit()

    node_reservations = get_node_reservations()
    jobs = get_jobs(all_jobs=arguments.all_jobs)
    cred_totals, public_cores, public_nodes, public_nodes_free = get_counts(
        node_reservations, jobs)

    if arguments.free_cores:
        print_free_cores(cred_totals, public_cores)
    elif arguments.csv:
        print_csv(arguments.csv_header_suppress, cred_totals, public_cores,
                  public_nodes, public_nodes_free)
    else:
        print_output(cred_totals, public_cores, public_nodes,
                     public_nodes_free)
Пример #33
0
def main():
    """
    Do some stuff, eventually printing output to stdout...
    """
    # Parse command-line arguments
    arguments = parse_arguments()

    # Logging setup
    if arguments.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)


    Nodes = pyslurm.node()
    node_dict = Nodes.get()
    Jobs = pyslurm.job()
    job_dict = Jobs.get()
    #print(job_dict)

    if len(node_dict) > 0 and len(job_dict) > 0:

        nt = get_nodetag(node_dict, job_dict, arguments)
        pc = get_pending(job_dict)

        if arguments.csv:
            print_csv(arguments.csv_header_suppress, nt, pc)
        else:
            js=get_aggregated_jobs(job_dict, arguments)
            print_usage(js)
            

    else:

        print "No Nodes and/or no Jobs found !"

    sys.exit()


    node_reservations = get_node_reservations()
    jobs = get_jobs(all_jobs=arguments.all_jobs)
    cred_totals, public_cores, public_nodes, public_nodes_free = get_counts(node_reservations, jobs)

    if arguments.free_cores:
        print_free_cores(cred_totals, public_cores)
    elif arguments.csv:
        print_csv(arguments.csv_header_suppress, cred_totals, public_cores, public_nodes, public_nodes_free)
    else:
        print_output(cred_totals, public_cores, public_nodes, public_nodes_free) 
Пример #34
0
def get_cluster():

    nodes = pyslurm.node().get()
    cluster = {}
    cluster['name'] = pyslurm.config().get()['cluster_name']
    cluster['nodes'] = len(nodes.keys())
    cluster['cores'] = 0
    for nodename, node in nodes.iteritems():
        cluster['cores'] += node['cpus']
    resp = jsonify({
        'authentication': {
            'enabled': auth_enabled,
            'guest': guests_allowed
        },
        'data': cluster
    })
    return resp
Пример #35
0
 def getUserAllocGPU(uid, node_dict=None):
     if not node_dict:
         node_dict = pyslurm.node().get()
     rlt = {}
     rlt_jobs = []
     jobs = PyslurmQuery.getUserCurrJobs(uid)
     if jobs:
         for job in jobs:
             job_gpus = PyslurmQuery.getJobAllocGPU(job, node_dict)
             if job_gpus:  # job has gpu
                 rlt_jobs.append(job)
                 for node, gpu_ids in job_gpus.items():
                     if node in rlt:
                         rlt[node].extend(gpu_ids)
                     else:
                         rlt[node] = gpu_ids
     return rlt, rlt_jobs
Пример #36
0
def get_cluster():

    nodes = pyslurm.node().get()
    cluster = {}
    cluster['name'] = pyslurm.config().get()['cluster_name']
    cluster['nodes'] = len(nodes.keys())
    cluster['cores'] = 0
    for nodename, node in nodes.iteritems():
        cluster['cores'] += node['cpus']
    resp = jsonify({
        'authentication': {
            'enabled': auth_enabled,
            'guest': guests_allowed
        },
        'data': cluster
    })
    return resp
Пример #37
0
def index_view(request):
    data = pyslurm.job().get()
    jobs = list(data.values())
    table = QueueTable(jobs, user_logged_in=get_current_user_id(request.user))

    table.order_by = '-submit_time'

    stats = {}
    nodes = pyslurm.node().get()
    stats["cpus_util"] = nodes["gpu-server"]["alloc_cpus"]  #FIXME
    stats["cpus_total"] = nodes["gpu-server"]["cpus"]  #FIXME
    stats["ram_util"] = nodes["gpu-server"]["alloc_mem"]  #FIXME
    stats["ram_total"] = nodes["gpu-server"]["real_memory"]  #FIXME
    stats["gpus_util"] = nodes["gpu-server"]["gres_used"]  #FIXME
    stats["gpus_total"] = nodes["gpu-server"]["gres"]  #FIXME

    return render(request, 'job_submission/job_queue.html', {'table': table, 'stats': stats})
Пример #38
0
    def run(self):
        #pdb.set_trace()
        logger.info("Start running PyslurmReader ...")
        while True:
          # pyslurm query
          ts       = int(datetime.now().timestamp())
          job_dict = pyslurm.job().get()
          node_dict= pyslurm.node().get()
          part_dict= pyslurm.partition().get()
          qos_dict = pyslurm.qos().get()
          #res_dict = pyslurm.reservation().get()
          res_dict = {}  #TODO: pyslurm reservation coredump ERROR
          #js_dict  = pyslurm.jobstep().get()

          #convert to points
          points   = []
          for jid,job in job_dict.items():
              self.slurmJob2point(ts, job, points)
          finishJob = [jid for jid in self.sav_job_dict.keys() if jid not in job_dict.keys()]
          #logger.debug ("PyslurmReader::run: Finish jobs {}".format(finishJob))
          for jid in finishJob:
              del self.sav_job_dict[jid]

          for node in node_dict.values():
              self.slurmNode2point(ts, node, points)

          if json.dumps(part_dict) != json.dumps(self.sav_part_dict):
              for pname, part in part_dict.items():
                 self.slurmPartition2point(ts, pname, part, points)
              self.sav_part_dict = part_dict

          if json.dumps(qos_dict) != json.dumps(self.sav_qos_dict):
              for qname, qos in qos_dict.items():
                 self.slurmQOS2point(ts, qname, qos, points)
              self.sav_qos_dict = qos_dict

          if res_dict and (json.dumps(res_dict) != json.dumps(self.sav_res_dict)):
              for rname, res in res_dict.items():
                 self.slurmReservation2point(ts, rname, res, points)
              self.sav_res_dict = res_dict

          with self.lock:
              self.points.extend(points)

          time.sleep (PyslurmReader.INTERVAL)
Пример #39
0
def get_jobs_by_nodes():

    jobs = pyslurm.job().get()
    nodes = pyslurm.node().get()

    returned_nodes = {}

    for node_id, node in nodes.iteritems():
        returned_jobs = {}
        # filter jobs by node
        for jobid, job in jobs.iteritems():
            nodes_list = job['cpus_allocated'].keys()
            if node_id in nodes_list:
                returned_jobs[jobid] = job

        returned_nodes[node_id] = returned_jobs

    return returned_nodes
Пример #40
0
def get_jobs_by_nodes():

    jobs = get_from_cache(pyslurm.job().get, 'get_jobs')
    nodes = get_from_cache(pyslurm.node().get, 'get_nodes')

    returned_nodes = {}

    for node_id, node in nodes.iteritems():
        returned_jobs = {}
        # filter jobs by node
        for jobid, job in jobs.iteritems():
            nodes_list = job['cpus_allocated'].keys()
            if node_id in nodes_list:
                returned_jobs[jobid] = job

        returned_nodes[node_id] = filter_entities('jobs', returned_jobs)

    return returned_nodes
Пример #41
0
def get_jobs_by_nodes():

    jobs = pyslurm.job().get()
    nodes = pyslurm.node().get()

    returned_nodes = {}

    for node_id, node in nodes.iteritems():
        returned_jobs = {}
        # filter jobs by node
        for jobid, job in jobs.iteritems():
            nodes_list = job['cpus_allocated'].keys()
            if node_id in nodes_list:
                returned_jobs[jobid] = job

        returned_nodes[node_id] = returned_jobs

    return returned_nodes
Пример #42
0
def display_nodes_from_github():
    try:

        Nodes = pyslurm.node()
        node_dict = Nodes.get()

        if len(node_dict) > 0:

            display(node_dict)

            print()
            print("Node IDs - {0}".format(Nodes.ids()))

        else:

            print("No Nodes found !")

    except ValueError as e:
        print("Error - {0}".format(e.args[0]))
Пример #43
0
def test_node_count():
    all_nodes = pyslurm.node().get()
    all_node_ids = pyslurm.node().ids()
    assert len(all_nodes) == len(all_node_ids)
Пример #44
0
def get_nodes():
    if mocking:
        return mock('nodes.json')

    nodes = pyslurm.node().get()
    return nodes
Пример #45
0
   Valid States :

          NODE_RESUME
          NODE_STATE_DRAIN
          NODE_STATE_COMPLETING
          NODE_STATE_NO_RESPOND
          NODE_STATE_POWER_SAVE
          NODE_STATE_FAIL
          NODE_STATE_POWER_UP

   Some states are not valid on a Blue Gene
"""

from __future__ import print_function

import pyslurm

Node_dict = {
    'node_names': 'bps000',
    'node_state': pyslurm.NODE_STATE_DRAIN, 
    'reason': 'API test'
    }

try:
    a = pyslurm.node()
    rc = a.update(Node_dict)
except ValueError as e:
    print("Node Update error - {0}".format(e.args[0]))
else:
    print("Node {0} successfully updated".format(Node_dict["node_names"]))
Пример #46
0
def test_node_ids():
    """Node: Test node().ids() return type."""
    all_node_ids = pyslurm.node().ids()
    assert_true(isinstance(all_node_ids, list))
Пример #47
0
def get_nodes():

    nodes = pyslurm.node().get()
    return nodes
Пример #48
0
def test_node_count():
    """Node: Test node count."""
    all_nodes = pyslurm.node().get()
    all_node_ids = pyslurm.node().ids()
    assert_equals(len(all_nodes), len(all_node_ids))
    print("Total Configured CPUs      : {0:>8}".format(
          metrics["total_cpus_config"]))
    print("Cluster CPU Utilization    : {0:>7}%".format(
          metrics["total_cpus_alloc"] * 100 / metrics["total_cpus_avail"]))
    print()

    # MEMORY
    print("Total Allocated Memory     : {0:>8}".format(
          human_readable(metrics["total_memory_alloc"] * 1024 * 1024)))
    print("Total Eligible Memory      : {0:>8}".format(
          human_readable(metrics["total_memory_avail"] * 1024 * 1024)))
    print("Total Configured Memory    : {0:>8}".format(
          human_readable(metrics["total_memory_config"] * 1024 * 1024)))
    print("Cluster Memory Utilization : {0:>7}%".format(
          metrics["total_memory_alloc"] * 100 / metrics["total_memory_avail"]))
    print()


if __name__ == "__main__":
    try:
        # Make sure pyslurm works, or else exit here
        pyslurmnode = pyslurm.node()
        # Get all node info
        nodes = pyslurmnode.get()
    except ValueError as e:
        print('Query failed - {0}').format(e)
        sys.exit(1)

    metrics = get_util(nodes)
    display_metrics(metrics)
Пример #50
0
def get_nodes():
    nodes = pyslurm.node().get()
    return jsonify(nodes)
Пример #51
0
def test_node_ids():
    all_node_ids = pyslurm.node().ids()
    assert type(all_node_ids) is ListType
Пример #52
0
def test_node_get():
    all_nodes = pyslurm.node().get()
    assert type(all_nodes) is DictType
Пример #53
0
						ddate = pyslurm.epoch2date(ddate)
						print("\t{0:<17} : {1}".format(part_key, ddate))
				elif ('reason_uid' in part_key and value['reason'] is None):
					print("\t{0:<17} : ".format(part_key))
				else: 
					print("\t{0:<17} : {1}".format(part_key, value[part_key]))

			print('{0:*^80}'.format(''))

if __name__ == "__main__":

	import pyslurm

	try:

		Nodes = pyslurm.node()
		node_dict = Nodes.get()

		if len(node_dict) > 0:

			display(node_dict)

			print()
			print("Node IDs - {0}".format(Nodes.ids()))

		else:
	
			print("No Nodes found !")

	except ValueError as e:
		print("Error - {0}".format(e.args[0]))
Пример #54
0
def test_node_get():
    """Node: Test node().get() return type."""
    all_nodes = pyslurm.node().get()
    assert_true(isinstance(all_nodes, dict))