def get_available_nodes(self, slices_size=1): """ Returns a list of currently available nodes by slice of slices_size ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]'] Args: (int) slices_size: slices size Returns: (str) list of nodes_id """ cmd = "sinfo -h -t IDLE" cmd_output = Popen(cmd, cwd=os.getcwd(), shell=True, stdout=PIPE, universal_newlines=True) if cmd_output.wait(): return [] nodeset = NodeSet() for line in cmd_output.stdout: nodeset_str = re.split(r'\s+', line.strip())[5] nodeset.update(nodeset_str) split_c = int(len(nodeset) / slices_size) nodes_list = [str(ns) for ns in nodeset.split(split_c)] return nodes_list
def get_available_nodes(self,slices_size=1): """ Returns a list of currently available nodes by slice of slices_size ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]'] :param slices_size: slices size :param type: int :returns: list of nodes_id :rtype: str """ node_list=[] a = pyslurm.node() node_dict = a.get() node_count=0 nodeset = NodeSet() if len(node_dict) > 0: for key, value in sorted(node_dict.iteritems()): if value['state']=='IDLE': nodetype=value nodeset.update(key) node_count+=1 if node_count==slices_size: node_list.append(str(nodeset)) nodeset=NodeSet() slice_str=None node_count=0 return node_list
def get_available_nodes(self, slices_size=1): """ Returns a list of currently available nodes by slice of slices_size ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]'] Args: (int) slices_size: slices size Returns: (str) list of nodes_id """ cmd_str = "sinfo -h -t IDLE" ret_code, stdout, _ = utils.run_cmd(cmd_str, os.getcwd()) if ret_code: print("!!Warning: unclebech was not able to get avaiable nodes") return [] nodeset = NodeSet() for line in stdout: nodeset_str = re.split(r'\s+', line.strip())[5] nodeset.update(nodeset_str) split_c = int(len(nodeset) / slices_size) nodes_list = [str(ns) for ns in nodeset.split(split_c)] return nodes_list
def get_available_nodes(self, slices_size=1): """ Returns a list of currently available nodes by slice of slices_size ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]'] :param slices_size: slices size :param type: int :returns: list of nodes_id :rtype: str """ node_list = [] a = pyslurm.node() node_dict = a.get() node_count = 0 nodeset = NodeSet() if len(node_dict) > 0: for key, value in sorted(node_dict.iteritems()): if value['state'] == 'IDLE': nodetype = value nodeset.update(key) node_count += 1 if node_count == slices_size: node_list.append(str(nodeset)) nodeset = NodeSet() slice_str = None node_count = 0 return node_list
def table_generate(self, root, topology): """The router relies on a routing table. The keys are the destination nodes and the values are the next hop gateways to use to reach these nodes. """ try: root_group = topology.find_nodegroup(root) except TopologyError: msgfmt = "Invalid root or gateway node: %s" raise RouteResolvingError(msgfmt % root) self.table = [] for group in root_group.children(): dest = NodeSet() stack = [group] while len(stack) > 0: curr = stack.pop() dest.update(curr.children_ns()) stack += curr.children() self.table.append((dest, group.nodeset))
def get_truncated_nodes_lists(self,nnodes_list,nodes_id): """ From a list of nodes number and a list of nodes id returns a list of nodes_id truncated according to nodes number :param nnodes_list: ex [2,4] :type nnodes_list: list of int :param nodes_id: ex ['cn[100-104]','cn[50-84]'] :type nodes_id: list of str :returns: truncated node list ex: ['cn[100-101]','cn[50-53]'] :rtype: list of str """ nodes_id_list=[] for nnode in nnodes_list : nodeset=NodeSet() nodeset.update(nodes_id) if nnode>len(nodeset): raise Exception('Number of nodes is greater than the giver number of nodes id') nodes_id_list.append(str(nodeset[:nnode])) return nodes_id_list
def get_truncated_nodes_lists(self, nnodes_list, nodes_id): """ From a list of nodes number and a list of nodes id returns a list of nodes_id truncated according to nodes number :param nnodes_list: ex [2,4] :type nnodes_list: list of int :param nodes_id: ex ['cn[100-104]','cn[50-84]'] :type nodes_id: list of str :returns: truncated node list ex: ['cn[100-101]','cn[50-53]'] :rtype: list of str """ nodes_id_list = [] for nnode in nnodes_list: nodeset = NodeSet() nodeset.update(nodes_id) if nnode > len(nodeset): raise Exception( 'Number of nodes is greater than the giver number of nodes id' ) nodes_id_list.append(str(nodeset[:nnode])) return nodes_id_list
def set_test_environment(args): """Set up the test environment. Args: args (argparse.Namespace): command line arguments for this program Returns: None """ base_dir = get_build_environment()["PREFIX"] bin_dir = os.path.join(base_dir, "bin") sbin_dir = os.path.join(base_dir, "sbin") # /usr/sbin is not setup on non-root user for CI nodes. # SCM formatting tool mkfs.ext4 is located under # /usr/sbin directory. usr_sbin = os.path.sep + os.path.join("usr", "sbin") path = os.environ.get("PATH") # Get the default interface to use if OFI_INTERFACE is not set interface = os.environ.get("OFI_INTERFACE") if interface is None: # Find all the /sys/class/net interfaces on the launch node # (excluding lo) print("Detecting network devices - OFI_INTERFACE not set") available_interfaces = {} net_path = os.path.join(os.path.sep, "sys", "class", "net") net_list = [dev for dev in os.listdir(net_path) if dev != "lo"] for device in sorted(net_list): # Get the interface state - only include active (up) interfaces with open(os.path.join(net_path, device, "operstate"), "r") as \ fileh: state = fileh.read().strip() # Only include interfaces that are up if state.lower() == "up": # Get the interface speed - used to select the fastest available with open(os.path.join(net_path, device, "speed"), "r") as \ fileh: try: speed = int(fileh.read().strip()) # KVM/Qemu/libvirt returns an EINVAL except IOError as ioerror: if ioerror.errno == errno.EINVAL: speed = 1000 else: raise print(" - {0:<5} (speed: {1:>6} state: {2})".format( device, speed, state)) # Only include the first active interface for each speed - first # is determined by an alphabetic sort: ib0 will be checked # before ib1 if speed not in available_interfaces: available_interfaces[speed] = device print("Available interfaces: {}".format(available_interfaces)) try: # Select the fastest active interface available by sorting the speed interface = available_interfaces[sorted(available_interfaces)[-1]] except IndexError: print("Error obtaining a default interface from: {}".format( os.listdir(net_path))) exit(1) print("Using {} as the default interface".format(interface)) # Update env definitions os.environ["PATH"] = ":".join([bin_dir, sbin_dir, usr_sbin, path]) os.environ["CRT_CTX_SHARE_ADDR"] = "1" os.environ["OFI_INTERFACE"] = os.environ.get("OFI_INTERFACE", interface) # Set the default location for daos log files written during testing if not # already defined. if "DAOS_TEST_LOG_DIR" not in os.environ: os.environ["DAOS_TEST_LOG_DIR"] = DEFAULT_DAOS_TEST_LOG_DIR os.environ["D_LOG_FILE"] = os.path.join(os.environ["DAOS_TEST_LOG_DIR"], "daos.log") # Ensure the daos log files directory exists on each possible test node test_hosts = NodeSet(socket.gethostname().split(".")[0]) test_hosts.update(args.test_clients) test_hosts.update(args.test_servers) spawn_commands(test_hosts, "mkdir -p {}".format(os.environ["DAOS_TEST_LOG_DIR"])) # Python paths required for functional testing python_version = "python{}{}".format( version_info.major, "" if version_info.major > 2 else ".{}".format(version_info.minor)) required_python_paths = [ os.path.abspath("util/apricot"), os.path.abspath("util"), os.path.join(base_dir, "lib64", python_version, "site-packages"), ] # Check the PYTHONPATH env definition python_path = os.environ.get("PYTHONPATH") if python_path is None or python_path == "": # Use the required paths to define the PYTHONPATH env if it is not set os.environ["PYTHONPATH"] = ":".join(required_python_paths) else: # Append any missing required paths to the existing PYTHONPATH env defined_python_paths = [ os.path.abspath(os.path.expanduser(path)) for path in python_path.split(":") ] for required_path in required_python_paths: if required_path not in defined_python_paths: python_path += ":" + required_path os.environ["PYTHONPATH"] = python_path
def get_metrics_results(self, cluster, job, metrics, period): """Get the metrics of the job on the cluster for the period in parameters. It sends an HTTP request to InfluxDB service to download the metric values in JSON format and returns a list. """ time_group = periods[period] profiler = Profiler() metrics_s = "\"" + "\", \"".join(metrics) + "\"" req = "select mean(value) from {metrics} " \ "where time > now() - {period} " \ "and cluster = '{cluster}' " \ "and job = 'job_{job}' " \ "group by time({time_group}), node fill(0)" \ .format(metrics=metrics_s, period=period, cluster=cluster, job=job.jobid, time_group=time_group) profiler.meta('metrics_req', req) payload = {'db': self.db, 'q': req, 'epoch': 'ms'} profiler.start('metrics_req') resp = requests.get(url=self.url, params=payload) profiler.stop('metrics_req') if resp.status_code == 404: raise LookupError("metrics not found for job {job} on cluster " "{cluster}" .format(job=job.jobid, cluster=cluster)) profiler.start('metrics_proc') data = json.loads(resp.text) # data is a dict with 'results' key that is itself a list of dict with # 'series' key that is as well a list of dict, one dict per node/node # association. Each dict has it own list of values. We have to compute # the sum the values for all nodes at every timestampsi, for each # metric. # # Ex: # # { "results": [ # { "series": [ # { "name": "cpu-system", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "cpu-system", # "tags": {"node":"cn2"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-system for cn3 ...) # # { "name": "cpu-user", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-user for cn[2-3] ...) # # { "name": "cpus", # "tags": {"node":"admin"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",6], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "memory-pss", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then memory-pss for cn[2-3] ...) # # ]} # ]} series = data['results'][0]['series'] results = {} nodeset = NodeSet() for serie in series: metric = serie['name'] node = serie['tags']['node'].encode('utf-8') if node not in nodeset: nodeset.update(node) for pair in serie['values']: timestamp = str(pair[0]) value = pair[1] if timestamp not in results: results[timestamp] = list() for xidx in range(len(metrics)): if xidx == metrics.index(metric): results[timestamp].append(value) else: results[timestamp].append(0) else: # The cpus/nodes metrics can be produced by several batch # servers and thus returned multiple times by InfluxDB # server in the result of the request. We must take care to # not add the multiple results of this metric here! if metric in ['cpus', 'nodes']: results[timestamp][metrics.index(metric)] = value else: results[timestamp][metrics.index(metric)] += value profiler.stop('metrics_proc') return (results, nodeset)
def ttyloop(task, nodeset, timeout, display, remote): """Manage the interactive prompt to run command""" readline_avail = False interactive = task.default("USER_interactive") if interactive: try: import readline readline_setup() readline_avail = True except ImportError: pass display.vprint(VERB_STD, \ "Enter 'quit' to leave this interactive mode") rc = 0 ns = NodeSet(nodeset) ns_info = True cmd = "" while task.default("USER_running") or \ (interactive and cmd.lower() != 'quit'): try: # Set SIGUSR1 handler if needed if task.default("USER_handle_SIGUSR1"): signal.signal(signal.SIGUSR1, signal_handler) if task.default("USER_interactive") and \ not task.default("USER_running"): if ns_info: display.vprint(VERB_QUIET, \ "Working with nodes: %s" % ns) ns_info = False prompt = "clush> " else: prompt = "" try: cmd = raw_input(prompt) assert cmd is not None, "Result of raw_input() is None!" finally: signal.signal(signal.SIGUSR1, signal.SIG_IGN) except EOFError: print() return except UpdatePromptException: if task.default("USER_interactive"): continue return except KeyboardInterrupt as kbe: # Caught SIGINT here (main thread) but the signal will also reach # subprocesses (that will most likely kill them) if display.gather: # Suspend task, so we can safely access its data from here task.suspend() # If USER_running is not set, the task had time to finish, # that could mean all subprocesses have been killed and all # handlers have been processed. if not task.default("USER_running"): # let's clush_excepthook handle the rest raise kbe # If USER_running is set, the task didn't have time to finish # its work, so we must print something for the user... print_warn = False # Display command output, but cannot order buffers by rc nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1])) for buf, nodeset in sorted(map(nodesetify, task.iter_buffers()), key=bufnodeset_cmpkey): if not print_warn: print_warn = True display.vprint_err(VERB_STD, \ "Warning: Caught keyboard interrupt!") display.print_gather(nodeset, buf) # Return code handling verbexit = VERB_QUIET if display.maxrc: verbexit = VERB_STD ns_ok = NodeSet() for rc, nodelist in task.iter_retcodes(): ns_ok.add(NodeSet._fromlist1(nodelist)) if rc != 0: # Display return code if not ok ( != 0) nsdisp = ns = NodeSet._fromlist1(nodelist) if display.verbosity >= VERB_QUIET and len(ns) > 1: nsdisp = "%s (%d)" % (ns, len(ns)) msgrc = "clush: %s: exited with exit code %d" % ( nsdisp, rc) display.vprint_err(verbexit, msgrc) # Add uncompleted nodeset to exception object kbe.uncompleted_nodes = ns - ns_ok # Display nodes that didn't answer within command timeout delay if task.num_timeout() > 0: display.vprint_err(verbexit, \ "clush: %s: command timeout" % \ NodeSet._fromlist1(task.iter_keys_timeout())) raise kbe if task.default("USER_running"): ns_reg, ns_unreg = NodeSet(), NodeSet() for client in task._engine.clients(): if client.registered: ns_reg.add(client.key) else: ns_unreg.add(client.key) if ns_unreg: pending = "\nclush: pending(%d): %s" % (len(ns_unreg), ns_unreg) else: pending = "" display.vprint_err(VERB_QUIET, "clush: interrupt (^C to abort task)") gws = list(task.gateways) if not gws: display.vprint_err( VERB_QUIET, "clush: in progress(%d): %s%s" % (len(ns_reg), ns_reg, pending)) else: display.vprint_err( VERB_QUIET, "clush: in progress(%d): %s%s\n" "clush: [tree] open gateways(%d): %s" % (len(ns_reg), ns_reg, pending, len(gws), NodeSet._fromlist1(gws))) for gw, (chan, metaworkers) in task.gateways.items(): act_targets = NodeSet.fromlist(mw.gwtargets[gw] for mw in metaworkers) if act_targets: display.vprint_err( VERB_QUIET, "clush: [tree] in progress(%d) on %s: %s" % (len(act_targets), gw, act_targets)) else: cmdl = cmd.lower() try: ns_info = True if cmdl.startswith('+'): ns.update(cmdl[1:]) elif cmdl.startswith('-'): ns.difference_update(cmdl[1:]) elif cmdl.startswith('@'): ns = NodeSet(cmdl[1:]) elif cmdl == '=': display.gather = not display.gather if display.gather: display.vprint(VERB_STD, \ "Switching to gathered output format") else: display.vprint(VERB_STD, \ "Switching to standard output format") task.set_default("stdout_msgtree", \ display.gather or display.line_mode) ns_info = False continue elif not cmdl.startswith('?'): # if ?, just print ns_info ns_info = False except NodeSetParseError: display.vprint_err(VERB_QUIET, \ "clush: nodeset parse error (ignoring)") if ns_info: continue if cmdl.startswith('!') and len(cmd.strip()) > 0: run_command(task, cmd[1:], None, timeout, display, remote) elif cmdl != "quit": if not cmd: continue if readline_avail: readline.write_history_file(get_history_file()) run_command(task, cmd, ns, timeout, display, remote) return rc
def ttyloop(task, nodeset, timeout, display, remote): """Manage the interactive prompt to run command""" readline_avail = False interactive = task.default("USER_interactive") if interactive: try: import readline readline_setup() readline_avail = True except ImportError: pass display.vprint(VERB_STD, \ "Enter 'quit' to leave this interactive mode") rc = 0 ns = NodeSet(nodeset) ns_info = True cmd = "" while task.default("USER_running") or \ (interactive and cmd.lower() != 'quit'): try: # Set SIGUSR1 handler if needed if task.default("USER_handle_SIGUSR1"): signal.signal(signal.SIGUSR1, signal_handler) if task.default("USER_interactive") and \ not task.default("USER_running"): if ns_info: display.vprint(VERB_QUIET, \ "Working with nodes: %s" % ns) ns_info = False prompt = "clush> " else: prompt = "" try: cmd = raw_input(prompt) assert cmd is not None, "Result of raw_input() is None!" finally: signal.signal(signal.SIGUSR1, signal.SIG_IGN) except EOFError: print() return except UpdatePromptException: if task.default("USER_interactive"): continue return except KeyboardInterrupt as kbe: # Caught SIGINT here (main thread) but the signal will also reach # subprocesses (that will most likely kill them) if display.gather: # Suspend task, so we can safely access its data from here task.suspend() # If USER_running is not set, the task had time to finish, # that could mean all subprocesses have been killed and all # handlers have been processed. if not task.default("USER_running"): # let's clush_excepthook handle the rest raise kbe # If USER_running is set, the task didn't have time to finish # its work, so we must print something for the user... print_warn = False # Display command output, but cannot order buffers by rc nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1])) for buf, nodeset in sorted(map(nodesetify, task.iter_buffers()), key=bufnodeset_cmpkey): if not print_warn: print_warn = True display.vprint_err(VERB_STD, \ "Warning: Caught keyboard interrupt!") display.print_gather(nodeset, buf) # Return code handling verbexit = VERB_QUIET if display.maxrc: verbexit = VERB_STD ns_ok = NodeSet() for rc, nodelist in task.iter_retcodes(): ns_ok.add(NodeSet._fromlist1(nodelist)) if rc != 0: # Display return code if not ok ( != 0) nsdisp = ns = NodeSet._fromlist1(nodelist) if display.verbosity >= VERB_QUIET and len(ns) > 1: nsdisp = "%s (%d)" % (ns, len(ns)) msgrc = "clush: %s: exited with exit code %d" % (nsdisp, rc) display.vprint_err(verbexit, msgrc) # Add uncompleted nodeset to exception object kbe.uncompleted_nodes = ns - ns_ok # Display nodes that didn't answer within command timeout delay if task.num_timeout() > 0: display.vprint_err(verbexit, \ "clush: %s: command timeout" % \ NodeSet._fromlist1(task.iter_keys_timeout())) raise kbe if task.default("USER_running"): ns_reg, ns_unreg = NodeSet(), NodeSet() for client in task._engine.clients(): if client.registered: ns_reg.add(client.key) else: ns_unreg.add(client.key) if ns_unreg: pending = "\nclush: pending(%d): %s" % (len(ns_unreg), ns_unreg) else: pending = "" display.vprint_err(VERB_QUIET, "clush: interrupt (^C to abort task)") gws = list(task.gateways) if not gws: display.vprint_err(VERB_QUIET, "clush: in progress(%d): %s%s" % (len(ns_reg), ns_reg, pending)) else: display.vprint_err(VERB_QUIET, "clush: in progress(%d): %s%s\n" "clush: [tree] open gateways(%d): %s" % (len(ns_reg), ns_reg, pending, len(gws), NodeSet._fromlist1(gws))) for gw, (chan, metaworkers) in task.gateways.items(): act_targets = NodeSet.fromlist(mw.gwtargets[gw] for mw in metaworkers) if act_targets: display.vprint_err(VERB_QUIET, "clush: [tree] in progress(%d) on %s: %s" % (len(act_targets), gw, act_targets)) else: cmdl = cmd.lower() try: ns_info = True if cmdl.startswith('+'): ns.update(cmdl[1:]) elif cmdl.startswith('-'): ns.difference_update(cmdl[1:]) elif cmdl.startswith('@'): ns = NodeSet(cmdl[1:]) elif cmdl == '=': display.gather = not display.gather if display.gather: display.vprint(VERB_STD, \ "Switching to gathered output format") else: display.vprint(VERB_STD, \ "Switching to standard output format") task.set_default("stdout_msgtree", \ display.gather or display.line_mode) ns_info = False continue elif not cmdl.startswith('?'): # if ?, just print ns_info ns_info = False except NodeSetParseError: display.vprint_err(VERB_QUIET, \ "clush: nodeset parse error (ignoring)") if ns_info: continue if cmdl.startswith('!') and len(cmd.strip()) > 0: run_command(task, cmd[1:], None, timeout, display, remote) elif cmdl != "quit": if not cmd: continue if readline_avail: readline.write_history_file(get_history_file()) run_command(task, cmd, ns, timeout, display, remote) return rc
# Check if node file exist ipxe_file = os.path.join(pxe_nodes_path, '{node}.ipxe'.format(node=node)) if not os.path.exists(ipxe_file): logging.warning(bcolors.WARNING + 'File ' + ipxe_file + ' does not exist. Skipping.' + bcolors.ENDC) continue else: with open(ipxe_file, 'r') as f: ipxe_conf = f.read() # Search the default boot type in the ipxe file boot = re.search(r"^set menu-default boot(.+)", ipxe_conf, re.MULTILINE).group(1) if boot == 'disk': diskfull.update(node) elif boot == 'osdeploy': osdeploy.update(node) elif boot == 'diskless': # If diskless, group nodes per image image = re.search(r"^set node-image (.+)", ipxe_conf, re.MULTILINE).group(1) if image not in diskless: diskless[image] = NodeSet() diskless[image].update(node) # Display NodeSet per boot type if len(diskfull): print('Diskfull: {nodes}'.format(nodes=diskfull)) if len(osdeploy): print('Next boot deployment: {nodes}'.format(nodes=osdeploy))
# Check if node file exist ipxe_file = os.path.join(pxe_nodes_path, '{node}.ipxe'.format(node=node)) if not os.path.exists(ipxe_file): logging.warning(bcolors.WARNING + 'File ' + ipxe_file + ' does not exist. Skipping.' + bcolors.ENDC) continue else: with open(ipxe_file, 'r') as f: ipxe_conf = f.read() # Search the default boot type in the ipxe file boot = re.search(r"^set menu-default boot(.+)", ipxe_conf, re.MULTILINE).group(1) if boot == 'disk': diskfull.update(node) elif boot == 'next': bootnext.update(node) elif boot == 'osdeploy': osdeploy.update(node) elif boot == 'diskless': # If diskless, group nodes per image image = re.search(r"^set node-image (.+)", ipxe_conf, re.MULTILINE).group(1) if image not in diskless: diskless[image] = NodeSet() diskless[image].update(node) # Display NodeSet per boot type if len(diskfull): print('Diskfull: {nodes}'.format(nodes=diskfull))
def get_metrics_results(self, cluster, job, metrics, period): """Get the metrics of the job on the cluster for the period in parameters. It sends an HTTP request to InfluxDB service to download the metric values in JSON format and returns a list. """ time_group = periods[period] profiler = Profiler() metrics_s = "\"" + "\", \"".join(metrics) + "\"" req = "select mean(value) from {metrics} " \ "where time > now() - {period} " \ "and cluster = '{cluster}' " \ "and job = 'job_{job}' " \ "group by time({time_group}), node fill(0)" \ .format(metrics=metrics_s, period=period, cluster=cluster, job=job.jobid, time_group=time_group) profiler.meta('metrics_req', req) payload = {'db': self.db, 'q': req, 'epoch': 'ms'} profiler.start('metrics_req') resp = requests.get(url=self.url, params=payload) profiler.stop('metrics_req') if resp.status_code == 404: raise LookupError("metrics not found for job {job} on cluster " "{cluster}".format(job=job.jobid, cluster=cluster)) profiler.start('metrics_proc') data = json.loads(resp.text) # data is a dict with 'results' key that is itself a list of dict with # 'series' key that is as well a list of dict, one dict per node/node # association. Each dict has it own list of values. We have to compute # the sum the values for all nodes at every timestampsi, for each # metric. # # Ex: # # { "results": [ # { "series": [ # { "name": "cpu-system", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "cpu-system", # "tags": {"node":"cn2"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-system for cn3 ...) # # { "name": "cpu-user", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-user for cn[2-3] ...) # # { "name": "cpus", # "tags": {"node":"admin"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",6], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "memory-pss", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then memory-pss for cn[2-3] ...) # # ]} # ]} series = data['results'][0]['series'] results = {} nodeset = NodeSet() for serie in series: metric = serie['name'] node = serie['tags']['node'].encode('utf-8') if node not in nodeset: nodeset.update(node) for pair in serie['values']: timestamp = str(pair[0]) value = pair[1] if timestamp not in results: results[timestamp] = list() for xidx in range(len(metrics)): if xidx == metrics.index(metric): results[timestamp].append(value) else: results[timestamp].append(0) else: # The cpus/nodes metrics can be produced by several batch # servers and thus returned multiple times by InfluxDB # server in the result of the request. We must take care to # not add the multiple results of this metric here! if metric in ['cpus', 'nodes']: results[timestamp][metrics.index(metric)] = value else: results[timestamp][metrics.index(metric)] += value profiler.stop('metrics_proc') return (results, nodeset)
def get_metrics_results(self, cluster, job, metrics, period): """Get the metrics of the job on the cluster for the period in parameters. It sends an HTTP request to InfluxDB service to download the metric values in JSON format and returns a list. """ timejob = job.end_time - job.start_time logger.debug("time job: %d", timejob) if timejob < 3600: period = "1h" if timejob < 21600 and timejob > 3600: period = "6h" time_group = periods[period] profiler = Profiler() metrics_s = "\"" + "\", \"".join(metrics) + "\"" req = "select mean(value) from {metrics} " \ "where cluster = '{cluster}' " \ "and (( job = 'job_{job}' and time > now() - {period} ) or" \ " ( job = 'none' and plugin = 'cuda' and time >= {start_time}000000000 and time <= {end_time}000000000 and node = '{nodes}' )) " \ "group by time({time_group}), node fill(0)" \ .format(metrics=metrics_s, period=period, cluster=cluster, job=job.jobid, nodes=job.nodeset, start_time=job.start_time, end_time=job.end_time, time_group=time_group) logger.debug("req influx: %s", req) profiler.meta('metrics_req', req) payload = {'db': self.db, 'q': req, 'epoch': 'ms'} profiler.start('metrics_req') resp = requests.get(url=self.url, params=payload) profiler.stop('metrics_req') if resp.status_code == 404: raise LookupError("metrics not found for job {job} on cluster " "{cluster}".format(job=job.jobid, cluster=cluster)) profiler.start('metrics_proc') json_data = json.loads(resp.text) # data is a dict with 'results' key that is itself a list of dict with # 'series' key that is as well a list of dict, one dict per node/node # association. Each dict has it own list of values. We have to compute # the sum the values for all nodes at every timestampsi, for each # metric. # # Ex: # # { "results": [ # { "series": [ # { "name": "cpu-system", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "cpu-system", # "tags": {"node":"cn2"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-system for cn3 ...) # # { "name": "cpu-user", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-user for cn[2-3] ...) # # { "name": "cpus", # "tags": {"node":"admin"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",6], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "memory-pss", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then memory-pss for cn[2-3] ...) # # ]} # ]} results = {} nodeset = NodeSet() for result in json_data['results']: if 'series' in result: series = result['series'] else: logger.warn("No series in one result for query: %s", req) series = {} for serie in series: metric = serie['name'] node = serie['tags']['node'].encode('utf-8') if node not in nodeset: nodeset.update(node) for pair in serie['values']: timestamp = str(pair[0]) value = pair[1] if timestamp not in results: # init all values for timestamp to 0 results[timestamp] = [0] * len(metrics) # The cpus/nodes metrics can be produced by several # batch servers and thus returned multiple times by # InfluxDB server in the result of the request. We # must take care to not add the multiple results of # this metric here! if metric in ['cpus', 'nodes']: results[timestamp][metrics.index(metric)] = value else: results[timestamp][metrics.index(metric)] += value profiler.stop('metrics_proc') return (results, nodeset)