Пример #1
0
    def get_available_nodes(self, slices_size=1):
        """ Returns a list of currently available nodes by slice of slices_size
        ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]']

        Args:
            (int) slices_size: slices size

        Returns:
            (str) list of nodes_id
        """

        cmd = "sinfo -h -t IDLE"
        cmd_output = Popen(cmd,
                           cwd=os.getcwd(),
                           shell=True,
                           stdout=PIPE,
                           universal_newlines=True)

        if cmd_output.wait():
            return []

        nodeset = NodeSet()
        for line in cmd_output.stdout:
            nodeset_str = re.split(r'\s+', line.strip())[5]
            nodeset.update(nodeset_str)

        split_c = int(len(nodeset) / slices_size)
        nodes_list = [str(ns) for ns in nodeset.split(split_c)]

        return nodes_list
Пример #2
0
     def get_available_nodes(self,slices_size=1):
          """ Returns a list of currently available nodes by slice of slices_size
          ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]']
          :param slices_size: slices size
          :param type: int
          :returns: list of nodes_id
          :rtype: str """

          node_list=[]
          a = pyslurm.node()
          node_dict = a.get()
          node_count=0
          nodeset = NodeSet()
          if len(node_dict) > 0:
               for key, value in sorted(node_dict.iteritems()):
                    if value['state']=='IDLE':
                         nodetype=value
                         nodeset.update(key)
                         node_count+=1
                    if node_count==slices_size:
                         node_list.append(str(nodeset))
                         nodeset=NodeSet()
                         slice_str=None
                         node_count=0


          return node_list
Пример #3
0
    def get_available_nodes(self, slices_size=1):
        """ Returns a list of currently available nodes by slice of slices_size
        ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]']

        Args:
            (int) slices_size: slices size

        Returns:
            (str) list of nodes_id
        """

        cmd_str = "sinfo -h -t IDLE"
        ret_code, stdout, _ = utils.run_cmd(cmd_str, os.getcwd())

        if ret_code:
            print("!!Warning: unclebech was not able to get avaiable nodes")
            return []

        nodeset = NodeSet()

        for line in stdout:
            nodeset_str = re.split(r'\s+', line.strip())[5]
            nodeset.update(nodeset_str)

        split_c = int(len(nodeset) / slices_size)
        nodes_list = [str(ns) for ns in nodeset.split(split_c)]

        return nodes_list
    def get_available_nodes(self, slices_size=1):
        """ Returns a list of currently available nodes by slice of slices_size
          ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]']
          :param slices_size: slices size
          :param type: int
          :returns: list of nodes_id
          :rtype: str """

        node_list = []
        a = pyslurm.node()
        node_dict = a.get()
        node_count = 0
        nodeset = NodeSet()
        if len(node_dict) > 0:
            for key, value in sorted(node_dict.iteritems()):
                if value['state'] == 'IDLE':
                    nodetype = value
                    nodeset.update(key)
                    node_count += 1
                if node_count == slices_size:
                    node_list.append(str(nodeset))
                    nodeset = NodeSet()
                    slice_str = None
                    node_count = 0

        return node_list
Пример #5
0
    def table_generate(self, root, topology):
        """The router relies on a routing table. The keys are the
        destination nodes and the values are the next hop gateways to
        use to reach these nodes.
        """
        try:
            root_group = topology.find_nodegroup(root)
        except TopologyError:
            msgfmt = "Invalid root or gateway node: %s"
            raise RouteResolvingError(msgfmt % root)

        self.table = []
        for group in root_group.children():
            dest = NodeSet()
            stack = [group]
            while len(stack) > 0:
                curr = stack.pop()
                dest.update(curr.children_ns())
                stack += curr.children()
            self.table.append((dest, group.nodeset))
Пример #6
0
     def get_truncated_nodes_lists(self,nnodes_list,nodes_id):
          """ From a list of nodes number and a list of nodes id returns a list of nodes_id
          truncated according to nodes number
          :param nnodes_list: ex [2,4]
          :type nnodes_list: list of int
          :param nodes_id: ex ['cn[100-104]','cn[50-84]']
          :type nodes_id: list of str
          :returns: truncated node list ex: ['cn[100-101]','cn[50-53]']
          :rtype: list of str
          """
          nodes_id_list=[]
          for nnode in nnodes_list :
               nodeset=NodeSet()
               nodeset.update(nodes_id)
               if nnode>len(nodeset):
                    raise Exception('Number of nodes is greater than the giver number of nodes id')
               nodes_id_list.append(str(nodeset[:nnode]))


          return nodes_id_list
    def get_truncated_nodes_lists(self, nnodes_list, nodes_id):
        """ From a list of nodes number and a list of nodes id returns a list of nodes_id
          truncated according to nodes number
          :param nnodes_list: ex [2,4]
          :type nnodes_list: list of int
          :param nodes_id: ex ['cn[100-104]','cn[50-84]']
          :type nodes_id: list of str
          :returns: truncated node list ex: ['cn[100-101]','cn[50-53]']
          :rtype: list of str
          """
        nodes_id_list = []
        for nnode in nnodes_list:
            nodeset = NodeSet()
            nodeset.update(nodes_id)
            if nnode > len(nodeset):
                raise Exception(
                    'Number of nodes is greater than the giver number of nodes id'
                )
            nodes_id_list.append(str(nodeset[:nnode]))

        return nodes_id_list
Пример #8
0
def set_test_environment(args):
    """Set up the test environment.

    Args:
        args (argparse.Namespace): command line arguments for this program

    Returns:
        None

    """
    base_dir = get_build_environment()["PREFIX"]
    bin_dir = os.path.join(base_dir, "bin")
    sbin_dir = os.path.join(base_dir, "sbin")
    # /usr/sbin is not setup on non-root user for CI nodes.
    # SCM formatting tool mkfs.ext4 is located under
    # /usr/sbin directory.
    usr_sbin = os.path.sep + os.path.join("usr", "sbin")
    path = os.environ.get("PATH")

    # Get the default interface to use if OFI_INTERFACE is not set
    interface = os.environ.get("OFI_INTERFACE")
    if interface is None:
        # Find all the /sys/class/net interfaces on the launch node
        # (excluding lo)
        print("Detecting network devices - OFI_INTERFACE not set")
        available_interfaces = {}
        net_path = os.path.join(os.path.sep, "sys", "class", "net")
        net_list = [dev for dev in os.listdir(net_path) if dev != "lo"]
        for device in sorted(net_list):
            # Get the interface state - only include active (up) interfaces
            with open(os.path.join(net_path, device, "operstate"), "r") as \
                 fileh:
                state = fileh.read().strip()
            # Only include interfaces that are up
            if state.lower() == "up":
                # Get the interface speed - used to select the fastest available
                with open(os.path.join(net_path, device, "speed"), "r") as \
                    fileh:
                    try:
                        speed = int(fileh.read().strip())
                        # KVM/Qemu/libvirt returns an EINVAL
                    except IOError as ioerror:
                        if ioerror.errno == errno.EINVAL:
                            speed = 1000
                        else:
                            raise
                print("  - {0:<5} (speed: {1:>6} state: {2})".format(
                    device, speed, state))
                # Only include the first active interface for each speed - first
                # is determined by an alphabetic sort: ib0 will be checked
                # before ib1
                if speed not in available_interfaces:
                    available_interfaces[speed] = device
        print("Available interfaces: {}".format(available_interfaces))
        try:
            # Select the fastest active interface available by sorting the speed
            interface = available_interfaces[sorted(available_interfaces)[-1]]
        except IndexError:
            print("Error obtaining a default interface from: {}".format(
                os.listdir(net_path)))
            exit(1)
    print("Using {} as the default interface".format(interface))

    # Update env definitions
    os.environ["PATH"] = ":".join([bin_dir, sbin_dir, usr_sbin, path])
    os.environ["CRT_CTX_SHARE_ADDR"] = "1"
    os.environ["OFI_INTERFACE"] = os.environ.get("OFI_INTERFACE", interface)

    # Set the default location for daos log files written during testing if not
    # already defined.
    if "DAOS_TEST_LOG_DIR" not in os.environ:
        os.environ["DAOS_TEST_LOG_DIR"] = DEFAULT_DAOS_TEST_LOG_DIR
    os.environ["D_LOG_FILE"] = os.path.join(os.environ["DAOS_TEST_LOG_DIR"],
                                            "daos.log")

    # Ensure the daos log files directory exists on each possible test node
    test_hosts = NodeSet(socket.gethostname().split(".")[0])
    test_hosts.update(args.test_clients)
    test_hosts.update(args.test_servers)
    spawn_commands(test_hosts,
                   "mkdir -p {}".format(os.environ["DAOS_TEST_LOG_DIR"]))

    # Python paths required for functional testing
    python_version = "python{}{}".format(
        version_info.major,
        "" if version_info.major > 2 else ".{}".format(version_info.minor))
    required_python_paths = [
        os.path.abspath("util/apricot"),
        os.path.abspath("util"),
        os.path.join(base_dir, "lib64", python_version, "site-packages"),
    ]

    # Check the PYTHONPATH env definition
    python_path = os.environ.get("PYTHONPATH")
    if python_path is None or python_path == "":
        # Use the required paths to define the PYTHONPATH env if it is not set
        os.environ["PYTHONPATH"] = ":".join(required_python_paths)
    else:
        # Append any missing required paths to the existing PYTHONPATH env
        defined_python_paths = [
            os.path.abspath(os.path.expanduser(path))
            for path in python_path.split(":")
        ]
        for required_path in required_python_paths:
            if required_path not in defined_python_paths:
                python_path += ":" + required_path
        os.environ["PYTHONPATH"] = python_path
Пример #9
0
    def get_metrics_results(self, cluster, job, metrics, period):
        """Get the metrics of the job on the cluster for the period in parameters.

           It sends an HTTP request to InfluxDB service to download the metric
           values in JSON format and returns a list.
        """

        time_group = periods[period]

        profiler = Profiler()

        metrics_s = "\"" + "\", \"".join(metrics) + "\""
        req = "select mean(value) from {metrics} " \
              "where time > now() - {period} " \
              "and cluster = '{cluster}' " \
              "and job = 'job_{job}' " \
              "group by time({time_group}), node fill(0)" \
              .format(metrics=metrics_s,
                      period=period,
                      cluster=cluster,
                      job=job.jobid,
                      time_group=time_group)

        profiler.meta('metrics_req', req)

        payload = {'db': self.db, 'q': req, 'epoch': 'ms'}

        profiler.start('metrics_req')
        resp = requests.get(url=self.url, params=payload)
        profiler.stop('metrics_req')
        if resp.status_code == 404:
            raise LookupError("metrics not found for job {job} on cluster "
                              "{cluster}"
                              .format(job=job.jobid,
                                      cluster=cluster))

        profiler.start('metrics_proc')
        data = json.loads(resp.text)

        # data is a dict with 'results' key that is itself a list of dict with
        # 'series' key that is as well a list of dict, one dict per node/node
        # association. Each dict has it own list of values. We have to compute
        # the sum the values for all nodes at every timestampsi, for each
        # metric.
        #
        # Ex:
        #
        # { "results": [
        #   { "series": [
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn2"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #        ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-system for cn3 ...)
        #
        #     { "name": "cpu-user",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-user for cn[2-3] ...)
        #
        #     { "name": "cpus",
        #       "tags": {"node":"admin"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",6],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "memory-pss",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then memory-pss for cn[2-3] ...)
        #
        #   ]}
        # ]}
        series = data['results'][0]['series']

        results = {}
        nodeset = NodeSet()

        for serie in series:
            metric = serie['name']
            node = serie['tags']['node'].encode('utf-8')

            if node not in nodeset:
                nodeset.update(node)

            for pair in serie['values']:
                timestamp = str(pair[0])
                value = pair[1]
                if timestamp not in results:
                    results[timestamp] = list()
                    for xidx in range(len(metrics)):
                        if xidx == metrics.index(metric):
                            results[timestamp].append(value)
                        else:
                            results[timestamp].append(0)
                else:
                    # The cpus/nodes metrics can be produced by several batch
                    # servers and thus returned multiple times by InfluxDB
                    # server in the result of the request. We must take care to
                    # not add the multiple results of this metric here!
                    if metric in ['cpus', 'nodes']:
                        results[timestamp][metrics.index(metric)] = value
                    else:
                        results[timestamp][metrics.index(metric)] += value

        profiler.stop('metrics_proc')
        return (results, nodeset)
Пример #10
0
def ttyloop(task, nodeset, timeout, display, remote):
    """Manage the interactive prompt to run command"""
    readline_avail = False
    interactive = task.default("USER_interactive")
    if interactive:
        try:
            import readline
            readline_setup()
            readline_avail = True
        except ImportError:
            pass
        display.vprint(VERB_STD, \
            "Enter 'quit' to leave this interactive mode")

    rc = 0
    ns = NodeSet(nodeset)
    ns_info = True
    cmd = ""
    while task.default("USER_running") or \
            (interactive and cmd.lower() != 'quit'):
        try:
            # Set SIGUSR1 handler if needed
            if task.default("USER_handle_SIGUSR1"):
                signal.signal(signal.SIGUSR1, signal_handler)

            if task.default("USER_interactive") and \
                    not task.default("USER_running"):
                if ns_info:
                    display.vprint(VERB_QUIET, \
                                   "Working with nodes: %s" % ns)
                    ns_info = False
                prompt = "clush> "
            else:
                prompt = ""
            try:
                cmd = raw_input(prompt)
                assert cmd is not None, "Result of raw_input() is None!"
            finally:
                signal.signal(signal.SIGUSR1, signal.SIG_IGN)
        except EOFError:
            print()
            return
        except UpdatePromptException:
            if task.default("USER_interactive"):
                continue
            return
        except KeyboardInterrupt as kbe:
            # Caught SIGINT here (main thread) but the signal will also reach
            # subprocesses (that will most likely kill them)
            if display.gather:
                # Suspend task, so we can safely access its data from here
                task.suspend()

                # If USER_running is not set, the task had time to finish,
                # that could mean all subprocesses have been killed and all
                # handlers have been processed.
                if not task.default("USER_running"):
                    # let's clush_excepthook handle the rest
                    raise kbe

                # If USER_running is set, the task didn't have time to finish
                # its work, so we must print something for the user...
                print_warn = False

                # Display command output, but cannot order buffers by rc
                nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1]))
                for buf, nodeset in sorted(map(nodesetify,
                                               task.iter_buffers()),
                                           key=bufnodeset_cmpkey):
                    if not print_warn:
                        print_warn = True
                        display.vprint_err(VERB_STD, \
                            "Warning: Caught keyboard interrupt!")
                    display.print_gather(nodeset, buf)

                # Return code handling
                verbexit = VERB_QUIET
                if display.maxrc:
                    verbexit = VERB_STD
                ns_ok = NodeSet()
                for rc, nodelist in task.iter_retcodes():
                    ns_ok.add(NodeSet._fromlist1(nodelist))
                    if rc != 0:
                        # Display return code if not ok ( != 0)
                        nsdisp = ns = NodeSet._fromlist1(nodelist)
                        if display.verbosity >= VERB_QUIET and len(ns) > 1:
                            nsdisp = "%s (%d)" % (ns, len(ns))
                        msgrc = "clush: %s: exited with exit code %d" % (
                            nsdisp, rc)
                        display.vprint_err(verbexit, msgrc)

                # Add uncompleted nodeset to exception object
                kbe.uncompleted_nodes = ns - ns_ok

                # Display nodes that didn't answer within command timeout delay
                if task.num_timeout() > 0:
                    display.vprint_err(verbexit, \
                        "clush: %s: command timeout" % \
                            NodeSet._fromlist1(task.iter_keys_timeout()))
            raise kbe

        if task.default("USER_running"):
            ns_reg, ns_unreg = NodeSet(), NodeSet()
            for client in task._engine.clients():
                if client.registered:
                    ns_reg.add(client.key)
                else:
                    ns_unreg.add(client.key)
            if ns_unreg:
                pending = "\nclush: pending(%d): %s" % (len(ns_unreg),
                                                        ns_unreg)
            else:
                pending = ""
            display.vprint_err(VERB_QUIET,
                               "clush: interrupt (^C to abort task)")
            gws = list(task.gateways)
            if not gws:
                display.vprint_err(
                    VERB_QUIET, "clush: in progress(%d): %s%s" %
                    (len(ns_reg), ns_reg, pending))
            else:
                display.vprint_err(
                    VERB_QUIET, "clush: in progress(%d): %s%s\n"
                    "clush: [tree] open gateways(%d): %s" %
                    (len(ns_reg), ns_reg, pending, len(gws),
                     NodeSet._fromlist1(gws)))
            for gw, (chan, metaworkers) in task.gateways.items():
                act_targets = NodeSet.fromlist(mw.gwtargets[gw]
                                               for mw in metaworkers)
                if act_targets:
                    display.vprint_err(
                        VERB_QUIET, "clush: [tree] in progress(%d) on %s: %s" %
                        (len(act_targets), gw, act_targets))
        else:
            cmdl = cmd.lower()
            try:
                ns_info = True
                if cmdl.startswith('+'):
                    ns.update(cmdl[1:])
                elif cmdl.startswith('-'):
                    ns.difference_update(cmdl[1:])
                elif cmdl.startswith('@'):
                    ns = NodeSet(cmdl[1:])
                elif cmdl == '=':
                    display.gather = not display.gather
                    if display.gather:
                        display.vprint(VERB_STD, \
                            "Switching to gathered output format")
                    else:
                        display.vprint(VERB_STD, \
                            "Switching to standard output format")
                    task.set_default("stdout_msgtree", \
                                     display.gather or display.line_mode)
                    ns_info = False
                    continue
                elif not cmdl.startswith('?'):  # if ?, just print ns_info
                    ns_info = False
            except NodeSetParseError:
                display.vprint_err(VERB_QUIET, \
                    "clush: nodeset parse error (ignoring)")

            if ns_info:
                continue

            if cmdl.startswith('!') and len(cmd.strip()) > 0:
                run_command(task, cmd[1:], None, timeout, display, remote)
            elif cmdl != "quit":
                if not cmd:
                    continue
                if readline_avail:
                    readline.write_history_file(get_history_file())
                run_command(task, cmd, ns, timeout, display, remote)
    return rc
Пример #11
0
def ttyloop(task, nodeset, timeout, display, remote):
    """Manage the interactive prompt to run command"""
    readline_avail = False
    interactive = task.default("USER_interactive")
    if interactive:
        try:
            import readline
            readline_setup()
            readline_avail = True
        except ImportError:
            pass
        display.vprint(VERB_STD, \
            "Enter 'quit' to leave this interactive mode")

    rc = 0
    ns = NodeSet(nodeset)
    ns_info = True
    cmd = ""
    while task.default("USER_running") or \
            (interactive and cmd.lower() != 'quit'):
        try:
            # Set SIGUSR1 handler if needed
            if task.default("USER_handle_SIGUSR1"):
                signal.signal(signal.SIGUSR1, signal_handler)

            if task.default("USER_interactive") and \
                    not task.default("USER_running"):
                if ns_info:
                    display.vprint(VERB_QUIET, \
                                   "Working with nodes: %s" % ns)
                    ns_info = False
                prompt = "clush> "
            else:
                prompt = ""
            try:
                cmd = raw_input(prompt)
                assert cmd is not None, "Result of raw_input() is None!"
            finally:
                signal.signal(signal.SIGUSR1, signal.SIG_IGN)
        except EOFError:
            print()
            return
        except UpdatePromptException:
            if task.default("USER_interactive"):
                continue
            return
        except KeyboardInterrupt as kbe:
            # Caught SIGINT here (main thread) but the signal will also reach
            # subprocesses (that will most likely kill them)
            if display.gather:
                # Suspend task, so we can safely access its data from here
                task.suspend()

                # If USER_running is not set, the task had time to finish,
                # that could mean all subprocesses have been killed and all
                # handlers have been processed.
                if not task.default("USER_running"):
                    # let's clush_excepthook handle the rest
                    raise kbe

                # If USER_running is set, the task didn't have time to finish
                # its work, so we must print something for the user...
                print_warn = False

                # Display command output, but cannot order buffers by rc
                nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1]))
                for buf, nodeset in sorted(map(nodesetify, task.iter_buffers()),
                                           key=bufnodeset_cmpkey):
                    if not print_warn:
                        print_warn = True
                        display.vprint_err(VERB_STD, \
                            "Warning: Caught keyboard interrupt!")
                    display.print_gather(nodeset, buf)

                # Return code handling
                verbexit = VERB_QUIET
                if display.maxrc:
                    verbexit = VERB_STD
                ns_ok = NodeSet()
                for rc, nodelist in task.iter_retcodes():
                    ns_ok.add(NodeSet._fromlist1(nodelist))
                    if rc != 0:
                        # Display return code if not ok ( != 0)
                        nsdisp = ns = NodeSet._fromlist1(nodelist)
                        if display.verbosity >= VERB_QUIET and len(ns) > 1:
                            nsdisp = "%s (%d)" % (ns, len(ns))
                        msgrc = "clush: %s: exited with exit code %d" % (nsdisp,
                                                                         rc)
                        display.vprint_err(verbexit, msgrc)

                # Add uncompleted nodeset to exception object
                kbe.uncompleted_nodes = ns - ns_ok

                # Display nodes that didn't answer within command timeout delay
                if task.num_timeout() > 0:
                    display.vprint_err(verbexit, \
                        "clush: %s: command timeout" % \
                            NodeSet._fromlist1(task.iter_keys_timeout()))
            raise kbe

        if task.default("USER_running"):
            ns_reg, ns_unreg = NodeSet(), NodeSet()
            for client in task._engine.clients():
                if client.registered:
                    ns_reg.add(client.key)
                else:
                    ns_unreg.add(client.key)
            if ns_unreg:
                pending = "\nclush: pending(%d): %s" % (len(ns_unreg), ns_unreg)
            else:
                pending = ""
            display.vprint_err(VERB_QUIET,
                               "clush: interrupt (^C to abort task)")
            gws = list(task.gateways)
            if not gws:
                display.vprint_err(VERB_QUIET,
                                   "clush: in progress(%d): %s%s"
                                   % (len(ns_reg), ns_reg, pending))
            else:
                display.vprint_err(VERB_QUIET,
                                   "clush: in progress(%d): %s%s\n"
                                   "clush: [tree] open gateways(%d): %s"
                                   % (len(ns_reg), ns_reg, pending,
                                      len(gws), NodeSet._fromlist1(gws)))
            for gw, (chan, metaworkers) in task.gateways.items():
                act_targets = NodeSet.fromlist(mw.gwtargets[gw]
                                               for mw in metaworkers)
                if act_targets:
                    display.vprint_err(VERB_QUIET,
                                       "clush: [tree] in progress(%d) on %s: %s"
                                       % (len(act_targets), gw, act_targets))
        else:
            cmdl = cmd.lower()
            try:
                ns_info = True
                if cmdl.startswith('+'):
                    ns.update(cmdl[1:])
                elif cmdl.startswith('-'):
                    ns.difference_update(cmdl[1:])
                elif cmdl.startswith('@'):
                    ns = NodeSet(cmdl[1:])
                elif cmdl == '=':
                    display.gather = not display.gather
                    if display.gather:
                        display.vprint(VERB_STD, \
                            "Switching to gathered output format")
                    else:
                        display.vprint(VERB_STD, \
                            "Switching to standard output format")
                    task.set_default("stdout_msgtree", \
                                     display.gather or display.line_mode)
                    ns_info = False
                    continue
                elif not cmdl.startswith('?'): # if ?, just print ns_info
                    ns_info = False
            except NodeSetParseError:
                display.vprint_err(VERB_QUIET, \
                    "clush: nodeset parse error (ignoring)")

            if ns_info:
                continue

            if cmdl.startswith('!') and len(cmd.strip()) > 0:
                run_command(task, cmd[1:], None, timeout, display, remote)
            elif cmdl != "quit":
                if not cmd:
                    continue
                if readline_avail:
                    readline.write_history_file(get_history_file())
                run_command(task, cmd, ns, timeout, display, remote)
    return rc
Пример #12
0
        # Check if node file exist
        ipxe_file = os.path.join(pxe_nodes_path,
                                 '{node}.ipxe'.format(node=node))
        if not os.path.exists(ipxe_file):
            logging.warning(bcolors.WARNING + 'File ' + ipxe_file +
                            ' does not exist. Skipping.' + bcolors.ENDC)
            continue
        else:
            with open(ipxe_file, 'r') as f:
                ipxe_conf = f.read()
                # Search the default boot type in the ipxe file
                boot = re.search(r"^set menu-default boot(.+)", ipxe_conf,
                                 re.MULTILINE).group(1)

                if boot == 'disk':
                    diskfull.update(node)
                elif boot == 'osdeploy':
                    osdeploy.update(node)
                elif boot == 'diskless':
                    # If diskless, group nodes per image
                    image = re.search(r"^set node-image (.+)", ipxe_conf,
                                      re.MULTILINE).group(1)
                    if image not in diskless:
                        diskless[image] = NodeSet()
                    diskless[image].update(node)

    # Display NodeSet per boot type
    if len(diskfull):
        print('Diskfull: {nodes}'.format(nodes=diskfull))
    if len(osdeploy):
        print('Next boot deployment: {nodes}'.format(nodes=osdeploy))
Пример #13
0
        # Check if node file exist
        ipxe_file = os.path.join(pxe_nodes_path,
                                 '{node}.ipxe'.format(node=node))
        if not os.path.exists(ipxe_file):
            logging.warning(bcolors.WARNING + 'File ' + ipxe_file +
                            ' does not exist. Skipping.' + bcolors.ENDC)
            continue
        else:
            with open(ipxe_file, 'r') as f:
                ipxe_conf = f.read()
                # Search the default boot type in the ipxe file
                boot = re.search(r"^set menu-default boot(.+)", ipxe_conf,
                                 re.MULTILINE).group(1)

                if boot == 'disk':
                    diskfull.update(node)
                elif boot == 'next':
                    bootnext.update(node)
                elif boot == 'osdeploy':
                    osdeploy.update(node)
                elif boot == 'diskless':
                    # If diskless, group nodes per image
                    image = re.search(r"^set node-image (.+)", ipxe_conf,
                                      re.MULTILINE).group(1)
                    if image not in diskless:
                        diskless[image] = NodeSet()
                    diskless[image].update(node)

    # Display NodeSet per boot type
    if len(diskfull):
        print('Diskfull: {nodes}'.format(nodes=diskfull))
Пример #14
0
    def get_metrics_results(self, cluster, job, metrics, period):
        """Get the metrics of the job on the cluster for the period in parameters.

           It sends an HTTP request to InfluxDB service to download the metric
           values in JSON format and returns a list.
        """

        time_group = periods[period]

        profiler = Profiler()

        metrics_s = "\"" + "\", \"".join(metrics) + "\""
        req = "select mean(value) from {metrics} " \
              "where time > now() - {period} " \
              "and cluster = '{cluster}' " \
              "and job = 'job_{job}' " \
              "group by time({time_group}), node fill(0)" \
              .format(metrics=metrics_s,
                      period=period,
                      cluster=cluster,
                      job=job.jobid,
                      time_group=time_group)

        profiler.meta('metrics_req', req)

        payload = {'db': self.db, 'q': req, 'epoch': 'ms'}

        profiler.start('metrics_req')
        resp = requests.get(url=self.url, params=payload)
        profiler.stop('metrics_req')
        if resp.status_code == 404:
            raise LookupError("metrics not found for job {job} on cluster "
                              "{cluster}".format(job=job.jobid,
                                                 cluster=cluster))

        profiler.start('metrics_proc')
        data = json.loads(resp.text)

        # data is a dict with 'results' key that is itself a list of dict with
        # 'series' key that is as well a list of dict, one dict per node/node
        # association. Each dict has it own list of values. We have to compute
        # the sum the values for all nodes at every timestampsi, for each
        # metric.
        #
        # Ex:
        #
        # { "results": [
        #   { "series": [
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn2"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #        ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-system for cn3 ...)
        #
        #     { "name": "cpu-user",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-user for cn[2-3] ...)
        #
        #     { "name": "cpus",
        #       "tags": {"node":"admin"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",6],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "memory-pss",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then memory-pss for cn[2-3] ...)
        #
        #   ]}
        # ]}
        series = data['results'][0]['series']

        results = {}
        nodeset = NodeSet()

        for serie in series:
            metric = serie['name']
            node = serie['tags']['node'].encode('utf-8')

            if node not in nodeset:
                nodeset.update(node)

            for pair in serie['values']:
                timestamp = str(pair[0])
                value = pair[1]
                if timestamp not in results:
                    results[timestamp] = list()
                    for xidx in range(len(metrics)):
                        if xidx == metrics.index(metric):
                            results[timestamp].append(value)
                        else:
                            results[timestamp].append(0)
                else:
                    # The cpus/nodes metrics can be produced by several batch
                    # servers and thus returned multiple times by InfluxDB
                    # server in the result of the request. We must take care to
                    # not add the multiple results of this metric here!
                    if metric in ['cpus', 'nodes']:
                        results[timestamp][metrics.index(metric)] = value
                    else:
                        results[timestamp][metrics.index(metric)] += value

        profiler.stop('metrics_proc')
        return (results, nodeset)
Пример #15
0
    def get_metrics_results(self, cluster, job, metrics, period):
        """Get the metrics of the job on the cluster for the period in parameters.

           It sends an HTTP request to InfluxDB service to download the metric
           values in JSON format and returns a list.
        """
        timejob = job.end_time - job.start_time
        logger.debug("time job: %d", timejob)
        if timejob < 3600:
            period = "1h"
        if timejob < 21600 and timejob > 3600:
            period = "6h"

        time_group = periods[period]

        profiler = Profiler()

        metrics_s = "\"" + "\", \"".join(metrics) + "\""
        req = "select mean(value) from {metrics} " \
              "where cluster = '{cluster}' " \
              "and (( job = 'job_{job}' and time > now() - {period} ) or" \
              " ( job = 'none' and plugin = 'cuda' and time >= {start_time}000000000 and time <= {end_time}000000000 and node = '{nodes}' )) " \
              "group by time({time_group}), node fill(0)" \
              .format(metrics=metrics_s,
                      period=period,
                      cluster=cluster,
                      job=job.jobid,
                      nodes=job.nodeset,
                      start_time=job.start_time,
                      end_time=job.end_time,
                      time_group=time_group)

        logger.debug("req influx: %s", req)
        profiler.meta('metrics_req', req)

        payload = {'db': self.db, 'q': req, 'epoch': 'ms'}

        profiler.start('metrics_req')
        resp = requests.get(url=self.url, params=payload)
        profiler.stop('metrics_req')
        if resp.status_code == 404:
            raise LookupError("metrics not found for job {job} on cluster "
                              "{cluster}".format(job=job.jobid,
                                                 cluster=cluster))

        profiler.start('metrics_proc')

        json_data = json.loads(resp.text)

        # data is a dict with 'results' key that is itself a list of dict with
        # 'series' key that is as well a list of dict, one dict per node/node
        # association. Each dict has it own list of values. We have to compute
        # the sum the values for all nodes at every timestampsi, for each
        # metric.
        #
        # Ex:
        #
        # { "results": [
        #   { "series": [
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn2"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #        ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-system for cn3 ...)
        #
        #     { "name": "cpu-user",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-user for cn[2-3] ...)
        #
        #     { "name": "cpus",
        #       "tags": {"node":"admin"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",6],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "memory-pss",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then memory-pss for cn[2-3] ...)
        #
        #   ]}
        # ]}

        results = {}
        nodeset = NodeSet()
        for result in json_data['results']:
            if 'series' in result:
                series = result['series']
            else:
                logger.warn("No series in one result for query: %s", req)
                series = {}

            for serie in series:
                metric = serie['name']
                node = serie['tags']['node'].encode('utf-8')

                if node not in nodeset:
                    nodeset.update(node)

                for pair in serie['values']:
                    timestamp = str(pair[0])
                    value = pair[1]
                    if timestamp not in results:
                        # init all values for timestamp to 0
                        results[timestamp] = [0] * len(metrics)
                    # The cpus/nodes metrics can be produced by several
                    # batch servers and thus returned multiple times by
                    # InfluxDB server in the result of the request. We
                    # must take care to not add the multiple results of
                    # this metric here!
                    if metric in ['cpus', 'nodes']:
                        results[timestamp][metrics.index(metric)] = value
                    else:
                        results[timestamp][metrics.index(metric)] += value

        profiler.stop('metrics_proc')
        return (results, nodeset)