Пример #1
0
    def testConfigurationLongSyntax(self):
        """test detailed topology description syntax"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write(b'# this is a comment\n')
        tmpfile.write(b'[routes]\n')
        tmpfile.write(b'admin: proxy\n')
        tmpfile.write(b'proxy: STA[0-1]\n')
        tmpfile.write(b'STA0: STB[0-1]\n')
        tmpfile.write(b'STB0: nodes[0-2]\n')
        tmpfile.write(b'STB1: nodes[3-5]\n')
        tmpfile.write(b'STA1: STB[2-3]\n')
        tmpfile.write(b'STB2: nodes[6-7]\n')
        tmpfile.write(b'STB3: nodes[8-10]\n')

        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,proxy,STA[0-1],STB[0-3],nodes[0-10]')
        ns_tree = NodeSet()
        tree = parser.tree('admin')
        self.assertEqual(tree.inner_node_count(), 8)
        self.assertEqual(tree.leaf_node_count(), 11)
        for nodegroup in tree:
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #2
0
    def testMultipleAdminGroups(self):
        """test topology with several admin groups"""
        ## -------------------
        # TODO : uncommenting following lines should not produce an error. This
        # is a valid topology!!
        # ----------
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write(b'[routes]\n')
        tmpfile.write(b'admin0: nodes[0-1]\n')
        #tmpfile.write(b'admin1: nodes[0-1]\n')
        tmpfile.write(b'admin2: nodes[2-3]\n')
        #tmpfile.write(b'admin3: nodes[2-3]\n')
        tmpfile.write(b'nodes[0-1]: nodes[10-19]\n')
        tmpfile.write(b'nodes[2-3]: nodes[20-29]\n')
        tmpfile.flush()
        parser = TopologyParser(tmpfile.name)

        ns_all = NodeSet('admin2,nodes[2-3,20-29]')
        ns_tree = NodeSet()
        tree = parser.tree('admin2')
        self.assertEqual(tree.inner_node_count(), 3)
        self.assertEqual(tree.leaf_node_count(), 10)
        for nodegroup in tree:
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #3
0
    def print_action_results(self, action, error_only=False):
        '''Remove the current line and write grouped results of an action'''
        line = ['%s %s ran in %.2f s' % \
            (self.string_color(action.name, 'MAGENTA'),
             action.parent.fullname(),
             action.duration)]
        buffers = []
        retcodes = []
        timeout = NodeSet()
        # Local action
        if action.worker.current_node is None:
            buffers = [(action.worker.read(), 'localhost')]
            if action.worker.did_timeout():
                timeout.add('localhost')
            if action.worker.retcode() is not None:
                retcodes.append((action.worker.retcode(), 'localhost'))
        # Remote action
        else:
            buffers = action.worker.iter_buffers()
            retcodes = action.worker.iter_retcodes()
            timeout = NodeSet.fromlist(action.worker.iter_keys_timeout())

        line += self.__gen_action_output(buffers, retcodes, timeout,
                                         error_only)
        self.output("\n".join(line))
Пример #4
0
def check_file_exists(hosts, filename, user=None, directory=False):
    """Check if a specified file exist on each specified hosts.

    If specified, verify that the file exists and is owned by the user.

    Args:
        hosts (list): list of hosts
        filename (str): file to check for the existence of on each host
        user (str, optional): owner of the file. Defaults to None.

    Returns:
        (bool, NodeSet): A tuple of:
            - True if the file exists on each of the hosts; False otherwise
            - A NodeSet of hosts on which the file does not exist

    """
    missing_file = NodeSet()
    command = "test -e {0}".format(filename)
    if user is not None and not directory:
        command = "test -O {0}".format(filename)
    elif user is not None and directory:
        command = "test -O {0} && test -d {0}".format(filename)
    elif directory:
        command = "test -d '{0}'".format(filename)

    task = run_task(hosts, command)
    for ret_code, node_list in task.iter_retcodes():
        if ret_code != 0:
            missing_file.add(NodeSet.fromlist(node_list))

    return len(missing_file) == 0, missing_file
Пример #5
0
    def testMultipleAdminGroups(self):
        """test topology with several admin groups"""
        ## -------------------
        # TODO : uncommenting following lines should not produce an error. This
        # is a valid topology!!
        # ----------
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write(b'[routes]\n')
        tmpfile.write(b'admin0: nodes[0-1]\n')
        #tmpfile.write(b'admin1: nodes[0-1]\n')
        tmpfile.write(b'admin2: nodes[2-3]\n')
        #tmpfile.write(b'admin3: nodes[2-3]\n')
        tmpfile.write(b'nodes[0-1]: nodes[10-19]\n')
        tmpfile.write(b'nodes[2-3]: nodes[20-29]\n')
        tmpfile.flush()
        parser = TopologyParser(tmpfile.name)

        ns_all = NodeSet('admin2,nodes[2-3,20-29]')
        ns_tree = NodeSet()
        tree = parser.tree('admin2')
        self.assertEqual(tree.inner_node_count(), 3)
        self.assertEqual(tree.leaf_node_count(), 10)
        for nodegroup in tree:
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #6
0
    def testConfigurationLongSyntax(self):
        """test detailed topology description syntax"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write(b'# this is a comment\n')
        tmpfile.write(b'[routes]\n')
        tmpfile.write(b'admin: proxy\n')
        tmpfile.write(b'proxy: STA[0-1]\n')
        tmpfile.write(b'STA0: STB[0-1]\n')
        tmpfile.write(b'STB0: nodes[0-2]\n')
        tmpfile.write(b'STB1: nodes[3-5]\n')
        tmpfile.write(b'STA1: STB[2-3]\n')
        tmpfile.write(b'STB2: nodes[6-7]\n')
        tmpfile.write(b'STB3: nodes[8-10]\n')

        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,proxy,STA[0-1],STB[0-3],nodes[0-10]')
        ns_tree = NodeSet()
        tree = parser.tree('admin')
        self.assertEqual(tree.inner_node_count(), 8)
        self.assertEqual(tree.leaf_node_count(), 11)
        for nodegroup in tree:
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #7
0
    def start(self, args, ctrl):
        dependanceManager = dep.dep()
        nodes = NodeSet()
        depNode = NodeSet()

        nbNoeud = len(args) - 3
        #print'nbNoeud: %d'%nbNoeud
        for i in range(1, nbNoeud):
            node0.add(args[i])

        dependance = 1
        if os.path.isfile("cfg/" + args[nbNoeud + 1]):
            #verification de la dependance
            dependanceManager.toInstall("cfg/" + args[nbNoeud + 1])
            dependanceManager.toStart("cfg/" + args[nbNoeud + 1])

            #recuperation des dependances
            startNode = dependanceManager.getNodeStarted()
            startServices = dependanceManager.getStarted()
            installNode = dependanceManager.getNodeIs_install()
            installService = dependanceManager.getIs_install()

            #pour chaque noeud dependant

            for node in installNode:
                depNode.add(node)
                for service in installService:
                    task_self().run('sudo service ' + service + ' status',
                                    nodes=depNode)
                    ret = self.status([node, service, 'status'], 2)
                    depNode = NodeSet()
                    if ret == 0:
                        print 'Service : ' + service + ' sur : ' + node + ' status : non-installe'
                        dependance = 0
                    elif ret == 1:
                        print 'Service : ' + service + ' sur : ' + node + ' status : installe'

            #pour chaque noeud dependant
            for node in startNode:
                depNode.add(node)
                for service in startServices:
                    task_self().run('sudo service ' + service + ' status',
                                    nodes=depNode)
                    ret = self.status([node, service, 'status'], 1)
                    depNode = NodeSet()
                    if ret == 0:
                        print 'Service : ' + service + ' sur : ' + node + ' status : non-demarre'
                        dependance = 0
                    elif ret == 1:
                        print 'Service : ' + service + ' sur : ' + node + ' status : demarre'
        print dependance

        if dependance == 1:
            print 'dependance OK'
            print 'sudo service ' + args[nbNoeud + 1] + ' start'
            task_self().run('sudo service ' + args[nbNoeud + 1] + ' start',
                            nodes=nodes)
        else:
            print 'dependance KO'
Пример #8
0
    def stop(self, args):
        nodes = NodeSet()

        nbNoeud = len(args) - 2
        # print'nbNoeud: %d'%nbNoeud
        for i in range(1, nbNoeud + 1):
            nodes.add(args[i])
            print args[i] + ' : sudo service ' + args[nbNoeud + 1] + ' stop'

        task_self().run('sudo service ' + args[nbNoeud + 1] + ' stop',
                        nodes=nodes)
Пример #9
0
    def check_mount_state(self, nodes=None):
        """Check the dfuse mount point mounted state on the hosts.

        Args:
            nodes (NodeSet, optional): hosts on which to check if dfuse is
                mounted. Defaults to None, which will use all of the hosts.

        Returns:
            dict: a dictionary of NodeSets of hosts with the dfuse mount point
                either "mounted" or "unmounted"

        """
        state = {
            "mounted": NodeSet(),
            "unmounted": NodeSet(),
            "nodirectory": NodeSet()
        }
        if not nodes:
            nodes = NodeSet.fromlist(self.hosts)
        check_mounted = NodeSet()

        # Detect which hosts have mount point directories defined
        command = "test -d {0} -a ! -L {0}".format(self.mount_dir.value)
        retcodes = pcmd(nodes, command, expect_rc=None)
        for retcode, hosts in list(retcodes.items()):
            for host in hosts:
                if retcode == 0:
                    check_mounted.add(host)
                else:
                    command = "grep 'dfuse {}' /proc/mounts".format(
                        self.mount_dir.value)
                    retcodes = pcmd([host], command, expect_rc=None)
                    for ret_code, host_names in list(retcodes.items()):
                        for node in host_names:
                            if ret_code == 0:
                                check_mounted.add(node)
                            else:
                                state["nodirectory"].add(node)

        if check_mounted:
            # Detect which hosts with mount point directories have it mounted as
            # a fuseblk device
            command = "stat -c %T -f {0} | grep -v fuseblk".format(
                self.mount_dir.value)
            retcodes = pcmd(check_mounted, command, expect_rc=None)
            for retcode, hosts in list(retcodes.items()):
                for host in hosts:
                    if retcode == 1:
                        state["mounted"].add(host)
                    else:
                        state["unmounted"].add(host)

        return state
Пример #10
0
 def nodes_error(self):
     """Get nodeset of error nodes for this action."""
     error_nodes = NodeSet()
     if self.worker:
         if isinstance(self.worker, WorkerPopen):
             retcode = self.worker.retcode()
             # We don't count timeout (retcode=None)
             if retcode not in (None, 0):
                 error_nodes = NodeSet("localhost")
         else:
             for retcode, nds in self.worker.iter_retcodes():
                 if retcode != 0:
                     error_nodes.add(nds)
     return error_nodes
Пример #11
0
    def status(self, args, afficher):
        node0 = NodeSet()
        nbNoeud = len(args) - 2

        #print'nbNoeud: %d'%nbNoeud

        for i in range(1, nbNoeud + 1):
            node0.add(args[i])

        print 'sudo service ' + args[nbNoeud + 1] + ' status'
        task_self().run('sudo service ' + args[nbNoeud + 1] + ' status',
                        nodes=node0)

        return self.recevoir(afficher)
Пример #12
0
 def nodes_error(self):
     """Get nodeset of error nodes for this action."""
     error_nodes = NodeSet()
     if self.worker:
         if isinstance(self.worker, WorkerPopen):
             retcode = self.worker.retcode()
             # We don't count timeout (retcode=None)
             if retcode not in (None, 0):
                 error_nodes = NodeSet("localhost")
         else:
             for retcode, nds in self.worker.iter_retcodes():
                 if retcode != 0:
                     error_nodes.add(nds)
     return error_nodes
Пример #13
0
    def run(self):
        self.print_banner()

        self.logger.debug("\nSource Hosts : " + str(self.config.source_hosts) +
                          "\n")
        self.logger.debug("Target Host : " + str(self.config.target_host) +
                          "\n")

        def print_cs_debug(t, s):
            logging.debug("Task Debug : " + s)

        # Creating Task
        self.task.set_info('debug', True)
        self.task.set_info("print_debug", print_cs_debug)

        # Assigning shell command to task
        nodeset = NodeSet()
        for sourceHost in self.config.source_hosts:
            nodeset.add(sourceHost)

        self.task.run("/bin/uname -r",
                      nodes=nodeset,
                      handler=HostCheckHandler())

        # Getting pwd
        pwd_worker = self.task.shell("pwd")
        self.task.resume()
        pwd = pwd_worker.current_msg
        self.logger.info("\nPresent working directory : " + pwd + "\n")

        # Creating Directories if not present already
        for node in nodeset:
            self.current_node = node
            mkdir_command = "mkdir " + str(node)
            self.task.run(mkdir_command, handler=MkdirHandler())

        # Getting local files
        for node in nodeset:
            self.current_node = node
            for logFilesNamePattern in self.config.log_files_name_patterns:
                command = "ls " + node + "/ | grep " + logFilesNamePattern + " | grep -v \'\.tar.gz\'"
                self.task.run(command, handler=LsLogsHandler())
        self.logger.info("\nLocal Files -> " + str(len(self.local_logs)) +
                         "\n")

        self.sync_files(nodeset, pwd)
        self.extract_zip_files(nodeset)
        self.extract_parameters_from_logs(nodeset)
        self.prepare_and_execute_jmeter()
Пример #14
0
def _report_unexec(a_model, execution):
    """
    Display the 'unexec' type of report
    """
    all_actions_set = set(a_model.actions.keys())
    all_actions_set_nb = len(all_actions_set)
    executed_actions_set = set(execution.executed_actions.keys())
    unexecuted_actions_set = all_actions_set.difference(executed_actions_set)
    unexecuted_actions_nb = len(unexecuted_actions_set)
    try:
        percentage = (float(unexecuted_actions_nb) / all_actions_set_nb) * 100
    except ZeroDivisionError:
        percentage = 0.0
    _LOGGER.output("\nUnexecuted Actions: %d (%2.1f %%)\t" + \
                       "Legend: mDeps=missings (error or unexecuted)" + \
                       " dependencies",
                   unexecuted_actions_nb, percentage)
    tab_values = []
    # Sort by len() first then alphabetically so:
    # b1, b2, b20, c1, c2, c10, c100 appears in that order
    sorted_list = sorted(unexecuted_actions_set, key = len)
    for id_ in sorted(sorted_list):
        action = a_model.actions[id_]
        all_deps = action.all_deps()
        all_deps_nb = len(all_deps)
        unexec = set(all_deps) - set(execution.executed_actions.keys())
        error = set(all_deps) & set(execution.error_actions.keys())
        missings = unexec.union(error)
        nodeset = NodeSet()
        missing_nb = len(missings)
        for missing in missings:
            if len(missing) != 0:
                nodeset.add(missing)
        try:
            percentage = ((float(missing_nb) / all_deps_nb) * 100)
        except ZeroDivisionError:
            percentage = 0.0
        tab_values.append([id_, str(len(all_deps)),
                           str(missing_nb),
                           u"%2.1f" % percentage,
                           str(nodeset)])
    output = smart_display([u"Id", u"#Deps",
                            u"#mDeps", u"%mDeps",
                            u"mDeps"],
                           tab_values, vsep=u" | ",
                           justify=[str.center, str.center,
                                    str.center, str.center,
                                    str.ljust])
    _LOGGER.output(output)
Пример #15
0
def prepare_slurm_structure(topology, debug=None):
    """ reduce the topology to a table
    SwitchName=SwitchID Nodes=NodeSet(nodes)  # for leaf switches
    SwitchName=SwitchID Switches=NodeSet(switches)  # for spine switches

    'reduce' step

    return a list
    """
    slurm_structure = []
    for node in topology.keys():
        node_info = topology[node]
        node_type = node_info['node_type']
        label = node_info['label']
        if node_type == "switch":
            if node_info['switch_type'] == 'spine':
                # if debug:
                #    pprint(node_info['ports'])
                switches = NodeSet()
                for (port1, remote, port1) in node_info['ports']:
                    switches.add(topology[remote]['label'])

                slurm_structure.append('SwitchName={} Switches={}\n'.format(
                    label, switches))
                if debug:
                    print('SwitchName={} Switches={}'.format(label, switches))

            elif node_info['switch_type'] == 'leaf':
                # if debug:
                #    pprint(node_info['ports'])
                nodes = NodeSet()
                for (port1, host, port2) in node_info['ports']:
                    if topology[host]['node_type'] == 'hca':
                        nodes.add(topology[host]['label'])

                if not nodes:  # if nodes is empty, we probably have a spine switch
                    slurm_structure.append(
                        'SwitchName={} Nodes="no nodes, probably a spine, please check"\n'
                        .format(label))
                else:
                    slurm_structure.append('SwitchName={} Nodes={}\n'.format(
                        label, nodes))

                if debug:
                    print('SwitchName={} Nodes={}'.format(label, nodes))

    return slurm_structure
Пример #16
0
    def testConfigurationShortSyntax(self):
        """test short topology specification syntax"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[routes]\n')
        tmpfile.write('admin: nodes[0-9]\n')
        tmpfile.write('nodes[0-3,5]: nodes[10-19]\n')
        tmpfile.write('nodes[4,6-9]: nodes[30-39]\n')
        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,nodes[0-19,30-39]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #17
0
    def testConfigurationParserBigTree(self):
        """test configuration parser against big propagation tree"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[routes]\n')
        tmpfile.write('admin: ST[0-4]\n')
        tmpfile.write('ST[0-4]: STA[0-49]\n')
        tmpfile.write('STA[0-49]: nodes[0-10000]\n')
        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,ST[0-4],STA[0-49],nodes[0-10000]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #18
0
    def testConfigurationParserCompatMain(self):
        """test configuration parsing (Main section compat)"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[Main]\n')
        tmpfile.write('admin: nodes[0-1]\n')
        tmpfile.write('nodes[0-1]: nodes[2-5]\n')
        tmpfile.write('nodes[4-5]: nodes[6-9]\n')
        tmpfile.flush()
        parser = TopologyParser(tmpfile.name)

        parser.tree('admin')
        ns_all = NodeSet('admin,nodes[0-9]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #19
0
    def testConfigurationParser(self):
        """test configuration parsing"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[Main]\n')
        tmpfile.write('admin: nodes[0-1]\n')
        tmpfile.write('nodes[0-1]: nodes[2-5]\n')
        tmpfile.write('nodes[4-5]: nodes[6-9]\n')
        tmpfile.flush()
        parser = TopologyParser(tmpfile.name)

        parser.tree('admin')
        ns_all = NodeSet('admin,nodes[0-9]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
           ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #20
0
    def testConfigurationParserBigTree(self):
        """test configuration parser against big propagation tree"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[Main]\n')
        tmpfile.write('admin: ST[0-4]\n')
        tmpfile.write('ST[0-4]: STA[0-49]\n')
        tmpfile.write('STA[0-49]: nodes[0-10000]\n')
        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,ST[0-4],STA[0-49],nodes[0-10000]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
           ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #21
0
    def testConfigurationShortSyntax(self):
        """test short topology specification syntax"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[Main]\n')
        tmpfile.write('admin: nodes[0-9]\n')
        tmpfile.write('nodes[0-3,5]: nodes[10-19]\n')
        tmpfile.write('nodes[4,6-9]: nodes[30-39]\n')
        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,nodes[0-19,30-39]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
           ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #22
0
    def print_summary(self, actions, report='default'):
        """Print the errors summary of the array actions"""
        lines = []

        errors = 0
        others = 0
        to_spell = 'action'
        error_nodes = NodeSet()
        all_error_nodes = NodeSet()
        all_nodes = NodeSet()

        for ent in actions:
            error_nodes.clear()
            errs = NodeSet(ent.nodes_error())
            timeouts = NodeSet(ent.nodes_timeout())
            all_nodes.add(ent.target)

            if ent.status in (TIMEOUT, ERROR, DEP_ERROR):
                error_nodes.add(errs)
                error_nodes.add(timeouts)
                lines.append(" + %s" % self.string_color(
                                                ent.longname().strip(), 'RED'))
                if report == 'full':
                    msg = "    %s: %s\n" % (self.string_color("Target",
                                                              'YELLOW'),
                                          error_nodes)
                    msg += "    %s: %s" % (self.string_color("Command",
                                                             'YELLOW'),
                                          ent.worker.command)
                    lines.append(msg)

                errors += 1
            elif ent.status not in (SKIPPED, LOCKED):
                others += 1
            all_error_nodes.add(error_nodes)

        # manage 'action(s)' spelling
        if (errors + others) > 1:
            to_spell += 's'

        header = "\n %s - %s %s (%s failed)" % (
                       self.string_color('Summary'.upper(), 'MAGENTA'),
                       self.string_color('%d' % (errors + others), 'CYAN'),
                       to_spell,
                       self.string_color(errors, (errors and 'RED' or 'GREEN')))
        lines.insert(0, header)
        good_nodes = all_nodes - all_error_nodes
        if report == 'full' and good_nodes:
            lines.append(" + %s" % self.string_color('Success on all services',
                                                     'GREEN'))
            lines.append("    %s" % good_nodes)
        self.output("\n".join(lines), raw=True)
Пример #23
0
    def testConfigurationParserDeepTree(self):
        """test a configuration that generates a deep tree"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[Main]\n')
        tmpfile.write('admin: nodes[0-9]\n')

        levels = 15 # how deep do you want the tree to be?
        for i in xrange(0, levels*10, 10):
            line = 'nodes[%d-%d]: nodes[%d-%d]\n' % (i, i+9, i+10, i+19)
            tmpfile.write(line)
        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,nodes[0-159]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
           ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #24
0
    def testConfigurationParserDeepTree(self):
        """test a configuration that generates a deep tree"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[routes]\n')
        tmpfile.write('admin: nodes[0-9]\n')

        levels = 15  # how deep do you want the tree to be?
        for i in xrange(0, levels * 10, 10):
            line = 'nodes[%d-%d]: nodes[%d-%d]\n' % (i, i + 9, i + 10, i + 19)
            tmpfile.write(line)
        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,nodes[0-159]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #25
0
    def testNodeString(self):
        """test loading a linear string topology"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('[Main]\n')

        # TODO : increase the size
        ns = NodeSet('node[0-10]')

        prev = 'admin'
        for n in ns:
            tmpfile.write('%s: %s\n' % (prev, str(n)))
            prev = n
        tmpfile.flush()
        parser = TopologyParser(tmpfile.name)

        tree = parser.tree('admin')

        ns.add('admin')
        ns_tree = NodeSet()
        for nodegroup in tree:
            ns_tree.add(nodegroup.nodeset)
        self.assertEquals(ns, ns_tree)
Пример #26
0
def _report_error(execution):
    """
    Display the 'error' type of report
    """
    actions_nb = len(execution.model.actions)
    error_actions = execution.error_actions.values()
    error_actions_nb = len(error_actions)
    try:
        percentage = (float(error_actions_nb) / actions_nb) * 100
    except ZeroDivisionError:
        percentage = 0.0

    _LOGGER.output("\nErrors: %d (%2.1f %%)\tLegend: " + \
                   "rDeps=reverse dependencies, RC=returned code",
                   error_actions_nb, percentage)
    tab_values = []
    # Sort by len() first then alphabetically so:
    # b1, b2, b20, c1, c2, c10, c100 appears in that order
    sorted_list = sorted(error_actions,
                         key=lambda error_action: len(error_action.id))
    for error_action in sorted(sorted_list,
                               key=lambda error_action: error_action.id):
        rdeps = error_action.next()
        rdeps_nb = len(rdeps)
        percentage = (float(rdeps_nb) / actions_nb) * 100
        nodeset = NodeSet()
        for rdep in error_action.next():
            if len(rdep) != 0:
                nodeset.add(rdep)
        tab_values.append([error_action.id, str(error_action.rc),
                           str(rdeps_nb), u"%2.1f" % percentage, str(nodeset)])
    output = smart_display([u"Id", u"RC",
                            u"#rDeps", u"%rDeps",
                            u"rDeps"],
                           tab_values, vsep=u" | ",
                           justify=[str.center, str.center,
                                    str.center, str.center,
                                    str.ljust])
    _LOGGER.output(output)
Пример #27
0
    def __init__(self, db, config, cluster_name):

        JobImporter.__init__(self, db, config, cluster_name)

        slurm_section = self._cluster_name + "/slurm"

        self._dbhost = config.get(slurm_section,"host")
        self._dbport = int(config.get(slurm_section,"port"))
        self._dbname = config.get(slurm_section,"name")
        self._dbuser = config.get(slurm_section,"user")
        self._dbpass = config.get(slurm_section,"password")
        try:
            self._conn = MySQLdb.connect( host = self._dbhost,
                                          user = self._dbuser,
                                          passwd = self._dbpass,
                                          db = self._dbname,
                                          port = self._dbport )
        except _mysql_exceptions.OperationalError as e:
            logging.error("connection to Slurm DBD MySQL failed: %s", e)
            raise RuntimeError
        self._cur = self._conn.cursor(MySQLdb.cursors.DictCursor) 

        # get it from archfile
        self._partitions = {}
        archfile_section = self._cluster_name + "/archfile"
        archfile_name = config.get(archfile_section, "file")
        archfile = ConfigParser.ConfigParser()
        archfile.read(archfile_name)
        partitions_list = archfile.get(self._cluster_name,"partitions").split(',')
        for partition_name in partitions_list:
            partition_section_name = self._cluster_name + "/" + partition_name
            nodesets_list = archfile.get(partition_section_name, "nodesets").split(',')
            slurm_partitions_list = archfile.get(partition_section_name, "slurm_partitions").split(',')
            ns_nodeset = NodeSet()
            for nodeset_name in nodesets_list:
                nodeset_section_name = self._cluster_name + "/" + partition_name + "/" + nodeset_name
                str_nodenames = archfile.get(nodeset_section_name, "names")
                ns_nodeset.add(str_nodenames)
            self._partitions[str(ns_nodeset)] = slurm_partitions_list 
Пример #28
0
    def testNodeString(self):
        """test loading a linear string topology"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('[routes]\n')

        # TODO : increase the size
        ns = NodeSet('node[0-10]')

        prev = 'admin'
        for n in ns:
            tmpfile.write('%s: %s\n' % (prev, str(n)))
            prev = n
        tmpfile.flush()
        parser = TopologyParser(tmpfile.name)

        tree = parser.tree('admin')

        ns.add('admin')
        ns_tree = NodeSet()
        for nodegroup in tree:
            ns_tree.add(nodegroup.nodeset)
        self.assertEquals(ns, ns_tree)
Пример #29
0
    def print_summary(self, actions, report='default'):
        """Print the errors summary of the array actions"""
        lines = []

        errors = 0
        others = 0
        to_spell = 'action'
        error_nodes = NodeSet()
        all_error_nodes = NodeSet()
        all_nodes = NodeSet()

        for ent in actions:
            error_nodes.clear()
            errs = NodeSet(ent.nodes_error())
            timeouts = NodeSet(ent.nodes_timeout())
            all_nodes.add(ent.target)

            if ent.status in (TIMEOUT, ERROR, DEP_ERROR):
                error_nodes.add(errs)
                error_nodes.add(timeouts)
                lines.append(" + %s" %
                             self.string_color(ent.longname().strip(), 'RED'))
                if report == 'full':
                    msg = "    %s: %s\n" % (self.string_color(
                        "Target", 'YELLOW'), error_nodes)
                    msg += "    %s: %s" % (self.string_color(
                        "Command", 'YELLOW'), ent.worker.command)
                    lines.append(msg)

                errors += 1
            elif ent.status not in (SKIPPED, LOCKED):
                others += 1
            all_error_nodes.add(error_nodes)

        # manage 'action(s)' spelling
        if (errors + others) > 1:
            to_spell += 's'

        header = "\n %s - %s %s (%s failed)" % (
            self.string_color('Summary'.upper(), 'MAGENTA'),
            self.string_color('%d' % (errors + others), 'CYAN'), to_spell,
            self.string_color(errors, (errors and 'RED' or 'GREEN')))
        lines.insert(0, header)
        good_nodes = all_nodes - all_error_nodes
        if report == 'full' and good_nodes:
            lines.append(" + %s" %
                         self.string_color('Success on all services', 'GREEN'))
            lines.append("    %s" % good_nodes)
        self.output("\n".join(lines), raw=True)
Пример #30
0
def _report_model(a_model):
    """
    Display the 'model' type of report
    """
    actions = a_model.actions.values()
    actions_nb = len(actions)
    _LOGGER.output("Actions in Model: %d\tLegend: @=remote, Deps=Dependencies",
                   actions_nb)
    tab_values = []
    deps_total_nb = 0
    # Sort by len() first then alphabetically so:
    # b1, b2, b20, c1, c2, c10, c100 appears in that order
    sorted_list = sorted(actions, key=lambda action: len(action.id))
    for action in sorted(sorted_list, key=lambda action: action.id):
        nodeset = NodeSet()
        deps = action.all_deps()
        deps_total_nb += len(deps)
        for dep in deps:
            if len(dep) != 0:
                nodeset.add(dep)
        tab_values.append([action.id,
                           ("@" if action.remote else "")+action.component_set,
                           str(nodeset),
                           action.description])
    tab_values.append([HSEP, HSEP, HSEP, HSEP])
    try:
        average_deps = float(deps_total_nb) / actions_nb
    except ZeroDivisionError:
        average_deps = 0
    tab_values.append(["Average #Deps:", "-",
                       "%2.1f" % average_deps,
                       "-"])
    _LOGGER.output(smart_display([u"Id",
                                  u"[@]Component Set",
                                  u"Deps",
                                  u"Description"],
                                 tab_values, vsep=u" | ",
                                 left_align=[False, False, True, False]))
Пример #31
0
    def testMultipleAdminGroups(self):
        """test topology with several admin groups"""
        ## -------------------
        # TODO : uncommenting following lines should not produce an error. This
        # is a valid topology!!
        # ----------
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write("[Main]\n")
        tmpfile.write("admin0: nodes[0-1]\n")
        # tmpfile.write('admin1: nodes[0-1]\n')
        tmpfile.write("admin2: nodes[2-3]\n")
        # tmpfile.write('admin3: nodes[2-3]\n')
        tmpfile.write("nodes[0-1]: nodes[10-19]\n")
        tmpfile.write("nodes[2-3]: nodes[20-29]\n")
        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet("admin2,nodes[2-3,20-29]")
        ns_tree = NodeSet()
        for nodegroup in parser.tree("admin2"):
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #32
0
    def testConfigurationLongSyntax(self):
        """test detailed topology description syntax"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write("# this is a comment\n")
        tmpfile.write("[Main]\n")
        tmpfile.write("admin: proxy\n")
        tmpfile.write("proxy: STA[0-1]\n")
        tmpfile.write("STA0: STB[0-1]\n")
        tmpfile.write("STB0: nodes[0-2]\n")
        tmpfile.write("STB1: nodes[3-5]\n")
        tmpfile.write("STA1: STB[2-3]\n")
        tmpfile.write("STB2: nodes[6-7]\n")
        tmpfile.write("STB3: nodes[8-10]\n")

        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet("admin,proxy,STA[0-1],STB[0-3],nodes[0-10]")
        ns_tree = NodeSet()
        for nodegroup in parser.tree("admin"):
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
Пример #33
0
    def print_action_results(self, action, error_only=False):
        '''Remove the current line and write grouped results of an action'''
        line = ['%s %s ran in %.2f s' % \
            (self.string_color(action.name, 'MAGENTA'),
             action.parent.fullname(),
             action.duration)]
        buffers = []
        retcodes = []
        timeout = NodeSet()
        # Local action
        if action.worker.current_node is None:
            buffers = [(action.worker.read(), 'localhost')]
            if action.worker.did_timeout():
                timeout.add('localhost')
            if action.worker.retcode() is not None:
                retcodes.append((action.worker.retcode(),'localhost'))
        # Remote action
        else:
            buffers = action.worker.iter_buffers()
            retcodes = action.worker.iter_retcodes()
            timeout = NodeSet.fromlist(action.worker.iter_keys_timeout())

        line += self.__gen_action_output(buffers, retcodes, timeout, error_only)
        self.output("\n".join(line))
Пример #34
0
    def dispatch(self, dst):
        """dispatch nodes from a target nodeset to the directly
        connected gateways.

        The method acts as an iterator, returning a gateway and the
        associated hosts. It should provide a rather good load balancing
        between the gateways.
        """
        # Check for directly connected targets
        res = [tmp & dst for tmp in self.table.values()]
        nexthop = NodeSet()
        [nexthop.add(x) for x in res]
        if len(nexthop) > 0:
            yield nexthop, nexthop

        # Check for remote targets, that require a gateway to be reached
        for network in self.table.iterkeys():
            dst_inter = network & dst
            dst.difference_update(dst_inter)
            for host in dst_inter.nsiter():
                yield self.next_hop(host), host
Пример #35
0
class BaseEntity(object):
    '''
    This class is abstract and shall not be instanciated.
    A BaseEntity object basically represents a node of graph with reference
    on parents and children.
    '''

    LOCAL_VARIABLES = {
        'NAME':    'name',
        'FANOUT':  'fanout',
        'TIMEOUT': 'timeout',
        'TARGET':  'target',
        'DESC':    'desc',
        'TAGS':    'tags',
    }

    def __init__(self, name, target=None, delay=0):
        # Entity name
        self.name = name

        # Each entity has a status which it state
        self.status = NO_STATUS

        # Description of an entity
        self.desc = None

        # Maximum window for parallelism. A None fanout means
        # that the task will be limited by the default value of
        # ClusterShell 64
        self.fanout = None

        # Nodes on which the entity is launched
        self._target = None
        self.target = target
        self._target_backup = self.target

        # Special mode which change entity behaviour
        # 'delegate' means manage targets but run localy.
        self.mode = None

        self.remote = True

        # Maximum error authorized for the entity.
        self.errors = 0

        # Error threshold before reaching the warning status
        # (should be <= self.errors)
        self.warnings = 0

        # Max time allowed to compute an entity, None means no timeout
        self.timeout = None

        # Delay to wait before launching an action
        self.delay = delay

        self.maxretry = 0

        self.failed_nodes = NodeSet()

        # Parent of the current object. Must be a subclass of BaseEntity
        self.parent = None

        # Parents dependencies (e.g A->B so B is the parent of A)
        self.parents = {}

        # Children dependencies (e.g A<-B) so A is a child of B)
        self.children = {}

        self.simulate = False

        # Agorithm's direction used
        # False : go in parent's direction
        # True : go in children direction
        self._algo_reversed = False

        # Tag the entity. By this way we know if the entity have to be
        # call by her dependencies
        self._tagged = False

        # Variables
        self.variables = {}

        # Tags the entity. The tags set define if the entity should run
        self.tags = set()

    def filter_nodes(self, nodes):
        """
        Add error nodes to skip list.

        Nodes in this list will not be used when launching actions.
        """
        self.failed_nodes.add(nodes)

    def add_var(self, varname, value):
        '''Add a new variable within the entity context'''
        if varname in self.LOCAL_VARIABLES:
            msg = "%s is a reserved variable name" % varname
            raise VariableAlreadyExistError(msg)
        elif varname in self.variables:
            raise VariableAlreadyExistError()
        else:
            self.variables[varname] = value

    def remove_var(self, varname):
        '''Remove an existing var from the entity'''
        if varname in self.variables:
            del self.variables[varname]

    def update_target(self, nodeset, mode=None):
        '''Update the attribute target of an entity'''
        assert nodeset is not None
        if not mode:
            self.target = NodeSet(nodeset)
        elif mode is 'DIF' and self.target:
            self.target.difference_update(nodeset)
        elif mode is 'INT' and self.target:
            self.target.intersection_update(nodeset)

    def _get_target(self):
        '''Return self._target'''
        return self._target

    def _set_target(self, value):
        '''Assign nodeset to _target'''
        self._target = None
        if value is not None:
            self._target = NodeSet(self._resolve(value))

    target = property(fset=_set_target, fget=_get_target)

    def reset(self):
        '''Reset values of attributes in order to perform multiple exec.'''
        self._tagged = False
        self.target = self._target_backup
        self.status = NO_STATUS
        self.failed_nodes = NodeSet()
        self.algo_reversed = False

    def search(self, name, reverse=False):
        '''
        Search an entity through the overall graph. This recursive algorithm
        stops as soon as the node searched is reached.
        '''
        target = None
        deps = self.parents
        if reverse:
            deps = self.children
        if name in deps:
            return deps[name].target
        else:    
            for dep in deps.values():
                target = dep.target.search(name, reverse)
                if target:
                    return target
        return target

    def add_dep(self, target, sgth=REQUIRE, parent=True):
        '''
        Add a dependency in both direction. This method allow the user to
        specify the dependency type. It is also possible to specify that
        the target is the parent or the child of the current entity.
        '''
        assert target, "target must not be None"
        if sgth in (CHECK, REQUIRE, REQUIRE_WEAK, FILTER):
            if parent:
                if target.name in self.parents:
                    raise DependencyAlreadyReferenced()
                else:
                    # This dependency is considered as a parent
                    self.parents[target.name] = Dependency(target, sgth, False)
                    target.children[self.name] = Dependency(self, sgth, False)
            else:
                if target.name in self.children:
                    raise DependencyAlreadyReferenced()
                else:
                    # This dependency is considered as a child
                    self.children[target.name] = Dependency(target, sgth, False)
                    target.parents[self.name] = Dependency(self, sgth, False)
        else:
            raise IllegalDependencyTypeError(sgth)

    def remove_dep(self, dep_name, parent=True):
        '''
        Remove a dependency on both side, in the current object and in the
        target object concerned by the dependency.
        '''
        assert dep_name, "Dependency specified must not be None"
        if parent and dep_name in self.parents:
            dep = self.parents[dep_name]
            del self.parents[dep_name]
            del dep.target.children[self.name]
        elif dep_name in self.children:
            dep = self.children[dep_name]
            del self.children[dep_name]
            del dep.target.parents[self.name]

    def clear_parent_deps(self):
        '''Remove all parent dependencies of an entity'''
        for dpname in self.parents.keys():
            self.remove_dep(dpname)

    def clear_child_deps(self):
        '''Remove all child dependencies of an entity'''
        for dpname in self.children.keys():
            self.remove_dep(dep_name=dpname, parent=False)

    def has_child_dep(self, dep_name=None):
        '''
        Determine whether the current object has a child dependency called
        dep_name.
        '''
        return dep_name in self.children

    def has_parent_dep(self, dep_name=None):
        '''
        Determine whether the current object has a parent dependency called
        dep_name
        '''
        return dep_name in self.parents

    def clear_deps(self):
        '''Clear parent/child dependencies.'''
        self.parents.clear()
        self.children.clear()

    def deps(self):
        """
        Return parent dependency list.

        Return children deps as parent if algo is reversed.
        """
        if self._algo_reversed:
            return self.children
        else:
            return self.parents

    def is_ready(self):
        '''
        Determine if the current services has to wait before to
        start due to unterminated dependencies.
        '''
        for dep in self.deps().values():
            if dep.target.status in (NO_STATUS, WAITING_STATUS):
                return False
        return True

    def match_tags(self, tags):
        """
        Check if at least one provided tag matches entity tags.

        Return True if both lists are empty.
        """
        if not self.tags and not tags:
            return True
        else:
            assert type(tags) is set
            return bool(self.tags & tags)

    def search_deps(self, symbols=None):
        '''
        Look for parent/child dependencies matching to the symbols. The
        search direction depends on the direction specified for the entiy.
        '''
        # No selection criteria, return everything
        if not symbols:
            return self.deps().values()

        # Else, only keep matching deps
        else:
            dep_list = self.deps().values()
            return [dep for dep in dep_list if dep.target.status in symbols]

    def graph_info(self):
        """ Return a tuple to manage dependencies output """
        return (self.fullname(), None)

    def graph(self, excluded=None):
        """ Generate a graph of dependencies"""
        grph = ""
        # If the entity has a no dependency we just return the entity fullname
        if not self.deps().values():
            grph += '"%s";\n' % self.fullname()
        else:
            for dep in self.deps().values():
                if not dep.target.excluded(excluded):
                    if not dep.target.simulate:
                        grph += dep.graph(self)
                    else:
                        grph += '"%s";\n' % self.fullname()
        return grph

    def excluded(self, excluded=None):
        """Is the entity ecluded recusively"""
        if not excluded:
            return False
        if not self.deps().values():
            return self.fullname() in excluded

        # FIXME: Better loop detection
        if self.search(self.name):
            return True

        for dep in self.deps().values():
            if dep.target.excluded(excluded):
                return True

        return self.fullname() in excluded

    def eval_deps_status(self):
        '''
        Evaluate the result of the dependencies in order to establish
        a status.
        '''
        if len(self.deps()):
            order = lambda dep: DEP_ORDER[dep.status()]
            sorted_deps = sorted(self.deps().values(), key=order)
            return sorted_deps[-1].status()
        else:
            return MISSING

    def set_algo_reversed(self, flag):
        '''Assign the right values for the property algo_reversed'''
        self._algo_reversed = flag

    algo_reversed = property(fset=set_algo_reversed)

    def longname(self):
        '''Return entity fullname and descrition if available '''
        label = self.fullname()
        if self.desc:
            label += " - %s" % self.desc
        return label

    def fullname(self):
        '''Return the fullname of the current entity'''
        names = []
        if self.parent and self.parent.fullname():
            names.append(self.parent.fullname())
        names.append(self.name)
        return '.'.join(names)

    def _lookup_variable(self, varname):
        '''
        Return the value of the specified variable name.

        If is not found in current object, it searches recursively in the
        parent object.
        If it cannot solve the variable name, it raises UndefinedVariableError.
        '''
        if varname in self.variables:
            return self.variables[varname]
        elif varname.upper() in self.LOCAL_VARIABLES:
            value = self.LOCAL_VARIABLES[varname.upper()]
            return self.resolve_property(value)
        elif self.parent:
            return self.parent._lookup_variable(varname)
        else:
            raise UndefinedVariableError(varname)

    def _substitute(self, template):
        """Substitute %xxx patterns from the provided template."""
        delimiter = '%'
        pattern = r"""
          %(delim)s(?:
            (?P<escaped>%(delim)s) | # Escape sequence of two delimiters
            (?P<named>%(id)s)      | # delimiter and a Python identifier
            {(?P<braced>%(id)s)}   | # delimiter and a braced identifier
            \((?P<parenth>.+?)\)   | # delimiter and parenthesis
            (?P<invalid>)            # Other ill-formed delimiter exprs
          )""" % {
                'delim' : delimiter,
                'id' : r'[_a-z][_a-z0-9]*',
            }
        pattern = re.compile(pattern, re.IGNORECASE | re.VERBOSE)

        # Command substitution
        def _cmd_repl(raw):
            '''Replace a command execution pattern by its result.'''
            logger = logging.getLogger('milkcheck')
            cmd = Popen(raw, stdout=PIPE, stderr=PIPE, shell=True)
            stdout = cmd.communicate()[0]
            logger.debug("External command exited with %d: '%s'" %
                         (cmd.returncode, stdout))
            if cmd.returncode >= 126:
                raise InvalidVariableError(raw)
            return stdout.rstrip('\n')

        def _invalid(mobj, template):
            '''Helper to raise a detail error message'''
            i = mobj.start('invalid')
            lines = template[:i].splitlines(True)
            # With the current regexp, it is impossible that lines is empty.
            assert lines, "invalid pattern as the begining of template"
            colno = i - len(''.join(lines[:-1]))
            lineno = len(lines)
            raise ValueError('Invalid placeholder in string: line %d, col %d' %
                             (lineno, colno))

        def _convert(mobj):
            """Helper function for .sub()"""
            # Check the mobjst commobjn path first.
            named = mobj.group('named') or mobj.group('braced')
            if named is not None:
                val = str(self._lookup_variable(named))
                return self._resolve(val)
            if mobj.group('escaped') is not None:
                return delimiter
            if mobj.group('parenth') is not None:
                val = self._resolve(mobj.group('parenth'))
                return _cmd_repl(val)
            if mobj.group('invalid') is not None:
                _invalid(mobj, template)
            raise ValueError('Unrecognized named group in pattern', pattern)

        # Check if content is only a variable pattern
        mobj = re.match(pattern, template)
        name = mobj and (mobj.group('named') or mobj.group('braced'))
        if name is not None and template == mobj.group(0):
            # In this case, simply replace it by variable content
            # (useful for list and dict)
            return self._resolve(self._lookup_variable(name))
        else:
            return pattern.sub(_convert, template)

    def _resolve(self, value):
        '''
        This method takes a string containing symbols. Those strings may
        look like to : 
            + %(nodeset -f epsilon[5-8] -x epsilon7)
            + %CMD echo %(nodeset -f epsilon[5-8])
            + ps -e | grep myprogram
        After computation this method return a string with all the symbols
        resolved.
        The '%' character could be inserted using '%%'.
        '''
        # For compat: if provided value is not a str, we should not convert
        # it to a str if nothing matches.
        if type(value) is not str:
            return value

        # Replace all %xxx patterns
        origvalue = value
        value = self._substitute(value)

        # Debugging
        if origvalue != value:
            logger = logging.getLogger('milkcheck')
            logger.info("Variable content '%s' replaced by '%s'",
                        origvalue, value)

        return value

    def resolve_property(self, prop):
        '''
        Resolve the variables contained within the property. It proceeds by
        looking for the values required to replace the symbols. This method
        returns None whether the property does not exist.
        '''
        pvalue = None
        if hasattr(self, prop):
            pvalue = self._resolve(getattr(self, prop))
        return pvalue

    def inherits_from(self, entity):
        '''Inheritance of properties between entities'''

        # Beware to check the default value of all of theses properties.
        # Some of theses have a two possible 'false' value (None or '').
        # * The init value should always be None
        # * '' is set by the user
        if self.fanout is None:
            self.fanout = entity.fanout
        self.errors = self.errors or entity.errors
        self.warnings = self.warnings or entity.warnings
        if self.timeout is None:
            self.timeout = entity.timeout
        if self.target is None:
            self.target = entity.target
        self.mode = self.mode or entity.mode
        self.remote = self.remote and entity.remote
        if self.desc is None:
            self.desc = entity.desc
        self.delay = self.delay or entity.delay
        self.maxretry = self.maxretry or entity.maxretry
        self.tags = self.tags or entity.tags

    def fromdict(self, entdict):
        """Populate entity attributes from dict."""
        for item, prop in entdict.items():
            if item == 'target':
                self.target = prop
                self._target_backup = prop
            elif item == 'mode':
                self.mode = prop
            elif item == 'remote':
                self.remote = prop
            elif item == 'fanout':
                self.fanout = prop
            elif item == 'timeout':
                self.timeout = prop
            elif item == 'delay':
                self.delay = prop
            elif item == 'retry':
                self.maxretry = prop
            elif item == 'errors':
                self.errors = prop
            elif item == 'warnings':
                self.warnings = prop
            elif item == 'desc':
                self.desc = prop
            elif item == 'tags':
                self.tags = set(prop)
            elif item == 'variables':
                for varname, value in prop.items():
                    self.add_var(varname, value)

    def resolve_all(self):
        """Resolve all properties from the entity"""
        # Resolve local variables first.
        # Ensure they are computed only once and not each time they are used.
        for name, value in self.variables.items():
            self.variables[name] = self._resolve(value)

        # Resolve properties
        properties = ['fanout', 'maxretry', 'errors', 'warnings', 'timeout',
                      'delay', 'target', '_target_backup', 'mode', 'desc']
        for item in properties:
            setattr(self, item, self._resolve(getattr(self, item)))
            if item == 'target':
                self._target_backup = self.resolve_property('target')
Пример #36
0
class TopologyRoutingTable(object):
    """This class provides a convenient way to store and manage topology
    routes
    """
    def __init__(self):
        """Initialize a new TopologyRoutingTable instance."""
        self._routes = []
        self.aggregated_src = NodeSet()
        self.aggregated_dst = NodeSet()

    def add_route(self, route):
        """add a new route to the table. The route argument is expected to be a
        TopologyRoute instance
        """
        if self._introduce_circular_reference(route):
            raise TopologyError('Loop detected! Cannot add route %s' %
                                str(route))
        if self._introduce_convergent_paths(route):
            raise TopologyError(
                'Convergent path detected! Cannot add route %s' % str(route))

        self._routes.append(route)

        self.aggregated_src.add(route.src)
        self.aggregated_dst.add(route.dst)

    def connected(self, src_ns):
        """find out and return the aggregation of directly connected children
        from src_ns.
        Argument src_ns is expected to be a NodeSet instance. Result is returned
        as a NodeSet instance
        """
        next_hop = NodeSet.fromlist([dst for dst in \
            [route.dest(src_ns) for route in self._routes] if dst is not None])
        if len(next_hop) == 0:
            return None
        return next_hop

    def __str__(self):
        """printable representation"""
        return '\n'.join([str(route) for route in self._routes])

    def __iter__(self):
        """return an iterator over the list of rotues"""
        return iter(self._routes)

    def _introduce_circular_reference(self, route):
        """check whether the last added route adds a topology loop or not"""
        current_ns = route.dst
        # iterate over the destinations until we find None or we come back on
        # the src
        while True:
            _dest = self.connected(current_ns)
            if _dest is None or len(_dest) == 0:
                return False
            if len(_dest & route.src) != 0:
                return True
            current_ns = _dest

    def _introduce_convergent_paths(self, route):
        """check for undesired convergent paths"""
        for known_route in self._routes:
            # source cannot be a superset of an already known destination
            if route.src > known_route.dst:
                return True
            # same thing...
            if route.dst < known_route.src:
                return True
            # two different nodegroups cannot point to the same one
            if len(route.dst & known_route.dst) != 0 \
                and route.src != known_route.src:
                return True
        return False
Пример #37
0
class PdshClient(ExecClient):
    """EngineClient which run 'pdsh'"""

    MODE = 'pdsh'

    def __init__(self,
                 node,
                 command,
                 worker,
                 stderr,
                 timeout,
                 autoclose=False,
                 rank=None):
        ExecClient.__init__(self, node, command, worker, stderr, timeout,
                            autoclose, rank)
        self._closed_nodes = NodeSet()

    def _build_cmd(self):
        """
        Build the shell command line to start the commmand.
        Return an array of command and arguments.
        """
        task = self.worker.task
        pdsh_env = {}

        # Build pdsh command
        executable = task.info("pdsh_path") or "pdsh"
        cmd_l = [executable, "-b"]

        fanout = task.info("fanout", 0)
        if fanout > 0:
            cmd_l.append("-f %d" % fanout)

        # Pdsh flag '-t' do not really works well. Better to use
        # PDSH_SSH_ARGS_APPEND variable to transmit ssh ConnectTimeout
        # flag.
        connect_timeout = task.info("connect_timeout", 0)
        if connect_timeout > 0:
            pdsh_env['PDSH_SSH_ARGS_APPEND'] = "-o ConnectTimeout=%d" % \
                    connect_timeout

        command_timeout = task.info("command_timeout", 0)
        if command_timeout > 0:
            cmd_l.append("-u %d" % command_timeout)

        cmd_l.append("-w %s" % self.key)
        cmd_l.append("%s" % self.command)

        return (cmd_l, pdsh_env)

    def _close(self, abort, timeout):
        """Close client. See EngineClient._close()."""
        if abort:
            # it's safer to call poll() first for long time completed processes
            prc = self.popen.poll()
            # if prc is None, process is still running
            if prc is None:
                try:  # try to kill it
                    self.popen.kill()
                except OSError:
                    pass
        prc = self.popen.wait()

        if prc > 0:
            raise WorkerError("Cannot run pdsh (error %d)" % prc)

        self.streams.clear()

        if timeout:
            assert abort, "abort flag not set on timeout"
            for node in (self.key - self._closed_nodes):
                self.worker._on_node_timeout(node)
        else:
            for node in (self.key - self._closed_nodes):
                self.worker._on_node_rc(node, 0)

        self.worker._check_fini()

    def _parse_line(self, line, fname):
        """
        Parse Pdsh line syntax.
        """
        if line.startswith("pdsh@") or \
           line.startswith("pdcp@") or \
           line.startswith("sending "):
            try:
                # pdsh@cors113: cors115: ssh exited with exit code 1
                #       0          1      2     3     4    5    6  7
                # corsUNKN: ssh: corsUNKN: Name or service not known
                #     0      1       2       3  4     5     6    7
                # pdsh@fortoy0: fortoy101: command timeout
                #     0             1         2       3
                # sending SIGTERM to ssh fortoy112 pid 32014
                #     0      1     2  3      4      5    6
                # pdcp@cors113: corsUNKN: ssh exited with exit code 255
                #     0             1      2    3     4    5    6    7
                # pdcp@cors113: cors115: fatal: /var/cache/shine/...
                #     0             1      2                   3...

                words = line.split()
                # Set return code for nodename of worker
                if self.MODE == 'pdsh':
                    if len(words) == 4 and words[2] == "command" and \
                       words[3] == "timeout":
                        pass
                    elif len(words) == 8 and words[3] == "exited" and \
                         words[7].isdigit():
                        self._closed_nodes.add(words[1][:-1])
                        self.worker._on_node_rc(words[1][:-1], int(words[7]))
                elif self.MODE == 'pdcp':
                    self._closed_nodes.add(words[1][:-1])
                    self.worker._on_node_rc(words[1][:-1], errno.ENOENT)

            except Exception, exc:
                print >> sys.stderr, exc
                raise EngineClientError()
        else:
Пример #38
0
class Action(BaseEntity):
    '''
    This class models an action. An action is generally hooked to a service
    and contains the code and parameters to execute commands over one or several
    nodes of a cluster. An action might have dependencies with other actions.
    '''

    LOCAL_VARIABLES = BaseEntity.LOCAL_VARIABLES.copy()
    LOCAL_VARIABLES['ACTION'] = 'name'
    
    def __init__(self, name, target=None, command=None, timeout=-1, delay=0):
        BaseEntity.__init__(self, name=name, target=target, delay=delay)
        
        # Action's timeout in seconds/milliseconds
        self.timeout = timeout
        
        # Number of action tries
        self.tries = 0
        
        # Command lines that we would like to run 
        self.command = command
        
        # Results and retcodes
        self.worker = None
        
        # Allow us to determine time used by an action within the master task
        self.start_time = None
        self.stop_time = None

        # Store pending targets
        self.pending_target = NodeSet()

    def reset(self):
        '''
        Reset values of attributes in order to used the action multiple time.
        '''
        BaseEntity.reset(self)
        self.start_time = None
        self.stop_time = None
        self.worker = None
        self.tries = 0

    def run(self):
        '''Prepare the current action and set up the master task'''
        self.prepare()
        action_manager_self().run()

    def to_skip(self):
        """Tell if action has an empty target list and should be skipped."""
        return (self.target != None and len(self.target) == 0)

    def prepare(self):
        '''
        Prepare is a recursive method allowing the current action to prepare
        actions which are in dependency with her first. An action can only
        be prepared whether the dependencies are not currently running and if
        the current action has not already a status.
        '''
        deps_status = self.eval_deps_status()
        # NO_STATUS and not any dep in progress for the current action
        if self.status is NO_STATUS and deps_status is not WAITING_STATUS:
            if self.to_skip():
                self.update_status(SKIPPED)
            elif deps_status is DEP_ERROR or not self.parents:
                self.update_status(WAITING_STATUS)
                self.schedule()
            elif deps_status is DONE:
                # No need to do the action so just make it DONE
                self.update_status(DONE)
            else:
                # Look for uncompleted dependencies
                deps = self.search_deps([NO_STATUS])
                # For each existing deps just prepare it
                for dep in deps:
                    dep.target.prepare()
                    
    def update_status(self, status):
        '''
        This method update the current status of an action. Whether the
        a status meaning that the action is done is specified, the current
        action triggers her direct dependencies.
        '''
        self.status = status
        call_back_self().notify(self, EV_STATUS_CHANGED)
        if status not in (NO_STATUS, WAITING_STATUS):
            if not self.parent.simulate:
                call_back_self().notify(self, EV_COMPLETE)
            if self.children:
                for dep in self.children.values():
                    if dep.target.is_ready():
                        if not self.parent.simulate:
                            call_back_self().notify(
                            (self, dep.target), EV_TRIGGER_DEP)
                        dep.target.prepare()
            else:
                self.parent.update_status(self.status)
        
    def nb_timeout(self):
        '''Return the number of timeout runs.'''
        if self.worker:
            if isinstance(self.worker, WorkerPopen):
                if self.worker.did_timeout():
                    return 1
            else:
                return len(list(self.worker.iter_keys_timeout()))
        return 0
        
    def nb_errors(self):
        '''
        Return the amount of error in the worker.
        '''
        error_count = 0
        if self.worker:
            if isinstance(self.worker, WorkerPopen):
                retcode = self.worker.retcode()
                # We don't count timeout (retcode=None)
                if retcode not in (None, 0):
                    error_count = 1
            else:
                for retcode, nds in self.worker.iter_retcodes():
                    if retcode != 0:
                        error_count += len(nds)
        return error_count
                    
    @property
    def duration(self):
        '''
        Task duration in seconds (10^-6) is readable as soon as the task is done
        otherwise it returns None.
        '''
        if not self.start_time or not self.stop_time:
            return None
        else:
            delta = self.stop_time - self.start_time
            return  delta.seconds + (delta.microseconds/1000000.0)

    
    def schedule(self, allow_delay=True):
        '''
        Schedule the current action within the master task. The current action
        could be delayed or fired right now depending of it properties.
        '''
        if not self.start_time:
            self.start_time = datetime.now()

        self.pending_target.add(self.target)

        if self.delay > 0 and allow_delay:
            # Action will be started as soon as the timer is done
            action_manager_self().perform_delayed_action(self)
        else:
            # Fire this action
            self.tries += 1
            action_manager_self().perform_action(self)

    def fromdict(self, actdict):
        """Populate action attributes from dict."""
        BaseEntity.fromdict(self, actdict)

        if 'cmd' in actdict:
            self.command = actdict['cmd']
Пример #39
0
class PropagationTreeRouter(object):
    """performs routes resolving operations within a propagation tree.
    This object provides a next_hop method, that will look for the best
    directly connected node to use to forward a message to a remote
    node.

    Upon instanciation, the router will parse the topology tree to
    generate its routing table.
    """
    def __init__(self, root, topology, fanout=0):
        self.root = root
        self.topology = topology
        self.fanout = fanout
        self.nodes_fanin = {}
        self.table = None

        self.table_generate(root, topology)
        self._unreachable_hosts = NodeSet()

    def table_generate(self, root, topology):
        """The router relies on a routing table. The keys are the
        destination nodes and the values are the next hop gateways to
        use to reach these nodes.
        """
        self.table = {}
        root_group = None

        for entry in topology.groups:
            if root in entry.nodeset:
                root_group = entry
                break

        if root_group is None:
            raise RouteResolvingError('Invalid admin node: %s' % root)

        for group in root_group.children():
            self.table[group.nodeset] = NodeSet()
            stack = [group]
            while len(stack) > 0:
                curr = stack.pop()
                self.table[group.nodeset].add(curr.children_ns())
                stack += curr.children()

        # reverse table (it was crafted backward)
        self.table = dict((v, k) for k, v in self.table.iteritems())

    def dispatch(self, dst):
        """dispatch nodes from a target nodeset to the directly
        connected gateways.

        The method acts as an iterator, returning a gateway and the
        associated hosts. It should provide a rather good load balancing
        between the gateways.
        """
        # Check for directly connected targets
        res = [tmp & dst for tmp in self.table.values()]
        nexthop = NodeSet()
        [nexthop.add(x) for x in res]
        if len(nexthop) > 0:
            yield nexthop, nexthop

        # Check for remote targets, that require a gateway to be reached
        for network in self.table.iterkeys():
            dst_inter = network & dst
            dst.difference_update(dst_inter)
            for host in dst_inter.nsiter():
                yield self.next_hop(host), host

    def next_hop(self, dst):
        """perform the next hop resolution. If several hops are
        available, then, the one with the least number of current jobs
        will be returned
        """
        if dst in self._unreachable_hosts:
            raise RouteResolvingError(
                'Invalid destination: %s, host is unreachable' % dst)

        # can't resolve if source == destination
        if self.root == dst:
            raise RouteResolvingError(
                'Invalid resolution request: %s -> %s' % (self.root, dst))

        ## ------------------
        # the routing table is organized this way:
        # 
        #  NETWORK    | NEXT HOP
        # ------------+-----------
        # node[0-9]   | gateway0
        # node[10-19] | gateway[1-2]
        #            ...
        # ---------
        for network, nexthops in self.table.iteritems():
            # destination contained in current network
            if dst in network:
                res = self._best_next_hop(nexthops)
                if res is None:
                    raise RouteResolvingError('No route available to %s' % \
                        str(dst))
                self.nodes_fanin[res] += len(dst)
                return res
            # destination contained in current next hops (ie. directly
            # connected)
            if dst in nexthops:
                return dst

        raise RouteResolvingError(
            'No route from %s to host %s' % (self.root, dst))

    def mark_unreachable(self, dst):
        """mark node dst as unreachable and don't advertise routes
        through it anymore. The cache will be updated only when
        necessary to avoid performing expensive traversals.
        """
        # Simply mark dst as unreachable in a dedicated NodeSet. This
        # list will be consulted by the resolution method
        self._unreachable_hosts.add(dst)

    def _best_next_hop(self, candidates):
        """find out a good next hop gateway"""
        backup = None
        backup_connections = 1e400 # infinity

        candidates = candidates.difference(self._unreachable_hosts)

        for host in candidates:
            # the router tracks established connections in the
            # nodes_fanin table to avoid overloading a gateway
            connections = self.nodes_fanin.setdefault(host, 0)
            # FIXME
            #if connections < self.fanout:
            #    # currently, the first one is the best
            #    return host
            if backup_connections > connections:
                backup = host
                backup_connections = connections
        return backup
Пример #40
0
def ttyloop(task, nodeset, timeout, display, remote):
    """Manage the interactive prompt to run command"""
    readline_avail = False
    interactive = task.default("USER_interactive")
    if interactive:
        try:
            import readline
            readline_setup()
            readline_avail = True
        except ImportError:
            pass
        display.vprint(VERB_STD, \
            "Enter 'quit' to leave this interactive mode")

    rc = 0
    ns = NodeSet(nodeset)
    ns_info = True
    cmd = ""
    while task.default("USER_running") or \
            (interactive and cmd.lower() != 'quit'):
        try:
            # Set SIGUSR1 handler if needed
            if task.default("USER_handle_SIGUSR1"):
                signal.signal(signal.SIGUSR1, signal_handler)

            if task.default("USER_interactive") and \
                    not task.default("USER_running"):
                if ns_info:
                    display.vprint(VERB_QUIET, \
                                   "Working with nodes: %s" % ns)
                    ns_info = False
                prompt = "clush> "
            else:
                prompt = ""
            try:
                cmd = raw_input(prompt)
                assert cmd is not None, "Result of raw_input() is None!"
            finally:
                signal.signal(signal.SIGUSR1, signal.SIG_IGN)
        except EOFError:
            print()
            return
        except UpdatePromptException:
            if task.default("USER_interactive"):
                continue
            return
        except KeyboardInterrupt as kbe:
            # Caught SIGINT here (main thread) but the signal will also reach
            # subprocesses (that will most likely kill them)
            if display.gather:
                # Suspend task, so we can safely access its data from here
                task.suspend()

                # If USER_running is not set, the task had time to finish,
                # that could mean all subprocesses have been killed and all
                # handlers have been processed.
                if not task.default("USER_running"):
                    # let's clush_excepthook handle the rest
                    raise kbe

                # If USER_running is set, the task didn't have time to finish
                # its work, so we must print something for the user...
                print_warn = False

                # Display command output, but cannot order buffers by rc
                nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1]))
                for buf, nodeset in sorted(map(nodesetify,
                                               task.iter_buffers()),
                                           key=bufnodeset_cmpkey):
                    if not print_warn:
                        print_warn = True
                        display.vprint_err(VERB_STD, \
                            "Warning: Caught keyboard interrupt!")
                    display.print_gather(nodeset, buf)

                # Return code handling
                verbexit = VERB_QUIET
                if display.maxrc:
                    verbexit = VERB_STD
                ns_ok = NodeSet()
                for rc, nodelist in task.iter_retcodes():
                    ns_ok.add(NodeSet._fromlist1(nodelist))
                    if rc != 0:
                        # Display return code if not ok ( != 0)
                        nsdisp = ns = NodeSet._fromlist1(nodelist)
                        if display.verbosity >= VERB_QUIET and len(ns) > 1:
                            nsdisp = "%s (%d)" % (ns, len(ns))
                        msgrc = "clush: %s: exited with exit code %d" % (
                            nsdisp, rc)
                        display.vprint_err(verbexit, msgrc)

                # Add uncompleted nodeset to exception object
                kbe.uncompleted_nodes = ns - ns_ok

                # Display nodes that didn't answer within command timeout delay
                if task.num_timeout() > 0:
                    display.vprint_err(verbexit, \
                        "clush: %s: command timeout" % \
                            NodeSet._fromlist1(task.iter_keys_timeout()))
            raise kbe

        if task.default("USER_running"):
            ns_reg, ns_unreg = NodeSet(), NodeSet()
            for client in task._engine.clients():
                if client.registered:
                    ns_reg.add(client.key)
                else:
                    ns_unreg.add(client.key)
            if ns_unreg:
                pending = "\nclush: pending(%d): %s" % (len(ns_unreg),
                                                        ns_unreg)
            else:
                pending = ""
            display.vprint_err(VERB_QUIET,
                               "clush: interrupt (^C to abort task)")
            gws = list(task.gateways)
            if not gws:
                display.vprint_err(
                    VERB_QUIET, "clush: in progress(%d): %s%s" %
                    (len(ns_reg), ns_reg, pending))
            else:
                display.vprint_err(
                    VERB_QUIET, "clush: in progress(%d): %s%s\n"
                    "clush: [tree] open gateways(%d): %s" %
                    (len(ns_reg), ns_reg, pending, len(gws),
                     NodeSet._fromlist1(gws)))
            for gw, (chan, metaworkers) in task.gateways.items():
                act_targets = NodeSet.fromlist(mw.gwtargets[gw]
                                               for mw in metaworkers)
                if act_targets:
                    display.vprint_err(
                        VERB_QUIET, "clush: [tree] in progress(%d) on %s: %s" %
                        (len(act_targets), gw, act_targets))
        else:
            cmdl = cmd.lower()
            try:
                ns_info = True
                if cmdl.startswith('+'):
                    ns.update(cmdl[1:])
                elif cmdl.startswith('-'):
                    ns.difference_update(cmdl[1:])
                elif cmdl.startswith('@'):
                    ns = NodeSet(cmdl[1:])
                elif cmdl == '=':
                    display.gather = not display.gather
                    if display.gather:
                        display.vprint(VERB_STD, \
                            "Switching to gathered output format")
                    else:
                        display.vprint(VERB_STD, \
                            "Switching to standard output format")
                    task.set_default("stdout_msgtree", \
                                     display.gather or display.line_mode)
                    ns_info = False
                    continue
                elif not cmdl.startswith('?'):  # if ?, just print ns_info
                    ns_info = False
            except NodeSetParseError:
                display.vprint_err(VERB_QUIET, \
                    "clush: nodeset parse error (ignoring)")

            if ns_info:
                continue

            if cmdl.startswith('!') and len(cmd.strip()) > 0:
                run_command(task, cmd[1:], None, timeout, display, remote)
            elif cmdl != "quit":
                if not cmd:
                    continue
                if readline_avail:
                    readline.write_history_file(get_history_file())
                run_command(task, cmd, ns, timeout, display, remote)
    return rc
Пример #41
0
    def testBadTopologies(self):
        """test detecting invalid topologies"""
        g = TopologyGraph()
        admin = NodeSet('admin')
        # Add the same nodeset twice
        ns0 = NodeSet('nodes[0-9]')
        ns1 = NodeSet('nodes[10-19]')
        ns2 = NodeSet('nodes[20-29]')

        g.add_route(admin, ns0)
        g.add_route(ns0, ns1)
        g.add_route(ns0, ns2)

        # add a superset of a known destination as source
        ns2_sup = NodeSet('somenode[0-10]')
        ns2_sup.add(ns2)
        self.assertRaises(TopologyError, g.add_route, ns2_sup, NodeSet('foo1'))

        # Add a known dst nodeset as a src nodeset
        ns3 = NodeSet('nodes[30-39]')
        g.add_route(ns1, ns3)

        # Add a subset of a known src nodeset as src
        ns0_sub = NodeSet(','.join(ns0[:3:]))
        ns4 = NodeSet('nodes[40-49]')
        g.add_route(ns0_sub, ns4)

        # Add a subset of a known dst nodeset as src
        ns1_sub = NodeSet(','.join(ns1[:3:]))
        self.assertRaises(TopologyError, g.add_route, ns4, ns1_sub)
        # Add a subset of a known src nodeset as dst
        self.assertRaises(TopologyError, g.add_route, ns4, ns0_sub)
        # Add a subset of a known dst nodeset as dst
        self.assertRaises(TopologyError, g.add_route, ns4, ns1_sub)
        # src <- subset of -> dst
        ns5 = NodeSet('nodes[50-59]')
        ns5_sub = NodeSet(','.join(ns5[:3:]))
        self.assertRaises(TopologyError, g.add_route, ns5, ns5_sub)
        self.assertRaises(TopologyError, g.add_route, ns5_sub, ns5)

        self.assertEqual(g.dest(ns0), (ns1 | ns2))
        self.assertEqual(g.dest(ns1), ns3)
        self.assertEqual(g.dest(ns2), None)
        self.assertEqual(g.dest(ns3), None)
        self.assertEqual(g.dest(ns4), None)
        self.assertEqual(g.dest(ns5), None)
        self.assertEqual(g.dest(ns0_sub), (ns1 | ns2 | ns4))

        g = TopologyGraph()
        root = NodeSet('root')
        ns01 = NodeSet('nodes[0-1]')
        ns23 = NodeSet('nodes[2-3]')
        ns45 = NodeSet('nodes[4-5]')
        ns67 = NodeSet('nodes[6-7]')
        ns89 = NodeSet('nodes[8-9]')

        g.add_route(root, ns01)
        g.add_route(root, ns23 | ns45)
        self.assertRaises(TopologyError, g.add_route, ns23, ns23)
        self.assertRaises(TopologyError, g.add_route, ns45, root)
        g.add_route(ns23, ns67)
        g.add_route(ns67, ns89)
        self.assertRaises(TopologyError, g.add_route, ns89, ns67)
        self.assertRaises(TopologyError, g.add_route, ns89, ns89)
        self.assertRaises(TopologyError, g.add_route, ns89, ns23)

        ns_all = NodeSet('root,nodes[0-9]')
        for nodegroup in g.to_tree('root'):
            ns_all.difference_update(nodegroup.nodeset)
        self.assertEqual(len(ns_all), 0)
Пример #42
0
class PdshClient(ExecClient):
    """EngineClient which run 'pdsh'"""

    MODE = 'pdsh'

    def __init__(self,
                 node,
                 command,
                 worker,
                 stderr,
                 timeout,
                 autoclose=False,
                 rank=None):
        ExecClient.__init__(self, node, command, worker, stderr, timeout,
                            autoclose, rank)
        self._closed_nodes = NodeSet()

    def _build_cmd(self):
        """
        Build the shell command line to start the commmand.
        Return an array of command and arguments.
        """
        task = self.worker.task
        pdsh_env = {}

        # Build pdsh command
        path = task.info("pdsh_path") or "pdsh"
        cmd_l = [os.path.expanduser(pathc) for pathc in shlex.split(path)]
        cmd_l.append("-b")

        fanout = task.info("fanout", 0)
        if fanout > 0:
            cmd_l.append("-f %d" % fanout)

        # Pdsh flag '-t' do not really works well. Better to use
        # PDSH_SSH_ARGS_APPEND variable to transmit ssh ConnectTimeout
        # flag.
        connect_timeout = task.info("connect_timeout", 0)
        if connect_timeout > 0:
            pdsh_env['PDSH_SSH_ARGS_APPEND'] = "-o ConnectTimeout=%d" % \
                    connect_timeout

        command_timeout = task.info("command_timeout", 0)
        if command_timeout > 0:
            cmd_l.append("-u %d" % command_timeout)

        cmd_l.append("-w %s" % self.key)
        cmd_l.append("%s" % self.command)

        return (cmd_l, pdsh_env)

    def _close(self, abort, timeout):
        """Close client. See EngineClient._close()."""
        if abort:
            # it's safer to call poll() first for long time completed processes
            prc = self.popen.poll()
            # if prc is None, process is still running
            if prc is None:
                try:  # try to kill it
                    self.popen.kill()
                except OSError:
                    pass
        prc = self.popen.wait()

        if prc > 0:
            raise WorkerError("Cannot run pdsh (error %d)" % prc)

        self.streams.clear()

        if timeout:
            assert abort, "abort flag not set on timeout"
            for node in (self.key - self._closed_nodes):
                self.worker._on_node_timeout(node)
        else:
            for node in (self.key - self._closed_nodes):
                self.worker._on_node_close(node, 0)

        self.worker._check_fini()

    def _parse_line(self, line, sname):
        """
        Parse Pdsh line syntax.
        """
        if line.startswith(b"pdsh@") or \
           line.startswith(b"pdcp@") or \
           line.startswith(b"sending "):
            try:
                # pdsh@cors113: cors115: ssh exited with exit code 1
                #       0          1      2     3     4    5    6  7
                # corsUNKN: ssh: corsUNKN: Name or service not known
                #     0      1       2       3  4     5     6    7
                # pdsh@fortoy0: fortoy101: command timeout
                #     0             1         2       3
                # sending SIGTERM to ssh fortoy112 pid 32014
                #     0      1     2  3      4      5    6
                # pdcp@cors113: corsUNKN: ssh exited with exit code 255
                #     0             1      2    3     4    5    6    7
                # pdcp@cors113: cors115: fatal: /var/cache/shine/...
                #     0             1      2                   3...
                words = line.split()
                # Set return code for nodename of worker
                if self.MODE == 'pdsh':
                    if len(words) == 4 and words[2] == b"command" and \
                       words[3] == b"timeout":
                        pass
                    elif len(words) == 8 and words[3] == b"exited" and \
                         words[7].isdigit():
                        nodename = words[1][:-1].decode()
                        self._closed_nodes.add(nodename)
                        self.worker._on_node_close(nodename, int(words[7]))
                elif self.MODE == 'pdcp':
                    nodename = words[1][:-1].decode()
                    self._closed_nodes.add(nodename)
                    self.worker._on_node_close(nodename, errno.ENOENT)

            except Exception as exc:
                raise EngineClientError("Pdsh parser error: %s" % exc)
        else:
            # split pdsh reply "nodename: msg"
            nodename, msg = line.split(b': ', 1)
            self.worker._on_node_msgline(nodename.decode(), msg, sname)

    def _flush_read(self, sname):
        """Called at close time to flush stream read buffer."""
        pass

    def _handle_read(self, sname):
        """Engine is telling us a read is available."""
        debug = self.worker.task.info("debug", False)
        if debug:
            print_debug = self.worker.task.info("print_debug")

        suffix = ""
        if sname == 'stderr':
            suffix = "@STDERR"

        for msg in self._readlines(sname):
            if debug:
                print_debug(self.worker.task, "PDSH%s: %s" % (suffix, msg))
            self._parse_line(msg, sname)
Пример #43
0
class Dfuse(DfuseCommand):
    """Class defining an object of type DfuseCommand."""
    def __init__(self, hosts, tmp):
        """Create a dfuse object."""
        super(Dfuse, self).__init__("/run/dfuse/*", "dfuse")

        # set params
        self.hosts = hosts
        self.tmp = tmp
        self.running_hosts = NodeSet()

    def __del__(self):
        """Destruct the object."""
        if len(self.running_hosts):
            self.log.error('Dfuse object deleted without shutting down')

    def create_mount_point(self):
        """Create dfuse directory.

        Raises:
            CommandFailure: In case of error creating directory

        """
        # raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        _, missing_nodes = check_file_exists(self.hosts,
                                             self.mount_dir.value,
                                             directory=True)
        if len(missing_nodes):

            cmd = "mkdir -p {}".format(self.mount_dir.value)
            ret_code = pcmd(missing_nodes, cmd, timeout=30)
            if len(ret_code) > 1 or 0 not in ret_code:
                error_hosts = NodeSet(",".join([
                    str(node_set) for code, node_set in ret_code.items()
                    if code != 0
                ]))
                raise CommandFailure(
                    "Error creating the {} dfuse mount point on the following "
                    "hosts: {}".format(self.mount_dir.value, error_hosts))

    def remove_mount_point(self, fail=True):
        """Remove dfuse directory.

        Try once with a simple rmdir which should succeed, if this does not then
        try again with rm -rf, but still raise an error.

        Raises:
            CommandFailure: In case of error deleting directory

        """
        # raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        dir_exists, clean_nodes = check_file_exists(self.hosts,
                                                    self.mount_dir.value,
                                                    directory=True)
        if dir_exists:

            target_nodes = list(self.hosts)
            if clean_nodes:
                target_nodes.remove(clean_nodes)

            cmd = "rmdir {}".format(self.mount_dir.value)
            ret_code = pcmd(target_nodes, cmd, timeout=30)
            if len(ret_code) == 1 and 0 in ret_code:
                return

            failed_nodes = NodeSet(",".join([
                str(node_set) for code, node_set in ret_code.items()
                if code != 0
            ]))

            cmd = "rm -rf {}".format(self.mount_dir.value)
            ret_code = pcmd(failed_nodes, cmd, timeout=30)
            if len(ret_code) > 1 or 0 not in ret_code:
                error_hosts = NodeSet(",".join([
                    str(node_set) for code, node_set in ret_code.items()
                    if code != 0
                ]))
                if fail:
                    raise CommandFailure(
                        "Error removing the {} dfuse mount point with rm on "
                        "the following hosts: {}".format(
                            self.mount_dir.value, error_hosts))
            if fail:
                raise CommandFailure(
                    "Error removing the {} dfuse mount point with rmdir on the "
                    "following hosts: {}".format(self.mount_dir.value,
                                                 failed_nodes))

    def run(self, check=True):
        """Run the dfuse command.

        Args:
            check (bool): Check if dfuse mounted properly after
                mount is executed.
        Raises:
            CommandFailure: In case dfuse run command fails

        """
        self.log.info('Starting dfuse at %s', self.mount_dir.value)

        # A log file must be defined to ensure logs are captured
        if "D_LOG_FILE" not in self.env:
            raise CommandFailure(
                "Dfuse missing environment variables for D_LOG_FILE")

        # create dfuse dir if does not exist
        self.create_mount_point()

        # run dfuse command
        cmd = "".join([self.env.get_export_str(), self.__str__()])
        ret_code = pcmd(self.hosts, cmd, timeout=30)

        if 0 in ret_code:
            self.running_hosts.add(ret_code[0])
            del ret_code[0]

        if len(ret_code):
            error_hosts = NodeSet(",".join([
                str(node_set) for code, node_set in ret_code.items()
                if code != 0
            ]))
            raise CommandFailure(
                "Error starting dfuse on the following hosts: {}".format(
                    error_hosts))

        if check:
            # Dfuse will block in the command for the mount to complete, even
            # if run in background mode so it should be possible to start using
            # it immediately after the command returns.
            if not self.check_running(fail_on_error=False):
                self.log.info('Waiting two seconds for dfuse to start')
                time.sleep(2)
                if not self.check_running(fail_on_error=False):
                    self.log.info('Waiting five seconds for dfuse to start')
                    time.sleep(5)
                    self.check_running()

    def check_running(self, fail_on_error=True):
        """Check dfuse is running.

        Run a command to verify dfuse is running on hosts where it is supposed
        to be.  Use grep -v and rc=1 here so that if it isn't, then we can
        see what is being used instead.
        """
        retcodes = pcmd(self.running_hosts,
                        "stat -c %T -f {0} | grep -v fuseblk".format(
                            self.mount_dir.value),
                        expect_rc=1)
        if 1 in retcodes:
            del retcodes[1]
        if len(retcodes):
            self.log.error('Errors checking running: %s', retcodes)
            if not fail_on_error:
                return False
            raise CommandFailure('dfuse not running')
        return True

    def stop(self):
        """Stop dfuse.

        Try to stop dfuse.  Try once nicely by using fusermount, then if that
        fails try to pkill it to see if that works.  Abort based on the result
        of the fusermount, as if pkill is necessary then dfuse itself has
        not worked correctly.

        Finally, try and remove the mount point, and that itself should work.

        Raises:
            CommandFailure: In case dfuse stop fails

        """
        self.log.info('Stopping dfuse at %s on %s', self.mount_dir.value,
                      self.running_hosts)

        if self.mount_dir.value is None:
            return

        if not len(self.running_hosts):
            return

        self.check_running()
        umount_cmd = [
            "if [ -x '$(command -v fusermount)' ]",
            "then fusermount -u {0}".format(self.mount_dir.value),
            "else fusermount3 -u {0}".format(self.mount_dir.value), "fi"
        ]
        ret_code = pcmd(self.running_hosts, "; ".join(umount_cmd), timeout=30)

        if 0 in ret_code:
            self.running_hosts.remove(ret_code[0])
            del ret_code[0]

        if len(self.running_hosts):
            cmd = "pkill dfuse --signal KILL"
            pcmd(self.running_hosts, cmd, timeout=30)
            pcmd(self.running_hosts, umount_cmd, timeout=30)
            self.remove_mount_point(fail=False)
            raise CommandFailure(
                "Error stopping dfuse on the following hosts: {}".format(
                    self.running_hosts))
        time.sleep(2)
        self.remove_mount_point()
Пример #44
0
class BaseEntity(object):
    '''
    This class is abstract and shall not be instanciated.
    A BaseEntity object basically represents a node of graph with reference
    on parents and children.
    '''

    LOCAL_VARIABLES = {
        'NAME': 'name',
        'FANOUT': 'fanout',
        'TIMEOUT': 'timeout',
        'TARGET': 'target',
        'DESC': 'desc',
        'TAGS': 'tags',
    }

    def __init__(self, name, target=None, delay=0):
        # Entity name
        self.name = name

        # Each entity has a status which it state
        self.status = NO_STATUS

        # Description of an entity
        self.desc = None

        # Maximum window for parallelism. A None fanout means
        # that the task will be limited by the default value of
        # ClusterShell 64
        self.fanout = None

        # Nodes on which the entity is launched
        self._target = None
        self.target = target
        self._target_backup = self.target

        # Special mode which change entity behaviour
        # 'delegate' means manage targets but run localy.
        self.mode = None

        self.remote = True

        # Maximum error authorized for the entity.
        self.errors = 0

        # Error threshold before reaching the warning status
        # (should be <= self.errors)
        self.warnings = 0

        # Max time allowed to compute an entity, None means no timeout
        self.timeout = None

        # Delay to wait before launching an action
        self.delay = delay

        self.maxretry = 0

        self.failed_nodes = NodeSet()

        # Parent of the current object. Must be a subclass of BaseEntity
        self.parent = None

        # Parents dependencies (e.g A->B so B is the parent of A)
        self.parents = {}

        # Children dependencies (e.g A<-B) so A is a child of B)
        self.children = {}

        self.simulate = False

        # Agorithm's direction used
        # False : go in parent's direction
        # True : go in children direction
        self._algo_reversed = False

        # Tag the entity. By this way we know if the entity have to be
        # call by her dependencies
        self._tagged = False

        # Variables
        self.variables = {}

        # Tags the entity. The tags set define if the entity should run
        self.tags = set()

    def filter_nodes(self, nodes):
        """
        Add error nodes to skip list.

        Nodes in this list will not be used when launching actions.
        """
        self.failed_nodes.add(nodes)

    def add_var(self, varname, value):
        '''Add a new variable within the entity context'''
        if varname in self.LOCAL_VARIABLES:
            msg = "%s is a reserved variable name" % varname
            raise VariableAlreadyExistError(msg)
        elif varname in self.variables:
            raise VariableAlreadyExistError()
        else:
            self.variables[varname] = value

    def remove_var(self, varname):
        '''Remove an existing var from the entity'''
        if varname in self.variables:
            del self.variables[varname]

    def update_var(self, varname, value):
        """ Update existing variable """
        # Debugging
        logger = logging.getLogger('milkcheck')
        logger.info("Variable '%s' updating '%s' (was '%s')", varname, value,
                    self.variables[varname])
        self.remove_var(varname)
        self.add_var(varname, value)

    def update_target(self, nodeset, mode=None):
        '''Update the attribute target of an entity'''
        assert nodeset is not None
        if not mode:
            self.target = NodeSet(nodeset)
        elif mode == 'DIF' and self.target:
            self.target.difference_update(nodeset)
        elif mode == 'INT' and self.target:
            self.target.intersection_update(nodeset)

    def _get_target(self):
        '''Return self._target'''
        return self._target

    def _set_target(self, value):
        '''Assign nodeset to _target'''
        self._target = None
        if value is not None:
            self._target = NodeSet(self._resolve(value))

    target = property(fset=_set_target, fget=_get_target)

    def reset(self):
        '''Reset values of attributes in order to perform multiple exec.'''
        self._tagged = False
        self.target = self._target_backup
        self.status = NO_STATUS
        self.failed_nodes = NodeSet()
        self.algo_reversed = False

    def search(self, name, reverse=False):
        '''
        Search an entity through the overall graph. This recursive algorithm
        stops as soon as the node searched is reached.
        '''
        target = None
        deps = self.parents
        if reverse:
            deps = self.children
        if name in deps:
            return deps[name].target
        else:
            for dep in deps.values():
                target = dep.target.search(name, reverse)
                if target:
                    return target
        return target

    def add_dep(self, target, sgth=REQUIRE, parent=True):
        '''
        Add a dependency in both direction. This method allow the user to
        specify the dependency type. It is also possible to specify that
        the target is the parent or the child of the current entity.
        '''
        assert target, "target must not be None"
        if sgth in (CHECK, REQUIRE, REQUIRE_WEAK, FILTER):
            if parent:
                if target.name in self.parents:
                    raise DependencyAlreadyReferenced()
                else:
                    # This dependency is considered as a parent
                    self.parents[target.name] = Dependency(target, sgth, False)
                    target.children[self.name] = Dependency(self, sgth, False)
            else:
                if target.name in self.children:
                    raise DependencyAlreadyReferenced()
                else:
                    # This dependency is considered as a child
                    self.children[target.name] = Dependency(
                        target, sgth, False)
                    target.parents[self.name] = Dependency(self, sgth, False)
        else:
            raise IllegalDependencyTypeError(sgth)

    def remove_dep(self, dep_name, parent=True):
        '''
        Remove a dependency on both side, in the current object and in the
        target object concerned by the dependency.
        '''
        assert dep_name, "Dependency specified must not be None"
        if parent and dep_name in self.parents:
            dep = self.parents[dep_name]
            del self.parents[dep_name]
            del dep.target.children[self.name]
        elif dep_name in self.children:
            dep = self.children[dep_name]
            del self.children[dep_name]
            del dep.target.parents[self.name]

    def clear_parent_deps(self):
        '''Remove all parent dependencies of an entity'''
        for dpname in list(self.parents.keys()):
            self.remove_dep(dpname)

    def clear_child_deps(self):
        '''Remove all child dependencies of an entity'''
        for dpname in list(self.children.keys()):
            self.remove_dep(dep_name=dpname, parent=False)

    def has_child_dep(self, dep_name=None):
        '''
        Determine whether the current object has a child dependency called
        dep_name.
        '''
        return dep_name in self.children

    def has_parent_dep(self, dep_name=None):
        '''
        Determine whether the current object has a parent dependency called
        dep_name
        '''
        return dep_name in self.parents

    def clear_deps(self):
        '''Clear parent/child dependencies.'''
        self.parents.clear()
        self.children.clear()

    def deps(self):
        """
        Return parent dependency list.

        Return children deps as parent if algo is reversed.
        """
        if self._algo_reversed:
            return self.children
        else:
            return self.parents

    def is_ready(self):
        '''
        Determine if the current services has to wait before to
        start due to unterminated dependencies.
        '''
        for dep in self.deps().values():
            if dep.target.status in (NO_STATUS, WAITING_STATUS):
                return False
        return True

    def match_tags(self, tags):
        """
        Check if at least one provided tag matches entity tags.

        Return True if both lists are empty.
        """
        if not self.tags and not tags:
            return True
        else:
            assert type(tags) is set
            return bool(self.tags & tags)

    def search_deps(self, symbols=None):
        '''
        Look for parent/child dependencies matching to the symbols. The
        search direction depends on the direction specified for the entiy.
        '''
        # No selection criteria, return everything
        if not symbols:
            return self.deps().values()

        # Else, only keep matching deps
        else:
            dep_list = self.deps().values()
            return [dep for dep in dep_list if dep.target.status in symbols]

    def graph_info(self):
        """ Return a tuple to manage dependencies output """
        return (self.fullname(), None)

    def graph(self, excluded=None):
        """ Generate a graph of dependencies"""
        grph = ""
        # If the entity has a no dependency we just return the entity fullname
        if not self.deps().values():
            grph += '"%s";\n' % self.fullname()
        else:
            for dep in self.deps().values():
                if not dep.target.excluded(excluded):
                    if not dep.target.simulate:
                        grph += dep.graph(self)
                    else:
                        grph += '"%s";\n' % self.fullname()
        return grph

    def excluded(self, excluded=None):
        """Is the entity ecluded recusively"""
        if not excluded:
            return False
        if not self.deps().values():
            return self.fullname() in excluded

        # FIXME: Better loop detection
        if self.search(self.name):
            return True

        for dep in self.deps().values():
            if dep.target.excluded(excluded):
                return True

        return self.fullname() in excluded

    def eval_deps_status(self):
        '''
        Evaluate the result of the dependencies in order to establish
        a status.
        '''
        if len(self.deps()):
            order = lambda dep: DEP_ORDER[dep.status()]
            sorted_deps = sorted(self.deps().values(), key=order)
            return sorted_deps[-1].status()
        else:
            return MISSING

    def set_algo_reversed(self, flag):
        '''Assign the right values for the property algo_reversed'''
        self._algo_reversed = flag

    algo_reversed = property(fset=set_algo_reversed)

    def longname(self):
        '''Return entity fullname and descrition if available '''
        label = self.fullname()
        if self.desc:
            label += " - %s" % self.desc
        return label

    def fullname(self):
        '''Return the fullname of the current entity'''
        names = []
        if self.parent and self.parent.fullname():
            names.append(self.parent.fullname())
        names.append(self.name)
        return '.'.join(names)

    def _lookup_variable(self, varname):
        '''
        Return the value of the specified variable name.

        If is not found in current object, it searches recursively in the
        parent object.
        If it cannot solve the variable name, it raises UndefinedVariableError.
        '''
        if varname in self.variables:
            return self.variables[varname]
        elif varname.upper() in self.LOCAL_VARIABLES:
            value = self.LOCAL_VARIABLES[varname.upper()]
            return self.resolve_property(value)
        elif self.parent:
            return self.parent._lookup_variable(varname)
        else:
            raise UndefinedVariableError(varname)

    def _substitute(self, template):
        """Substitute %xxx patterns from the provided template."""
        delimiter = '%'
        pattern = r"""
          %(delim)s(?:
            (?P<escaped>%(delim)s) | # Escape sequence of two delimiters
            (?P<named>%(id)s)      | # delimiter and a Python identifier
            {(?P<braced>%(id)s)}   | # delimiter and a braced identifier
            \((?P<parenth>.+?)\)   | # delimiter and parenthesis
            (?P<invalid>)            # Other ill-formed delimiter exprs
          )""" % {
            'delim': delimiter,
            'id': r'[_a-z][_a-z0-9]*',
        }
        pattern = re.compile(pattern, re.IGNORECASE | re.VERBOSE)

        # Command substitution
        def _cmd_repl(raw):
            '''Replace a command execution pattern by its result.'''
            logger = logging.getLogger('milkcheck')
            cmd = Popen(raw, stdout=PIPE, stderr=PIPE, shell=True)
            stdout = cmd.communicate()[0].decode()
            logger.debug("External command exited with %d: '%s'" %
                         (cmd.returncode, stdout))
            if cmd.returncode >= 126:
                raise InvalidVariableError(raw)
            return stdout.rstrip('\n')

        def _invalid(mobj, template):
            '''Helper to raise a detail error message'''
            i = mobj.start('invalid')
            lines = template[:i].splitlines(True)
            # With the current regexp, it is impossible that lines is empty.
            assert lines, "invalid pattern as the begining of template"
            colno = i - len(''.join(lines[:-1]))
            lineno = len(lines)
            raise ValueError('Invalid placeholder in string: line %d, col %d' %
                             (lineno, colno))

        def _convert(mobj):
            """Helper function for .sub()"""
            # Check the mobjst commobjn path first.
            named = mobj.group('named') or mobj.group('braced')
            if named is not None:
                val = str(self._lookup_variable(named))
                return self._resolve(val)
            if mobj.group('escaped') is not None:
                return delimiter
            if mobj.group('parenth') is not None:
                val = self._resolve(mobj.group('parenth'))
                return _cmd_repl(val)
            if mobj.group('invalid') is not None:
                _invalid(mobj, template)
            raise ValueError('Unrecognized named group in pattern', pattern)

        # Check if content is only a variable pattern
        mobj = re.match(pattern, template)
        name = mobj and (mobj.group('named') or mobj.group('braced'))
        if name is not None and template == mobj.group(0):
            # In this case, simply replace it by variable content
            # (useful for list and dict)
            return self._resolve(self._lookup_variable(name))
        else:
            return pattern.sub(_convert, template)

    def _resolve(self, value):
        '''
        This method takes a string containing symbols. Those strings may
        look like to : 
            + %(nodeset -f epsilon[5-8] -x epsilon7)
            + %CMD echo %(nodeset -f epsilon[5-8])
            + ps -e | grep myprogram
        After computation this method return a string with all the symbols
        resolved.
        The '%' character could be inserted using '%%'.
        '''
        # For compat: if provided value is not a str, we should not convert
        # it to a str if nothing matches.
        if type(value) is not str:
            return value

        # Replace all %xxx patterns
        origvalue = value
        value = self._substitute(value)

        # Debugging
        if origvalue != value:
            logger = logging.getLogger('milkcheck')
            logger.info("Variable content '%s' replaced by '%s'", origvalue,
                        value)

        return value

    def resolve_property(self, prop):
        '''
        Resolve the variables contained within the property. It proceeds by
        looking for the values required to replace the symbols. This method
        returns None whether the property does not exist.
        '''
        pvalue = None
        if hasattr(self, prop):
            pvalue = self._resolve(getattr(self, prop))
        return pvalue

    def inherits_from(self, entity):
        '''Inheritance of properties between entities'''

        # Beware to check the default value of all of theses properties.
        # Some of theses have a two possible 'false' value (None or '').
        # * The init value should always be None
        # * '' is set by the user
        if self.fanout is None:
            self.fanout = entity.fanout
        self.errors = self.errors or entity.errors
        self.warnings = self.warnings or entity.warnings
        if self.timeout is None:
            self.timeout = entity.timeout
        if self.target is None:
            self.target = entity.target
        self.mode = self.mode or entity.mode
        self.remote = self.remote and entity.remote
        if self.desc is None:
            self.desc = entity.desc
        self.delay = self.delay or entity.delay
        self.maxretry = self.maxretry or entity.maxretry
        self.tags = self.tags or entity.tags

    def fromdict(self, entdict):
        """Populate entity attributes from dict."""
        for item, prop in entdict.items():
            if item == 'target':
                self.target = prop
                self._target_backup = prop
            elif item == 'mode':
                self.mode = prop
            elif item == 'remote':
                self.remote = prop
            elif item == 'fanout':
                self.fanout = prop
            elif item == 'timeout':
                self.timeout = prop
            elif item == 'delay':
                self.delay = prop
            elif item == 'retry':
                self.maxretry = prop
            elif item == 'errors':
                self.errors = prop
            elif item == 'warnings':
                self.warnings = prop
            elif item == 'desc':
                self.desc = prop
            elif item == 'tags':
                self.tags = set(prop)
            elif item == 'variables':
                for varname, value in prop.items():
                    self.add_var(varname, value)

    def resolve_all(self):
        """Resolve all properties from the entity"""
        # Resolve local variables first.
        # Ensure they are computed only once and not each time they are used.
        for name, value in self.variables.items():
            self.variables[name] = self._resolve(value)

        # Resolve properties
        properties = [
            'fanout', 'maxretry', 'errors', 'warnings', 'timeout', 'delay',
            'target', '_target_backup', 'mode', 'desc'
        ]
        for item in properties:
            setattr(self, item, self._resolve(getattr(self, item)))
            if item == 'target':
                self._target_backup = self.resolve_property('target')
Пример #45
0
class Action(BaseEntity):
    """
    This class models an action. An action is generally hooked to a service
    and contains the code and parameters to execute commands over one or several
    nodes of a cluster. An action might have dependencies with other actions.
    """

    LOCAL_VARIABLES = BaseEntity.LOCAL_VARIABLES.copy()
    LOCAL_VARIABLES['ACTION'] = 'name'

    def __init__(self, name, target=None, command=None, timeout=None, delay=0):
        BaseEntity.__init__(self, name=name, target=target, delay=delay)

        # Action's timeout in seconds/milliseconds
        self.timeout = timeout

        # Number of action tries
        self.tries = 0

        # Command lines that we would like to run
        self.command = command

        # Results and retcodes
        self.worker = None

        # Allow us to determine time used by an action within the master task
        self.start_time = None
        self.stop_time = None

        # Store pending targets
        self.pending_target = NodeSet()

    def reset(self):
        '''
        Reset values of attributes in order to used the action multiple time.
        '''
        BaseEntity.reset(self)
        self.start_time = None
        self.stop_time = None
        self.worker = None
        self.tries = 0

    def run(self):
        '''Prepare the current action and set up the master task'''
        self.prepare()
        action_manager_self().run()

    def skip(self):
        """Skip this action"""
        # XXX AD: This should use a dedicated flag, should not hack self.target
        self.target = NodeSet()

    def to_skip(self):
        """Tell if action has an empty target list and should be skipped."""
        return (self.target != None and len(self.target) == 0)

    def prepare(self):
        '''
        Prepare is a recursive method allowing the current action to prepare
        actions which are in dependency with her first. An action can only
        be prepared whether the dependencies are not currently running and if
        the current action has not already a status.
        '''
        deps_status = self.eval_deps_status()
        # NO_STATUS and not any dep in progress for the current action
        if self.status is NO_STATUS and deps_status is not WAITING_STATUS:

            # Remove nodes marked on error by our filter dependencies
            if self.target:
                self.target -= self.parent.failed_nodes

            if self.to_skip():
                self.update_status(SKIPPED)
            elif deps_status is DEP_ERROR or not self.parents:
                self.update_status(WAITING_STATUS)
                self.schedule()
            elif deps_status is DONE:
                # No need to do the action so just make it DONE
                self.update_status(DONE)
            else:
                # Look for uncompleted dependencies
                deps = self.search_deps([NO_STATUS])
                # For each existing deps just prepare it
                for dep in deps:
                    dep.target.prepare()

    def update_status(self, status):
        '''
        This method update the current status of an action. Whether the
        a status meaning that the action is done is specified, the current
        action triggers her direct dependencies.
        '''
        self.status = status
        call_back_self().notify(self, EV_STATUS_CHANGED)
        if status not in (NO_STATUS, WAITING_STATUS):
            if not self.parent.simulate:
                call_back_self().notify(self, EV_COMPLETE)
            if self.children:
                for dep in self.children.values():
                    dep.filter_nodes(self.failed_nodes)

                    if dep.target.is_ready():
                        if not self.parent.simulate:
                            call_back_self().notify(
                            (self, dep.target), EV_TRIGGER_DEP)
                        dep.target.prepare()
            else:
                self.parent.filter_nodes(self.failed_nodes)
                self.parent.update_status(self.status)

    def nodes_timeout(self):
        """Get nodeset of timeout nodes for this action."""
        if self.worker:
            if isinstance(self.worker, WorkerPopen):
                if self.worker.did_timeout():
                    return NodeSet("localhost")
            else:
                return NodeSet.fromlist(list(self.worker.iter_keys_timeout()))
        return NodeSet()

    def nb_timeout(self):
        """Get timeout node count."""
        return len(self.nodes_timeout())

    def nodes_error(self):
        """Get nodeset of error nodes for this action."""
        error_nodes = NodeSet()
        if self.worker:
            if isinstance(self.worker, WorkerPopen):
                retcode = self.worker.retcode()
                # We don't count timeout (retcode=None)
                if retcode not in (None, 0):
                    error_nodes = NodeSet("localhost")
            else:
                for retcode, nds in self.worker.iter_retcodes():
                    if retcode != 0:
                        error_nodes.add(nds)
        return error_nodes

    def nb_errors(self):
        """Get error node count."""
        return len(self.nodes_error())

    @property
    def duration(self):
        """
        Action duration in seconds and microseconds if done, None otherwise.
        """
        if self.start_time and self.stop_time:
            return self.stop_time - self.start_time
        else:
            return None

    def schedule(self, allow_delay=True):
        '''
        Schedule the current action within the master task. The current action
        could be delayed or fired right now depending of it properties.
        '''
        if not self.start_time:
            self.start_time = time.time()

        self.pending_target.add(self.target)

        if self.delay > 0 and allow_delay:
            # Action will be started as soon as the timer is done
            action_manager_self().perform_delayed_action(self)
        else:
            # Fire this action
            self.tries += 1
            action_manager_self().perform_action(self)

    def fromdict(self, actdict):
        """Populate action attributes from dict."""
        BaseEntity.fromdict(self, actdict)

        if 'cmd' in actdict:
            self.command = actdict['cmd']

    def resolve_all(self):
        """Resolve all properties from the entity"""
        BaseEntity.resolve_all(self)
        self.command = self.resolve_property('command')
Пример #46
0
    def testBadTopologies(self):
        """test detecting invalid topologies"""
        g = TopologyGraph()
        admin = NodeSet('admin')
        # Add the same nodeset twice
        ns0 = NodeSet('nodes[0-9]')
        ns1 = NodeSet('nodes[10-19]')
        ns2 = NodeSet('nodes[20-29]')

        g.add_route(admin, ns0)
        g.add_route(ns0, ns1)
        g.add_route(ns0, ns2)

        # add a superset of a known destination as source
        ns2_sup = NodeSet('somenode[0-10]')
        ns2_sup.add(ns2)
        self.assertRaises(TopologyError, g.add_route, ns2_sup, NodeSet('foo1'))

        # Add a known dst nodeset as a src nodeset
        ns3 = NodeSet('nodes[30-39]')
        g.add_route(ns1, ns3)

        # Add a subset of a known src nodeset as src
        ns0_sub = NodeSet(','.join(ns0[:3:]))
        ns4 = NodeSet('nodes[40-49]')
        g.add_route(ns0_sub, ns4)

        # Add a subset of a known dst nodeset as src
        ns1_sub = NodeSet(','.join(ns1[:3:]))
        self.assertRaises(TopologyError, g.add_route, ns4, ns1_sub)
        # Add a subset of a known src nodeset as dst
        self.assertRaises(TopologyError, g.add_route, ns4, ns0_sub)
        # Add a subset of a known dst nodeset as dst
        self.assertRaises(TopologyError, g.add_route, ns4, ns1_sub)
        # src <- subset of -> dst
        ns5 = NodeSet('nodes[50-59]')
        ns5_sub = NodeSet(','.join(ns5[:3:]))
        self.assertRaises(TopologyError, g.add_route, ns5, ns5_sub)
        self.assertRaises(TopologyError, g.add_route, ns5_sub, ns5)

        self.assertEqual(g.dest(ns0), (ns1 | ns2))
        self.assertEqual(g.dest(ns1), ns3)
        self.assertEqual(g.dest(ns2), None)
        self.assertEqual(g.dest(ns3), None)
        self.assertEqual(g.dest(ns4), None)
        self.assertEqual(g.dest(ns5), None)
        self.assertEqual(g.dest(ns0_sub), (ns1 | ns2 | ns4))

        g = TopologyGraph()
        root = NodeSet('root')
        ns01 = NodeSet('nodes[0-1]')
        ns23 = NodeSet('nodes[2-3]')
        ns45 = NodeSet('nodes[4-5]')
        ns67 = NodeSet('nodes[6-7]')
        ns89 = NodeSet('nodes[8-9]')

        g.add_route(root, ns01)
        g.add_route(root, ns23 | ns45)
        self.assertRaises(TopologyError, g.add_route, ns23, ns23)
        self.assertRaises(TopologyError, g.add_route, ns45, root)
        g.add_route(ns23, ns67)
        g.add_route(ns67, ns89)
        self.assertRaises(TopologyError, g.add_route, ns89, ns67)
        self.assertRaises(TopologyError, g.add_route, ns89, ns89)
        self.assertRaises(TopologyError, g.add_route, ns89, ns23)

        ns_all = NodeSet('root,nodes[0-9]')
        for nodegroup in g.to_tree('root'):
            ns_all.difference_update(nodegroup.nodeset)
        self.assertEqual(len(ns_all), 0)
Пример #47
0
class PdshClient(ExecClient):
    """EngineClient which run 'pdsh'"""

    MODE = 'pdsh'

    def __init__(self, node, command, worker, stderr, timeout, autoclose=False,
                 rank=None):
        ExecClient.__init__(self, node, command, worker, stderr, timeout,
                            autoclose, rank)
        self._closed_nodes = NodeSet()

    def _build_cmd(self):
        """
        Build the shell command line to start the commmand.
        Return an array of command and arguments.
        """
        task = self.worker.task
        pdsh_env = {}

        # Build pdsh command
        path = task.info("pdsh_path") or "pdsh"
        cmd_l = [os.path.expanduser(pathc) for pathc in shlex.split(path)]
        cmd_l.append("-b")

        fanout = task.info("fanout", 0)
        if fanout > 0:
            cmd_l.append("-f %d" % fanout)

        # Pdsh flag '-t' do not really works well. Better to use
        # PDSH_SSH_ARGS_APPEND variable to transmit ssh ConnectTimeout
        # flag.
        connect_timeout = task.info("connect_timeout", 0)
        if connect_timeout > 0:
            pdsh_env['PDSH_SSH_ARGS_APPEND'] = "-o ConnectTimeout=%d" % \
                    connect_timeout

        command_timeout = task.info("command_timeout", 0)
        if command_timeout > 0:
            cmd_l.append("-u %d" % command_timeout)

        cmd_l.append("-w %s" % self.key)
        cmd_l.append("%s" % self.command)

        return (cmd_l, pdsh_env)

    def _close(self, abort, timeout):
        """Close client. See EngineClient._close()."""
        if abort:
            # it's safer to call poll() first for long time completed processes
            prc = self.popen.poll()
            # if prc is None, process is still running
            if prc is None:
                try: # try to kill it
                    self.popen.kill()
                except OSError:
                    pass
        prc = self.popen.wait()

        if prc > 0:
            raise WorkerError("Cannot run pdsh (error %d)" % prc)

        self.streams.clear()

        if timeout:
            assert abort, "abort flag not set on timeout"
            for node in (self.key - self._closed_nodes):
                self.worker._on_node_timeout(node)
        else:
            for node in (self.key - self._closed_nodes):
                self.worker._on_node_rc(node, 0)

        self.worker._check_fini()

    def _parse_line(self, line, sname):
        """
        Parse Pdsh line syntax.
        """
        if line.startswith("pdsh@") or \
           line.startswith("pdcp@") or \
           line.startswith("sending "):
            try:
                # pdsh@cors113: cors115: ssh exited with exit code 1
                #       0          1      2     3     4    5    6  7
                # corsUNKN: ssh: corsUNKN: Name or service not known
                #     0      1       2       3  4     5     6    7
                # pdsh@fortoy0: fortoy101: command timeout
                #     0             1         2       3
                # sending SIGTERM to ssh fortoy112 pid 32014
                #     0      1     2  3      4      5    6
                # pdcp@cors113: corsUNKN: ssh exited with exit code 255
                #     0             1      2    3     4    5    6    7
                # pdcp@cors113: cors115: fatal: /var/cache/shine/...
                #     0             1      2                   3...

                words  = line.split()
                # Set return code for nodename of worker
                if self.MODE == 'pdsh':
                    if len(words) == 4 and words[2] == "command" and \
                       words[3] == "timeout":
                        pass
                    elif len(words) == 8 and words[3] == "exited" and \
                         words[7].isdigit():
                        self._closed_nodes.add(words[1][:-1])
                        self.worker._on_node_rc(words[1][:-1], int(words[7]))
                elif self.MODE == 'pdcp':
                    self._closed_nodes.add(words[1][:-1])
                    self.worker._on_node_rc(words[1][:-1], errno.ENOENT)

            except Exception, exc:
                print >> sys.stderr, exc
                raise EngineClientError()
        else:
Пример #48
0
class Dfuse(DfuseCommand):
    """Class defining an object of type DfuseCommand."""
    def __init__(self, hosts, tmp):
        """Create a dfuse object."""
        super().__init__("/run/dfuse/*", "dfuse")

        # set params
        self.hosts = hosts
        self.tmp = tmp
        self.running_hosts = NodeSet()

    def __del__(self):
        """Destruct the object."""
        if self.running_hosts:
            self.log.error('Dfuse object deleted without shutting down')

    def check_mount_state(self, nodes=None):
        """Check the dfuse mount point mounted state on the hosts.

        Args:
            nodes (NodeSet, optional): hosts on which to check if dfuse is
                mounted. Defaults to None, which will use all of the hosts.

        Returns:
            dict: a dictionary of NodeSets of hosts with the dfuse mount point
                either "mounted" or "unmounted"

        """
        state = {
            "mounted": NodeSet(),
            "unmounted": NodeSet(),
            "nodirectory": NodeSet()
        }
        if not nodes:
            nodes = NodeSet.fromlist(self.hosts)
        check_mounted = NodeSet()

        # Detect which hosts have mount point directories defined
        command = "test -d {0} -a ! -L {0}".format(self.mount_dir.value)
        retcodes = pcmd(nodes, command, expect_rc=None)
        for retcode, hosts in list(retcodes.items()):
            for host in hosts:
                if retcode == 0:
                    check_mounted.add(host)
                else:
                    command = "grep 'dfuse {}' /proc/mounts".format(
                        self.mount_dir.value)
                    retcodes = pcmd([host], command, expect_rc=None)
                    for ret_code, host_names in list(retcodes.items()):
                        for node in host_names:
                            if ret_code == 0:
                                check_mounted.add(node)
                            else:
                                state["nodirectory"].add(node)

        if check_mounted:
            # Detect which hosts with mount point directories have it mounted as
            # a fuseblk device
            command = "stat -c %T -f {0} | grep -v fuseblk".format(
                self.mount_dir.value)
            retcodes = pcmd(check_mounted, command, expect_rc=None)
            for retcode, hosts in list(retcodes.items()):
                for host in hosts:
                    if retcode == 1:
                        state["mounted"].add(host)
                    else:
                        state["unmounted"].add(host)

        return state

    def get_umount_command(self, force=False):
        """Get the command to umount the dfuse mount point.

        Args:
            force (bool, optional): whether to force the umount with a lazy
                unmount. Defaults to False.

        Returns:
            str: the dfuse umount command

        """
        umount = "-uz" if force else "-u"
        command = [
            "if [ -x '$(command -v fusermount)' ]",
            "then fusermount {0} {1}".format(umount, self.mount_dir.value),
            "else fusermount3 {0} {1}".format(umount,
                                              self.mount_dir.value), "fi"
        ]
        return ";".join(command)

    def create_mount_point(self):
        """Create dfuse directory.

        Raises:
            CommandFailure: In case of error creating directory

        """
        # Raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        # Create the mount point on any host without dfuse already mounted
        state = self.check_mount_state()
        if state["nodirectory"]:
            command = "mkdir -p {}".format(self.mount_dir.value)
            ret_code = pcmd(state["nodirectory"], command, timeout=30)
            if len(ret_code) > 1 or 0 not in ret_code:
                failed_nodes = [
                    str(node_set) for code, node_set in list(ret_code.items())
                    if code != 0
                ]
                error_hosts = NodeSet(",".join(failed_nodes))
                raise CommandFailure(
                    "Error creating the {} dfuse mount point on the "
                    "following hosts: {}".format(self.mount_dir.value,
                                                 error_hosts))

    def remove_mount_point(self, fail=True):
        """Remove dfuse directory.

        Try once with a simple rmdir which should succeed, if this does not then
        try again with rm -rf, but still raise an error.

        Raises:
            CommandFailure: In case of error deleting directory

        """
        # raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        dir_exists, clean_nodes = check_file_exists(self.hosts,
                                                    self.mount_dir.value,
                                                    directory=True)
        if dir_exists:
            target_nodes = list(self.hosts)
            if clean_nodes:
                target_nodes.remove(clean_nodes)

            self.log.info("Removing the %s dfuse mount point on %s",
                          self.mount_dir.value, target_nodes)

            cmd = "rmdir {}".format(self.mount_dir.value)
            ret_code = pcmd(target_nodes, cmd, timeout=30)
            if len(ret_code) == 1 and 0 in ret_code:
                return

            failed_nodes = NodeSet(",".join([
                str(node_set) for code, node_set in list(ret_code.items())
                if code != 0
            ]))

            cmd = "rm -rf {}".format(self.mount_dir.value)
            ret_code = pcmd(failed_nodes, cmd, timeout=30)
            if len(ret_code) > 1 or 0 not in ret_code:
                error_hosts = NodeSet(",".join([
                    str(node_set) for code, node_set in list(ret_code.items())
                    if code != 0
                ]))
                if fail:
                    raise CommandFailure(
                        "Error removing the {} dfuse mount point with rm on "
                        "the following hosts: {}".format(
                            self.mount_dir.value, error_hosts))
            if fail:
                raise CommandFailure(
                    "Error removing the {} dfuse mount point with rmdir on the "
                    "following hosts: {}".format(self.mount_dir.value,
                                                 failed_nodes))
        else:
            self.log.info("No %s dfuse mount point directory found on %s",
                          self.mount_dir.value, self.hosts)

    def run(self, check=True):
        # pylint: disable=arguments-differ
        """Run the dfuse command.

        Args:
            check (bool): Check if dfuse mounted properly after
                mount is executed.
        Raises:
            CommandFailure: In case dfuse run command fails

        """
        self.log.info('Starting dfuse at %s', self.mount_dir.value)

        # A log file must be defined to ensure logs are captured
        if "D_LOG_FILE" not in self.env:
            raise CommandFailure(
                "Dfuse missing environment variables for D_LOG_FILE")

        # create dfuse dir if does not exist
        self.create_mount_point()

        # run dfuse command
        cmd = "".join([self.env.get_export_str(), self.__str__()])
        ret_code = pcmd(self.hosts, cmd, timeout=30)

        if 0 in ret_code:
            self.running_hosts.add(ret_code[0])
            del ret_code[0]

        if ret_code:
            error_hosts = NodeSet(",".join([
                str(node_set) for code, node_set in list(ret_code.items())
                if code != 0
            ]))
            raise CommandFailure(
                "Error starting dfuse on the following hosts: {}".format(
                    error_hosts))

        if check:
            # Dfuse will block in the command for the mount to complete, even
            # if run in background mode so it should be possible to start using
            # it immediately after the command returns.
            if not self.check_running(fail_on_error=False):
                self.log.info('Waiting two seconds for dfuse to start')
                time.sleep(2)
                if not self.check_running(fail_on_error=False):
                    self.log.info('Waiting five seconds for dfuse to start')
                    time.sleep(5)
                    self.check_running()

    def check_running(self, fail_on_error=True):
        """Check dfuse is running.

        Run a command to verify dfuse is running on hosts where it is supposed
        to be.  Use grep -v and rc=1 here so that if it isn't, then we can
        see what is being used instead.

        Args:
            fail_on_error (bool, optional): should an exception be raised if an
                error is detected. Defaults to True.

        Raises:
            CommandFailure: raised if dfuse is found not running on any expected
                nodes and fail_on_error is set.

        Returns:
            bool: whether or not dfuse is running

        """
        status = True
        state = self.check_mount_state(self.running_hosts)
        if state["unmounted"] or state["nodirectory"]:
            self.log.error("Error: dfuse not running on %s",
                           str(state["unmounted"].union(state["nodirectory"])))
            status = False
            if fail_on_error:
                raise CommandFailure("dfuse not running")
        return status

    def stop(self):
        """Stop dfuse.

        Try to stop dfuse.  Try once nicely by using fusermount, then if that
        fails try to pkill it to see if that works.  Abort based on the result
        of the fusermount, as if pkill is necessary then dfuse itself has
        not worked correctly.

        Finally, try and remove the mount point, and that itself should work.

        Raises:
            CommandFailure: In case dfuse stop fails

        """
        # Include all hosts when stopping to ensure all mount points in any
        # state are properly removed
        self.running_hosts.add(NodeSet.fromlist(self.hosts))

        self.log.info("Stopping dfuse at %s on %s", self.mount_dir.value,
                      self.running_hosts)

        if self.mount_dir.value and self.running_hosts:
            error_list = []

            # Loop until all fuseblk mounted devices are unmounted
            counter = 0
            while self.running_hosts and counter < 3:
                # Attempt to kill dfuse on after first unmount fails
                if self.running_hosts and counter > 1:
                    kill_command = "pkill dfuse --signal KILL"
                    pcmd(self.running_hosts, kill_command, timeout=30)

                # Attempt to unmount any fuseblk mounted devices after detection
                if self.running_hosts and counter > 0:
                    pcmd(self.running_hosts,
                         self.get_umount_command(counter > 1),
                         expect_rc=None)
                    time.sleep(2)

                # Detect which hosts have fuseblk mounted devices and remove any
                # hosts which no longer have the dfuse mount point mounted
                state = self.check_mount_state(self.running_hosts)
                for host in state["unmounted"].union(state["nodirectory"]):
                    self.running_hosts.remove(host)

                # Increment the loop counter
                counter += 1

            if self.running_hosts:
                error_list.append("Error stopping dfuse on {}".format(
                    self.running_hosts))

            # Remove mount points
            try:
                self.remove_mount_point()
            except CommandFailure as error:
                error_list.append(error)

            # Report any errors
            if error_list:
                raise CommandFailure("\n".join(error_list))

        elif self.mount_dir.value is None:
            self.log.info("No dfuse mount directory defined - nothing to stop")

        else:
            self.log.info("No hosts running dfuse - nothing to stop")
Пример #49
0
def ttyloop(task, nodeset, timeout, display, remote):
    """Manage the interactive prompt to run command"""
    readline_avail = False
    interactive = task.default("USER_interactive")
    if interactive:
        try:
            import readline
            readline_setup()
            readline_avail = True
        except ImportError:
            pass
        display.vprint(VERB_STD, \
            "Enter 'quit' to leave this interactive mode")

    rc = 0
    ns = NodeSet(nodeset)
    ns_info = True
    cmd = ""
    while task.default("USER_running") or \
            (interactive and cmd.lower() != 'quit'):
        try:
            # Set SIGUSR1 handler if needed
            if task.default("USER_handle_SIGUSR1"):
                signal.signal(signal.SIGUSR1, signal_handler)

            if task.default("USER_interactive") and \
                    not task.default("USER_running"):
                if ns_info:
                    display.vprint(VERB_QUIET, \
                                   "Working with nodes: %s" % ns)
                    ns_info = False
                prompt = "clush> "
            else:
                prompt = ""
            try:
                cmd = raw_input(prompt)
                assert cmd is not None, "Result of raw_input() is None!"
            finally:
                signal.signal(signal.SIGUSR1, signal.SIG_IGN)
        except EOFError:
            print()
            return
        except UpdatePromptException:
            if task.default("USER_interactive"):
                continue
            return
        except KeyboardInterrupt as kbe:
            # Caught SIGINT here (main thread) but the signal will also reach
            # subprocesses (that will most likely kill them)
            if display.gather:
                # Suspend task, so we can safely access its data from here
                task.suspend()

                # If USER_running is not set, the task had time to finish,
                # that could mean all subprocesses have been killed and all
                # handlers have been processed.
                if not task.default("USER_running"):
                    # let's clush_excepthook handle the rest
                    raise kbe

                # If USER_running is set, the task didn't have time to finish
                # its work, so we must print something for the user...
                print_warn = False

                # Display command output, but cannot order buffers by rc
                nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1]))
                for buf, nodeset in sorted(map(nodesetify, task.iter_buffers()),
                                           key=bufnodeset_cmpkey):
                    if not print_warn:
                        print_warn = True
                        display.vprint_err(VERB_STD, \
                            "Warning: Caught keyboard interrupt!")
                    display.print_gather(nodeset, buf)

                # Return code handling
                verbexit = VERB_QUIET
                if display.maxrc:
                    verbexit = VERB_STD
                ns_ok = NodeSet()
                for rc, nodelist in task.iter_retcodes():
                    ns_ok.add(NodeSet._fromlist1(nodelist))
                    if rc != 0:
                        # Display return code if not ok ( != 0)
                        nsdisp = ns = NodeSet._fromlist1(nodelist)
                        if display.verbosity >= VERB_QUIET and len(ns) > 1:
                            nsdisp = "%s (%d)" % (ns, len(ns))
                        msgrc = "clush: %s: exited with exit code %d" % (nsdisp,
                                                                         rc)
                        display.vprint_err(verbexit, msgrc)

                # Add uncompleted nodeset to exception object
                kbe.uncompleted_nodes = ns - ns_ok

                # Display nodes that didn't answer within command timeout delay
                if task.num_timeout() > 0:
                    display.vprint_err(verbexit, \
                        "clush: %s: command timeout" % \
                            NodeSet._fromlist1(task.iter_keys_timeout()))
            raise kbe

        if task.default("USER_running"):
            ns_reg, ns_unreg = NodeSet(), NodeSet()
            for client in task._engine.clients():
                if client.registered:
                    ns_reg.add(client.key)
                else:
                    ns_unreg.add(client.key)
            if ns_unreg:
                pending = "\nclush: pending(%d): %s" % (len(ns_unreg), ns_unreg)
            else:
                pending = ""
            display.vprint_err(VERB_QUIET,
                               "clush: interrupt (^C to abort task)")
            gws = list(task.gateways)
            if not gws:
                display.vprint_err(VERB_QUIET,
                                   "clush: in progress(%d): %s%s"
                                   % (len(ns_reg), ns_reg, pending))
            else:
                display.vprint_err(VERB_QUIET,
                                   "clush: in progress(%d): %s%s\n"
                                   "clush: [tree] open gateways(%d): %s"
                                   % (len(ns_reg), ns_reg, pending,
                                      len(gws), NodeSet._fromlist1(gws)))
            for gw, (chan, metaworkers) in task.gateways.items():
                act_targets = NodeSet.fromlist(mw.gwtargets[gw]
                                               for mw in metaworkers)
                if act_targets:
                    display.vprint_err(VERB_QUIET,
                                       "clush: [tree] in progress(%d) on %s: %s"
                                       % (len(act_targets), gw, act_targets))
        else:
            cmdl = cmd.lower()
            try:
                ns_info = True
                if cmdl.startswith('+'):
                    ns.update(cmdl[1:])
                elif cmdl.startswith('-'):
                    ns.difference_update(cmdl[1:])
                elif cmdl.startswith('@'):
                    ns = NodeSet(cmdl[1:])
                elif cmdl == '=':
                    display.gather = not display.gather
                    if display.gather:
                        display.vprint(VERB_STD, \
                            "Switching to gathered output format")
                    else:
                        display.vprint(VERB_STD, \
                            "Switching to standard output format")
                    task.set_default("stdout_msgtree", \
                                     display.gather or display.line_mode)
                    ns_info = False
                    continue
                elif not cmdl.startswith('?'): # if ?, just print ns_info
                    ns_info = False
            except NodeSetParseError:
                display.vprint_err(VERB_QUIET, \
                    "clush: nodeset parse error (ignoring)")

            if ns_info:
                continue

            if cmdl.startswith('!') and len(cmd.strip()) > 0:
                run_command(task, cmd[1:], None, timeout, display, remote)
            elif cmdl != "quit":
                if not cmd:
                    continue
                if readline_avail:
                    readline.write_history_file(get_history_file())
                run_command(task, cmd, ns, timeout, display, remote)
    return rc
Пример #50
0
    def testCompletePropagation(self):
        """test a complete command propagation trip"""
        #
        # This test relies on configured parameters (topology2.conf)
        tmpfile = tempfile.NamedTemporaryFile()

        logging.basicConfig(
                level=logging.DEBUG
                )
        logging.debug("STARTING")

        hostname = my_node()
        cfgparser = load_cfg('topology2.conf')
        neighbors = cfgparser.get('CONFIG', 'NEIGHBORS')
        targets = cfgparser.get('CONFIG', 'TARGETS')

        tmpfile.write('[DEFAULT]\n')
        tmpfile.write('%s: %s\n' % (hostname, neighbors))
        tmpfile.write('%s: %s\n' % (neighbors, targets))
        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)
        tmpfile.close()

        nfs_tmpdir = os.path.expanduser('~/.clustershell/tests/tmp')

        tree = parser.tree(hostname)
        print tree

        ptree = PropagationTree(tree, hostname)
        ptree.upchannel = None
        ptree.edgehandler = DirectHandler()

        ptree.fanout = 20
        ptree.invoke_gateway = \
            'cd %s; PYTHONPATH=../lib python -m ClusterShell/Gateway -Bu' % \
                os.getcwd()
        #print ptree.invoke_gateway

        ## delete remaining files from previous tests
        for filename in os.listdir(nfs_tmpdir):
            if filename.startswith("fortoy"):
                os.remove(os.path.join(nfs_tmpdir, filename))

        dst = NodeSet(targets)
        task = ptree.execute('python -c "import time; print time.time()" > ' + \
                             os.path.join(nfs_tmpdir, '$(hostname)'), dst, 20)
        #task = ptree.execute('sleep 2; echo "output from $(hostname)"', \
        #                      dst, 20)
        self.assert_(task)

        res = NodeSet()
        times = []
        for filename in os.listdir(nfs_tmpdir):
            for k in dst:
                if filename.startswith(str(k)):
                    res.add(k)
                    fd = open(os.path.join(nfs_tmpdir, filename))
                    times.append(float(fd.read()))
                    fd.close()

        self.assertEquals(str(res), str(dst))
        print "Complete propagation time: %fs for %d nodes" % \
                (max(times) - min(times), len(dst))
Пример #51
0
class TopologyNodeGroup(object):
    """Base element for in-memory representation of the propagation tree.
    Contains a nodeset, with parent-children relationships with other
    instances.
    """
    def __init__(self, nodeset=None):
        """initialize a new TopologyNodeGroup instance."""
        # Base nodeset
        self.nodeset = nodeset
        # Parent TopologyNodeGroup (TNG) instance
        self.parent = None
        # List of children TNG instances
        self._children = []
        self._children_len = 0
        # provided for convenience
        self._children_ns = None

    def printable_subtree(self, prefix=''):
        """recursive method that returns a printable version the subtree from
        the current node with a nice presentation
        """
        res = ''
        # For now, it is ok to use a recursive method here as we consider that
        # tree depth is relatively small.
        if self.parent is None:
            # root
            res = '%s\n' % str(self.nodeset)
        elif self.parent.parent is None:
            # first level
            if not self._is_last():
                res = '|- %s\n' % str(self.nodeset)
            else:
                res = '`- %s\n' % str(self.nodeset)
        else:
            # deepest levels...
            if not self.parent._is_last():
                prefix += '|  '
            else:
                # fix last line
                prefix += '   '
            if not self._is_last():
                res = '%s|- %s\n' % (prefix, str(self.nodeset))
            else:
                res = '%s`- %s\n' % (prefix, str(self.nodeset))
        # perform recursive calls to print out every node
        for child in self._children:
            res += child.printable_subtree(prefix)
        return res

    def add_child(self, child):
        """add a child to the children list and define the current instance as
        its parent
        """
        assert isinstance(child, TopologyNodeGroup)

        if child in self._children:
            return
        child.parent = self
        self._children.append(child)
        if self._children_ns is None:
            self._children_ns = NodeSet()
        self._children_ns.add(child.nodeset)

    def clear_child(self, child, strict=False):
        """remove a child"""
        try:
            self._children.remove(child)
            self._children_ns.difference_update(child.nodeset)
            if len(self._children_ns) == 0:
                self._children_ns = None
        except ValueError:
            if strict:
                raise

    def clear_children(self):
        """delete all children"""
        self._children = []
        self._children_ns = None

    def children(self):
        """get the children list"""
        return self._children

    def children_ns(self):
        """return the children as a nodeset"""
        return self._children_ns

    def children_len(self):
        """returns the number of children as the sum of the size of the
        children's nodeset
        """
        if self._children_ns is None:
            return 0
        else:
            return len(self._children_ns)

    def _is_last(self):
        """used to display the subtree: we won't prefix the line the same way if
        the current instance is the last child of the children list of its
        parent.
        """
        return self.parent._children[-1::][0] == self

    def __str__(self):
        """printable representation of the nodegroup"""
        return '<TopologyNodeGroup (%s)>' % str(self.nodeset)
Пример #52
0
class NodeBrowser(Prompter):

    ## constructor
    def __init__(self):
        self.ns = NodeSet()
        self.service = ''
        self.action = ''

    ## launch the prompt menu
    def promptMenu(self):
        try:
            # launch the prompter for node selection
            self.submitNodeList()
        # catch Ctrl + C
        except KeyboardInterrupt :
            print '\n Bye bye !\n'
            sys.exit(1)
    
    ## ask the user to submit the node list
    def submitNodeList(self):
        # info msg
        print '\n# Step 1 of 3 : Please enter nodes name below (using the clustershell syntax <azur1>, <azur[1-2]>) :' 
        # retrieve keyboard input
        try:
            self.ns = NodeSet(self.input_request(''))
            repeat = True
                
            # ask if the user wants to add another node/node group
            while repeat :
                # print added nodes
                for node in self.ns:
                    print 'node : %s' % node
                # user want to add nodes ?
                print '\n### Add nodes ? (yes | no)'
                # retrieve answer
                ans = self.input_request('')
                # check the ans
                if ans == 'Yes' or ans == 'Y' or ans == 'y' or ans == 'yes':
                   print '### Please enter the node/group list below : '
                   # retrieve and append nodes
                   self.ns.add(self.input_request(''))
                # the user don't want to add additionnal nodes
                else:
                   # unset flag
                   repeat = False
                   # check submitted nodes
                   self.ns = self.checkSubmittedNodes(self.ns)

        # invalid submitted node list / syntax error
        except NodeSetException :
            print >> sys.stderr, '\n(!) Error : the submitted node list is not valid\n' % self.ns

    ## retrieve the service to be performed
    def submitService(self):

        # specify the service
        print '\n# Step 2 of 3: Please enter the service to be launched'
        self.service = self.input_request('')
   
    ## retrieve the action to be performed
    def submitAction(self):
        # choose action to be executed
        print '\n# Step 3 of 3 : Please choose the action to perform '
        actionList = ['start','stop','restart','status']
        self.print_choice(actionList)

        # flag for the prompter
        repeat = True        
        # retrieve user's choice
        while repeat :
            # show prompter
            choice = self.input_request('')
            if choice == '1' or choice == 'start':
               self.action = 'start'
               repeat = False
            elif choice == '2' or choice == 'stop':
               self.action = 'stop'
               repeat = False
            elif choice == '3' or choice == 'restart':
               self.action = 'restart'
               repeat = False
            elif choice == '4' or choice == 'status':
               self.action = 'status'
               repeat = False
            else:
               print >> sys.stderr,'Error : invalid choice' 

    ## check and retrieves ok nodes
    def checkSubmittedNodes(self, nodes):
        # create nodeset
        ns = Nodes(nodes)
        # ping nodes
        ns.checkNodes() 
        # return active nodes
        return ns.getActiveNodes()
Пример #53
0
class TopologyRoutingTable(object):
    """This class provides a convenient way to store and manage topology
    routes
    """
    def __init__(self):
        """Initialize a new TopologyRoutingTable instance."""
        self._routes = []
        self.aggregated_src = NodeSet()
        self.aggregated_dst = NodeSet()

    def add_route(self, route):
        """add a new route to the table. The route argument is expected to be a
        TopologyRoute instance
        """
        if self._introduce_circular_reference(route):
            raise TopologyError(
                'Loop detected! Cannot add route %s' % str(route))
        if self._introduce_convergent_paths(route):
            raise TopologyError(
                'Convergent path detected! Cannot add route %s' % str(route))

        self._routes.append(route)

        self.aggregated_src.add(route.src)
        self.aggregated_dst.add(route.dst)

    def connected(self, src_ns):
        """find out and return the aggregation of directly connected children
        from src_ns.
        Argument src_ns is expected to be a NodeSet instance. Result is returned
        as a NodeSet instance
        """
        next_hop = NodeSet.fromlist([dst for dst in \
            [route.dest(src_ns) for route in self._routes] if dst is not None])
        if len(next_hop) == 0:
            return None
        return next_hop

    def __str__(self):
        """printable representation"""
        return '\n'.join([str(route) for route in self._routes])

    def __iter__(self):
        """return an iterator over the list of rotues"""
        return iter(self._routes)

    def _introduce_circular_reference(self, route):
        """check whether the last added route adds a topology loop or not"""
        current_ns = route.dst
        # iterate over the destinations until we find None or we come back on
        # the src
        while True:
            _dest = self.connected(current_ns)
            if _dest is None or len(_dest) == 0:
                return False
            if len(_dest & route.src) != 0:
                return True
            current_ns = _dest

    def _introduce_convergent_paths(self, route):
        """check for undesired convergent paths"""
        for known_route in self._routes:
            # source cannot be a superset of an already known destination
            if route.src > known_route.dst:
                return True
            # same thing...
            if route.dst < known_route.src:
                return True
            # two different nodegroups cannot point to the same one
            if len(route.dst & known_route.dst) != 0 \
                and route.src != known_route.src:
                return True
        return False
Пример #54
0
    def load(self):
        """Load Cluster, Nodes and partitions from Architecture files. Raises
           HPCStatsRuntimeError or HPCStatsSourceError if error is encountered
           while loading data from sources. It sets attributes cluster, nodes
           and partitions with loaded data.
        """

        self.cluster = Cluster(self.cluster_name)
        self.nodes = []
        self.partitions = {}

        self.read_arch()
        config_get = self.config_get
        partitions = config_get(self.cluster.name, "partitions").split(',')

        for partition in partitions:

            part_sect = self.cluster.name + "/" + partition

            nodegroups = config_get(part_sect, "nodegroups").split(',')
            job_partitions = config_get(part_sect, "job_partitions") \
                               .split(',')

            nodeset_part = NodeSet() # nodeset for the partitions attribute

            for nodegroup in nodegroups:

                nodegroup_sect = self.cluster.name + "/" + partition \
                                 + "/" + nodegroup
                nodenames = config_get(nodegroup_sect, "names")
                nodeset_part.add(nodenames)

                sockets = config_get(nodegroup_sect, "sockets", isint=True)
                cores_per_socket = config_get(nodegroup_sect,
                                              "corespersocket",
                                              isint=True)
                cpu = sockets * cores_per_socket

                float_instructions = config_get(nodegroup_sect,
                                                "floatinstructions",
                                                isint=True)

                freq_str = config_get(nodegroup_sect, "frequency")
                freq = ArchitectureImporterArchfile.convert_freq(freq_str)
                if freq is None:
                    raise HPCStatsSourceError( \
                            "format of frequency for nodeset %s/%s/%s (%s) " \
                            "'%s' is not valid" \
                              % ( self.cluster.name,
                                  partition,
                                  nodegroup,
                                  nodenames,
                                  freq_str ))

                flops = sockets * cores_per_socket * float_instructions * freq

                mem_str = config_get(nodegroup_sect, "memory")
                mem = ArchitectureImporterArchfile.convert_mem(mem_str)
                if mem is None:
                    raise HPCStatsSourceError( \
                            "format of memory for nodeset %s/%s/%s (%s) " \
                            "'%s' is not valid" \
                              % ( self.cluster.name,
                                  partition,
                                  nodegroup,
                                  nodenames,
                                  mem_str ))

                model = config_get(nodegroup_sect, "model")
            
                nodeset_group = NodeSet(nodenames)
                for nodename in nodeset_group:
                    # create and append node
                    new_node = Node(name=nodename,
                                    cluster=self.cluster,
                                    model=model,
                                    partition=partition,
                                    cpu=cpu,
                                    memory=mem,
                                    flops=flops)
                    self.nodes.append(new_node)

            self.partitions[str(nodeset_part)] = job_partitions
Пример #55
0
class TopologyNodeGroup(object):
    """Base element for in-memory representation of the propagation tree.
    Contains a nodeset, with parent-children relationships with other
    instances.
    """
    def __init__(self, nodeset=None):
        """initialize a new TopologyNodeGroup instance."""
        # Base nodeset
        self.nodeset = nodeset
        # Parent TopologyNodeGroup (TNG) instance
        self.parent = None
        # List of children TNG instances
        self._children = []
        self._children_len = 0
        # provided for convenience
        self._children_ns = None

    def printable_subtree(self, prefix=''):
        """recursive method that returns a printable version the subtree from
        the current node with a nice presentation
        """
        res = ''
        # For now, it is ok to use a recursive method here as we consider that
        # tree depth is relatively small.
        if self.parent is None:
            # root
            res = '%s\n' % str(self.nodeset)
        elif self.parent.parent is None:
            # first level
            if not self._is_last():
                res = '|- %s\n' % str(self.nodeset)
            else:
                res = '`- %s\n' % str(self.nodeset)
        else:
            # deepest levels...
            if not self.parent._is_last():
                prefix += '|  '
            else:
                # fix last line
                prefix += '   '
            if not self._is_last():
                res = '%s|- %s\n' % (prefix, str(self.nodeset))
            else:
                res = '%s`- %s\n' % (prefix, str(self.nodeset))
        # perform recursive calls to print out every node
        for child in self._children:
            res += child.printable_subtree(prefix)
        return res

    def add_child(self, child):
        """add a child to the children list and define the current instance as
        its parent
        """
        assert isinstance(child, TopologyNodeGroup)

        if child in self._children:
            return
        child.parent = self
        self._children.append(child)
        if self._children_ns is None:
            self._children_ns = NodeSet()
        self._children_ns.add(child.nodeset)

    def clear_child(self, child, strict=False):
        """remove a child"""
        try:
            self._children.remove(child)
            self._children_ns.difference_update(child.nodeset)
            if len(self._children_ns) == 0:
                self._children_ns = None
        except ValueError:
            if strict:
                raise

    def clear_children(self):
        """delete all children"""
        self._children = []
        self._children_ns = None

    def children(self):
        """get the children list"""
        return self._children

    def children_ns(self):
        """return the children as a nodeset"""
        return self._children_ns

    def children_len(self):
        """returns the number of children as the sum of the size of the
        children's nodeset
        """
        if self._children_ns is None:
            return 0
        else:
            return len(self._children_ns)

    def _is_last(self):
        """used to display the subtree: we won't prefix the line the same way if
        the current instance is the last child of the children list of its
        parent.
        """
        return self.parent._children[-1::][0] == self

    def __str__(self):
        """printable representation of the nodegroup"""
        return '<TopologyNodeGroup (%s)>' % str(self.nodeset)