示例#1
0
    def moabOwnerForIP(self, vmIP):
        """
        deprecated: gets the owner for one specific job - takes too many ssh connections 
        (3 per host, instead of 3)
        """
        #ssh for all three accounts to NEMO to check which Host/IP belongs to which account
        freiburg_users = ["fr_ms1414", "fr_herten", "fr_cs97"]
        freiburg_key = self.config.get("freiburg_cloud",
                                       self.configFreiburgKey)
        freiburg_server = self.config.get("freiburg_cloud",
                                          self.configFreiburgServer)

        if self.args["verbose"]:
            print("Considering the following NEMO user accounts: {}".format(
                ', '.join(freiburg_users)))

        for user in freiburg_users:
            frSsh = ScaleTools.Ssh(freiburg_server, user, freiburg_key)
            cmd = "checkjob ALL --xml"
            frResult = frSsh.handleSshCall(call=cmd, quiet=True)
            if frResult[0] == 0:
                itemlist = minidom.parseString(
                    frResult[1]).getElementsByTagName('job')
                #print( "{} {}".format(user,frSsh.handleSshCall(call=cmd, quiet=True)[1]))
                #if the VM-IP
                for li in itemlist:
                    if li.attributes['State'].value == "Running":
                        var = li.getElementsByTagName('Variable')
                        for v in var:
                            if v.getAttribute('name') == 'VM_IP':
                                if v.childNodes[0].nodeValue == vmIP:
                                    return user
    def requirement(self):
        cmd = "qstat | egrep \"Q %s|R %s\" | wc -l" % (self.torqQName,
                                                       self.torqQName)

        if self.torqKey is None:
            (res1, count1) = self.countLocalQ(cmd)
        else:
            ssh = ScaleTools.Ssh(self.torqIp, "root", self.torqKey, None, 1)
            (res1, count1) = ssh.handleSshCall(cmd)

        # dangerous, if not all instances are run by us...

        if res1 == 0:
            self._curRequirement = int(count1) - self.qsizeOffset

            if self._curRequirement < 0:
                self._curRequirement = 0

            # apply divider
            self._curRequirement //= self.qsizeDivider

            logging.info("torq needs %d nodes. qsizeoffset is %d." %
                         (self._curRequirement, self.qsizeOffset))
            return self._curRequirement
        else:
            return None
示例#3
0
    def sinfoFromSlurm(self):

        slurm_server = self.config.get("slurm_req_freiburg",
                                       self.configSlurmServer)
        slurm_partition = self.config.get("slurm_req_freiburg",
                                          self.configSlurmPartition)
        slurm_user = self.config.get("slurm_req_freiburg",
                                     self.configSlurmUser)
        slurm_key = self.config.get("slurm_req_freiburg", self.configSlurmKey)
        slurm_ssh = ScaleTools.Ssh(slurm_server, slurm_user, slurm_key)

        cmd = (
            "sinfo -h -l -N -p {} --format %n,%C,%T ").format(slurm_partition)
        results_sinfo = slurm_ssh.handleSshCall(call=cmd, quiet=False)
        #this contains all nodes, also those that are down
        results_sinfo = slurm_ssh.handleSshCall(call=cmd, quiet=True)
        slurm_result_status = results_sinfo[0]
        slurm_result_sinfos = results_sinfo[1] + "\n"
        slurm_ssh_error = str(results_sinfo[2])

        #this gives the sinfo output for ALL machines, even those that are down
        #put slurm_result together in the way it is needed:
        slurm_result = (slurm_result_status, slurm_result_sinfos,
                        slurm_ssh_error)
        return slurm_result
示例#4
0
 def runCommandOnPbs(self, cmd):
     if self.torqKey is None:
         rc, stdout, stderr = ScaleTools.Shell.executeCommand(cmd)
         return rc, stdout
     else:
         ssh = ScaleTools.Ssh(self.torqIp, "root", self.torqKey, None, 1)
         return ssh.handleSshCall(cmd)
    def runCommandOnGridEngineServer(self, cmd):

        ssh = ScaleTools.Ssh(self.getConfig(self.ConfigGridEngineIp), "root",
                             self.getConfig(self.ConfigGridEngineKey), None, 1)

        ssh.copyToRemote("gridengineconf.py")

        return ssh.handleSshCall(cmd)
示例#6
0
    def getSusers(self):
        slurm_server = self.config.get("slurm_req_freiburg",
                                       self.configSlurmServer)
        slurm_partition = self.config.get("slurm_req_freiburg",
                                          self.configSlurmPartition)
        slurm_user = self.config.get("slurm_req_freiburg",
                                     self.configSlurmUser)
        slurm_key = self.config.get("slurm_req_freiburg", self.configSlurmKey)
        slurm_ssh = ScaleTools.Ssh(slurm_server, slurm_user, slurm_key)

        results_status = 0
        results_squeues = ""
        results_ssh_error = ""
        for slurm_partition in [
                "nemo_vm_atlsch", "nemo_vm_atljak", "nemo_vm_atlher"
        ]:
            cmd = ("squeue -p {} -h --format %u,%T,%C").format(slurm_partition)
            results_squeue_q = slurm_ssh.handleSshCall(call=cmd, quiet=False)
            results_status += results_squeue_q[0]
            results_squeues += results_squeue_q[1]
            results_ssh_error += str(results_squeue_q[2])
        results_squeue = (results_status, results_squeues, results_ssh_error)
        print results_squeue

        if results_squeue[0] != 0:
            raise ValueError(
                "SSH connection to Slurm collector could not be established.")

        userlist = self.parse_squeue(results_squeue[1])

        utext = []
        utext.append("\n")

        utext.append("User".ljust(10) + "RUNNING".ljust(10) +
                     "PENDING".ljust(10))
        utext.append(30 * "=")

        _statepr = {}

        for u in self.listOfUsers:
            _us = u.ljust(10)
            for _s in ["RUNNING", "PENDING"]:
                _statepr[_s] = 10 * " "
                if _s in self.userDict[u].keys():
                    _statepr[_s] = str(self.userDict[u][_s]).ljust(10)
            utext.append(_us + _statepr["RUNNING"] + _statepr["PENDING"])
        for i in utext:
            print i
示例#7
0
    def fillHostDict(self):
        """
        Fills a dictionary with the { hostname : { "owner": owner, "status" : status} }
        from the information gathered from slurm (hostnames, status) and from nemo/Moab (owner)
        """
        for nodename in self.listOfHosts:
            if nodename not in self.hostDict.keys():
                self.hostDict[nodename] = {}

        freiburg_users = ["fr_ms1414", "fr_herten", "fr_cw97"]
        freiburg_key = self.config.get("freiburg_cloud",
                                       self.configFreiburgKey)
        freiburg_server = self.config.get("freiburg_cloud",
                                          self.configFreiburgServer)

        print("Considering the following NEMO user accounts: {}".format(
            ', '.join(freiburg_users)))

        for user in freiburg_users:
            frSsh = ScaleTools.Ssh(freiburg_server, user, freiburg_key)
            cmd = "checkjob ALL --xml"
            frResult = frSsh.handleSshCall(call=cmd, quiet=False)
            if self.args["verbose"]:
                print("trying to log into {} for account {}".format(
                    freiburg_server, user))
            if frResult[0] != 0:
                print(
                    "SSH connection to NEMO via {} could not be established, error {}."
                    .format(freiburg_server, frResult[0]))
                for nodename in self.listOfHosts:
                    self.hostDict[nodename]["owner"] = "unknown"

            elif frResult[0] == 0:
                itemlist = minidom.parseString(
                    frResult[1]).getElementsByTagName('job')
                for li in itemlist:
                    if li.attributes['State'].value == "Running":
                        var = li.getElementsByTagName('Variable')
                        for v in var:
                            if v.getAttribute('name') == 'VM_IP':
                                vmIP = v.childNodes[0].nodeValue
                                hostname = "host-" + vmIP.replace('.', '-')
                                if hostname in self.hostDict.keys():
                                    self.hostDict[hostname]["owner"] = user

        return self.hostDict
    def condorList(self):
        # type: () -> Defaultdict(List)
        """Return list of condor machines {machine name : [[state, activity], [state, activity], ..]}

        :return: condor_machines
        """

        # load the connection settings from config
        condor_server = self.getConfig(self.configCondorServer)
        condor_user = self.getConfig(self.configCondorUser)
        condor_key = self.getConfig(self.configCondorKey)
        condor_constraint = self.getConfig(self.configCondorConstraint)
        condor_ssh = ScaleTools.Ssh(condor_server, condor_user, condor_key)

        cmd = ("condor_status -constraint '%s' %s" %
               (condor_constraint, self._query_format_string))

        # get a list of the condor machines (SSH)
        condor_result = condor_ssh.handleSshCall(call=cmd, quiet=True)
        condor_ssh.debugOutput(self.logger, "EKP-manage", condor_result)

        if condor_result[0] != 0:
            raise ValueError(
                "SSH connection to HTCondor collector could not be established."
            )
        elif self.collector_error_string in condor_result[1]:
            raise ValueError("Collector(s) didn't answer.")

        # prepare list of condor machines
        tmp_condor_machines = self.regex_queue_parser.findall(condor_result[1])

        # transform list into dictionary with one list per slot
        # {machine name : [[state, activity], [state, activity], ..]}
        condor_machines = defaultdict(list)
        if len(tmp_condor_machines) > 1 and any(tmp_condor_machines[0]):
            for machine_name, state, activity in tmp_condor_machines:
                condor_machines[machine_name].append([state, activity])

        return condor_machines
示例#9
0
    def slurmList(self):
        # type: () -> Defaultdict(List)
        """Return list of slurm machines {machine name : [[state, activity], [state, activity], ..]}

        :return: slurm_machines
        """

        # load the connection settings from config
        slurm_server = self.getConfig(self.configSlurmServer)
        slurm_user = self.getConfig(self.configSlurmUser)
        slurm_key = self.getConfig(self.configSlurmKey)
        slurm_ssh = ScaleTools.Ssh(slurm_server, slurm_user, slurm_key)
        slurm_partition = self.getConfig(self.configSlurmPartition)


        self.logger.debug("Getting slurmList for jobs requested on partition {}".format(slurm_partition))


        #this outputs those nodes which are used by a job of this queue. It ignores most "down", "drained" and "draining" machines
        #in addition, one could just get the machines which are draining or drained. It might not matter which service cancels the job
        #first determine the list of nodes, then make the ssh call for each of them again for sinfo

        # Find all nodes assigned a job in this particular slurm_partition queue
        cmd = ("squeue -p {} -h --format=%N  | sort | uniq".format(slurm_partition))
        nodes_from_squeue = slurm_ssh.handleSshCall(call=cmd, quiet=True)
        if len(nodes_from_squeue) <=1 :
            nodes_this_partition = []
        else: 
            nodes_this_partition = nodes_from_squeue[1].split('\n')

        self.logger.debug("Querying information for these nodes {}".format( nodes_this_partition))

        slurm_result_status = 0
        slurm_result_sinfos = ""
        slurm_ssh_error = ""
        for nn in nodes_this_partition:
            if nn == "":
                continue
            # for each of these nodes, query its sinfo status 
            #     in the form <hostname>,<CPU-State: allocated/idle/other/total>,<host state>
            cmd =  ("sinfo -h -l -N -p {} -n {} --format %n,%C,%T" ).format( slurm_partition , nn)
            slurm_result_nn = slurm_ssh.handleSshCall(call=cmd, quiet=True)
            slurm_result_status += slurm_result_nn[0]
            slurm_result_sinfos += slurm_result_nn[1] + "\n"
            slurm_ssh_error += str( slurm_result_nn[2] )

        
        #put slurm_result together in the way it is needed:
        slurm_result = (slurm_result_status,  slurm_result_sinfos , slurm_ssh_error)
        self.logger.debug("slurm_result: {}".format(slurm_result))

        # get a list of the slurm machines (SSH)
        slurm_ssh.debugOutput(self.logger, "EKP-manage", slurm_result)



        if slurm_result[0] != 0:
            raise ValueError("SSH connection to Slurm collector could not be established.")
        #elif self.collector_error_string in slurm_result[1]:
        #    raise ValueError("Collector(s) didn't answer.")
        
        # Example Output: <hostname>,<CPU-State: allocated/idle/other/total>
        # host-10-18-1-0,0/0/4/4
                
        # prepare list of slurm machines
        logging.debug("Trying to parse sinfo output" )
        tmp_slurm_machines=self.parse_sinfo_output(slurm_result[1])

        slurm_machines = defaultdict(list)
        for node in tmp_slurm_machines:
            
            # ignore invalid lines
            if len(node) != 3:
                continue

            machine_name=node[0]
            cpus=node[1].split('/')
            state=node[2]

            # ignore down machines: host-10-18-1-50,0/0/4/4,down* 
            if "down" in state:
                continue

            #ignore machines with no allocated and idle cpus
            #if int(cpus[0]) == 0 and int(cpus[1]) == 0:
            #    continue

            allocatedCPUs = int(cpus[0])
            idleCPUs = int(cpus[1])
            totalCPUs = int(cpus[3])

            for slot in range(totalCPUs):
                if allocatedCPUs > 0:
                    slurm_machines[machine_name].append(['allocated', None])
                    allocatedCPUs = allocatedCPUs - 1
                elif idleCPUs > 0:
                    slurm_machines[machine_name].append(['idle', None])
                    idleCPUs = idleCPUs - 1
                elif "draining" in state:
                    slurm_machines[machine_name].append(['draining', None])
                elif "drained" in state:
                    slurm_machines[machine_name].append(['drained', None])
                else:
                    print "WARNING!!! this is a bug!"

        return slurm_machines
    def requirement(self):
        ssh = ScaleTools.Ssh(host=self.getConfig(self.configCondorServer),
                             username=self.getConfig(self.configCondorUser),
                             key=self.getConfig(self.configCondorKey))

        # Target.Requirements can't be filtered with -constraints since it would require ClassAd based regex matching.
        # TODO: Find a more generic way to match resources/requirements (condor_q -slotads ??)
        # cmd_idle = "condor_q -constraint 'JobStatus == 1' -slotads slotads_bwforcluster " \
        #            "-analyze:summary,reverse | tail -n1 | awk -F ' ' " \
        #            "'{print $3 "\n" $4}'| sort -n | head -n1"
        constraint = "( %s ) && ( %s )" % (self._query_constraints,
                                           self.getConfig(
                                               self.configCondorConstraint))

        cmd = ("condor_q -global -constraint '%s' %s" %
               (constraint, self._query_format_string))
        result = ssh.handleSshCall(call=cmd, quiet=True)
        if result[0] != 0:
            self.logger.warning("Could not get HTCondor queue status! %d: %s" %
                                (result[0], result[2]))
            return None
        elif any(error_string in result[1]
                 for error_string in self._CLI_error_strings):
            self.logger.warning("condor_q request timed out.")
            return None

        queue_line = (entry.split(",", 3)
                      for entry in str(result[1]).splitlines())
        converted_line = ((int(status), int(cores), requirement)
                          for status, cores, requirement in queue_line)
        if self.getConfig(self.configCondorRequirement):
            # TODO: We could use ClassAd bindings, to check requirement(s)
            filtered_line = (
                (status, cores)
                for status, cores, requirement in converted_line
                if self.getConfig(self.configCondorRequirement) in requirement)
        else:
            filtered_line = ((status, cores)
                             for status, cores, requirement in converted_line)

        required_cpus_total = 0
        required_cpus_idle_jobs = 0
        required_cpus_running_jobs = 0
        try:
            for job_status, requested_cpus in filtered_line:
                required_cpus_total += requested_cpus
                if job_status == self.condorStatusIdle:
                    required_cpus_idle_jobs += requested_cpus
                elif job_status == self.condorStatusRunning:
                    required_cpus_running_jobs += requested_cpus
        except ValueError:
            # This error should only occur, if the result was empty AND CondorRequirement is initial
            required_cpus_total = 0
            required_cpus_idle_jobs = 0
            required_cpus_running_jobs = 0

        self.logger.debug(
            "HTCondor queue: Idle: %d; Running: %d." %
            (required_cpus_idle_jobs, required_cpus_running_jobs))

        # cores->machines: machine definition required for RequirementAdapter
        n_cores = -int(
            self.getConfig(
                self.configMachines)[self.getNeededMachineType()]["cores"])
        self._curRequirement = -(required_cpus_total // n_cores)

        with Logging.JsonLog() as json_log:
            json_log.addItem(self.getNeededMachineType(), "jobs_idle",
                             required_cpus_idle_jobs)
            json_log.addItem(self.getNeededMachineType(), "jobs_running",
                             required_cpus_running_jobs)

        return self._curRequirement
示例#11
0
    def requirement(self):
        ssh = ScaleTools.Ssh(host=self.getConfig(self.configSlurmServer),
                             username=self.getConfig(self.configSlurmUser),
                             key=self.getConfig(self.configSlurmKey))

        # Target.Requirements can't be filtered with -constraints since it would require ClassAd based regex matching.
        # TODO: Find a more generic way to match resources/requirements (condor_q -slotads ??)
        # cmd_idle = "condor_q -constraint 'JobStatus == 1' -slotads slotads_bwforcluster " \
        #            "-analyze:summary,reverse | tail -n1 | awk -F ' ' " \
        #            "'{print $3 "\n" $4}'| sort -n | head -n1"
        #constraint = "( %s ) && ( %s )" % (self._query_constraints, self.getConfig(self.configCondorConstraint))

        #cmd = ("condor_q -global -allusers -nobatch -constraint '%s' %s" % (constraint, self._query_format_string))
        #cmd = 'squeue -p nemo_vm_atlsch --noheader --format="%T %r %c"'
        self.logger.info("Checking requirements in partition {}".format(
            self.getConfig(self.configSlurmPartition)))
        cmd = 'squeue -p {} --noheader --format="%T %r %c"'.format(
            self.getConfig(self.configSlurmPartition))
        result = ssh.handleSshCall(call=cmd, quiet=True)
        if result[0] != 0:
            self.logger.warning("Could not get Slurm queue status! %d: %s" %
                                (result[0], result[2]))
            return None
        elif any(error_string in result[1]
                 for error_string in self._CLI_error_strings):
            self.logger.warning("squeue request timed out.")
            return None

        required_cpus_total = 0
        required_cpus_idle_jobs = 0
        required_cpus_running_jobs = 0
        cpus_dependency_jobs = 0

        for line in result[1].splitlines():
            values = line.split()
            #self.logger.debug(values)

            if len(values) != 3:
                continue

            if "Dependency" in values[1]:
                cpus_dependency_jobs = cpus_dependency_jobs + int(values[2])
                continue
            if "PartitionTimeLimit" in values[1]:
                continue
            elif "PENDING" in values[0]:
                required_cpus_total = required_cpus_total + int(values[2])
                required_cpus_idle_jobs = required_cpus_idle_jobs + int(
                    values[2])
                continue
            elif "RUNNING" in values[0]:
                required_cpus_total = required_cpus_total + int(values[2])
                required_cpus_running_jobs = required_cpus_running_jobs + int(
                    values[2])
                continue
            else:
                self.logger.warning("unknown job state: %s. Ignoring.",
                                    values[0])

        self.logger.debug(
            "Slurm queue: Idle: %d; Running: %d. in partition: %s." %
            (required_cpus_idle_jobs, required_cpus_running_jobs,
             self.getConfig(self.configSlurmPartition)))

        # cores->machines: machine definition required for RequirementAdapter
        n_cores = -int(
            self.getConfig(
                self.configMachines)[self.getNeededMachineType()]["cores"])
        self._curRequirement = -(required_cpus_total // n_cores)

        self.logger.debug("Required CPUs total=%s" % required_cpus_total)
        self.logger.debug("Required CPUs idle Jobs=%s" %
                          required_cpus_idle_jobs)
        self.logger.debug("Required CPUs running Jobs=%s" %
                          required_cpus_running_jobs)
        self.logger.debug("CPUs dependency Jobs=%s" % cpus_dependency_jobs)
        with Logging.JsonLog() as json_log:
            json_log.addItem(self.getNeededMachineType(), "jobs_idle",
                             required_cpus_idle_jobs)
            json_log.addItem(self.getNeededMachineType(), "jobs_running",
                             required_cpus_running_jobs)

        return self._curRequirement
示例#12
0
    def manage(self):
        """manage method is called every manage cycle.
        this is the right place to survey the booting status and set it to StatusUp if machine is up"""

        """
        logging.info("Querying OpenNebula Server for running instances...")
        
        vm_pool = self.VMPoolInfo()
        print vm_pool
        
        running_instances = filter(lambda x: self.hostname_prefix in x[1]["NAME"], vm_pool[1:])
        print running_instances
        
        running_instances2 = (filter(lambda x: "ubuntu" in x[1]["NAME"], vm_pool[1:]))[0]
        print running_instances2
        """

        myMachines = self.getSiteMachines()

        # print myMachines
        # {'8e661aac-fc4e-450f-9cbb-e57ec6e4adb2': {'status': 'working', 'site_type': 'one', 'hostname': '141.52.208.174', 'ssh_key': 'one_host_key', 'one_vmid': 910, 'status_last_update': datetime.datetime(2011, 1, 13, 15, 48, 39, 328084), 'machine_type': 'euca-default', 'site': 'one_site_scc'}}

        for mid in myMachines:
            if myMachines[mid]["status"] == "booting":
                vm_info = self.VMInfo(myMachines[mid]["one_vmid"])
                logging.debug(myMachines[mid]["vpn_ip"])
                logging.debug(myMachines[mid]["vpn_cert_is_valid"])
                logging.debug(myMachines[mid]["vpn_cert"])
                if vm_info[0] is True:

                    if vm_info[1]["STATE"] == "3" and vm_info[1]["LCM_STATE"] == "3":
                        if self.checkIfMachineIsUp(mid):
                            vpn = ScaleTools.Vpn()

                            if myMachines[mid]["vpn_cert_is_valid"] is None:
                                if vpn.makeCertificate(myMachines[mid]["vpn_cert"]) == 0:
                                    myMachines[mid]["vpn_cert_is_valid"] = True

                            if myMachines[mid]["vpn_cert_is_valid"] is True and \
                                            myMachines[mid]["vpn_ip"] is None:
                                if (vpn.copyCertificate(myMachines[mid]["vpn_cert"],
                                                        myMachines[mid]) == 0):
                                    if (vpn.connectVPN(myMachines[mid]["vpn_cert"],
                                                       myMachines[mid]) == 0):
                                        (res, ip) = vpn.getIP(myMachines[mid])
                                        logging.debug(res)
                                        logging.debug(ip)
                                        if res == 0 and ip != "":
                                            myMachines[mid]["vpn_ip"] = ip
                                        else:
                                            logging.debug("getting VPN IP failed!!")

                            if (myMachines[mid]["vpn_cert_is_valid"] is True and myMachines[mid][
                                "vpn_ip"] is not None):
                                # if( vpn.revokeCertificate(myMachines[k]["vpn_cert"]) == 0):
                                #    myMachines[k]["vpn_cert_is_valid"] = False
                                self.mr.updateMachineStatus(mid, self.mr.statusUp)

                            logging.debug(myMachines[mid]["vpn_ip"])
                            logging.debug(myMachines[mid]["vpn_cert_is_valid"])
                            logging.debug(myMachines[mid]["vpn_cert"])

                        else:
                            self.checkForDeadMachine(mid)