def moabOwnerForIP(self, vmIP): """ deprecated: gets the owner for one specific job - takes too many ssh connections (3 per host, instead of 3) """ #ssh for all three accounts to NEMO to check which Host/IP belongs to which account freiburg_users = ["fr_ms1414", "fr_herten", "fr_cs97"] freiburg_key = self.config.get("freiburg_cloud", self.configFreiburgKey) freiburg_server = self.config.get("freiburg_cloud", self.configFreiburgServer) if self.args["verbose"]: print("Considering the following NEMO user accounts: {}".format( ', '.join(freiburg_users))) for user in freiburg_users: frSsh = ScaleTools.Ssh(freiburg_server, user, freiburg_key) cmd = "checkjob ALL --xml" frResult = frSsh.handleSshCall(call=cmd, quiet=True) if frResult[0] == 0: itemlist = minidom.parseString( frResult[1]).getElementsByTagName('job') #print( "{} {}".format(user,frSsh.handleSshCall(call=cmd, quiet=True)[1])) #if the VM-IP for li in itemlist: if li.attributes['State'].value == "Running": var = li.getElementsByTagName('Variable') for v in var: if v.getAttribute('name') == 'VM_IP': if v.childNodes[0].nodeValue == vmIP: return user
def requirement(self): cmd = "qstat | egrep \"Q %s|R %s\" | wc -l" % (self.torqQName, self.torqQName) if self.torqKey is None: (res1, count1) = self.countLocalQ(cmd) else: ssh = ScaleTools.Ssh(self.torqIp, "root", self.torqKey, None, 1) (res1, count1) = ssh.handleSshCall(cmd) # dangerous, if not all instances are run by us... if res1 == 0: self._curRequirement = int(count1) - self.qsizeOffset if self._curRequirement < 0: self._curRequirement = 0 # apply divider self._curRequirement //= self.qsizeDivider logging.info("torq needs %d nodes. qsizeoffset is %d." % (self._curRequirement, self.qsizeOffset)) return self._curRequirement else: return None
def sinfoFromSlurm(self): slurm_server = self.config.get("slurm_req_freiburg", self.configSlurmServer) slurm_partition = self.config.get("slurm_req_freiburg", self.configSlurmPartition) slurm_user = self.config.get("slurm_req_freiburg", self.configSlurmUser) slurm_key = self.config.get("slurm_req_freiburg", self.configSlurmKey) slurm_ssh = ScaleTools.Ssh(slurm_server, slurm_user, slurm_key) cmd = ( "sinfo -h -l -N -p {} --format %n,%C,%T ").format(slurm_partition) results_sinfo = slurm_ssh.handleSshCall(call=cmd, quiet=False) #this contains all nodes, also those that are down results_sinfo = slurm_ssh.handleSshCall(call=cmd, quiet=True) slurm_result_status = results_sinfo[0] slurm_result_sinfos = results_sinfo[1] + "\n" slurm_ssh_error = str(results_sinfo[2]) #this gives the sinfo output for ALL machines, even those that are down #put slurm_result together in the way it is needed: slurm_result = (slurm_result_status, slurm_result_sinfos, slurm_ssh_error) return slurm_result
def runCommandOnPbs(self, cmd): if self.torqKey is None: rc, stdout, stderr = ScaleTools.Shell.executeCommand(cmd) return rc, stdout else: ssh = ScaleTools.Ssh(self.torqIp, "root", self.torqKey, None, 1) return ssh.handleSshCall(cmd)
def runCommandOnGridEngineServer(self, cmd): ssh = ScaleTools.Ssh(self.getConfig(self.ConfigGridEngineIp), "root", self.getConfig(self.ConfigGridEngineKey), None, 1) ssh.copyToRemote("gridengineconf.py") return ssh.handleSshCall(cmd)
def getSusers(self): slurm_server = self.config.get("slurm_req_freiburg", self.configSlurmServer) slurm_partition = self.config.get("slurm_req_freiburg", self.configSlurmPartition) slurm_user = self.config.get("slurm_req_freiburg", self.configSlurmUser) slurm_key = self.config.get("slurm_req_freiburg", self.configSlurmKey) slurm_ssh = ScaleTools.Ssh(slurm_server, slurm_user, slurm_key) results_status = 0 results_squeues = "" results_ssh_error = "" for slurm_partition in [ "nemo_vm_atlsch", "nemo_vm_atljak", "nemo_vm_atlher" ]: cmd = ("squeue -p {} -h --format %u,%T,%C").format(slurm_partition) results_squeue_q = slurm_ssh.handleSshCall(call=cmd, quiet=False) results_status += results_squeue_q[0] results_squeues += results_squeue_q[1] results_ssh_error += str(results_squeue_q[2]) results_squeue = (results_status, results_squeues, results_ssh_error) print results_squeue if results_squeue[0] != 0: raise ValueError( "SSH connection to Slurm collector could not be established.") userlist = self.parse_squeue(results_squeue[1]) utext = [] utext.append("\n") utext.append("User".ljust(10) + "RUNNING".ljust(10) + "PENDING".ljust(10)) utext.append(30 * "=") _statepr = {} for u in self.listOfUsers: _us = u.ljust(10) for _s in ["RUNNING", "PENDING"]: _statepr[_s] = 10 * " " if _s in self.userDict[u].keys(): _statepr[_s] = str(self.userDict[u][_s]).ljust(10) utext.append(_us + _statepr["RUNNING"] + _statepr["PENDING"]) for i in utext: print i
def fillHostDict(self): """ Fills a dictionary with the { hostname : { "owner": owner, "status" : status} } from the information gathered from slurm (hostnames, status) and from nemo/Moab (owner) """ for nodename in self.listOfHosts: if nodename not in self.hostDict.keys(): self.hostDict[nodename] = {} freiburg_users = ["fr_ms1414", "fr_herten", "fr_cw97"] freiburg_key = self.config.get("freiburg_cloud", self.configFreiburgKey) freiburg_server = self.config.get("freiburg_cloud", self.configFreiburgServer) print("Considering the following NEMO user accounts: {}".format( ', '.join(freiburg_users))) for user in freiburg_users: frSsh = ScaleTools.Ssh(freiburg_server, user, freiburg_key) cmd = "checkjob ALL --xml" frResult = frSsh.handleSshCall(call=cmd, quiet=False) if self.args["verbose"]: print("trying to log into {} for account {}".format( freiburg_server, user)) if frResult[0] != 0: print( "SSH connection to NEMO via {} could not be established, error {}." .format(freiburg_server, frResult[0])) for nodename in self.listOfHosts: self.hostDict[nodename]["owner"] = "unknown" elif frResult[0] == 0: itemlist = minidom.parseString( frResult[1]).getElementsByTagName('job') for li in itemlist: if li.attributes['State'].value == "Running": var = li.getElementsByTagName('Variable') for v in var: if v.getAttribute('name') == 'VM_IP': vmIP = v.childNodes[0].nodeValue hostname = "host-" + vmIP.replace('.', '-') if hostname in self.hostDict.keys(): self.hostDict[hostname]["owner"] = user return self.hostDict
def condorList(self): # type: () -> Defaultdict(List) """Return list of condor machines {machine name : [[state, activity], [state, activity], ..]} :return: condor_machines """ # load the connection settings from config condor_server = self.getConfig(self.configCondorServer) condor_user = self.getConfig(self.configCondorUser) condor_key = self.getConfig(self.configCondorKey) condor_constraint = self.getConfig(self.configCondorConstraint) condor_ssh = ScaleTools.Ssh(condor_server, condor_user, condor_key) cmd = ("condor_status -constraint '%s' %s" % (condor_constraint, self._query_format_string)) # get a list of the condor machines (SSH) condor_result = condor_ssh.handleSshCall(call=cmd, quiet=True) condor_ssh.debugOutput(self.logger, "EKP-manage", condor_result) if condor_result[0] != 0: raise ValueError( "SSH connection to HTCondor collector could not be established." ) elif self.collector_error_string in condor_result[1]: raise ValueError("Collector(s) didn't answer.") # prepare list of condor machines tmp_condor_machines = self.regex_queue_parser.findall(condor_result[1]) # transform list into dictionary with one list per slot # {machine name : [[state, activity], [state, activity], ..]} condor_machines = defaultdict(list) if len(tmp_condor_machines) > 1 and any(tmp_condor_machines[0]): for machine_name, state, activity in tmp_condor_machines: condor_machines[machine_name].append([state, activity]) return condor_machines
def slurmList(self): # type: () -> Defaultdict(List) """Return list of slurm machines {machine name : [[state, activity], [state, activity], ..]} :return: slurm_machines """ # load the connection settings from config slurm_server = self.getConfig(self.configSlurmServer) slurm_user = self.getConfig(self.configSlurmUser) slurm_key = self.getConfig(self.configSlurmKey) slurm_ssh = ScaleTools.Ssh(slurm_server, slurm_user, slurm_key) slurm_partition = self.getConfig(self.configSlurmPartition) self.logger.debug("Getting slurmList for jobs requested on partition {}".format(slurm_partition)) #this outputs those nodes which are used by a job of this queue. It ignores most "down", "drained" and "draining" machines #in addition, one could just get the machines which are draining or drained. It might not matter which service cancels the job #first determine the list of nodes, then make the ssh call for each of them again for sinfo # Find all nodes assigned a job in this particular slurm_partition queue cmd = ("squeue -p {} -h --format=%N | sort | uniq".format(slurm_partition)) nodes_from_squeue = slurm_ssh.handleSshCall(call=cmd, quiet=True) if len(nodes_from_squeue) <=1 : nodes_this_partition = [] else: nodes_this_partition = nodes_from_squeue[1].split('\n') self.logger.debug("Querying information for these nodes {}".format( nodes_this_partition)) slurm_result_status = 0 slurm_result_sinfos = "" slurm_ssh_error = "" for nn in nodes_this_partition: if nn == "": continue # for each of these nodes, query its sinfo status # in the form <hostname>,<CPU-State: allocated/idle/other/total>,<host state> cmd = ("sinfo -h -l -N -p {} -n {} --format %n,%C,%T" ).format( slurm_partition , nn) slurm_result_nn = slurm_ssh.handleSshCall(call=cmd, quiet=True) slurm_result_status += slurm_result_nn[0] slurm_result_sinfos += slurm_result_nn[1] + "\n" slurm_ssh_error += str( slurm_result_nn[2] ) #put slurm_result together in the way it is needed: slurm_result = (slurm_result_status, slurm_result_sinfos , slurm_ssh_error) self.logger.debug("slurm_result: {}".format(slurm_result)) # get a list of the slurm machines (SSH) slurm_ssh.debugOutput(self.logger, "EKP-manage", slurm_result) if slurm_result[0] != 0: raise ValueError("SSH connection to Slurm collector could not be established.") #elif self.collector_error_string in slurm_result[1]: # raise ValueError("Collector(s) didn't answer.") # Example Output: <hostname>,<CPU-State: allocated/idle/other/total> # host-10-18-1-0,0/0/4/4 # prepare list of slurm machines logging.debug("Trying to parse sinfo output" ) tmp_slurm_machines=self.parse_sinfo_output(slurm_result[1]) slurm_machines = defaultdict(list) for node in tmp_slurm_machines: # ignore invalid lines if len(node) != 3: continue machine_name=node[0] cpus=node[1].split('/') state=node[2] # ignore down machines: host-10-18-1-50,0/0/4/4,down* if "down" in state: continue #ignore machines with no allocated and idle cpus #if int(cpus[0]) == 0 and int(cpus[1]) == 0: # continue allocatedCPUs = int(cpus[0]) idleCPUs = int(cpus[1]) totalCPUs = int(cpus[3]) for slot in range(totalCPUs): if allocatedCPUs > 0: slurm_machines[machine_name].append(['allocated', None]) allocatedCPUs = allocatedCPUs - 1 elif idleCPUs > 0: slurm_machines[machine_name].append(['idle', None]) idleCPUs = idleCPUs - 1 elif "draining" in state: slurm_machines[machine_name].append(['draining', None]) elif "drained" in state: slurm_machines[machine_name].append(['drained', None]) else: print "WARNING!!! this is a bug!" return slurm_machines
def requirement(self): ssh = ScaleTools.Ssh(host=self.getConfig(self.configCondorServer), username=self.getConfig(self.configCondorUser), key=self.getConfig(self.configCondorKey)) # Target.Requirements can't be filtered with -constraints since it would require ClassAd based regex matching. # TODO: Find a more generic way to match resources/requirements (condor_q -slotads ??) # cmd_idle = "condor_q -constraint 'JobStatus == 1' -slotads slotads_bwforcluster " \ # "-analyze:summary,reverse | tail -n1 | awk -F ' ' " \ # "'{print $3 "\n" $4}'| sort -n | head -n1" constraint = "( %s ) && ( %s )" % (self._query_constraints, self.getConfig( self.configCondorConstraint)) cmd = ("condor_q -global -constraint '%s' %s" % (constraint, self._query_format_string)) result = ssh.handleSshCall(call=cmd, quiet=True) if result[0] != 0: self.logger.warning("Could not get HTCondor queue status! %d: %s" % (result[0], result[2])) return None elif any(error_string in result[1] for error_string in self._CLI_error_strings): self.logger.warning("condor_q request timed out.") return None queue_line = (entry.split(",", 3) for entry in str(result[1]).splitlines()) converted_line = ((int(status), int(cores), requirement) for status, cores, requirement in queue_line) if self.getConfig(self.configCondorRequirement): # TODO: We could use ClassAd bindings, to check requirement(s) filtered_line = ( (status, cores) for status, cores, requirement in converted_line if self.getConfig(self.configCondorRequirement) in requirement) else: filtered_line = ((status, cores) for status, cores, requirement in converted_line) required_cpus_total = 0 required_cpus_idle_jobs = 0 required_cpus_running_jobs = 0 try: for job_status, requested_cpus in filtered_line: required_cpus_total += requested_cpus if job_status == self.condorStatusIdle: required_cpus_idle_jobs += requested_cpus elif job_status == self.condorStatusRunning: required_cpus_running_jobs += requested_cpus except ValueError: # This error should only occur, if the result was empty AND CondorRequirement is initial required_cpus_total = 0 required_cpus_idle_jobs = 0 required_cpus_running_jobs = 0 self.logger.debug( "HTCondor queue: Idle: %d; Running: %d." % (required_cpus_idle_jobs, required_cpus_running_jobs)) # cores->machines: machine definition required for RequirementAdapter n_cores = -int( self.getConfig( self.configMachines)[self.getNeededMachineType()]["cores"]) self._curRequirement = -(required_cpus_total // n_cores) with Logging.JsonLog() as json_log: json_log.addItem(self.getNeededMachineType(), "jobs_idle", required_cpus_idle_jobs) json_log.addItem(self.getNeededMachineType(), "jobs_running", required_cpus_running_jobs) return self._curRequirement
def requirement(self): ssh = ScaleTools.Ssh(host=self.getConfig(self.configSlurmServer), username=self.getConfig(self.configSlurmUser), key=self.getConfig(self.configSlurmKey)) # Target.Requirements can't be filtered with -constraints since it would require ClassAd based regex matching. # TODO: Find a more generic way to match resources/requirements (condor_q -slotads ??) # cmd_idle = "condor_q -constraint 'JobStatus == 1' -slotads slotads_bwforcluster " \ # "-analyze:summary,reverse | tail -n1 | awk -F ' ' " \ # "'{print $3 "\n" $4}'| sort -n | head -n1" #constraint = "( %s ) && ( %s )" % (self._query_constraints, self.getConfig(self.configCondorConstraint)) #cmd = ("condor_q -global -allusers -nobatch -constraint '%s' %s" % (constraint, self._query_format_string)) #cmd = 'squeue -p nemo_vm_atlsch --noheader --format="%T %r %c"' self.logger.info("Checking requirements in partition {}".format( self.getConfig(self.configSlurmPartition))) cmd = 'squeue -p {} --noheader --format="%T %r %c"'.format( self.getConfig(self.configSlurmPartition)) result = ssh.handleSshCall(call=cmd, quiet=True) if result[0] != 0: self.logger.warning("Could not get Slurm queue status! %d: %s" % (result[0], result[2])) return None elif any(error_string in result[1] for error_string in self._CLI_error_strings): self.logger.warning("squeue request timed out.") return None required_cpus_total = 0 required_cpus_idle_jobs = 0 required_cpus_running_jobs = 0 cpus_dependency_jobs = 0 for line in result[1].splitlines(): values = line.split() #self.logger.debug(values) if len(values) != 3: continue if "Dependency" in values[1]: cpus_dependency_jobs = cpus_dependency_jobs + int(values[2]) continue if "PartitionTimeLimit" in values[1]: continue elif "PENDING" in values[0]: required_cpus_total = required_cpus_total + int(values[2]) required_cpus_idle_jobs = required_cpus_idle_jobs + int( values[2]) continue elif "RUNNING" in values[0]: required_cpus_total = required_cpus_total + int(values[2]) required_cpus_running_jobs = required_cpus_running_jobs + int( values[2]) continue else: self.logger.warning("unknown job state: %s. Ignoring.", values[0]) self.logger.debug( "Slurm queue: Idle: %d; Running: %d. in partition: %s." % (required_cpus_idle_jobs, required_cpus_running_jobs, self.getConfig(self.configSlurmPartition))) # cores->machines: machine definition required for RequirementAdapter n_cores = -int( self.getConfig( self.configMachines)[self.getNeededMachineType()]["cores"]) self._curRequirement = -(required_cpus_total // n_cores) self.logger.debug("Required CPUs total=%s" % required_cpus_total) self.logger.debug("Required CPUs idle Jobs=%s" % required_cpus_idle_jobs) self.logger.debug("Required CPUs running Jobs=%s" % required_cpus_running_jobs) self.logger.debug("CPUs dependency Jobs=%s" % cpus_dependency_jobs) with Logging.JsonLog() as json_log: json_log.addItem(self.getNeededMachineType(), "jobs_idle", required_cpus_idle_jobs) json_log.addItem(self.getNeededMachineType(), "jobs_running", required_cpus_running_jobs) return self._curRequirement
def manage(self): """manage method is called every manage cycle. this is the right place to survey the booting status and set it to StatusUp if machine is up""" """ logging.info("Querying OpenNebula Server for running instances...") vm_pool = self.VMPoolInfo() print vm_pool running_instances = filter(lambda x: self.hostname_prefix in x[1]["NAME"], vm_pool[1:]) print running_instances running_instances2 = (filter(lambda x: "ubuntu" in x[1]["NAME"], vm_pool[1:]))[0] print running_instances2 """ myMachines = self.getSiteMachines() # print myMachines # {'8e661aac-fc4e-450f-9cbb-e57ec6e4adb2': {'status': 'working', 'site_type': 'one', 'hostname': '141.52.208.174', 'ssh_key': 'one_host_key', 'one_vmid': 910, 'status_last_update': datetime.datetime(2011, 1, 13, 15, 48, 39, 328084), 'machine_type': 'euca-default', 'site': 'one_site_scc'}} for mid in myMachines: if myMachines[mid]["status"] == "booting": vm_info = self.VMInfo(myMachines[mid]["one_vmid"]) logging.debug(myMachines[mid]["vpn_ip"]) logging.debug(myMachines[mid]["vpn_cert_is_valid"]) logging.debug(myMachines[mid]["vpn_cert"]) if vm_info[0] is True: if vm_info[1]["STATE"] == "3" and vm_info[1]["LCM_STATE"] == "3": if self.checkIfMachineIsUp(mid): vpn = ScaleTools.Vpn() if myMachines[mid]["vpn_cert_is_valid"] is None: if vpn.makeCertificate(myMachines[mid]["vpn_cert"]) == 0: myMachines[mid]["vpn_cert_is_valid"] = True if myMachines[mid]["vpn_cert_is_valid"] is True and \ myMachines[mid]["vpn_ip"] is None: if (vpn.copyCertificate(myMachines[mid]["vpn_cert"], myMachines[mid]) == 0): if (vpn.connectVPN(myMachines[mid]["vpn_cert"], myMachines[mid]) == 0): (res, ip) = vpn.getIP(myMachines[mid]) logging.debug(res) logging.debug(ip) if res == 0 and ip != "": myMachines[mid]["vpn_ip"] = ip else: logging.debug("getting VPN IP failed!!") if (myMachines[mid]["vpn_cert_is_valid"] is True and myMachines[mid][ "vpn_ip"] is not None): # if( vpn.revokeCertificate(myMachines[k]["vpn_cert"]) == 0): # myMachines[k]["vpn_cert_is_valid"] = False self.mr.updateMachineStatus(mid, self.mr.statusUp) logging.debug(myMachines[mid]["vpn_ip"]) logging.debug(myMachines[mid]["vpn_cert_is_valid"]) logging.debug(myMachines[mid]["vpn_cert"]) else: self.checkForDeadMachine(mid)