def load(self): pastHosts = Session.query(Host).all() print ("INFRASTRUCTURE-LOAD: LOADING INFORMATION FROM PAST EXECUTIONS") print (" Desired LRMS: " + self.lrms) #obtain present hosts prom gridway #PILOTS pastPilots = Session.query(Pilot).all() print (" Deleting all past pilots from database") for pilot in pastPilots: base.Session.delete(pilot) presentHosts = [] GWHosts = obtainGWResources() if GWHosts == []: print("Error when parsing host information file, employing information from past executions") self.hosts = pastHosts return #load XML to memory and extract data from hosts for hostInfo in GWHosts: hostName = hostInfo.getElementsByTagName("HOSTNAME")[0].firstChild.data #TODO: remove "unicode" from TEXT #for every found host, check if it existed on a previous execution for host in pastHosts: if host.hostname.strip().lower() == hostName.strip().lower(): #in the case of pilot jobs, only employ the ones with FREENODECOUNT > 0 if host.lrms =="jobmanager-pilot": try: freeNodeCount = int(hostInfo.getElementsByTagName("FREENODECOUNT")[0].firstChild.data) except: freeNodeCount = 0 if not freeNodeCount > 0: continue; presentHosts.append(host) break # Si no pones este break, y está repetido en memoria, se cargan todos #now, if only hosts with a certain LRMS are desired, we remove the rest if self.lrms != None: for host in presentHosts: if host.lrms != self.lrms: print (" Removing host " + host.hostname + ", wrong LRMS found: " + host.lrms) presentHosts.remove(host) else: print (" Keeping host " + host.hostname) #the hosts that we will first employ are the resulting ones self.hosts = presentHosts
def updateStatus (self, gridTasks): print ("UPDATE STATUS") GWHosts = obtainGWResources() if GWHosts ==[]: print("Error when parsing host information file, employing information from past executions") return GWHosts #PILOTS pastPilots = Session.query(Pilot).all() presentPilots = [] #load XML to memory and extract data from hosts for resource in GWHosts: hostname = resource.getElementsByTagName("HOSTNAME")[0].firstChild.data.strip().lower() #TODO: remove "unicode" from TEXT try: foundLrms = resource.getElementsByTagName("LRMS_NAME")[0].firstChild.data.strip().lower() except: print ("Could not obtain resource LRMS, skipping it") continue if foundLrms != "jobmanager-pilot": continue #ahora son todos pilots try: freeNodeCount = int(resource.getElementsByTagName("FREENODECOUNT")[0].firstChild.data) except: freeNodeCount = 0 if not freeNodeCount > 0: continue; for pilot in pastPilots: if pilot.hostname == hostname: presentPilots.append(pilot) break #the pilots that we will first employ are the resulting ones self.pilots = presentPilots
def createInfrastructureTasks(self, infrastructureTasks): print ("---------------------") print ("---------------------") print ("---------------------") print ("CREATE INFRASTRUCTURE TASKS") hostsToProfile = [] hostList = obtainGWResources() for hostInfo in hostList: hostName = hostInfo.getElementsByTagName("HOSTNAME")[0].firstChild.data #TODO: remove "unicode" from TEXT try: foundArch = hostInfo.getElementsByTagName("ARCH")[0].firstChild.data except: foundArch="" try: foundCpuMHz = int(hostInfo.getElementsByTagName("CPU_MHZ")[0].firstChild.data) except: foundCpuMHz = 0 try: foundLrms = hostInfo.getElementsByTagName("LRMS_NAME")[0].firstChild.data except: foundLrms = None try: freeNodeCount = int(hostInfo.getElementsByTagName("FREENODECOUNT")[0].firstChild.data) except: freeNodeCount = 0 if foundLrms != None: if foundLrms == "jobmanager-pilot": #solo tenemos en cuenta los pilots con al menos un slot disponible if not freeNodeCount > 0: continue #if a certain LRMS is desired, remove the hosts with a different one if self.lrms != None: if foundLrms != self.lrms: continue #if host is unknown, create a profiling task currentHost = self.getHost(hostName) if currentHost == None: newHost = Host(hostName, arch=foundArch, cpuMHz = foundCpuMHz, lrms=foundLrms) self.hosts.append(newHost) hostsToProfile.append(newHost) #store new host on databae (faiulre resistance Session.add(newHost) #if information has changed, update host information elif (currentHost.arch != foundArch) or (currentHost.cpuMHz != foundCpuMHz): #TODO: pensar que hacer aqui. habria que eliminar el viejo o solo sobreescribir la información? Si se elimina el viejo, que pasa con las tareas ahí ejecutadas? No es trivial currentHost.arch = foundArch currentHost.cpuMHz = foundCpuMHz if currentHost.lrms == None: currentHost.lrms = foundLrms hostsToProfile.append(currentHost) Session.add(currentHost) elif currentHost.shouldBeProfiled(): if currentHost.lrms == None: currentHost.lrms = foundLrms hostsToProfile.append(currentHost) #print("Host profiling: submission of 1 tasks per host") hostProfilingTasks = [ExecutionManager.createHostProfilingTask(host) for host in hostsToProfile for i in range(1)] siteTasks = [] for task in hostProfilingTasks: found=False for gridTask in infrastructureTasks: if gridTask.host.hostname == task.host.hostname: found=True break if not found: siteTasks.append(task) #Esto es para el primer experimento de montera + gwpilot #queremos tener pilots funcionando, así que los arranco con esto if self.lrms=="jobmanager-pilot": print ("creating fake profiling tasks") existingFakeTasks = len([task for task in infrastructureTasks if task.host.hostname=="" and task.status != "PENDING"]) existingGoodPilots = len (self.getGoodHosts()) existingProfilingTasks = len(hostProfilingTasks) #fakeTasksToCreate = base.maxRunningTasks - (existingFakeTasks + existingGoodPilots + existingProfilingTasks) fakeTasksToCreate = base.maxRunningTasks - existingFakeTasks print (" Desired tasks: " + str(base.maxRunningTasks)) print (" Existing fake tasks: " + str(existingFakeTasks)) print (" Existing good pilots: " + str(existingGoodPilots)) print (" created: " + str(fakeTasksToCreate)) emptyHost = FakeHost() fakeHostProfilingTasks = [ExecutionManager.createWakeUpask(emptyHost) for i in range(fakeTasksToCreate)] siteTasks+=fakeHostProfilingTasks return siteTasks
def getGoodHosts(self): print ("GET GOOD HOSTS.") pastHostsList = [host for host in self.hosts if host.successfulExecutions > 0 ] hostList = obtainGWResources() #load XML to memory, and extract data from hosts presentHostNames = [] for hostInfo in hostList: hostName = hostInfo.getElementsByTagName("HOSTNAME")[0].firstChild.data #TODO: remove "unicode" from TEXT presentHostNames.append(hostName) for host in pastHostsList: found = False for name in presentHostNames: if name == host.hostname: found = True break if not found: pastHostsList.remove(host) if host.lrms != self.lrms: pastHostsList.remove(host) #banning failure hosts for host in pastHostsList: bannedTime = None if host.failedProfilings < 0: continue elif host.failedProfilings == 0 and host.successfulExecutions > 0: continue #24 primeros fallos: banning de una hora por fallo if host.failedProfilings < 24: bannedTime =timedelta(hours=host.failedProfilings) elif host.failedProfilings >= 24: bannedTime = timedelta(days=7) if bannedTime != None: if (datetime.now() - bannedTime) < host.lastFailedProfiling: pastHostsList.remove(host) # print (" Host "+ host.hostname + " is banned due to failures for " + str(bannedTime)) #ban hosts with no whetstones, what means no profiling hostsWithWhetstones = [] for host in pastHostsList: if host.getWhetstones() > 1: hostsWithWhetstones.append(host) # else: # print (" Host "+ host.hostname + " is banned, no whetstone info") #Un poco chapuza esto, pero bueno defList=[] if self.lrms == "jobmanager-pilot": print (" filtering hosts.") for host in hostsWithWhetstones: if host.lrms == self.lrms: defList.append(host) print (" keeped host " + host.hostname + " with lrms=" + host.lrms) else: print (" deleted host " + host.hostname + " with lrms=" + host.lrms) else: defList = hostsWithWhetstones print (" Returning list of size " + str(len(defList))) for host in defList: print (" " + host.hostname) return defList
def createInfrastructureTasks(self, infrastructureTasks): print ("-------------------") print ("-------------------") print ("createInfrastructureTasks- NewPilotInfrastructure") # self.showHosts() hostList = obtainGWResources() hostsToProfile = [] print ("Analyzing resources ") for hostInfo in hostList: hostName = hostInfo.getElementsByTagName("HOSTNAME")[0].firstChild.data.strip().lower() #TODO: remove "unicode" from TEXT whetstones=0 try: foundArch = hostInfo.getElementsByTagName("ARCH")[0].firstChild.data except: foundArch="" try: foundCpuMHz = int(hostInfo.getElementsByTagName("CPU_MHZ")[0].firstChild.data) except: foundCpuMHz = 0 try: foundLrms = hostInfo.getElementsByTagName("LRMS_NAME")[0].firstChild.data.strip().lower() except: foundLrms = None print ("Could not find LRMS for host " + hostName + ", skipping it") continue try: freeNodeCount = int(hostInfo.getElementsByTagName("FREENODECOUNT")[0].firstChild.data) except: freeNodeCount = 0 if foundLrms == "jobmanager-pilot": #solo tenemos en cuenta los pilots con al menos un slot disponible if not freeNodeCount > 0: continue username = os.getenv("USER") genericStringArgs = hostInfo.getElementsByTagName("GENERIC_VAR_STR") for node in genericStringArgs: if node.attributes['NAME'].value =="PILOT_REAL_HOSTNAME": workerNode = node.attributes['VALUE'].value.strip().lower() if node.attributes['NAME'].value =="PILOT_REAL_RESOURCE": site = node.attributes['VALUE'].value.strip().lower() genericIntArgs = hostInfo.getElementsByTagName("GENERIC_VAR_INT") for node in genericIntArgs: if node.attributes['NAME'].value =="PILOT_" + username + "_VAR_5": whetstones = int(node.attributes['VALUE'].value.strip().lower()) if whetstones > 65534: whetstones = 0 # whetstones = 0 #if host is unknown, create a profiling task currentHost = self.getHost(hostName) if currentHost == None: print ("Host/Pilot not found. hostname: " + hostName + ", LRMS: " + foundLrms) if foundLrms == "jobmanager-pilot": #he encontrado un pilot: #primero busco e resource, y si no existe lo creo. #luego creo un pilot que utilice ese resource pilotResource = base.Session.query(PilotResource).filter(PilotResource.site == site, PilotResource.workerNode == workerNode).first() if pilotResource == None: print (" PilotResource was not found, creating a new one") pilotResource = PilotResource(site, workerNode) print (" Creating a new Pilot in NewPilotInfrastructure.createInfrastructureTasks") newHost = Pilot(hostName, arch=foundArch, cpuMHz = foundCpuMHz, pilotResource = pilotResource, whetstones = whetstones) self.pilots.append(newHost) Session.add(newHost) else: print (" Creating a new Host in NewPilotInfrastructure.createInfrastructureTasks") newHost = Host(hostName, arch=foundArch, cpuMHz = foundCpuMHz, lrms=foundLrms) self.hosts.append(newHost) Session.add(newHost) #ESTO ES PARA HACER EL PROFILING DE LOS PILOT SI NO HAN PUBLICADO LOS WHETSTONES, SI NO NO HACE FALTA #=============================================================== # if whetstones == 0 or whetstones > 65534: # whetstones = 0 # print (" Host to profile: " + hostName + ": whetstone value not initialized ") # hostsToProfile.append(newHost) # #store new host on databae (faiulre resistance # Session.add(newHost) #=============================================================== #if information has changed, update host information elif (currentHost.getWhetstones() != whetstones): #va con un set porque es una operación más complicada, así que está encapsulada en esta funcion currentHost.setWhetstones(whetstones) Session.add(currentHost) print ("Host: " + hostName + " UPDATED, new whetstones=" + str(whetstones)) elif currentHost.lrms == None: currentHost.lrms = foundLrms #pprofiling of new sites hostProfilingTasks = [ExecutionManager.createHostProfilingTask(host) for host in hostsToProfile for i in range(base.profilingTasksPerHost)] #estamos asumiento que todos los pilots publican la variable esa con su #rendimiento, con lo que no hay que hacer el profiling de nada. #AHORA, EN ESA NUEVA APROXIMACION, QUEREMOS TENER UNOS CUANTO SBENCHMARKS PARA IR ARRANCANDO PILOTS print ("creating fake profiling tasks") existingFakeTasks = len([task for task in infrastructureTasks if task.host.hostname=="" and task.status != "PENDING"]) existingGoodPilots = len (self.getGoodHosts()) existingProfilingTasks = len(hostProfilingTasks) #fakeTasksToCreate = base.maxRunningTasks - (existingFakeTasks + existingGoodPilots + existingProfilingTasks) fakeTasksToCreate = base.maxRunningTasks - existingFakeTasks print (" Desired tasks: " + str(base.maxRunningTasks)) print (" Existing fake tasks: " + str(existingFakeTasks)) print (" Existing good pilots: " + str(existingGoodPilots)) print (" created: " + str(fakeTasksToCreate)) emptyHost = FakeHost() fakeHostProfilingTasks = [ExecutionManager.createFakeHostProfilingTask(emptyHost) for i in range(fakeTasksToCreate)] hostProfilingTasks+=fakeHostProfilingTasks return hostProfilingTasks
def load(self): print ("LOAD - pilotInfrastructures") #SITES pastHosts = Session.query(Host).filter(Host.type=="hosts").all() presentHosts = [] #PILOTS pastPilots = Session.query(Pilot).all() print ("Deleting all past pilots from database") for pilot in pastPilots: base.Session.delete(pilot) presentPilots = [] GWHosts = obtainGWResources() print ("Updating information") #load XML to memory and extract data from hosts try: for resource in GWHosts: hostname = resource.getElementsByTagName("HOSTNAME")[0].firstChild.data.strip().lower() #TODO: remove "unicode" from TEXT try: foundLrms = resource.getElementsByTagName("LRMS_NAME")[0].firstChild.data except: print ("Could not obtain resource LRMS for site " + hostname + ", skipping it") continue if foundLrms == "jobmanager-pilot": #nombre de pilot genericArgs = resource.getElementsByTagName("GENERIC_VAR_STR") for node in genericArgs: if node.attributes['NAME'].value =="PILOT_REAL_HOSTNAME": workerNode = node.attributes['VALUE'].value.strip().lower() if node.attributes['NAME'].value =="PILOT_REAL_RESOURCE": site = node.attributes['VALUE'].value.strip().lower() #numero de nodos libres. Esto en los pilots funciona bien, y es lo que se emplea para saber si está #activo o apagado try: freeNodeCount = int(resource.getElementsByTagName("FREENODECOUNT")[0].firstChild.data) except: freeNodeCount = 0 if not freeNodeCount > 0: continue; #cargamos el pilot en la base de datos: #primero buscamos el pilotResource, o lo creamos si no existe #después creamos el pilot que emplee ese recurso pilotResource = base.Session.query(PilotResource).filter(PilotResource.site == site, PilotResource.workerNode == workerNode).first() if pilotResource == None: pilotResource = PilotResource(site, workerNode) pilot = Pilot(hostname, pilotResource=pilotResource) presentPilots.append(pilot) else: #it is a site for host in pastHosts: if host.hostname == hostname: presentHosts.append(host) break except: print("Error when parsing host information file, employing information from past executions") self.hosts = pastHosts self.pilots = [] return #the hosts that we will first employ are the resulting ones self.pilots = presentPilots self.hosts = presentHosts