示例#1
0
	def load(self):
		pastHosts = Session.query(Host).all()

		print ("INFRASTRUCTURE-LOAD: LOADING INFORMATION FROM PAST EXECUTIONS")
		print ("	Desired LRMS: " + self.lrms)
		#obtain present hosts prom gridway
		
				#PILOTS
		pastPilots = Session.query(Pilot).all()
		print ("	Deleting all past pilots from database")
		for pilot in pastPilots:
			base.Session.delete(pilot)
			
			
		presentHosts = []
		GWHosts =  obtainGWResources()

		if GWHosts == []:
			print("Error when parsing host information file, employing information from past executions")	
			self.hosts = pastHosts
			return

	
		#load XML to memory and extract data from hosts
		for hostInfo in GWHosts:
			hostName = hostInfo.getElementsByTagName("HOSTNAME")[0].firstChild.data #TODO: remove "unicode" from TEXT
			
			#for every found host, check if it existed on a previous execution
			for host in pastHosts:
				if host.hostname.strip().lower() == hostName.strip().lower():
					
					#in the case of pilot jobs, only employ the ones with FREENODECOUNT > 0
					if host.lrms =="jobmanager-pilot":
						try:	
							freeNodeCount = int(hostInfo.getElementsByTagName("FREENODECOUNT")[0].firstChild.data)
						except:
							freeNodeCount = 0
						if not freeNodeCount > 0:
							continue;
					
					presentHosts.append(host)
					break  # Si no pones este break, y está repetido en memoria, se cargan todos

		
		#now, if only hosts with a certain LRMS are desired, we remove the rest
		if self.lrms != None:
			for host in presentHosts:
				if host.lrms != self.lrms:
					print ("	Removing host " + host.hostname + ", wrong LRMS found: " + host.lrms)
					presentHosts.remove(host)
				else:
					print ("	Keeping host " + host.hostname)
		
		#the hosts that we will first employ are the resulting ones
		self.hosts = presentHosts
	def updateStatus (self, gridTasks):
		print ("UPDATE STATUS")
		GWHosts = obtainGWResources()
		if GWHosts ==[]:
			print("Error when parsing host information file, employing information from past executions")	
			return GWHosts
		#PILOTS
		pastPilots = Session.query(Pilot).all()
		presentPilots = []
	
		#load XML to memory and extract data from hosts
		for resource in GWHosts:
			hostname = resource.getElementsByTagName("HOSTNAME")[0].firstChild.data.strip().lower() #TODO: remove "unicode" from TEXT

			try:	
				foundLrms = resource.getElementsByTagName("LRMS_NAME")[0].firstChild.data.strip().lower()
			except:
				print ("Could not obtain resource LRMS, skipping it")
				continue
			
			if foundLrms != "jobmanager-pilot":
				continue
		
			#ahora son todos pilots
			try:	
				freeNodeCount = int(resource.getElementsByTagName("FREENODECOUNT")[0].firstChild.data)
			except:
				freeNodeCount = 0
			if not freeNodeCount > 0:
				continue;

			for pilot in pastPilots:
				if pilot.hostname == hostname:
					presentPilots.append(pilot)
					break	
		#the pilots that we will first employ are the resulting ones
		self.pilots = presentPilots
示例#3
0
	def createInfrastructureTasks(self, infrastructureTasks):	
		
		print ("---------------------")
		print ("---------------------")
		print ("---------------------")

		print ("CREATE INFRASTRUCTURE TASKS")
		
					
					
		hostsToProfile = []
		
		hostList =  obtainGWResources()
		for hostInfo in hostList:
			hostName = hostInfo.getElementsByTagName("HOSTNAME")[0].firstChild.data #TODO: remove "unicode" from TEXT
			
			try:
				foundArch = hostInfo.getElementsByTagName("ARCH")[0].firstChild.data
			except:
				foundArch=""
				
			try:	
				foundCpuMHz = int(hostInfo.getElementsByTagName("CPU_MHZ")[0].firstChild.data)
			except:
				foundCpuMHz = 0
			
			try:	
				foundLrms = hostInfo.getElementsByTagName("LRMS_NAME")[0].firstChild.data
			except:
				foundLrms = None
			
			try:	
				freeNodeCount = int(hostInfo.getElementsByTagName("FREENODECOUNT")[0].firstChild.data)
			except:
				freeNodeCount = 0	

			if foundLrms != None:
				if foundLrms == "jobmanager-pilot":			
					#solo tenemos en cuenta los pilots con al menos un slot disponible
					if not freeNodeCount > 0:
						continue
			
			#if a certain LRMS is desired, remove the hosts with a different one
			if self.lrms != None:
				if foundLrms != self.lrms:
					continue
				
			#if host is unknown, create a profiling task
			currentHost = self.getHost(hostName)
			if  currentHost == None:
				newHost = Host(hostName, arch=foundArch, cpuMHz = foundCpuMHz, lrms=foundLrms)
				self.hosts.append(newHost)
				hostsToProfile.append(newHost)
				#store new host on databae (faiulre resistance
				Session.add(newHost)
			#if information has changed, update host information
			elif (currentHost.arch != foundArch) or (currentHost.cpuMHz != foundCpuMHz):
				#TODO: pensar que hacer aqui. habria que eliminar el viejo o solo sobreescribir la información? Si se elimina el viejo, que pasa con las tareas ahí ejecutadas? No es trivial
				currentHost.arch = foundArch
				currentHost.cpuMHz = foundCpuMHz
				if currentHost.lrms == None:
					currentHost.lrms = foundLrms
				hostsToProfile.append(currentHost)
				Session.add(currentHost)
					
			elif currentHost.shouldBeProfiled():
				if currentHost.lrms == None:
					currentHost.lrms = foundLrms
				hostsToProfile.append(currentHost)

				
		#print("Host profiling: submission of 1 tasks per host")		
		hostProfilingTasks = [ExecutionManager.createHostProfilingTask(host) 
							for host in hostsToProfile
							for i in range(1)]
		
		
		
		siteTasks = []
		for task in hostProfilingTasks:
			found=False
			for gridTask in infrastructureTasks:
				if gridTask.host.hostname == task.host.hostname:
					found=True
					break
			if not found:
				siteTasks.append(task)
				
				
		#Esto es para el primer experimento de montera + gwpilot
		#queremos tener pilots funcionando, así que los arranco con esto 
		if self.lrms=="jobmanager-pilot":
			print ("creating fake profiling tasks")
			
			existingFakeTasks = len([task for task in infrastructureTasks if task.host.hostname=="" and task.status != "PENDING"])
			existingGoodPilots = len (self.getGoodHosts())
			existingProfilingTasks = len(hostProfilingTasks)
			#fakeTasksToCreate = base.maxRunningTasks - (existingFakeTasks + existingGoodPilots + existingProfilingTasks)
			fakeTasksToCreate = base.maxRunningTasks - existingFakeTasks
			
			print ("	Desired tasks: " + str(base.maxRunningTasks))
			print ("	Existing fake tasks: " + str(existingFakeTasks))
			print ("	Existing good pilots: " + str(existingGoodPilots))
			print ("	created: " + str(fakeTasksToCreate))
			
			emptyHost = FakeHost()
			fakeHostProfilingTasks = [ExecutionManager.createWakeUpask(emptyHost) 
						for i in range(fakeTasksToCreate)]
	
			siteTasks+=fakeHostProfilingTasks
		
				

		return siteTasks
示例#4
0
	def getGoodHosts(self):
		print ("GET GOOD HOSTS.")
		pastHostsList = [host for host in self.hosts if host.successfulExecutions > 0 ]
		hostList = obtainGWResources()

		#load XML to memory,  and extract data from hosts
		presentHostNames = []

		for hostInfo in hostList:
			hostName = hostInfo.getElementsByTagName("HOSTNAME")[0].firstChild.data #TODO: remove "unicode" from TEXT
			presentHostNames.append(hostName)
		
		for host in pastHostsList:
			found = False
			for name in presentHostNames:
				if name == host.hostname:
					found = True
					break
			if not found:
				pastHostsList.remove(host)
			if host.lrms != self.lrms:
				pastHostsList.remove(host)

		#banning failure hosts
		for host in pastHostsList:
			bannedTime = None
			if host.failedProfilings < 0:
				continue
			elif host.failedProfilings == 0 and host.successfulExecutions > 0:
				continue

			#24 primeros fallos: banning de una hora por fallo
			if host.failedProfilings < 24:
				bannedTime =timedelta(hours=host.failedProfilings)
			elif  host.failedProfilings >= 24:
				bannedTime = timedelta(days=7)
			if bannedTime != None:	
				if (datetime.now() - bannedTime) < host.lastFailedProfiling:
					pastHostsList.remove(host)
# 					print ("	Host "+ host.hostname + " is banned due to failures for " + str(bannedTime))

		#ban hosts with no whetstones, what means no profiling
		hostsWithWhetstones = []
		for host in pastHostsList:
			if host.getWhetstones() > 1:
				hostsWithWhetstones.append(host)
# 			else:
# 				print ("	Host "+ host.hostname + " is banned, no whetstone info")

		
		#Un poco chapuza esto, pero bueno
		defList=[]
		if self.lrms == "jobmanager-pilot":
			print ("	filtering hosts.")
			for host in hostsWithWhetstones:
				if host.lrms == self.lrms:
					defList.append(host)
					print ("		keeped host " + host.hostname + " with lrms=" + host.lrms)
				else:
					print ("		deleted host " + host.hostname + " with lrms=" + host.lrms)
		else:
			defList = hostsWithWhetstones
			
		print ("	Returning list of size " + str(len(defList)))
		for host in defList:
			print ("		" + host.hostname)
		return defList
	def createInfrastructureTasks(self, infrastructureTasks):	
		
		print ("-------------------")
		print ("-------------------")

		print ("createInfrastructureTasks- NewPilotInfrastructure")

	#	self.showHosts()
					
		hostList = obtainGWResources()
		
		hostsToProfile = []

		print ("Analyzing resources ")
		for hostInfo in hostList:
			hostName = hostInfo.getElementsByTagName("HOSTNAME")[0].firstChild.data.strip().lower() #TODO: remove "unicode" from TEXT
			whetstones=0

			try:
				foundArch = hostInfo.getElementsByTagName("ARCH")[0].firstChild.data
			except:
				foundArch=""
				
			try:	
				foundCpuMHz = int(hostInfo.getElementsByTagName("CPU_MHZ")[0].firstChild.data)
			except:
				foundCpuMHz = 0
			
			try:	
				foundLrms = hostInfo.getElementsByTagName("LRMS_NAME")[0].firstChild.data.strip().lower()
			except:
				foundLrms = None
				print ("Could not find LRMS for host " + hostName + ", skipping it")
				continue
			
			try:	
				freeNodeCount = int(hostInfo.getElementsByTagName("FREENODECOUNT")[0].firstChild.data)
			except:
				freeNodeCount = 0	

			if foundLrms == "jobmanager-pilot":			
				#solo tenemos en cuenta los pilots con al menos un slot disponible
				if not freeNodeCount > 0:
					continue
				
				username = os.getenv("USER")
				genericStringArgs = hostInfo.getElementsByTagName("GENERIC_VAR_STR")
				for node in genericStringArgs:
					if node.attributes['NAME'].value =="PILOT_REAL_HOSTNAME":
						workerNode = node.attributes['VALUE'].value.strip().lower()
					if node.attributes['NAME'].value =="PILOT_REAL_RESOURCE":
						site = node.attributes['VALUE'].value.strip().lower()
				
				genericIntArgs = hostInfo.getElementsByTagName("GENERIC_VAR_INT")
				for node in genericIntArgs:
					if node.attributes['NAME'].value =="PILOT_" + username + "_VAR_5":
						whetstones = int(node.attributes['VALUE'].value.strip().lower())
						if whetstones > 65534: 
							whetstones = 0
				# 	whetstones = 0
			#if host is unknown, create a profiling task
			currentHost = self.getHost(hostName)
			if  currentHost == None:
				print ("Host/Pilot  not found. hostname: " + hostName + ", LRMS: " + foundLrms)
				if foundLrms == "jobmanager-pilot":
					#he encontrado un pilot:
					#primero busco e resource, y si no existe lo creo.
					#luego creo un pilot que utilice ese resource

					pilotResource = base.Session.query(PilotResource).filter(PilotResource.site == site, PilotResource.workerNode == workerNode).first()
					if pilotResource == None:
						print ("	PilotResource was not found, creating a new one")
						pilotResource = PilotResource(site, workerNode)
					print ("	Creating a new Pilot in NewPilotInfrastructure.createInfrastructureTasks")
					newHost = Pilot(hostName, arch=foundArch, cpuMHz = foundCpuMHz, pilotResource = pilotResource, whetstones = whetstones)
					self.pilots.append(newHost)
					Session.add(newHost)

				else:
					print ("	Creating a new Host in NewPilotInfrastructure.createInfrastructureTasks")
					newHost = Host(hostName, arch=foundArch, cpuMHz = foundCpuMHz, lrms=foundLrms)
					self.hosts.append(newHost)
					Session.add(newHost)

				#ESTO ES PARA HACER EL PROFILING DE LOS PILOT SI NO HAN PUBLICADO LOS WHETSTONES, SI NO NO HACE FALTA	
				#===============================================================
				# if whetstones == 0 or whetstones > 65534: 
				# 	whetstones = 0
				# 	print ("	Host to profile: " + hostName + ": whetstone value not initialized ")
				# 	hostsToProfile.append(newHost)
				# 	#store new host on databae (faiulre resistance
				# 	Session.add(newHost)
				#===============================================================
				
			#if information has changed, update host information
			elif (currentHost.getWhetstones() != whetstones):
				#va con un set porque es una operación más complicada, así que está encapsulada en esta funcion
				currentHost.setWhetstones(whetstones)	
				Session.add(currentHost)
				print ("Host: " + hostName + " UPDATED, new whetstones=" + str(whetstones))

			elif currentHost.lrms == None:
				currentHost.lrms = foundLrms


		#pprofiling of new sites		
		hostProfilingTasks = [ExecutionManager.createHostProfilingTask(host) 
							for host in hostsToProfile
							for i in range(base.profilingTasksPerHost)]
		
		

	
		#estamos asumiento que todos los pilots publican la variable esa con su 
		#rendimiento, con lo que no hay que hacer el profiling de nada. 		
				
		#AHORA, EN ESA NUEVA APROXIMACION, QUEREMOS TENER UNOS CUANTO SBENCHMARKS PARA IR ARRANCANDO PILOTS 
		print ("creating fake profiling tasks")
		
		existingFakeTasks = len([task for task in infrastructureTasks if task.host.hostname=="" and task.status != "PENDING"])
		existingGoodPilots = len (self.getGoodHosts())
		existingProfilingTasks = len(hostProfilingTasks)
		#fakeTasksToCreate = base.maxRunningTasks - (existingFakeTasks + existingGoodPilots + existingProfilingTasks)
		fakeTasksToCreate = base.maxRunningTasks - existingFakeTasks
		
		print ("	Desired tasks: " + str(base.maxRunningTasks))
		print ("	Existing fake tasks: " + str(existingFakeTasks))
		print ("	Existing good pilots: " + str(existingGoodPilots))
		print ("	created: " + str(fakeTasksToCreate))
		
		emptyHost = FakeHost()
		fakeHostProfilingTasks = [ExecutionManager.createFakeHostProfilingTask(emptyHost) 
					for i in range(fakeTasksToCreate)]

		hostProfilingTasks+=fakeHostProfilingTasks
		
		
		return hostProfilingTasks
	def load(self):	
		print ("LOAD - pilotInfrastructures")			
		#SITES
		pastHosts = Session.query(Host).filter(Host.type=="hosts").all()
		presentHosts = []
		
		#PILOTS
		pastPilots = Session.query(Pilot).all()
		print ("Deleting all past pilots from database")
		for pilot in pastPilots:
			base.Session.delete(pilot)
				
		presentPilots = []
	
		GWHosts = obtainGWResources()
		print ("Updating information")
		#load XML to memory and extract data from hosts
		try: 
			for resource in GWHosts:
				
				hostname = resource.getElementsByTagName("HOSTNAME")[0].firstChild.data.strip().lower() #TODO: remove "unicode" from TEXT
				try:	
					foundLrms = resource.getElementsByTagName("LRMS_NAME")[0].firstChild.data
				except:
					print ("Could not obtain resource LRMS for site " + hostname + ", skipping it")
					continue
				
				if foundLrms == "jobmanager-pilot":
					#nombre de pilot
					genericArgs = resource.getElementsByTagName("GENERIC_VAR_STR")
					for node in genericArgs:
						if node.attributes['NAME'].value =="PILOT_REAL_HOSTNAME":
							workerNode = node.attributes['VALUE'].value.strip().lower()
						if node.attributes['NAME'].value =="PILOT_REAL_RESOURCE":
							site = node.attributes['VALUE'].value.strip().lower()
						
					#numero de nodos libres. Esto en los pilots funciona bien, y es lo que se emplea para saber si está
					#activo o apagado
					try:	
						freeNodeCount = int(resource.getElementsByTagName("FREENODECOUNT")[0].firstChild.data)
					except:
						freeNodeCount = 0
					if not freeNodeCount > 0:
						continue;

					#cargamos el pilot en la base de datos:
					#primero buscamos el pilotResource, o lo creamos  si no existe
					#después creamos el pilot que emplee ese recurso
					
					pilotResource = base.Session.query(PilotResource).filter(PilotResource.site == site, PilotResource.workerNode == workerNode).first()
					
					if pilotResource == None:
						pilotResource = PilotResource(site, workerNode)
					
					pilot = Pilot(hostname, pilotResource=pilotResource)
					presentPilots.append(pilot)

				else:	#it is a site	
					for host in pastHosts:
						if host.hostname == hostname:
							presentHosts.append(host)
							break		
		
		except:	
			print("Error when parsing host information file, employing information from past executions")	
			self.hosts = pastHosts
			self.pilots = []
			return
			
		
		#the hosts that we will first employ are the resulting ones
		self.pilots = presentPilots
		self.hosts = presentHosts