def recoverExecution(self): print("") print("Starting recovery from past execution") print("") print("Loading application to execute") self.myApplication = base.Session.query(Application).filter(Application.finished == "0").first() if self.myApplication == None: print("Could not load application, exiting") sys.exit(1) # base.tmpExecutionFolder += str(self.myApplication.id) + "/" try: os.makedirs(base.tmpExecutionFolder) except: print("Temporary folder for the application already exists, ok") self.myApplication.calculateInputFileSize() # TODO: esto es un añapa, pero no sé mejor como cargarlo self.myApplication.profile.application = self.myApplication print("Application Loaded") print("") print("Loading scheduling algorithm") self.mySchedulingAlgorithm = InformationManager.loadSchedulingAlgorithm( self.myApplication.schedulingAlgorithm, initialized=True ) if self.mySchedulingAlgorithm != None: print("Scheduling Algorithm loaded") else: print("Could not find an appropriate scheduling algorithm") sys.exit(1) print("") print("Loading tasks being executed") self.applicationTasks = ( base.Session.query(GridTask) .filter( GridTask.status != "CLEAR", or_(GridTask.applicationID == self.myApplication.id, GridTask.applicationID == None), or_(GridTask.type == "applicationExecution", GridTask.type == "applicationProfiling"), ) .all() ) self.infrastructureTasks = ( base.Session.query(GridTask) .filter( GridTask.status != "CLEAR", or_(GridTask.applicationID == self.myApplication.id, GridTask.applicationID == None), or_(GridTask.type == "hostProfiling", GridTask.type == "pilot"), ) .all() ) print("Tasks loaded")
def loadAllvalues(sys): global progress_count progress_count=progress_count+1 con= ConnectionManager.createConnection(sys) if con != None: print "Connection established to "+sys._primary_ip sys= InformationManager.getHostName(con,sys) print "Received hostname : "+sys._hostname showProgress(1) sys= InformationManager.getOSName(con,sys) print "Received OS details : " +ensure_str(sys._os) showProgress(2) sys= InformationManager.getModel(con,sys) print "Received Model : "+sys._type showProgress(3) sys= InformationManager.getIPAddr(con,sys) print "Received IP Address details : "+sys._ipaddr showProgress(4) sys= InformationManager.getMacAddr(con,sys) print "Received MAC address details : "+sys._macaddr showProgress(5) #sys=InformationManager.getWWN(con,sys) sys= InformationManager.getStorage(con,sys,storageconnection) print "Received Storage details : "+sys._wwn, sys._tparhost, sys._tparvolumes showProgress(6) #print sys._hostname ConnectionManager.closeConnection(con,sys) return sys else : sys._hostname = "Error in "+sys._primary_ip return sys
def execute(self): while True: # update status of tasks being executed print("---") print("date:" + str(datetime.now())) print("") print("CHECKING FOR GRID CERTIFICATE") if not InformationManager.checkForValidCertificates(): print("Could not find a valid certificate") print("Finishing execution now :(") break # =================================================================== # print ("PRINCICIPIO DE EXECUTE") # for task in self.infrastructureTasks: # print ("Task " + task.gwID + "has hosttype " + str(task.host.__class__)) # # # =================================================================== print("") print("UPDATING INFRASTRUCTURE STATUS") for gridTask in self.infrastructureTasks: ExecutionManager.updateGridTaskStatus(gridTask) finishedTasks = [gridTask for gridTask in self.infrastructureTasks if gridTask.status == "DONE"] for gridTask in finishedTasks: if gridTask.type == "hostProfiling": self.myInfrastructure.updateInfoAfterProfiling(gridTask) # and update gridTask status ExecutionManager.removeTaskFromGW(gridTask) self.infrastructureTasks.remove(gridTask) self.myInfrastructure.updateStatus(self.infrastructureTasks) print("") print("CREATING INFRASTRUCTURE TASKS") self.executeInfrastructureTasks() # =================================================================== # print ("FILNAL DE EXECUTE") # for task in self.infrastructureTasks: # print ("Task " + task.gwID + "has hosttype " + str(task.host.__class__)) # =================================================================== print("UPDATE APPLICATION STATUS") for gridTask in self.applicationTasks: ExecutionManager.updateGridTaskStatus(gridTask) # estoe s una chapuza totalTasks = self.infrastructureTasks + self.applicationTasks self.myInfrastructure.updateStatus(totalTasks) # process recently finishedTasks finishedTasks = [gridTask for gridTask in self.applicationTasks if gridTask.status == "DONE"] for gridTask in finishedTasks: if gridTask.type == "applicationProfiling": self.myApplication.updateInfoAfterProfiling(gridTask) elif gridTask.type == "applicationExecution": self.myApplication.updateInfoAfterExecution(gridTask) # and update gridTask status ExecutionManager.removeTaskFromGW(gridTask) self.applicationTasks.remove(gridTask) # check for execution finish if self.myApplication.remainingSamples <= 0: print("Starting the exit of execution loop") # TODO: poner en los hosts que el máximo de host total es el maximo del total y del maximoThisTime self.myApplication.finished = 1 base.Session.add(self.myApplication) print("Application marked as finished") print("") print("Storing information about available hosts on remote sites") for host in self.myInfrastructure.hosts: host.maxSlotCount = max(host.maxSlotCount, host.maxSlotCountThisTime) base.Session.add(host) print("") print("Removing finished tasks from gridWay") # esto es discutible, # ahora, si una tarea de profiling ha llegado hasta el final de la ejecución sin ser completada, la marco como fallida # esto perjudica a los sitios que tienen un tiempo enorme de respuesta # lo contrario hace que si un sitio no contesta se siga considerando pendiente de profiling hasta el infinito for gridTask in self.applicationTasks: ExecutionManager.removeTaskFromGW(gridTask) if gridTask.type == "hostProfiling": self.myInfrastructure.updateInfoAfterProfiling(gridTask) try: base.Session.commit() except: base.Session.rollback() print("Lost connection with database, not storing anything!") print("Exiting execution loop") break print("") print("CREATING NEW EXECUTION TASKS") # =================================================================== # self.myInfrastructure.showHosts() # =================================================================== applicationExecutionTasks = self.mySchedulingAlgorithm.createApplicationTasks( self.myInfrastructure, self.myApplication, self.applicationTasks ) for gridTask in applicationExecutionTasks: ExecutionManager.submit(gridTask) self.applicationTasks.append(gridTask) base.Session.add(gridTask) try: base.Session.commit() except: base.Session.rollback() print("Lost connection with database, not storing anything!") print("...") sleep(15)
def newExecution(self, requirementsFile): print("") print("Starting the execution of a new Montera template") print("") print("") print("Cleaning the DB from past executions") self.purgeDB() print("Database cleaned") print("") print("Reading application requirements") self.myApplication = UserInterface.readRequirements(requirementsFile) base.Session.add(self.myApplication.profile) base.Session.add(self.myApplication) base.Session.commit() # TODO: esto es un añapa, pero el base.Session.commit jode este puntero self.myApplication.profile.application = self.myApplication print("application requirements read an stored") print("") print("name: " + self.myApplication.name) print("desired samples: " + str(self.myApplication.desiredSamples)) # base.tmpExecutionFolder += str(self.myApplication.id) + "/" try: os.makedirs(base.tmpExecutionFolder) print("Creating temporary folder for application results, " + base.tmpExecutionFolder) except: print("Temporary folder for the application already exists, ok") print("") print("Checking if a profiling of the application is neccesary") if self.myApplication.profile.numProfilings == 0: print("it is necessary. Starting app profile!") self.profileApplication() try: base.Session.commit() except: base.Session.rollback() print("Lost connection with database, not storing anything!") print("Application profiling finished") else: print("Application profiling loaded from past executions, profiling is not needed") print("Constant effort, sample effort:") print( str(self.myApplication.profile.constantEffort) + " whetstones, " + str(self.myApplication.profile.sampleEffort) + " whetstones/sample" ) print("") print("Loading scheduling algorithm") self.mySchedulingAlgorithm = InformationManager.loadSchedulingAlgorithm(self.myApplication.schedulingAlgorithm) if self.mySchedulingAlgorithm != None: print("Scheduling Algorithm loaded") else: print("Could not find an appropriate scheduling algorithm") sys.exit(1)
def createApplicationTasks(self, infrastructure, application, gridTasks): print ("-------------") print ("-------------") print ("DyTSS") #in this algorithm, we consider each free slot to be a host (it symplifies the maths) goodHosts = infrastructure.getGoodHosts() if len (goodHosts) == 0: print ("No available hosts, will not schedule this time") return [] #print ("") #print("Employed hosts:") #for host in goodHosts: # print(host.hostname + ", " + str(host.maxSlotCount) +" slots, "+ str(host.getWhetstones()) + " whetstones") newGridTasks = [] #in the first execution, we consider that the number of free slots on each host #is the one stored from the last execution. Thus, we create a task for each of these slots gridTaskList = self.createDyTSSTaskList(infrastructure, application) if not self.initialized: for host in goodHosts: taskToAdd = self.pickFirstTaskForHost(gridTaskList, host) for i in range(host.maxSlotCount): newTask= GridTask(None, None, None) newTask.duplicate(taskToAdd) newGridTasks.append(newTask) createGWTemplate(newTask) maxRunningTasks = InformationManager.readMaxRunningTasks() self.priorize(infrastructure, application, newGridTasks) newGridTasks = newGridTasks[:maxRunningTasks] self.initialized = True else: currentRunningTasks = 0 for host in goodHosts: runningTaks = 0 submittedTasks = 0 for task in gridTasks: if task.host is host: if task.status == "RUNNING": runningTaks+= 1 currentRunningTasks +=1 elif task.status == "SUBMITTED": submittedTasks +=1 desiredQueuedTasks = max(1,math.ceil(host.currentSlotCount * base.spareTasks)) waitingTasksToCreate = desiredQueuedTasks - submittedTasks if waitingTasksToCreate > 0: print ("In host " + host.hostname + " we have " + str(runningTaks) + " running and " + str(submittedTasks) + " submitted, so we create " + str(waitingTasksToCreate)) for i in range(int(waitingTasksToCreate)): newTask = self.pickFirstTaskForHost(gridTaskList, host) newGridTasks.append(newTask) createGWTemplate(newTask) maxRunningTasks = InformationManager.readMaxRunningTasks() self.priorize(infrastructure, application, newGridTasks) tasksWeWant = int(maxRunningTasks * (1 + base.spareTasks) - currentRunningTasks) print ("Max running tasks: " + str(maxRunningTasks) + "; currentRunningTasks: "+ str(currentRunningTasks) + " ; taskl we want: " + str(tasksWeWant) + "; submitting: " + str(len(newGridTasks))) if tasksWeWant > 0: newGridTasks = newGridTasks[:tasksWeWant] else: newGridTasks = [] print ("No tasks submitted") return newGridTasks
def updateInfoAfterExecution(self, gridTask): print("Updating info after exetcution of task " +gridTask.gwID + " on host " + gridTask.host.hostname + " (hostID " + str(gridTask.host.id) + ")") gridTask.status="CLEAR" #1.- abrir el archivo correspondiente a esa task hostToUpdate = gridTask.host execution_file = base.tmpExecutionFolder + "/execution_result_" + gridTask.gwID + ".xml" #1.- abrir el archivo correspondiente a esa task try: doc = xml.dom.minidom.parse(execution_file) except: print("failed when updating info after execution. File " + execution_file + " could not be found") hostToUpdate.updateInfoAfterFailedExecution() Session.add(gridTask) return #si los archivos de salida deseados no existen, también la cuento como fallida for outputFile in self.outputFiles.split(","): #JOB_ID has to be replaced by gwID as it happens along the execution splittedFile = outputFile.split('JOB_ID') output="" for pos in range(len(splittedFile)): output += splittedFile[pos] if pos < len(splittedFile) -1: output+=gridTask.gwID if not os.path.exists(base.tmpExecutionFolder + "/" + output): print("failed when updating info after execution. output file " + base.tmpExecutionFolder + "/" + output + " could not be found") hostToUpdate.updateInfoAfterFailedExecution() Session.add(gridTask) return executionInfoList = doc.getElementsByTagName('execution_info') gridTaskType = None remoteHostName = None executionTime = None dataSize = None realSamples = None for executionData in executionInfoList: try: gridTaskType = executionData.getElementsByTagName("type")[0].firstChild.data #TODO: remove "unicode" from TEXT remoteHostName = executionData.getElementsByTagName("hostname")[0].firstChild.data executionTime = float(executionData.getElementsByTagName("execution_time")[0].firstChild.data) dataSize = float(executionData.getElementsByTagName("data_size")[0].firstChild.data) realSamples = int(executionData.getElementsByTagName("real_samples")[0].firstChild.data) except: print ("Error when reading execution file, exiting" ) Session.add(gridTask) return #2.- procesar los resultados if gridTaskType != "execution": print ("ERROR when updating info from an application execution") print("Incorrect task type, expecting \"execution\"") gridTask.status = "CLEAR" Session.add(gridTask) return if remoteHostName != hostToUpdate.hostname: print ("ERROR when updating info from a application execution") print("Incorrect host name, expecting " + hostToUpdate.hostname) gridTask.status = "CLEAR" Session.add(gridTask) return if executionTime == 0: print ("ERROR when updating info from an application execution") print ("Execution time appears to be zero, and that's quite strange") gridTask.status = "CLEAR" hostToUpdate.updateInfoAfterFailedExecution() Session.add(gridTask) return totalActiveTime = InformationManager.readTotalActiveTime(gridTask.gwID) if totalActiveTime == -1: print ("ERROR when updating info from an application execution") print ("Could not read active time from GridWay log, considering that task failed") gridTask.status = "CLEAR" hostToUpdate.updateInfoAfterFailedExecution() Session.add(gridTask) return queueTime = InformationManager.readQueueTime(gridTask.gwID) if queueTime == -1: print ("ERROR when updating info from an application execution") print ("Could not read queue time from GridWay log, considering that task failed") gridTask.status = "CLEAR" hostToUpdate.updateInfoAfterFailedExecution() Session.add(gridTask) return transferTime = totalActiveTime - executionTime if transferTime > 0: bandwidth = dataSize / transferTime else: bandwidth = -1 #3.- actualizar rendimiento del host computationalEffort = self.profile.constantEffort + self.profile.sampleEffort * realSamples whetstones = computationalEffort / executionTime hostToUpdate.updateInfoFromSuccessFulExecution(whetstones, queueTime, bandwidth) hostToUpdate.failedProfilings -=1 #4 actualizar estado de la tarea y la apliación. gridTask.realSamples = realSamples gridTask.status = "CLEAR" self.remainingSamples -= realSamples print("APPLICATION UPDATE: " + str(self.remainingSamples) + "/" + str(self.desiredSamples) + " left") #5.- eliminar archivos temporales try: #os.remove(execution_file) print ("In Application.py, I would be deletign" + execution_file) print ("Execution file has been successfully deleted: " + execution_file) except: print ("Could not delete profiling file " + execution_file) #6 update info on DB gridTask.endDate = datetime.now() Session.add(gridTask) Session.add(self)
def updateInfoAfterProfiling(self, gridTask): print ("Updating info after profiling site " + gridTask.host.hostname) print (" Task info:") print (" Task id: " + str(gridTask.id)) print (" GW ID: " + gridTask.gwID) print (" desired host: " + gridTask.host.hostname) print (" Host type: " + str(gridTask.host.__class__)) gridTask.status="CLEAR" #1.- abrir el archivo correspondiente a esa task execution_file = base.tmpExecutionFolder + "execution_result_" + gridTask.gwID + ".xml" try: doc = xml.dom.minidom.parse(execution_file) except: print("failed when profiling host " + gridTask.host.hostname + ". File " + execution_file + " could not be found") gridTask.host.updateInfoAfterFailedProfiling() Session.add(gridTask) return executionInfoList = doc.getElementsByTagName('execution_info') for executionData in executionInfoList: try: gridTaskType = executionData.getElementsByTagName("type")[0].firstChild.data #TODO: remove "unicode" from TEXT remoteHostName = executionData.getElementsByTagName("hostname")[0].firstChild.data whetstones = float(executionData.getElementsByTagName("whetstones")[0].firstChild.data) waitingTime = float(executionData.getElementsByTagName("execution_time")[0].firstChild.data) dataSize = float(executionData.getElementsByTagName("data_size")[0].firstChild.data) except: print("failed when profiling host " + gridTask.host.hostname + ". File " + execution_file + " could not be found") gridTask.host.updateInfoAfterFailedProfiling() Session.add(gridTask) return #2.- procesar los resultados if gridTaskType != "benchmark": print ("ERROR when updating info from a site profiling") print("Incorrect task type, readed " + gridTaskType + " and should be \"benchmark\"") print (" considering the execution as failed") gridTask.host.updateInfoAfterFailedProfiling() Session.add(gridTask) return if remoteHostName != gridTask.host.hostname: print ("ERROR when updating info from a site profiling") print("Incorrect host name, readed" + remoteHostName + " and should be " + gridTask.host.hostname) print (" considering the execution as failed") gridTask.host.updateInfoAfterFailedProfiling() Session.add(gridTask) return totalActiveTime = InformationManager.readTotalActiveTime(gridTask.gwID) transferTime = totalActiveTime - waitingTime queueTime = InformationManager.readQueueTime(gridTask.gwID) if transferTime > 0: bandwidth = dataSize / transferTime else: bandwidth = -1 #3.- suministrar esa info al host. gridTask.host.updateInfoFromSuccessFulExecution(whetstones, queueTime, bandwidth) #4.- eliminar archivos temporales try: #os.remove(execution_file) print ("IN application.py, I would be deleting " + execution_file) print ("Profiling file has been successfully deleted: " + execution_file) except: print ("Could not delete profiling file " + execution_file) gridTask.endDate = datetime.now()