Пример #1
0
 def ProccessResults(self,Verbose=True):
     if Verbose:print "Processing the output"
     try:
         jobfilesDir=os.path.join(self.taskDir,"jobfiles")
         resultFile=os.path.join(self.taskDir,"result.xml")
                     
         if hasattr(self, 'ReportFormat'):#i.e. fatal error and the last one is already in status/statusinfo
             if self.ReportFormat=="Error":
                 self.ToDoNextString="PushToDB"
                 self.WriteErrorXML(resultFile)
                 return datetime.timedelta(seconds=3)
         
         (batchJobDir,stdoutFile,stderrFile,appstdoutFile,taskexeclogFile)=self.GetResultFiles(raiseError=True)
         
         #get the performance data
         parserfilename=os.path.join(akrr.curdir,"appkernelsparsers",self.app['parser'])
         import imp
         with open(parserfilename, 'rb') as fp:
             thisAppKerParser = imp.load_module(
                 'thisAppKerParser', fp, parserfilename,
                 ('.py', 'rb', imp.PY_SOURCE)
             )
         
         appKerNResVars={}
         appKerNResVars['resource']=self.resource
         appKerNResVars['resource'].update(self.resourceParam)
         appKerNResVars['app']=self.app
         appKerNResVars['app'].update(self.appParam)
         appKerNResVars['taskId']=self.task_id
         appKerNResVars['subTasksId']=self.subTasksId
         
         
         performance=thisAppKerParser.processAppKerOutput(appstdout=appstdoutFile,geninfo=os.path.join(batchJobDir,"gen.info"),appKerNResVars=appKerNResVars)
         if performance==None:
             self.status="ERROR: Job have not finished successfully"
             self.statusinfo=""
             self.ToDoNextString="PushToDB"
             self.WriteErrorXML(resultFile)
         else:
             fout=open(resultFile,"w")
             content=fout.write(performance)
             fout.close()
             self.status="Output was processed and found that kernel either exited with error or executed successfully."
             self.statusinfo="Done"
             self.ToDoNextString="PushToDB"
         return datetime.timedelta(seconds=3)
     except:
         print traceback.format_exc()
         self.status="ERROR: Error happens during processing of output."
         self.statusinfo=traceback.format_exc()
         self.FatalErrorsCount+=1
         akrr.printException(self.status)
         self.ToDoNextString="PushToDB"
         self.WriteErrorXML(resultFile)
         return  datetime.timedelta(seconds=3)
Пример #2
0
 def PushToDB(self,Verbose=True):
     
     db,cur=akrr.getExportDB()
     try:
         
         time_finished=None
         if hasattr(self,'TimeJobPossiblyCompleted'):
             time_finished=self.TimeJobPossiblyCompleted
         else:
             time_finished=datetime.datetime.today()
         self.PushToDBRaw(cur,self.task_id,time_finished,Verbose)
         db.commit()
         cur.close()
         del db
         self.ToDoNextString="IamDone"
         return None
     except:
         print traceback.format_exc()
         db.rollback()
         db.commit()
         cur.close()
         del db
         if hasattr(self, 'PushToDBAttemps'):
             self.PushToDBAttemps+=1
         else:
             self.PushToDBAttemps=1
             
         
         if self.PushToDBAttemps <= akrr.export_db_max_repeat_attempts:
             akrr.printException("AKRR server was not able to push to external DB.")
             self.status="ERROR: Can not push to external DB, will try again"
             self.statusinfo=traceback.format_exc()
             return akrr.export_db_repeat_attempt_in
         else:
             akrr.printException("AKRR server was not able to push to external DB will only update local.")
             self.status="ERROR: Can not push to external DB, will try again"
             self.statusinfo=traceback.format_exc()
             self.ToDoNextString="IamDone"
             return None
Пример #3
0
    def CreateBatchJobScriptAndSubmitIt(self):
        self.JobScriptName=self.appName+".job"
        print "### Creating batch job script and submitting it to remote machine"
        
        
        #as a current bypass will create a job script remotely and copy it here
        #get ssh to remote resource
        
        sh=None
        try:
            sh=akrr.sshResource(self.resource)
            #Create remote directories if needed
            def CheckAndCreateDir(self,sh,d):
                cmd="if [ ! -d  \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d)
                akrr.sshCommand(sh,cmd)
                cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d)
                msg=akrr.sshCommand(sh,cmd)
                if msg.find("DOESNOTEXIST")>=0:
                    raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name']))
            #akrrdata
            CheckAndCreateDir(self,sh,self.resource['akrrdata'])
            #dir for app
            CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName))
            #dir for task
            CheckAndCreateDir(self,sh,self.remoteTaskDir)
            #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl"))
            
            #cd to remoteTaskDir
            akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir))
            
            #get walltime from DB
            dbdefaults={}
            try:
                db,cur=akrr.getDB()
                
                cur.execute('''SELECT resource,app,resource_param,app_param FROM ACTIVETASKS
                WHERE task_id=%s ;''',(self.task_id,))
                raw=cur.fetchall()
                (resource,app,resource_param,app_param)=raw[0]
                
                cur.execute("""SELECT walllimit
                    FROM akrr_default_walllimit
                    WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """,(resource,app,resource_param,app_param))
                raw=cur.fetchall()
                
                if len(raw)>0:
                    dbdefaults['walllimit']=raw[0][0]
                
                #db.commit()
                cur.close()
                del db
            except Exception as e:
                pass
            
            #create job-script
            batchvars={}
            
            #print "#"*80
            for di in [self.resource,self.app,dbdefaults,self.resourceParam, self.appParam]:
                batchvars.update(di)
                
            #stack the subtasks
            subTaskInfo=self.GetSubTaskInfo()
            if batchvars['shuffleSubtasks']:
                random.shuffle(subTaskInfo)
            subTasksExecution=""
            for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo:
                remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp)
                SubTaskJobScriptName=self.GetJobScriptName(subtask_app)
                SubTaskJobScriptPath=os.path.join(remoteSubTaskDir,SubTaskJobScriptName)
                
                subTasksExecution+="cd "+remoteSubTaskDir+"\n"
                #subTasksExecution+="cp "+os.path.join(self.remoteTaskDir,"job.id ")+"./\n"
                subTasksExecution+="echo Starting "+subtask_app+"\n"
                subTasksExecution+=self.resource['shell']+" "+SubTaskJobScriptPath+" > stdout 2> stderr\n"
                subTasksExecution+="echo Done with "+subtask_app+"\n"+"\n"
           
            batchvars['subTasksExecution']=subTasksExecution
            
            
            
            
            #calculate NNodes and NCores
            tmpNNodes=None
            tmpNCores=None
            if batchvars.has_key('nnodes'):
                tmpNNodes=batchvars['nnodes']
                tmpNCores=tmpNNodes*batchvars['ppn']
            else:
                tmpNCores=batchvars['ncores']
                if tmpNCores%batchvars['ppn']==0:
                    tmpNNodes=tmpNCores/batchvars['ppn']
                else:
                    tmpNNodes=(tmpNCores/batchvars['ppn'])+1
            
            batchvars['akrrNCores']=tmpNCores
            batchvars['akrrNNodes']=tmpNNodes
            
            #Set batchvars remaps
            batchvars['akrrPPN']=batchvars['ppn']
            batchvars['akrrNCoresToBorder']=batchvars['akrrPPN']*batchvars['akrrNNodes']
            batchvars['akrrTaskWorkingDir']=self.remoteTaskDir
            batchvars['akrrWallTimeLimit']="%02d:%02d:00"%(int(batchvars['walllimit'])/60,int(batchvars['walllimit'])%60)
            batchvars['localPATH']=akrr.sshCommand(sh,"echo $PATH").strip()
            batchvars['akrrAppKerName']=self.app['name']
            batchvars['akrrResourceName']=self.resource['name']
            batchvars['akrrTimeStamp']= self.timeStamp
            if batchvars['akrrNNodes']==1: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrNCores']
            else: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN']
            if 'nodeListSetterTemplate' not in batchvars:
                batchvars['nodeListSetterTemplate']=batchvars['nodeListSetter'][batchvars['batchScheduler']]
            #set AppKerLauncher
            #if self.resource['name'] in batchvars['runScript']:
            #    batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript'][self.resource['name']],batchvars,keepDoubleBrakets=True)
            #else:
            #    batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript']['default'],batchvars,keepDoubleBrakets=True)
                
            
            #process templates
            batchvars['akrrCommonCommands']=akrr.formatRecursively(batchvars['akrrCommonCommandsTemplate'],batchvars,keepDoubleBrakets=True)
            #batchvars['akrrCommonTests']=akrr.formatRecursively(batchvars['akrrCommonTestsTemplate'],batchvars,keepDoubleBrakets=True)
            #batchvars['akrrStartAppKer']=batchvars['akrrStartAppKerTemplate'].format(**batchvars)
            batchvars['akrrCommonCleanup']=akrr.formatRecursively(batchvars['akrrCommonCleanupTemplate'],batchvars,keepDoubleBrakets=True)
            
            #do parameters adjustment
            if 'process_params' in batchvars:
                batchvars['process_params'](batchvars)
            #generate job script
            jobScript=akrr.formatRecursively(self.resource["batchJobTemplate"],batchvars)
            fout=open(os.path.join(self.taskDir,"jobfiles",self.JobScriptName),"w")
            fout.write(jobScript)
            fout.close()
            msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir))
            
            ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript))
            akrr.sshCommand(sh,"cat %s "%(self.JobScriptName))
            
            
            #send to queue
            from string import Template
            sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName)
            msg=akrr.sshCommand(sh,sendToQueue)
            matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S)

            JobID=None
            if matchObj:
                try:
                    JobID=int(matchObj.group(1))
                except:
                    raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg)
            else:
                raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg)
            
            akrr.sshCommand(sh,"echo %d > job.id"%(JobID))
            
            #cp job id to subtasks
            for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo:
                remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp)
                akrr.sshCommand(sh,"cp job.id %s"%(remoteSubTaskDir))
            
            self.RemoteJobID=JobID
            self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today()
            
            sh.sendline("exit")
            sh.close(force=True)
            del sh
            sh=None
            print "\nRemoteJobID=",self.RemoteJobID
            print "copying files from remote machine"
            msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r")
            
            #update DB time_submitted_to_queue
            db,cur=akrr.getDB()
            
            cur.execute('''UPDATE ACTIVETASKS
            SET time_submitted_to_queue=%s
            WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id))
            
            cur.close()
            del db
            
            self.status="Created batch job script and have submitted it to remote queue."
            self.statusinfo="Remote job ID is %d"%(self.RemoteJobID)
            self.ToDoNextString="CheckTheJobOnRemoteMachine"
            
            
            #check first time in 1 minute
            return datetime.timedelta(days=0, hours=0, minutes=1)
        except Exception as e:
            if sh!=None:
                sh.sendline("exit")
                sh.close(force=True)
                del sh
            self.status="ERROR Can not created batch job script and submit it to remote queue"
            self.statusinfo=traceback.format_exc()
            if akrr.max_fails_to_submit_to_the_queue>=0:
                if hasattr(self, "FailsToSubmitToTheQueue"):
                    self.FailsToSubmitToTheQueue+=1
                    if self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue:
                        #Stop execution of the task and submit results to db
                        self.ToDoNextString="PushToDB"
                        resultFile=os.path.join(self.taskDir,"result.xml")
                        self.WriteErrorXML(resultFile)
                        return  datetime.timedelta(seconds=3)
                else:
                    self.FailsToSubmitToTheQueue=1
            else:
                self.FatalErrorsCount+=1
            
            akrr.printException(self.status)
            return akrr.RepeateAfterFailsToSubmitToTheQueue
Пример #4
0
 def CheckTheJobOnRemoteMachine(self):
     sh=None
     try:
         print "### Checking the job status on remote machine"
         from string import Template
         wE=waitExprs[self.resource['batchScheduler']]
         cmd =Template(wE[0]).substitute(jobId=str(self.RemoteJobID))
         rege=Template(wE[2]).substitute(jobId=str(self.RemoteJobID))
         
         sh=akrr.sshResource(self.resource)
         msg=akrr.sshCommand(sh,cmd)
         sh.sendline("exit")
         sh.close(force=True)
         del sh
         sh=None
         
         
         matchObj= wE[1](rege,msg,wE[3])
         if matchObj:
             print "Still in queue. Either waiting or running"
             if datetime.datetime.today()-self.TimeJobSubmetedToRemoteQueue>self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue):
                 print "ERROR:"
                 print "Job exceeds the maximal time in queue (%s). And will be terminated."%(str(self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue)))
                 print "Removing job from remote queue."
                 self.Terminate()
                 print "copying files from remote machine"
                 akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r")
                 #print msg
                 print "Deleting all files from remote machine"
                 self.DeleteRemoteFolder()
                 self.status="ERROR: Job exceeds the maximal time in queue (%s) and was terminated."%(str(self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue)))
                 self.statusinfo="\nLast Status report:\n"+msg
                 self.ReportFormat="Error"
                 self.ToDoNextString="CheckIfSubtasksDoneProccessingResults"
                 
                 self.UpdateSubTasks()
                 #del self.RemoteJobID
                 return datetime.timedelta(seconds=3)
             
             self.status="Still in queue. Either waiting or running"
             self.statusinfo=msg
             return active_task_default_attempt_repeat
         else:
             print "Not in queue. Either exited with error or executed successfully."
             print "copying files from remote machine"
             msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r")
             #print msg
             print "Deleting all files from remote machine"
             self.DeleteRemoteFolder()
             self.status="Not in queue. Either exited with error or executed successfully. Copied all files to local machine. Deleted all files from remote machine"
             self.statusinfo="Not in queue. Either exited with error or executed successfully. Copied all files to local machine. Deleted all files from remote machine"
             self.ToDoNextString="CheckIfSubtasksDoneProccessingResults"
             self.UpdateSubTasks()
             #del self.RemoteJobID
             self.TimeJobPossiblyCompleted=datetime.datetime.today()
             return datetime.timedelta(seconds=3)
         #print msg
     except:
         if sh!=None:
             sh.sendline("exit")
             sh.close(force=True)
             del sh
         self.status="ERROR Can not check the status of the job on remote resource"
         self.statusinfo=traceback.format_exc()
         self.FatalErrorsCount+=1
         akrr.printException(self.status)
         self.ToDoNextString="CheckTheJobOnRemoteMachine"
         return active_task_default_attempt_repeat
     self.status="CheckTheJobOnRemoteMachine"
     self.statusinfo="CheckTheJobOnRemoteMachine"
     self.ToDoNextString="CheckTheJobOnRemoteMachine"
     return datetime.timedelta(days=0, hours=0, minutes=2)
Пример #5
0
 def ProccessResultsOld(self,Verbose=True):
     if Verbose:print "Processing the output"
     try:
         jobfilesDir=os.path.join(self.taskDir,"jobfiles")
         resultFile=os.path.join(self.taskDir,"result.xml")
                     
         if hasattr(self, 'ReportFormat'):#i.e. fatal error and the last one is already in status/statusinfo
             if self.ReportFormat=="Error":
                 self.ToDoNextString="PushToDB"
                 self.WriteErrorXML(resultFile)
                 return datetime.timedelta(seconds=3)
         
         (batchJobDir,stdoutFile,stderrFile,appstdoutFile,taskexeclogFile)=self.GetResultFiles(raiseError=True)
         
         #now check if stdoutFile is empty or not
         fin=open(stdoutFile,"r")
         remstdout=fin.read()
         fin.close()
         
         
         if len(remstdout) < 5:
             fin=open(stderrFile,"r")
             remstderr=fin.readlines()
             fin.close()
             for l in remstderr:
                 if re.search('job killed: walltime *\d+ *exceeded limit *\d+',l):
                     self.status="ERROR: Job was killed on remote resource due to walltime exceeded limit"
                     self.statusinfo=l
                     self.ToDoNextString="PushToDB"
                     self.WriteErrorXML(resultFile)
                     return datetime.timedelta(seconds=3)
                 
             
             if Verbose:print "stdout is too short meaning that application kernel exit prematurely"
             self.status="ERROR: stdout is too short meaning that application kernel exit prematurely"
             self.statusinfo="stdout is too short meaning that application kernel exit prematurely"
             self.WriteErrorXML(resultFile)
             self.ToDoNextString="PushToDB"
             return datetime.timedelta(seconds=3)
         #here we need to check file
         if remstdout.count("<rep:report")==0:
             self.status="ERROR: unknown error"
             self.statusinfo="stdout:\n"+remstdout
             if appstdoutFile!=None:
                 fin=open(appstdoutFile,"r")
                 remappstdout=fin.read()
                 fin.close()
                 self.statusinfo+="\nappstdout:\n"+remappstdout
         
             self.WriteErrorXML(resultFile)
             self.ToDoNextString="PushToDB"
             return datetime.timedelta(seconds=3)
             
         self.status="Output was processed and found that kernel either exited with error or executed successfully."
         self.statusinfo="Done"
         self.ToDoNextString="PushToDB"
         import shutil
         
         shutil.copy2(stdoutFile,resultFile)
         #need to extract xml part file, some resource put servise information above and below
         fin=open(resultFile,"r")
         content=fin.read()
         fin.close()
         if content[0]!='<' or content[-2]!='>':
             #need to reformat
             i0=content.find("<rep:report")
             i1=content.find("</rep:report>")
             
             fout=open(resultFile,"w")
             content=fout.write("<?xml version='1.0'?>\n"+content[i0:i1+len("</rep:report>")]+"\n")
             fout.close()
         return datetime.timedelta(seconds=3)
     except:
         self.status="ERROR: Error happens during processing of output."
         self.statusinfo=traceback.format_exc()
         self.FatalErrorsCount+=1
         akrr.printException(self.status)
         self.ToDoNextString="PushToDB"
         self.WriteErrorXML(resultFile)
         return  datetime.timedelta(seconds=3)
Пример #6
0
 def ProccessResults(self,Verbose=True):
     if not self.app.has_key('parser'):
         return self.ProccessResultsOld(Verbose)
     if Verbose:print "Processing the output"
     try:
         jobfilesDir=os.path.join(self.taskDir,"jobfiles")
         resultFile=os.path.join(self.taskDir,"result.xml")
         print resultFile
         #get job.id (from remote machine) of master node
         if self.RemoteJobID==0: #i.e. this is a subtask of a bundle
             if os.path.isfile(os.path.join(jobfilesDir,"job.id")):
                 fin=open(os.path.join(jobfilesDir,"job.id"),"r")
                 self.RemoteJobID=int(fin.read().strip())
                 print "Master task's RemoteJobID is ",self.RemoteJobID
                 fin.close()
                     
         if hasattr(self, 'ReportFormat'):#i.e. fatal error and the last one is already in status/statusinfo
             if self.ReportFormat=="Error":
                 self.ToDoNextString="PushToDB"
                 self.WriteErrorXML(resultFile)
                 return datetime.timedelta(seconds=3)
         
         (batchJobDir,stdoutFile,stderrFile,appstdoutFile,taskexeclogFile)=self.GetResultFiles(raiseError=True)
         
         #get the performance data
         parserfilename=os.path.join(akrr.curdir,"appkernelsparsers",self.app['parser'])
         import imp
         with open(parserfilename, 'rb') as fp:
             thisAppKerParser = imp.load_module(
                 'thisAppKerParser', fp, parserfilename,
                 ('.py', 'rb', imp.PY_SOURCE)
             )
         
         appKerNResVars={}
         appKerNResVars['resource']=self.resource
         appKerNResVars['resource'].update(self.resourceParam)
         appKerNResVars['app']=self.app
         appKerNResVars['app'].update(self.appParam)
         
         performance=thisAppKerParser.processAppKerOutput(appstdout=appstdoutFile,
                                                          stdout=stdoutFile,
                                                          stderr=stderrFile,
                                                          geninfo=os.path.join(batchJobDir,"gen.info"),
                                                          appKerNResVars=appKerNResVars)
         if performance==None:
             self.status="ERROR: Job have not finished successfully"
             self.statusinfo=""
             self.ToDoNextString="PushToDB"
             self.WriteErrorXML(resultFile)
         else:
             fout=open(resultFile,"w")
             content=fout.write(performance)
             fout.close()
             self.status="Output was processed and found that kernel either exited with error or executed successfully."
             self.statusinfo="Done"
             self.ToDoNextString="PushToDB"
             if hasattr(performance,'nodeList'):
                 self.nodesList=performance.nodeList
             else:
                 self.nodesList=None
         return datetime.timedelta(seconds=3)
     except:
         print traceback.format_exc()
         self.status="ERROR: Error happens during processing of output."
         self.statusinfo=traceback.format_exc()
         self.FatalErrorsCount+=1
         akrr.printException(self.status)
         self.ToDoNextString="PushToDB"
         self.WriteErrorXML(resultFile)
         return  datetime.timedelta(seconds=3)
Пример #7
0
 def GenerateBatchJobScript(self):
     if not hasattr(self, 'JobScriptName'):
         self.JobScriptName=self.GetJobScriptName(self.appName)
     #get walltime from DB
     dbdefaults={}
     try:
         db,cur=akrr.getDB()
         
         cur.execute('''SELECT resource,app,resource_param,app_param FROM ACTIVETASKS
         WHERE task_id=%s ;''',(self.task_id,))
         raw=cur.fetchall()
         if len(raw)>0:
             (resource,app,resource_param,app_param)=raw[0]
         
             cur.execute("""SELECT walllimit
                 FROM akrr_default_walllimit
                 WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """,(resource,app,resource_param,app_param))
             raw=cur.fetchall()
         
             if len(raw)>0:
                 dbdefaults['walllimit']=raw[0][0]
         
         #db.commit()
         cur.close()
         del db
     except Exception as e:
         pass
         raise e
     #create job-script
     try:
         batchvars={}
         appkernelOnResource={}
         if 'appkernelOnResource' in self.app:
             if  self.resourceName in self.app['appkernelOnResource']:
                 appkernelOnResource=self.app['appkernelOnResource'][self.resourceName]
             elif 'default' in self.app['appkernelOnResource']:
                 appkernelOnResource=self.app['appkernelOnResource']['default']
             
         #print "#"*80
         for di in [self.resource,self.app,appkernelOnResource,dbdefaults,self.resourceParam, self.appParam]:
             batchvars.update(di)
         
                  
         #get autowalltime limit
         try:
             if 'autoWalltimeLimit' in batchvars and batchvars['autoWalltimeLimit']==True:
                 print "\nautoWalltimeLimit is on, trying to estimate walltime limit..."
                 autoWalltimeLimitOverhead=1.2
                 if 'autoWalltimeLimitOverhead' in batchvars:
                     autoWalltimeLimitOverhead=batchvars['autoWalltimeLimitOverhead']+1.0
                 #query last 20 executions of this appkernel on that resource with that node count
             
                 db,cur=akrr.getDB(True)
                 
                 cur.execute('''SELECT resource,reporter,reporternickname,collected,status,walltime FROM akrr_xdmod_instanceinfo
                     WHERE  `resource`=%s AND `reporternickname` =  %s
                     ORDER BY  `akrr_xdmod_instanceinfo`.`collected` DESC 
                     LIMIT 0 , 20''',(self.resource['name'],"%s.%d"%(self.app['name'],batchvars['nnodes'])))
                 
                 raw=cur.fetchall()
                 
                 i=0
                 lastFiveRunsSuccessfull=True
                 maxwalltime=0.0
                 for r in raw:
                     if i<5 and r['status']==0:
                         lastFiveRunsSuccessfull=False
                     if r['status']==1 and r['walltime']>maxwalltime:
                         maxwalltime=r['walltime']
                     i+=1
                 if i<5:
                     print "There are only %d previous run, need at least 5 for walltime limit autoset"
                 else:
                     if lastFiveRunsSuccessfull == False:
                         print "One of last 5 runs have failed. Would not use autoset."
                     else:
                         print "Max walltime was %.1f s, will change walltime limit from %.1f minutes to %d minutes"%(maxwalltime,batchvars['walllimit'],int(autoWalltimeLimitOverhead*maxwalltime/60.0+0.99))
                         batchvars['walllimit']=int((autoWalltimeLimitOverhead*maxwalltime/60.0+0.99))
                 print
                 cur.close()
                 del db
         except Exception as e:
             pass
         
         
         #calculate NNodes and NCores
         tmpNNodes=None
         tmpNCores=None
         if batchvars.has_key('nnodes'):
             tmpNNodes=batchvars['nnodes']
             tmpNCores=tmpNNodes*batchvars['ppn']
         else:
             tmpNCores=batchvars['ncores']
             if tmpNCores%batchvars['ppn']==0:
                 tmpNNodes=tmpNCores/batchvars['ppn']
             else:
                 tmpNNodes=(tmpNCores/batchvars['ppn'])+1
         
         batchvars['akrrNCores']=tmpNCores
         batchvars['akrrNNodes']=tmpNNodes
         
         #Set batchvars remaps
         batchvars['akrrPPN']=batchvars['ppn']
         batchvars['akrrNCoresToBorder']=batchvars['akrrPPN']*batchvars['akrrNNodes']
         batchvars['akrrTaskWorkingDir']=self.remoteTaskDir
         batchvars['akrrWallTimeLimit']="%02d:%02d:00"%(int(batchvars['walllimit'])/60,int(batchvars['walllimit'])%60)
         #batchvars['localPATH']=akrr.sshCommand(sh,"echo $PATH").strip()
         batchvars['akrrAppKerName']=self.app['name']
         batchvars['akrrResourceName']=self.resource['name']
         batchvars['akrrTimeStamp']= self.timeStamp
         if batchvars['akrrNNodes']==1: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrNCores']
         else: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN']
         
         if 'nodeListSetterTemplate' not in batchvars:
             batchvars['nodeListSetterTemplate']=batchvars['nodeListSetter'][batchvars['batchScheduler']]
         #set AppKerLauncher
         #if 'runScript' in batchvars:
         #    if self.resource['name'] in batchvars['runScript']:
         #        batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript'][self.resource['name']],batchvars,keepDoubleBrakets=True)
         #    else:
         #        batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript']['default'],batchvars,keepDoubleBrakets=True)
             
         
         #process templates
         batchvars['akrrCommonCommands']=akrr.formatRecursively(batchvars['akrrCommonCommandsTemplate'],batchvars,keepDoubleBrakets=True)
         #batchvars['akrrCommonTests']=akrr.formatRecursively(batchvars['akrrCommonTestsTemplate'],batchvars,keepDoubleBrakets=True)
         #batchvars['akrrStartAppKer']=batchvars['akrrStartAppKerTemplate'].format(**batchvars)
         batchvars['akrrCommonCleanup']=akrr.formatRecursively(batchvars['akrrCommonCleanupTemplate'],batchvars,keepDoubleBrakets=True)
         
         #specially for IOR request two nodes for single node benchmark, one for read and one for write
         if batchvars['requestTwoNodesForOneNodeAppKer']==True and batchvars['akrrNNodes']==1 and 'batchJobHeaderTemplate' in batchvars:
             batchvars2=copy.deepcopy(batchvars)
             batchvars2['akrrNCores']=2*batchvars['akrrNCores']
             batchvars2['akrrNNodes']=2*batchvars['akrrNNodes']
             batchvars2['akrrNCoresToBorder']=2*batchvars['akrrNCoresToBorder']
             batchvars2['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN']
             batchvars['batchJobHeaderTemplate']=akrr.formatRecursively(batchvars2['batchJobHeaderTemplate'],batchvars2)
             pass
         
         #do parameters adjustment
         if 'process_params' in batchvars:
             batchvars['process_params'](batchvars)
         
         #generate job script
         jobScript=akrr.formatRecursively(self.resource["batchJobTemplate"],batchvars)
         jobScriptFullPath=os.path.join(self.taskDir,"jobfiles",self.JobScriptName)
         fout=open(jobScriptFullPath,"w")
         fout.write(jobScript)
         fout.close()
     except Exception as e:
         self.status="ERROR: Can not created batch job script"
         self.statusinfo=traceback.format_exc()
         akrr.printException(self.status)
         raise e
Пример #8
0
 def CreateBatchJobScriptAndSubmitIt(self,doNotSubmitToQueue=False):
     self.JobScriptName=self.GetJobScriptName(self.appName)
     print "### Creating batch job script and submitting it to remote machine"
     #as a current bypass will create a job script remotely and copy it here
     #get ssh to remote resource
     
     sh=None
     try:
         sh=akrr.sshResource(self.resource)
         #Create remote directories if needed
         def CheckAndCreateDir(self,sh,d):
             cmd="if [ ! -d  \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d)
             akrr.sshCommand(sh,cmd)
             cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d)
             msg=akrr.sshCommand(sh,cmd)
             if msg.find("DOESNOTEXIST")>=0:
                 raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name']))
         #akrrdata
         CheckAndCreateDir(self,sh,self.resource['akrrdata'])
         #dir for app
         CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName))
         #dir for task
         CheckAndCreateDir(self,sh,self.remoteTaskDir)
         #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl"))
         
         #cd to remoteTaskDir
         akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir))
         
         #GenerateBatchJobScript
         self.GenerateBatchJobScript()
         
         msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir))
         if doNotSubmitToQueue:
             return
         ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript))
         akrr.sshCommand(sh,"cat %s "%(self.JobScriptName))
         
         #send to queue
         from string import Template
         JobID=0
         if not 'masterTaskID' in self.taskParam:
             #i.e. submit to queue only if task is independent
             sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName)
             msg=akrr.sshCommand(sh,sendToQueue)
             matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S)
             
             if matchObj:
                 try:
                     JobID=int(matchObj.group(1))
                 except:
                     raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg)
             else:
                 raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg)
         
         akrr.sshCommand(sh,"echo %d > job.id"%(JobID))
         
         self.RemoteJobID=JobID
         self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today()
         
         
         sh.sendline("exit")
         sh.close(force=True)
         del sh
         sh=None
         print "\nRemoteJobID=",self.RemoteJobID
         print "copying files from remote machine"
         msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r")
         
         #update DB time_submitted_to_queue
         db,cur=akrr.getDB()
         
         cur.execute('''UPDATE ACTIVETASKS
         SET time_submitted_to_queue=%s
         WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id))
         
         cur.close()
         del db
         
         if not 'masterTaskID' in self.taskParam:
             #i.e. idepentent task
             self.status="Created batch job script and have submitted it to remote queue."
             self.statusinfo="Remote job ID is %d"%(self.RemoteJobID)
             self.ToDoNextString="CheckTheJobOnRemoteMachine"
             
             #check first time in 1 minute
             return datetime.timedelta(days=0, hours=0, minutes=1)
         else:
             #i.e. this is subtask
              #i.e. idepentent task
             self.status="Created batch job script."
             self.statusinfo="Created batch job script. Waiting for master task to execute it."
             self.ToDoNextString="CheckTheJobOnRemoteMachine"
             
             #master task will update the time when it will finish task execution 
             return datetime.timedelta(days=111*365)
         
     except Exception as e:
         if sh!=None:
             sh.sendline("exit")
             sh.close(force=True)
             del sh
         self.status="ERROR Can not created batch job script and submit it to remote queue"
         self.statusinfo=traceback.format_exc()
         if akrr.max_fails_to_submit_to_the_queue>=0:
             if hasattr(self, "FailsToSubmitToTheQueue"):
                 self.FailsToSubmitToTheQueue+=1
                 if (self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue or
                         (self.taskParam['test_run']==True and self.FailsToSubmitToTheQueue>=2)):
                     #Stop execution of the task and submit results to db
                     self.ToDoNextString="PushToDB"
                     resultFile=os.path.join(self.taskDir,"result.xml")
                     self.WriteErrorXML(resultFile)
                     return  datetime.timedelta(seconds=3)
             else:
                 self.FailsToSubmitToTheQueue=1
         else:
             self.FatalErrorsCount+=1
         
         akrr.printException(self.status)
         return akrr.repeat_after_fails_to_submit_to_the_queue