############################################################################################### #copy exec sources and inputs to remote resource log("#"*80) log("Preparing to copy application signature calculator,\n app. kernel input files and \n HPCC,IMB,IOR and Graph500 source code to remote resource\n\n") str_io=cStringIO.StringIO() try: #sys.stdout = sys.stderr = str_io akrr.sshCommand(rsh,"cd %s"%resource['appKerDir']) out=akrr.sshCommand(rsh,"ls "+resource['appKerDir']) files_in_appKerDir=out.strip().split() if not ("inputs" in files_in_appKerDir or "inputs/" in files_in_appKerDir): log("Copying app. kernel input tarball to %s"%resource['appKerDir']) akrr.scpToResource(resource,curdir+"/../../appker_repo/inputs.tar.gz",resource['appKerDir'],logfile=str_io) log("Unpacking app. kernel input files to %s/inputs"%resource['appKerDir']) print >>str_io, akrr.sshCommand(rsh,"tar xvfz %s/inputs.tar.gz"%resource['appKerDir']) out=akrr.sshCommand(rsh,"df -h %s/inputs"%resource['appKerDir']) if out.count("No such file or directory")==0: log("App. kernel input files are in %s/inputs\n"%resource['appKerDir'],highlight="ok") else: print >>str_io, out raise Exception("files are not copied!") else: log("WARNING %d: App. kernel inputs directory %s/inputs is present, assume they are correct.\n"%(warningCount+1,resource['appKerDir']),highlight='warning') warningCount+=1 str_io=cStringIO.StringIO()
def CreateBatchJobScriptAndSubmitIt(self): self.JobScriptName=self.appName+".job" print "### Creating batch job script and submitting it to remote machine" #as a current bypass will create a job script remotely and copy it here #get ssh to remote resource sh=None try: sh=akrr.sshResource(self.resource) #Create remote directories if needed def CheckAndCreateDir(self,sh,d): cmd="if [ ! -d \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d) akrr.sshCommand(sh,cmd) cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d) msg=akrr.sshCommand(sh,cmd) if msg.find("DOESNOTEXIST")>=0: raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name'])) #akrrdata CheckAndCreateDir(self,sh,self.resource['akrrdata']) #dir for app CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName)) #dir for task CheckAndCreateDir(self,sh,self.remoteTaskDir) #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl")) #cd to remoteTaskDir akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir)) #get walltime from DB dbdefaults={} try: db,cur=akrr.getDB() cur.execute('''SELECT resource,app,resource_param,app_param FROM ACTIVETASKS WHERE task_id=%s ;''',(self.task_id,)) raw=cur.fetchall() (resource,app,resource_param,app_param)=raw[0] cur.execute("""SELECT walllimit FROM akrr_default_walllimit WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """,(resource,app,resource_param,app_param)) raw=cur.fetchall() if len(raw)>0: dbdefaults['walllimit']=raw[0][0] #db.commit() cur.close() del db except Exception as e: pass #create job-script batchvars={} #print "#"*80 for di in [self.resource,self.app,dbdefaults,self.resourceParam, self.appParam]: batchvars.update(di) #stack the subtasks subTaskInfo=self.GetSubTaskInfo() if batchvars['shuffleSubtasks']: random.shuffle(subTaskInfo) subTasksExecution="" for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo: remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp) SubTaskJobScriptName=self.GetJobScriptName(subtask_app) SubTaskJobScriptPath=os.path.join(remoteSubTaskDir,SubTaskJobScriptName) subTasksExecution+="cd "+remoteSubTaskDir+"\n" #subTasksExecution+="cp "+os.path.join(self.remoteTaskDir,"job.id ")+"./\n" subTasksExecution+="echo Starting "+subtask_app+"\n" subTasksExecution+=self.resource['shell']+" "+SubTaskJobScriptPath+" > stdout 2> stderr\n" subTasksExecution+="echo Done with "+subtask_app+"\n"+"\n" batchvars['subTasksExecution']=subTasksExecution #calculate NNodes and NCores tmpNNodes=None tmpNCores=None if batchvars.has_key('nnodes'): tmpNNodes=batchvars['nnodes'] tmpNCores=tmpNNodes*batchvars['ppn'] else: tmpNCores=batchvars['ncores'] if tmpNCores%batchvars['ppn']==0: tmpNNodes=tmpNCores/batchvars['ppn'] else: tmpNNodes=(tmpNCores/batchvars['ppn'])+1 batchvars['akrrNCores']=tmpNCores batchvars['akrrNNodes']=tmpNNodes #Set batchvars remaps batchvars['akrrPPN']=batchvars['ppn'] batchvars['akrrNCoresToBorder']=batchvars['akrrPPN']*batchvars['akrrNNodes'] batchvars['akrrTaskWorkingDir']=self.remoteTaskDir batchvars['akrrWallTimeLimit']="%02d:%02d:00"%(int(batchvars['walllimit'])/60,int(batchvars['walllimit'])%60) batchvars['localPATH']=akrr.sshCommand(sh,"echo $PATH").strip() batchvars['akrrAppKerName']=self.app['name'] batchvars['akrrResourceName']=self.resource['name'] batchvars['akrrTimeStamp']= self.timeStamp if batchvars['akrrNNodes']==1: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrNCores'] else: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN'] if 'nodeListSetterTemplate' not in batchvars: batchvars['nodeListSetterTemplate']=batchvars['nodeListSetter'][batchvars['batchScheduler']] #set AppKerLauncher #if self.resource['name'] in batchvars['runScript']: # batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript'][self.resource['name']],batchvars,keepDoubleBrakets=True) #else: # batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript']['default'],batchvars,keepDoubleBrakets=True) #process templates batchvars['akrrCommonCommands']=akrr.formatRecursively(batchvars['akrrCommonCommandsTemplate'],batchvars,keepDoubleBrakets=True) #batchvars['akrrCommonTests']=akrr.formatRecursively(batchvars['akrrCommonTestsTemplate'],batchvars,keepDoubleBrakets=True) #batchvars['akrrStartAppKer']=batchvars['akrrStartAppKerTemplate'].format(**batchvars) batchvars['akrrCommonCleanup']=akrr.formatRecursively(batchvars['akrrCommonCleanupTemplate'],batchvars,keepDoubleBrakets=True) #do parameters adjustment if 'process_params' in batchvars: batchvars['process_params'](batchvars) #generate job script jobScript=akrr.formatRecursively(self.resource["batchJobTemplate"],batchvars) fout=open(os.path.join(self.taskDir,"jobfiles",self.JobScriptName),"w") fout.write(jobScript) fout.close() msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir)) ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript)) akrr.sshCommand(sh,"cat %s "%(self.JobScriptName)) #send to queue from string import Template sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName) msg=akrr.sshCommand(sh,sendToQueue) matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S) JobID=None if matchObj: try: JobID=int(matchObj.group(1)) except: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg) else: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg) akrr.sshCommand(sh,"echo %d > job.id"%(JobID)) #cp job id to subtasks for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo: remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp) akrr.sshCommand(sh,"cp job.id %s"%(remoteSubTaskDir)) self.RemoteJobID=JobID self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today() sh.sendline("exit") sh.close(force=True) del sh sh=None print "\nRemoteJobID=",self.RemoteJobID print "copying files from remote machine" msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #update DB time_submitted_to_queue db,cur=akrr.getDB() cur.execute('''UPDATE ACTIVETASKS SET time_submitted_to_queue=%s WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id)) cur.close() del db self.status="Created batch job script and have submitted it to remote queue." self.statusinfo="Remote job ID is %d"%(self.RemoteJobID) self.ToDoNextString="CheckTheJobOnRemoteMachine" #check first time in 1 minute return datetime.timedelta(days=0, hours=0, minutes=1) except Exception as e: if sh!=None: sh.sendline("exit") sh.close(force=True) del sh self.status="ERROR Can not created batch job script and submit it to remote queue" self.statusinfo=traceback.format_exc() if akrr.max_fails_to_submit_to_the_queue>=0: if hasattr(self, "FailsToSubmitToTheQueue"): self.FailsToSubmitToTheQueue+=1 if self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue: #Stop execution of the task and submit results to db self.ToDoNextString="PushToDB" resultFile=os.path.join(self.taskDir,"result.xml") self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) else: self.FailsToSubmitToTheQueue=1 else: self.FatalErrorsCount+=1 akrr.printException(self.status) return akrr.RepeateAfterFailsToSubmitToTheQueue
def CreateBatchJobScriptAndSubmitIt(self,doNotSubmitToQueue=False): self.JobScriptName=self.GetJobScriptName(self.appName) print "### Creating batch job script and submitting it to remote machine" #as a current bypass will create a job script remotely and copy it here #get ssh to remote resource sh=None try: sh=akrr.sshResource(self.resource) #Create remote directories if needed def CheckAndCreateDir(self,sh,d): cmd="if [ ! -d \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d) akrr.sshCommand(sh,cmd) cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d) msg=akrr.sshCommand(sh,cmd) if msg.find("DOESNOTEXIST")>=0: raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name'])) #akrrdata CheckAndCreateDir(self,sh,self.resource['akrrdata']) #dir for app CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName)) #dir for task CheckAndCreateDir(self,sh,self.remoteTaskDir) #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl")) #cd to remoteTaskDir akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir)) #GenerateBatchJobScript self.GenerateBatchJobScript() msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir)) if doNotSubmitToQueue: return ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript)) akrr.sshCommand(sh,"cat %s "%(self.JobScriptName)) #send to queue from string import Template JobID=0 if not 'masterTaskID' in self.taskParam: #i.e. submit to queue only if task is independent sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName) msg=akrr.sshCommand(sh,sendToQueue) matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S) if matchObj: try: JobID=int(matchObj.group(1)) except: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg) else: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg) akrr.sshCommand(sh,"echo %d > job.id"%(JobID)) self.RemoteJobID=JobID self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today() sh.sendline("exit") sh.close(force=True) del sh sh=None print "\nRemoteJobID=",self.RemoteJobID print "copying files from remote machine" msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #update DB time_submitted_to_queue db,cur=akrr.getDB() cur.execute('''UPDATE ACTIVETASKS SET time_submitted_to_queue=%s WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id)) cur.close() del db if not 'masterTaskID' in self.taskParam: #i.e. idepentent task self.status="Created batch job script and have submitted it to remote queue." self.statusinfo="Remote job ID is %d"%(self.RemoteJobID) self.ToDoNextString="CheckTheJobOnRemoteMachine" #check first time in 1 minute return datetime.timedelta(days=0, hours=0, minutes=1) else: #i.e. this is subtask #i.e. idepentent task self.status="Created batch job script." self.statusinfo="Created batch job script. Waiting for master task to execute it." self.ToDoNextString="CheckTheJobOnRemoteMachine" #master task will update the time when it will finish task execution return datetime.timedelta(days=111*365) except Exception as e: if sh!=None: sh.sendline("exit") sh.close(force=True) del sh self.status="ERROR Can not created batch job script and submit it to remote queue" self.statusinfo=traceback.format_exc() if akrr.max_fails_to_submit_to_the_queue>=0: if hasattr(self, "FailsToSubmitToTheQueue"): self.FailsToSubmitToTheQueue+=1 if (self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue or (self.taskParam['test_run']==True and self.FailsToSubmitToTheQueue>=2)): #Stop execution of the task and submit results to db self.ToDoNextString="PushToDB" resultFile=os.path.join(self.taskDir,"result.xml") self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) else: self.FailsToSubmitToTheQueue=1 else: self.FatalErrorsCount+=1 akrr.printException(self.status) return akrr.repeat_after_fails_to_submit_to_the_queue