Пример #1
0
def validate_resource_name(resource_name):
    if resource_name.strip()=="":
        logging.error("Bad name for resource, try a different name")
        return False
    #check config file presence
    file_path = os.path.abspath(os.path.join(resources_dir, resource_name))
    if os.path.exists(file_path):
        logging.error("Resource configuration directory (%s) for resource with name %s already present on file system, try a different name"%(file_path,resource_name,))
        return False
    
    
    #check the entry in mod_appkernel
    dbAK,curAK=akrr.getAKDB(True)
        
    curAK.execute('''SELECT * FROM resource WHERE nickname=%s''', (resource_name,))
    resource_in_AKDB = curAK.fetchall()
    if len(resource_in_AKDB)!=0:
        logging.error("Resource with name %s already present in mod_appkernel DB, try a different name"%(resource_name,))
        return False
    
    #check the entry in mod_akrr
    db,cur=akrr.getDB(True)
        
    cur.execute('''SELECT * FROM resources WHERE name=%s''', (resource_name,))
    resource_in_DB = cur.fetchall()
    if len(resource_in_DB)!=0:
        logging.error("Resource with name %s already present in mod_akrr DB, try a different name"%(resource_name,))
        return False
    
    return True
Пример #2
0
 def UpdateSubTasks(self):
     #force to check SubTasks
     #stack the subtasks
     subTaskInfo=self.GetSubTaskInfo()
     
     db,cur=akrr.getDB()
     
     for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo:
         cur.execute('''UPDATE ACTIVETASKS
                         SET next_check_time=%s
                         WHERE task_id=%s ;''',(datetime.datetime.today(),subtask_id))
     
     db.commit()
     cur.close()
     del db
Пример #3
0
 def GetSubTaskInfo(self):
     db,cur=akrr.getDB()
     
     cur.execute('''SELECT task_id,status,datetimestamp,resource,app,task_param FROM ACTIVETASKS
                 WHERE task_param LIKE %s AND task_param LIKE '%%masterTaskID%%'
                 ORDER BY  task_id ASC 
                 ''',("%%%d%%"%(self.task_id,),))
     raws=cur.fetchall()
     subTaskInfo=[]
     for task_id,status,datetimestamp,resource,app,task_param in raws:
         task_param=eval(task_param)
         if task_param['masterTaskID']==self.task_id:
             subTaskInfo.append([task_id,status,datetimestamp,resource,app,task_param])
     
     cur.close()
     del db
     return subTaskInfo
Пример #4
0
def generate_resource_config(resource_id, resource_name, queuing_system):
    logging.info("Initiating %s at AKRR"%(resource_name,))
    
    slurm_template_contents = retrieve_queue_template(os.path.join(akrr.curdir, 'templates', 'template.{0}.inp.py'), 'slurm')
    pbs_template_contents = retrieve_queue_template(os.path.join(akrr.curdir, 'templates', 'template.{0}.inp.py'), 'pbs')

    queues = {'slurm': slurm_template_contents, 'pbs': pbs_template_contents}
    

    if not args.test:
        os.mkdir(os.path.join(resources_dir, resource_name),0700)
    
    file_path = os.path.abspath(os.path.join(resources_dir, resource_name, 'resource.inp.py'))
    global resource_cfg_filename
    resource_cfg_filename=file_path
    
    create_resource_template(file_path, queues[queuing_system], queues[queuing_system])
        
    if not args.test:    
        #add entry to mod_appkernel.resource
        dbAK,curAK=akrr.getAKDB(True)
            
        curAK.execute('''SELECT * FROM resource WHERE nickname=%s''', (resource_name,))
        resource_in_AKDB = curAK.fetchall()
        if len(resource_in_AKDB)==0:
            curAK.execute('''INSERT INTO resource (resource,nickname,description,enabled,visible,xdmod_resource_id)
                        VALUES(%s,%s,%s,0,0,%s);''',
                        (resource_name,resource_name,resource_name,resource_id))
            dbAK.commit()
        curAK.execute('''SELECT * FROM resource WHERE nickname=%s''', (resource_name,))
        resource_in_AKDB = curAK.fetchall()
        resource_id_in_AKDB=resource_in_AKDB[0]['resource_id']
        #add entry to mod_akrr.resource
        db,cur=akrr.getDB(True)
            
        cur.execute('''SELECT * FROM resources WHERE name=%s''', (resource_name,))
        resource_in_DB = cur.fetchall()
        if len(resource_in_DB)==0:
            cur.execute('''INSERT INTO resources (id,xdmod_resource_id,name,enabled)
                        VALUES(%s,%s,%s,%s);''',
                        (resource_id_in_AKDB,resource_id,resource_name,0))
            db.commit()

            logging.info("Resource configuration is in "+file_path)
Пример #5
0
    def CreateBatchJobScriptAndSubmitIt(self):
        self.JobScriptName=self.appName+".job"
        print "### Creating batch job script and submitting it to remote machine"
        
        
        #as a current bypass will create a job script remotely and copy it here
        #get ssh to remote resource
        
        sh=None
        try:
            sh=akrr.sshResource(self.resource)
            #Create remote directories if needed
            def CheckAndCreateDir(self,sh,d):
                cmd="if [ ! -d  \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d)
                akrr.sshCommand(sh,cmd)
                cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d)
                msg=akrr.sshCommand(sh,cmd)
                if msg.find("DOESNOTEXIST")>=0:
                    raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name']))
            #akrrdata
            CheckAndCreateDir(self,sh,self.resource['akrrdata'])
            #dir for app
            CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName))
            #dir for task
            CheckAndCreateDir(self,sh,self.remoteTaskDir)
            #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl"))
            
            #cd to remoteTaskDir
            akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir))
            
            #get walltime from DB
            dbdefaults={}
            try:
                db,cur=akrr.getDB()
                
                cur.execute('''SELECT resource,app,resource_param,app_param FROM ACTIVETASKS
                WHERE task_id=%s ;''',(self.task_id,))
                raw=cur.fetchall()
                (resource,app,resource_param,app_param)=raw[0]
                
                cur.execute("""SELECT walllimit
                    FROM akrr_default_walllimit
                    WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """,(resource,app,resource_param,app_param))
                raw=cur.fetchall()
                
                if len(raw)>0:
                    dbdefaults['walllimit']=raw[0][0]
                
                #db.commit()
                cur.close()
                del db
            except Exception as e:
                pass
            
            #create job-script
            batchvars={}
            
            #print "#"*80
            for di in [self.resource,self.app,dbdefaults,self.resourceParam, self.appParam]:
                batchvars.update(di)
                
            #stack the subtasks
            subTaskInfo=self.GetSubTaskInfo()
            if batchvars['shuffleSubtasks']:
                random.shuffle(subTaskInfo)
            subTasksExecution=""
            for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo:
                remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp)
                SubTaskJobScriptName=self.GetJobScriptName(subtask_app)
                SubTaskJobScriptPath=os.path.join(remoteSubTaskDir,SubTaskJobScriptName)
                
                subTasksExecution+="cd "+remoteSubTaskDir+"\n"
                #subTasksExecution+="cp "+os.path.join(self.remoteTaskDir,"job.id ")+"./\n"
                subTasksExecution+="echo Starting "+subtask_app+"\n"
                subTasksExecution+=self.resource['shell']+" "+SubTaskJobScriptPath+" > stdout 2> stderr\n"
                subTasksExecution+="echo Done with "+subtask_app+"\n"+"\n"
           
            batchvars['subTasksExecution']=subTasksExecution
            
            
            
            
            #calculate NNodes and NCores
            tmpNNodes=None
            tmpNCores=None
            if batchvars.has_key('nnodes'):
                tmpNNodes=batchvars['nnodes']
                tmpNCores=tmpNNodes*batchvars['ppn']
            else:
                tmpNCores=batchvars['ncores']
                if tmpNCores%batchvars['ppn']==0:
                    tmpNNodes=tmpNCores/batchvars['ppn']
                else:
                    tmpNNodes=(tmpNCores/batchvars['ppn'])+1
            
            batchvars['akrrNCores']=tmpNCores
            batchvars['akrrNNodes']=tmpNNodes
            
            #Set batchvars remaps
            batchvars['akrrPPN']=batchvars['ppn']
            batchvars['akrrNCoresToBorder']=batchvars['akrrPPN']*batchvars['akrrNNodes']
            batchvars['akrrTaskWorkingDir']=self.remoteTaskDir
            batchvars['akrrWallTimeLimit']="%02d:%02d:00"%(int(batchvars['walllimit'])/60,int(batchvars['walllimit'])%60)
            batchvars['localPATH']=akrr.sshCommand(sh,"echo $PATH").strip()
            batchvars['akrrAppKerName']=self.app['name']
            batchvars['akrrResourceName']=self.resource['name']
            batchvars['akrrTimeStamp']= self.timeStamp
            if batchvars['akrrNNodes']==1: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrNCores']
            else: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN']
            if 'nodeListSetterTemplate' not in batchvars:
                batchvars['nodeListSetterTemplate']=batchvars['nodeListSetter'][batchvars['batchScheduler']]
            #set AppKerLauncher
            #if self.resource['name'] in batchvars['runScript']:
            #    batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript'][self.resource['name']],batchvars,keepDoubleBrakets=True)
            #else:
            #    batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript']['default'],batchvars,keepDoubleBrakets=True)
                
            
            #process templates
            batchvars['akrrCommonCommands']=akrr.formatRecursively(batchvars['akrrCommonCommandsTemplate'],batchvars,keepDoubleBrakets=True)
            #batchvars['akrrCommonTests']=akrr.formatRecursively(batchvars['akrrCommonTestsTemplate'],batchvars,keepDoubleBrakets=True)
            #batchvars['akrrStartAppKer']=batchvars['akrrStartAppKerTemplate'].format(**batchvars)
            batchvars['akrrCommonCleanup']=akrr.formatRecursively(batchvars['akrrCommonCleanupTemplate'],batchvars,keepDoubleBrakets=True)
            
            #do parameters adjustment
            if 'process_params' in batchvars:
                batchvars['process_params'](batchvars)
            #generate job script
            jobScript=akrr.formatRecursively(self.resource["batchJobTemplate"],batchvars)
            fout=open(os.path.join(self.taskDir,"jobfiles",self.JobScriptName),"w")
            fout.write(jobScript)
            fout.close()
            msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir))
            
            ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript))
            akrr.sshCommand(sh,"cat %s "%(self.JobScriptName))
            
            
            #send to queue
            from string import Template
            sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName)
            msg=akrr.sshCommand(sh,sendToQueue)
            matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S)

            JobID=None
            if matchObj:
                try:
                    JobID=int(matchObj.group(1))
                except:
                    raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg)
            else:
                raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg)
            
            akrr.sshCommand(sh,"echo %d > job.id"%(JobID))
            
            #cp job id to subtasks
            for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo:
                remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp)
                akrr.sshCommand(sh,"cp job.id %s"%(remoteSubTaskDir))
            
            self.RemoteJobID=JobID
            self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today()
            
            sh.sendline("exit")
            sh.close(force=True)
            del sh
            sh=None
            print "\nRemoteJobID=",self.RemoteJobID
            print "copying files from remote machine"
            msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r")
            
            #update DB time_submitted_to_queue
            db,cur=akrr.getDB()
            
            cur.execute('''UPDATE ACTIVETASKS
            SET time_submitted_to_queue=%s
            WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id))
            
            cur.close()
            del db
            
            self.status="Created batch job script and have submitted it to remote queue."
            self.statusinfo="Remote job ID is %d"%(self.RemoteJobID)
            self.ToDoNextString="CheckTheJobOnRemoteMachine"
            
            
            #check first time in 1 minute
            return datetime.timedelta(days=0, hours=0, minutes=1)
        except Exception as e:
            if sh!=None:
                sh.sendline("exit")
                sh.close(force=True)
                del sh
            self.status="ERROR Can not created batch job script and submit it to remote queue"
            self.statusinfo=traceback.format_exc()
            if akrr.max_fails_to_submit_to_the_queue>=0:
                if hasattr(self, "FailsToSubmitToTheQueue"):
                    self.FailsToSubmitToTheQueue+=1
                    if self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue:
                        #Stop execution of the task and submit results to db
                        self.ToDoNextString="PushToDB"
                        resultFile=os.path.join(self.taskDir,"result.xml")
                        self.WriteErrorXML(resultFile)
                        return  datetime.timedelta(seconds=3)
                else:
                    self.FailsToSubmitToTheQueue=1
            else:
                self.FatalErrorsCount+=1
            
            akrr.printException(self.status)
            return akrr.RepeateAfterFailsToSubmitToTheQueue
Пример #6
0
 def GenerateBatchJobScript(self):
     if not hasattr(self, 'JobScriptName'):
         self.JobScriptName=self.GetJobScriptName(self.appName)
     #get walltime from DB
     dbdefaults={}
     try:
         db,cur=akrr.getDB()
         
         cur.execute('''SELECT resource,app,resource_param,app_param FROM ACTIVETASKS
         WHERE task_id=%s ;''',(self.task_id,))
         raw=cur.fetchall()
         if len(raw)>0:
             (resource,app,resource_param,app_param)=raw[0]
         
             cur.execute("""SELECT walllimit
                 FROM akrr_default_walllimit
                 WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """,(resource,app,resource_param,app_param))
             raw=cur.fetchall()
         
             if len(raw)>0:
                 dbdefaults['walllimit']=raw[0][0]
         
         #db.commit()
         cur.close()
         del db
     except Exception as e:
         pass
         raise e
     #create job-script
     try:
         batchvars={}
         appkernelOnResource={}
         if 'appkernelOnResource' in self.app:
             if  self.resourceName in self.app['appkernelOnResource']:
                 appkernelOnResource=self.app['appkernelOnResource'][self.resourceName]
             elif 'default' in self.app['appkernelOnResource']:
                 appkernelOnResource=self.app['appkernelOnResource']['default']
             
         #print "#"*80
         for di in [self.resource,self.app,appkernelOnResource,dbdefaults,self.resourceParam, self.appParam]:
             batchvars.update(di)
         
                  
         #get autowalltime limit
         try:
             if 'autoWalltimeLimit' in batchvars and batchvars['autoWalltimeLimit']==True:
                 print "\nautoWalltimeLimit is on, trying to estimate walltime limit..."
                 autoWalltimeLimitOverhead=1.2
                 if 'autoWalltimeLimitOverhead' in batchvars:
                     autoWalltimeLimitOverhead=batchvars['autoWalltimeLimitOverhead']+1.0
                 #query last 20 executions of this appkernel on that resource with that node count
             
                 db,cur=akrr.getDB(True)
                 
                 cur.execute('''SELECT resource,reporter,reporternickname,collected,status,walltime FROM akrr_xdmod_instanceinfo
                     WHERE  `resource`=%s AND `reporternickname` =  %s
                     ORDER BY  `akrr_xdmod_instanceinfo`.`collected` DESC 
                     LIMIT 0 , 20''',(self.resource['name'],"%s.%d"%(self.app['name'],batchvars['nnodes'])))
                 
                 raw=cur.fetchall()
                 
                 i=0
                 lastFiveRunsSuccessfull=True
                 maxwalltime=0.0
                 for r in raw:
                     if i<5 and r['status']==0:
                         lastFiveRunsSuccessfull=False
                     if r['status']==1 and r['walltime']>maxwalltime:
                         maxwalltime=r['walltime']
                     i+=1
                 if i<5:
                     print "There are only %d previous run, need at least 5 for walltime limit autoset"
                 else:
                     if lastFiveRunsSuccessfull == False:
                         print "One of last 5 runs have failed. Would not use autoset."
                     else:
                         print "Max walltime was %.1f s, will change walltime limit from %.1f minutes to %d minutes"%(maxwalltime,batchvars['walllimit'],int(autoWalltimeLimitOverhead*maxwalltime/60.0+0.99))
                         batchvars['walllimit']=int((autoWalltimeLimitOverhead*maxwalltime/60.0+0.99))
                 print
                 cur.close()
                 del db
         except Exception as e:
             pass
         
         
         #calculate NNodes and NCores
         tmpNNodes=None
         tmpNCores=None
         if batchvars.has_key('nnodes'):
             tmpNNodes=batchvars['nnodes']
             tmpNCores=tmpNNodes*batchvars['ppn']
         else:
             tmpNCores=batchvars['ncores']
             if tmpNCores%batchvars['ppn']==0:
                 tmpNNodes=tmpNCores/batchvars['ppn']
             else:
                 tmpNNodes=(tmpNCores/batchvars['ppn'])+1
         
         batchvars['akrrNCores']=tmpNCores
         batchvars['akrrNNodes']=tmpNNodes
         
         #Set batchvars remaps
         batchvars['akrrPPN']=batchvars['ppn']
         batchvars['akrrNCoresToBorder']=batchvars['akrrPPN']*batchvars['akrrNNodes']
         batchvars['akrrTaskWorkingDir']=self.remoteTaskDir
         batchvars['akrrWallTimeLimit']="%02d:%02d:00"%(int(batchvars['walllimit'])/60,int(batchvars['walllimit'])%60)
         #batchvars['localPATH']=akrr.sshCommand(sh,"echo $PATH").strip()
         batchvars['akrrAppKerName']=self.app['name']
         batchvars['akrrResourceName']=self.resource['name']
         batchvars['akrrTimeStamp']= self.timeStamp
         if batchvars['akrrNNodes']==1: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrNCores']
         else: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN']
         
         if 'nodeListSetterTemplate' not in batchvars:
             batchvars['nodeListSetterTemplate']=batchvars['nodeListSetter'][batchvars['batchScheduler']]
         #set AppKerLauncher
         #if 'runScript' in batchvars:
         #    if self.resource['name'] in batchvars['runScript']:
         #        batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript'][self.resource['name']],batchvars,keepDoubleBrakets=True)
         #    else:
         #        batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript']['default'],batchvars,keepDoubleBrakets=True)
             
         
         #process templates
         batchvars['akrrCommonCommands']=akrr.formatRecursively(batchvars['akrrCommonCommandsTemplate'],batchvars,keepDoubleBrakets=True)
         #batchvars['akrrCommonTests']=akrr.formatRecursively(batchvars['akrrCommonTestsTemplate'],batchvars,keepDoubleBrakets=True)
         #batchvars['akrrStartAppKer']=batchvars['akrrStartAppKerTemplate'].format(**batchvars)
         batchvars['akrrCommonCleanup']=akrr.formatRecursively(batchvars['akrrCommonCleanupTemplate'],batchvars,keepDoubleBrakets=True)
         
         #specially for IOR request two nodes for single node benchmark, one for read and one for write
         if batchvars['requestTwoNodesForOneNodeAppKer']==True and batchvars['akrrNNodes']==1 and 'batchJobHeaderTemplate' in batchvars:
             batchvars2=copy.deepcopy(batchvars)
             batchvars2['akrrNCores']=2*batchvars['akrrNCores']
             batchvars2['akrrNNodes']=2*batchvars['akrrNNodes']
             batchvars2['akrrNCoresToBorder']=2*batchvars['akrrNCoresToBorder']
             batchvars2['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN']
             batchvars['batchJobHeaderTemplate']=akrr.formatRecursively(batchvars2['batchJobHeaderTemplate'],batchvars2)
             pass
         
         #do parameters adjustment
         if 'process_params' in batchvars:
             batchvars['process_params'](batchvars)
         
         #generate job script
         jobScript=akrr.formatRecursively(self.resource["batchJobTemplate"],batchvars)
         jobScriptFullPath=os.path.join(self.taskDir,"jobfiles",self.JobScriptName)
         fout=open(jobScriptFullPath,"w")
         fout.write(jobScript)
         fout.close()
     except Exception as e:
         self.status="ERROR: Can not created batch job script"
         self.statusinfo=traceback.format_exc()
         akrr.printException(self.status)
         raise e
Пример #7
0
 def CreateBatchJobScriptAndSubmitIt(self,doNotSubmitToQueue=False):
     self.JobScriptName=self.GetJobScriptName(self.appName)
     print "### Creating batch job script and submitting it to remote machine"
     #as a current bypass will create a job script remotely and copy it here
     #get ssh to remote resource
     
     sh=None
     try:
         sh=akrr.sshResource(self.resource)
         #Create remote directories if needed
         def CheckAndCreateDir(self,sh,d):
             cmd="if [ ! -d  \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d)
             akrr.sshCommand(sh,cmd)
             cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d)
             msg=akrr.sshCommand(sh,cmd)
             if msg.find("DOESNOTEXIST")>=0:
                 raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name']))
         #akrrdata
         CheckAndCreateDir(self,sh,self.resource['akrrdata'])
         #dir for app
         CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName))
         #dir for task
         CheckAndCreateDir(self,sh,self.remoteTaskDir)
         #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl"))
         
         #cd to remoteTaskDir
         akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir))
         
         #GenerateBatchJobScript
         self.GenerateBatchJobScript()
         
         msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir))
         if doNotSubmitToQueue:
             return
         ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript))
         akrr.sshCommand(sh,"cat %s "%(self.JobScriptName))
         
         #send to queue
         from string import Template
         JobID=0
         if not 'masterTaskID' in self.taskParam:
             #i.e. submit to queue only if task is independent
             sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName)
             msg=akrr.sshCommand(sh,sendToQueue)
             matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S)
             
             if matchObj:
                 try:
                     JobID=int(matchObj.group(1))
                 except:
                     raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg)
             else:
                 raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg)
         
         akrr.sshCommand(sh,"echo %d > job.id"%(JobID))
         
         self.RemoteJobID=JobID
         self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today()
         
         
         sh.sendline("exit")
         sh.close(force=True)
         del sh
         sh=None
         print "\nRemoteJobID=",self.RemoteJobID
         print "copying files from remote machine"
         msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r")
         
         #update DB time_submitted_to_queue
         db,cur=akrr.getDB()
         
         cur.execute('''UPDATE ACTIVETASKS
         SET time_submitted_to_queue=%s
         WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id))
         
         cur.close()
         del db
         
         if not 'masterTaskID' in self.taskParam:
             #i.e. idepentent task
             self.status="Created batch job script and have submitted it to remote queue."
             self.statusinfo="Remote job ID is %d"%(self.RemoteJobID)
             self.ToDoNextString="CheckTheJobOnRemoteMachine"
             
             #check first time in 1 minute
             return datetime.timedelta(days=0, hours=0, minutes=1)
         else:
             #i.e. this is subtask
              #i.e. idepentent task
             self.status="Created batch job script."
             self.statusinfo="Created batch job script. Waiting for master task to execute it."
             self.ToDoNextString="CheckTheJobOnRemoteMachine"
             
             #master task will update the time when it will finish task execution 
             return datetime.timedelta(days=111*365)
         
     except Exception as e:
         if sh!=None:
             sh.sendline("exit")
             sh.close(force=True)
             del sh
         self.status="ERROR Can not created batch job script and submit it to remote queue"
         self.statusinfo=traceback.format_exc()
         if akrr.max_fails_to_submit_to_the_queue>=0:
             if hasattr(self, "FailsToSubmitToTheQueue"):
                 self.FailsToSubmitToTheQueue+=1
                 if (self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue or
                         (self.taskParam['test_run']==True and self.FailsToSubmitToTheQueue>=2)):
                     #Stop execution of the task and submit results to db
                     self.ToDoNextString="PushToDB"
                     resultFile=os.path.join(self.taskDir,"result.xml")
                     self.WriteErrorXML(resultFile)
                     return  datetime.timedelta(seconds=3)
             else:
                 self.FailsToSubmitToTheQueue=1
         else:
             self.FatalErrorsCount+=1
         
         akrr.printException(self.status)
         return akrr.repeat_after_fails_to_submit_to_the_queue
Пример #8
0
 #check if AK is in DB
 if True:
     #add entry to mod_appkernel.resource
     dbAK,curAK=akrr.getAKDB(True)
         
     curAK.execute('''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''', (app_name,))
     ak_in_AKDB = curAK.fetchall()
     if len(ak_in_AKDB)==0:
         curAK.execute('''INSERT INTO app_kernel_def (name,ak_base_name,processor_unit,enabled, description, visible)
                     VALUES(%s,%s,'node',0,%s,0);''',
                     (app_name,app_name,app_name))
         dbAK.commit()
     curAK.execute('''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''', (app_name,))
     ak_in_AKDB = curAK.fetchall()[0]
     #add entry to mod_akrr.resource
     db,cur=akrr.getDB(True)
         
     cur.execute('''SELECT * FROM app_kernels WHERE name=%s''', (app_name,))
     ak_in_DB = cur.fetchall()
     if len(ak_in_DB)==0:
         cur.execute('''INSERT INTO app_kernels (id,name,enabled,nodes_list)
                     VALUES(%s,%s,0,'1,2,4,8');''',
                     (ak_in_AKDB['ak_def_id'],app_name))
         db.commit()
         
 ###############################################################################################
 #connect to resource
 log("#"*80)
 log("Validating resource accessibility. Connecting to %s."%(resource['name']))
 if resource['sshPrivateKeyFile']!=None and os.path.isfile(resource['sshPrivateKeyFile'])==False:
     logerr("Can not access ssh private key (%s)"""%(resource['sshPrivateKeyFile'],))