def runWL( OutputDir, XMLhandler, NPoolThreads, NoSubmit=0, NoBackground=0, OneFile=0, testid=0, projectid = "default", testerid=0, timediff=0.0 ): DictionaryOfApps = XMLhandler.getDictionaryOfApplications() ListOfApps = AIStorageUtils.dict_sortbyvalue_dict( DictionaryOfApps, 'runTime', AIStorageUtils.SORT_TYPE_FLOAT, AIStorageUtils.SORT_ASCENDING ) # ListOfApps = AIStorageUtils.dict_sortbykey( DictionaryOfApps, AIStorageUtils.SORT_DESCENDING ) for (id, App) in ListOfApps: print "Found ", App['id'], "due to start at", App['runTime'] NTotalJobs = len( ListOfApps ) #print "### Found", NTotalJobs, "apps. Sorting...done." startTime = float(ListOfApps[0][1]['runTime']) if startTime < 0.0: startTime = 0.0 for (id, App) in ListOfApps: App['runTime'] = float(App['runTime']) - startTime #print "### runTime:", App['runTime'], "\n" if App['runTime'] < 0.0: App['runTime'] = 0.0 #print "ID", id, "starts in %.3fs." % float(App['runTime']/1000.0) #-- generate all work units try: os.mkdir( OutputDir ) except: pass FirstSubmission = time.time() CommandLinesList = [] for (id, App) in ListOfApps: #-- generate item CommandLineItem = {} CommandLineItem['id'] = id CommandLineItem['firstSubmission'] = FirstSubmission CommandLineItem['startTime'] = float(App['runTime']/1000.0) #CommandLineItem['commandLine'] = "drunner -g -e -o -f %s 1> %s.out 2> %s.err &" % (App['jdf'], id, id) if OneFile == 0: StdOutFile = os.path.join( OutputDir, "%s.out" % id ) StdErrFile = os.path.join( OutputDir, "%s.err" % id ) ActualCommand = "%s 1> %s 2> %s" % ( App['submitCommand'], StdOutFile, StdErrFile ) #ActualCommand = "%s 2>%s" % ( App['submitCommand'], StdErrFile ) else: StdOutFile = os.path.join( OutputDir, "onefile.out" ) StdErrFile = os.path.join( OutputDir, "onefile.err" ) ActualCommand = "%s 1>> %s 2>> %s" % ( App['submitCommand'], StdOutFile, StdErrFile ) #ActualCommand = "%s" % ( App['submitCommand'] ) if NoBackground == 0: CommandLineItem['commandLine'] = ActualCommand #+ ' &' else: CommandLineItem['commandLine'] = ActualCommand CommandLineItem['stdout'] = StdOutFile CommandLineItem['stderr'] = StdErrFile CommandLineItem['onefile'] = OneFile #-- amod v.0.12: just generate commands CommandLineItem['NoSubmit'] = NoSubmit CommandLineItem['testid'] = testid CommandLineItem['projectid'] = projectid CommandLineItem['testerid'] = testerid CommandLineItem['timediff'] = timediff #-- append item #if os.path.exists(App['jdf']): CommandLinesList.append(CommandLineItem) #else: # print "Could not locate JDF", App['jdf'], "... skipping job" #-- build a WorkRequest object for each work unit requests = ASPNThreadPool.makeRequests(submitJob, CommandLinesList, printSubmitJobResults) #-- create a pool of NPoolThreads worker threads StdOutLock.acquire() print "[wl-submit.py] Starting a thread pool with", NPoolThreads, "threads" StdOutLock.release() submitThreadPool = ASPNThreadPool.ThreadPool(NPoolThreads, StdOutLock) StartSubmissionTime = time.time() #-- add all work units into the thread pool # NOTE: We expect the thread pool to be based on Queues, # beacause our applications need to be run at specified times # and the submit job waits until the current work unit is done # -> if we are NOT using Queues, it may happen that a work unit # that needs to be submitted at time T will get submitted much # later, due to other jobs starting the submission before it, # but waiting for their later start time for req in requests: submitThreadPool.putRequest(req) #DEBUG:print req.args StdOutLock.acquire() print "[Pool] Work request #%s added (id=%s, start time=%.3f)." % \ (req.requestID, req.args[0]['id'], req.args[0]['startTime']) StdOutLock.release() #-- wait for all submissions to be completed submitThreadPool.wait() while 1: try: submitThreadPool.poll() EndSubmissionTime = time.time() #print "Main thread working..." time.sleep(0.5) except (KeyboardInterrupt, ASPNThreadPool.NoResultsPending): break EndSubmissionTime = time.time() NTotalJobsInQueue = len(submitThreadPool.workRequests) # should send to the database the 'onefile.out' and 'onefile.err' (not tested) if OneFile != 0: StdOutFile = os.path.join(OutputDir, "onefile.out") StdErrFile = os.path.join(OutputDir, "onefile.err") try: fin = open(StdOutFile) lines = (fin.read()).split("\n") for line in lines: if (len(line) > 1): sLine = "\n" + LOGFILE_PREFIX + str(testid) + "\1" + str(projectid) + "\1" + str(testerid) + "\1" + "0" + "\1" + line + "\n" StdOutLock.acquire() sys.stdout.write(sLine) StdOutLock.release() fin.close() except: pass try: fin = open(StdErrFile) lines = (fin.read()).split("\n") for line in lines: if (len(line) > 1): sLine = "\n" + LOGFILE_PREFIX + str(testid) + "\1" + str(projectid) + "\1" + str(testerid) + "\1" + "0" + "\1" + line + "\n" StdOutLock.acquire() sys.stdout.write(sLine) StdOutLock.release() fin.close() except: pass return StartSubmissionTime, EndSubmissionTime, NTotalJobs, NTotalJobsInQueue
def runWL(OutputDir, XMLhandler, NPoolThreads, NoSubmit=0, Background=0, OneFile=0): # --- get applications DictionaryOfApps = XMLhandler.getDictionaryOfApplications() # --- create composite structure manager TheCompositeApplicationData = CompositeApplicationData() ## ListOfApps = AIStorageUtils.dict_sortbykey( DictionaryOfApps, AIStorageUtils.SORT_DESCENDING ) ## for (id, App) in ListOfApps: ## print "Found ", App['id'], "due to start at", App['runTime'] ## # -- sort jobs ListOfApps = AIStorageUtils.dict_sortbyvalue_dict( DictionaryOfApps, "runTime", AIStorageUtils.SORT_TYPE_FLOAT, AIStorageUtils.SORT_ASCENDING ) NTotalJobs = len(ListOfApps) print "Found", NTotalJobs, "apps. Sorting...done." # Modification - C.S.: make all the tasks have the start time 0 # startTime = float(ListOfApps[0][1]['runTime']) # if startTime < 0.0: startTime = 0.0 startTime = 0.0 # -- correct start times and add all applications to the composite structure manager for (id, App) in ListOfApps: App["runTime"] = 0 # App['runTime'] = float(App['runTime']) - startTime # if App['runTime'] < 0.0: App['runTime'] = 0.0 print "ID", id, "starts in %.3fs." % float(App["runTime"] / 1000.0) # add the 'dependsOn' key if missing if "dependsOn" not in App: App["dependsOn"] = [] TheCompositeApplicationData.addJob(id, App) # -- create all 'enables' relations TheCompositeApplicationData.buildEnablesRelations() # -- mark all the starting jobs as 'can run' for id in TheCompositeApplicationData.JobsWithDeps: TheCompositeApplicationData.triggerCanRunCheck(id) # --- generate all work units try: os.mkdir(OutputDir) except: pass # --- build a WorkRequest object for each work unit FirstSubmission = time.time() CommandLinesList = [] for (id, App) in ListOfApps: # -- generate item CommandLineItem = {} CommandLineItem[".CompositeApplicationData"] = TheCompositeApplicationData CommandLineItem["id"] = id CommandLineItem["firstSubmission"] = FirstSubmission CommandLineItem["startTime"] = float(App["runTime"] / 1000.0) # CommandLineItem['commandLine'] = "drunner -g -e -o -f %s 1> %s.out 2> %s.err &" % (App['jdf'], id, id) if OneFile == 0: StdOutFile = os.path.join(OutputDir, "%s.out" % id) StdErrFile = os.path.join(OutputDir, "%s.err" % id) ActualCommand = "%s 1> %s 2> %s" % (App["submitCommand"], StdOutFile, StdErrFile) else: StdOutFile = os.path.join(OutputDir, "onefile.out") StdErrFile = os.path.join(OutputDir, "onefile.err") ActualCommand = "%s 1>> %s 2>> %s" % (App["submitCommand"], StdOutFile, StdErrFile) if Background == 1: CommandLineItem["commandLine"] = ActualCommand + " &" else: CommandLineItem["commandLine"] = ActualCommand # -- amod v.0.12: just generate commands CommandLineItem["NoSubmit"] = NoSubmit # -- append item if os.path.exists(App["jdf"]): CommandLinesList.append(CommandLineItem) else: print "Could not locate JDF", App["jdf"], "... skipping job" requests = ASPNThreadPool.makeRequests(runJob, CommandLinesList, printJobResults) # --- create a pool of NPoolThreads worker threads print "[wl-exec-dagman.py] Starting a thread pool with", NPoolThreads, "threads" submitThreadPool = ASPNThreadPool.ThreadPool(NPoolThreads, StdOutLock) StartSubmissionTime = time.time() # --- add all work units into the thread pool # NOTE: We expect the thread pool to be based on Queues, # beacause our applications need to be run at specified times # and the submit job waits until the current work unit is done # -> if we are NOT using Queues, it may happen that a work unit # that needs to be submitted at time T will get submitted much # later, due to other jobs starting the submission before it, # but waiting for their later start time # Modification - corina: the requests are put in the thread pool only when # their dependencies are satisfied requestsBkp = requests[:] for req in requestsBkp: reqId = req.args[0]["id"] # take only the runnable jobs if TheCompositeApplicationData.isRunnable(reqId): submitThreadPool.putRequest(req) # remove the request from the list if it was submitted to the pool requests.remove(req) # DEBUG:print req.args print "[Pool] Work request #%s added (id=%s, start time=%.3f)." % ( req.requestID, req.args[0]["id"], req.args[0]["startTime"], ) # --- wait for all submissions to be completed # submitThreadPool.wait() while 1: try: submitThreadPool.poll() EndSubmissionTime = time.time() time.sleep(0.5) ## if TheCompositeApplicationData.isCompositeApplicationFinished(): ## #submitThreadPool.wait() ## EndSubmissionTime = time.time() ## break ## time.sleep(1) ## #print "Main thread working..." except ASPNThreadPool.NoResultsPending: # -- check that all jobs have actually finished or failed if TheCompositeApplicationData.isCompositeApplicationFinished(): EndSubmissionTime = time.time() break else: # see if we have some more runnable jobs and add them to the pool requestsBkp2 = requests[:] for req in requestsBkp2: reqId = req.args[0]["id"] if TheCompositeApplicationData.isRunnable(reqId): submitThreadPool.putRequest(req) requests.remove(req) # DEBUG:print req.args print "[Pool] Work request #%s added (id=%s, start time=%.3f)." % ( req.requestID, req.args[0]["id"], req.args[0]["startTime"], ) print "[wl-exec-dagman] Got ASPNThreadPool.NoResultsPending" print " All:", TheCompositeApplicationData.TotalJobs, "Done:", TheCompositeApplicationData.TotalSuccessful, "Failed:", TheCompositeApplicationData.TotalFailed time.sleep(2) except KeyboardInterrupt: break except: print ">>>" + traceback.print_exc() raise Exception, "aaaaaaaaaaaaaaaaaaaaaaa" NTotalJobsInQueue = len(submitThreadPool.workRequests) ## print ">>>", "NTotalJobsInQueue:", NTotalJobsInQueue ## ## #-- mark all the starting jobs as 'can run' ## for id in TheCompositeApplicationData.JobsWithDeps: ## print "ID", id, "isFailed:", TheCompositeApplicationData.isFailed(id), "isSuccessful:", TheCompositeApplicationData.isSuccessful(id) return StartSubmissionTime, EndSubmissionTime, NTotalJobs, NTotalJobsInQueue