def WriteSimulationInfo(Ddata, mutAges, mutPops, mutFreqs, nreplicas, suffix, inputParamsFiles, pop2name=pop2name, powerSfx='', getio=None): """Write out info files describing what is in the simulation. These files are used by sim_analysis_pipe.pl to define the simulation analysis pipeline. """ stdDir = '../Data/Shari_Data/sim/stdSimAnalConfig' configFiles = dict([(stdDir + '/' + test + '_config.txt', Str("$Ddata/power_$test$suffix/config$powerSfx.txt")) for test in ('lrh', 'ihs', 'xpop')]) cfgDir = Ddata + '/config' + suffix if getio: return dict(depends_on=list(configFiles.keys()) + list(inputParamsFiles), creates=[ cfgDir + '/' + f + powerSfx + '.txt' for f in ('scenarios', 'sims', 'pops') ] + list(configFiles.values())) neutralParams = reduce(concat, list(map(SlurpFile, inputParamsFiles)), '') # Check that the list of pops defined in the param file matches the list of pops we are analyzing assert sorted( [ int( s.split()[1] ) for s in neutralParams.split( '\n' ) if s.startswith( 'pop_define' ) ] ) \ == sorted( mutPops ) assert set(map(int, list(pop2name.keys()))) == set(mutPops) DumpFile( cfgDir + '/scenarios%s.txt' % powerSfx, '\n'.join(scen.scenDir() for scen in GetScenarios(mutAges, mutPops, mutFreqs))) DumpFile(cfgDir + '/sims%s.txt' % powerSfx, '%d\n%d' % (0, nreplicas - 1)) DumpFile( cfgDir + '/pops%s.txt' % powerSfx, '\n'.join('%s\t%d' % (popName, popNum) for popNum, popName in list(pop2name.items()))) for fromFile, toFile in list(configFiles.items()): copyfile(fromFile, toFile)
def doSplit(splitFunc, splitFN, outDir, getio=None): """Do the splitting""" chunkListFN = os.path.join(outDir, 'chunks.txt') if getio: return dict(depends_on=splitFN, creates=chunkListFN) chunkFNs = splitFunc(splitFN, outDir=outDir) DumpFile(chunkListFN, '\n'.join(chunkFNs))
def CreateSimsParams_neutral(Ddata, suffix, inputParamsFiles, getio=None): """Write the neutral parameter file. """ inputParamsFiles = MakeSeq(inputParamsFiles) neutralParamsFile = Ddata + '/params_neutral' + suffix if getio: return dict(depends_on=inputParamsFiles, creates=neutralParamsFile) neutralParams = reduce(concat, map(SlurpFile, inputParamsFiles)) DumpFile(neutralParamsFile, neutralParams)
def checkTableKey(inFN, cols, comparison='lt', writeCheckedFile=True, tsvOpts={}, lineFilter=None, lineFilterCols=(), getio=None): """Check that in the given table, record identifiers increase uniformly. Params: cols - the columns whose tuple should uniformly inrease comparison - this comparison must be true between each record and the next. the comparison is the name of a routine in the operator module. """ cols = tuple(MakeSeq(cols)) lineFilterCols = tuple(MakeSeq(lineFilterCols)) checkedFN = Str('$inFN.checked_${comparison}') + Sfx(*cols) if getio: return dict(depends_on=inFN, creates=checkedFN if writeCheckedFile else (), attrs=dict(piperun_short=True)) comparisonFunc = getattr(operator, comparison) prevRec = None loadCols = cols + lineFilterCols nskipped = 0 nchecked = 0 for i, r in enumerate(IDotData(inFN, ToLoad=loadCols, **tsvOpts)): if lineFilter and not lineFilter(r): nskipped += 1 continue thisRec = r[cols] if IsSeq(r) else (r, ) if i > 0 and not comparisonFunc(prevRec, thisRec): logging.error( Str('at line $i of $inFN, looking at $cols: $prevRec is not $comparison $thisRec' )) assert False else: nchecked += 1 prevRec = thisRec dbg('nchecked nskipped') DumpFile(checkedFN, 'checked ok.')
def RunTasks( options ): """Take tasks from the specified queue directory, and run them. Params: options - see command-line parameter definition in main() below. """ if haveParamiko: Random.atfork() startClock = time.time() logging.info( 'Starting runner (process id %d on host %s) with options %s' % ( os.getpid(), GetHostName(), options ) ) stopSignal = [ False ] def SetStopSignal( sigNum, stkFrm ): logging.info( 'Setting stop signal to stop runners' ) stopSignal[ 0 ] = True dbg( '"aftset" stopSignal' ) signal.signal( signal.SIGUSR1, SetStopSignal ) fs = RemoteFileSystem( remote = options.remote, pw = options.password, pkey = options.pkey ) \ if options.remote else LocalFileSystem() # check that all queues exist assert all( fs.exists( queue ) and fs.isdir( queue ) for queue in options.queues.split( fs.pathsep ) ) # register a cleanup routine so that, if we claim a task and then # crash midway through it, our lock on the task is erased, so that # another runner can pick up the task. Note that if the _task_ fails # with an error code, that's fine -- we just report the error code # to the mqsub.sh script instance that submitted the task. # The cleanup here happens only if the runner crashes before receiving # a proper exit code from the task. fileToErase = [ None ] @atexit.register def DoErase( eraseWhat = fileToErase ): if eraseWhat[0] and fs.exists(eraseWhat[0]): #print 'runner AtExit: removing ' + eraseWhat[0] fs.remove( eraseWhat[0] ) # var: lastFinish - time when a task last finished. lastFinish = time.time() queues = options.queues.split( fs.pathsep ) lastQueueModTime = [ None ] * len( queues ) skipDirs = set([ 'newtask.dat' ] ) numTasksRun = 0 numProcsAvail = int( os.getenv( 'LSB_DJOB_NUMPROC', 1 ) ) dbg( 'numProcsAvail' ) for queue in queues: EnsureDirExists( os.path.join( queue, 'succ' ) ) EnsureDirExists( os.path.join( queue, 'fail' ) ) while not stopSignal[0]: ranCommand = False if options.maxRunHours > 0 and ( time.time() - startClock ) / 3600.0 > options.maxRunHours: logging.info( 'Runner exiting after CPU time of %s hours' % ( time.time() - startClock ) / 3600.0 ) return if stopSignal[ 0 ]: logging.info( 'Runner stopped by stop signal' ) return else: dbg( '"chkstop" stopSignal' ) dbg( 'queues' ) for queueNum, queue in enumerate( queues ): dbg( 'queueNum queue' ) # do a quick check to see if any tasks have been added to the queue since we last checked newTaskFN = os.path.join( queue, 'newtask.dat' ) try: curQueueModTime = fs.stat( newTaskFN ).st_mtime if curQueueModTime == lastQueueModTime[ queueNum ]: continue lastQueueModTime[ queueNum ] = curQueueModTime except EnvironmentError as e: if os.path.exists( newTaskFN ): logging.warning( 'ERROR CHECKING FOR NEW TASKS in queue %s: %s' % ( queue, e ) ) pass # find an unclaimed task in this queue, and try to claim it taskDirs = sorted( fs.listdir( queue ) ) dbg( 'len(taskDirs)' ) #random.shuffle( taskDirs ) dbg( 'os.environ.get("MQ_FIRST_DIR")' ) if 'MQ_FIRST_DIR' in os.environ and os.environ[ 'MQ_FIRST_DIR' ] in taskDirs: taskDirs = [ os.environ[ 'MQ_FIRST_DIR' ] ] + taskDirs logging.info( 'putting specified dir first' ) for taskDir in taskDirs: if taskDir in skipDirs: continue if options.maxRunHours > 0 and ( ( time.time() - startClock ) / 3600.0 ) > options.maxRunHours: logging.info( 'Runner exiting after CPU time of %s hours' % ( ( time.time() - startClock ) / 3600.0 ) ) return if stopSignal[ 0 ]: logging.info( 'Runner stopped by stop signal' ) return else: dbg( '"chkstop" stopSignal' ) try: while fs.path.exists( os.path.join( queue, 'noclaim.dat' ) ): time.sleep( 60 + random.normalvariate( 10.0, 5.0 ) ) fullTaskDir = fs.path.join( queue, taskDir ) claimedFN = fs.path.join( fullTaskDir, options.claimedFN ) attrsFN = fs.path.join( fullTaskDir, 'attrs.tsv' ) cwdFN = fs.path.join( fullTaskDir, 'submitdir.txt' ) failedCond = [] def saveVal( name, val, fc = failedCond ): if not val: fc.append( name ) return val if saveVal( 'ready', fs.path.exists( fs.path.join( fullTaskDir, options.readyFN ) ) ) \ and saveVal( 'not claimed', not fs.path.exists( claimedFN ) ) \ and saveVal( 'relocatable', ( not options.remote or \ all([ not f.startswith( '/' ) for which in ( 'sources', 'targets' ) \ for f in fs.SlurpFile( fs.path.join( fullTaskDir, which + '.lst' ) ) ]) ) ) \ and saveVal( 'memOk', GetMemReq( fs, attrsFN ) <= options.maxMem ) \ and saveVal( 'minMemOk', options.minMem == 0 or GetMemReq( fs, attrsFN ) >= options.minMem ) \ and saveVal( 'minProc', GetProcReq( fs, attrsFN ) >= options.minProc ) \ and saveVal( 'maxProc', GetProcReq( fs, attrsFN ) <= numProcsAvail ) \ and saveVal( 'local', ( options.local_tasks or not GetTaskAttr( fs, attrsFN, 'piperun_run_locally', False ) ) ) \ and saveVal( 'onlyLocal', ( not options.only_local_tasks or GetTaskAttr( fs, attrsFN, 'piperun_run_locally', False ) ) ) \ and saveVal( 'short', ( not options.runOnlyShort or GetTaskAttr( fs, attrsFN, 'piperun_short', False ) ) ) \ and saveVal( 'long', ( not options.runOnlyLong or not GetTaskAttr( fs, attrsFN, 'piperun_short', False ) ) ) \ and saveVal( 'notRequeued', ( not options.noRequeuedTasks or not fs.path.exists( fs.path.join( fullTaskDir, 'requeued.dat' ) ) ) ) \ and saveVal( 'notFromHost', ( not options.onlyFromHost or socket.getfqdn() == options.onlyFromHost ) ) \ and saveVal( 'notFromPipeline', ( not options.onlyFromPipelineId or \ GetTaskAttr( fs, attrsFN, 'piperun_pipelineId' ) == options.onlyFromPipelineId ) ): # try to claim the task try: fd = fs.open(claimedFN, os.O_CREAT|os.O_EXCL|os.O_WRONLY) except EnvironmentError: # another runner beat us to this task -- go and check other tasks logging.info( 'another job beat us to claiming ' + fullTaskDir ) continue try: fs.write( fd, 'locked by process %d on host %s\n' % ( os.getpid(), GetHostName() ) ) for v in list(os.environ.keys()): fs.write( fd, '%s=%s\n' % ( v, os.environ[v] ) ) finally: fs.close( fd ) # Tell our cleanup code to release this task if we crash. fileToErase[0] = claimedFN # get the command to run the task theCMD = fs.SlurpFile( os.path.join( fullTaskDir, 'command.dat' ) ).strip() theCmdDir = fs.SlurpFile( os.path.join( fullTaskDir, 'submitdir.txt' ) ).strip() theCmdEnvFN = os.path.join( fullTaskDir, 'submitenv.txt' ) if options.remote: assert have_fcntl SystemSucceed( 'mkdir -p ' + os.path.join( options.localDataDir, fs.root[1:] ) ) for needDir in 'Operations', 'Classes', 'System', 'Other': needDirFull = os.path.join( options.localDataDir, fs.root[1:], '..', needDir ) if not os.path.exists( needDirFull ): os.symlink( os.path.realpath( os.path.join( '..', needDir ) ), needDirFull ) # copy source files # get exclusive locks on the source files srcFiles = sorted( set( fs.SlurpFile( os.path.join( fs.root, fullTaskDir, 'sources.lst' ) ).rstrip( '\n' ).split( '\n' ) ) ) srcLockIds = [] srcLockFiles = [] for srcFile in srcFiles: lockFile = os.path.join( options.localDataDir, 'mqlocks', srcFile[1:] ) if lockFile.endswith('/'): lockFile = lockFile[:-1] lockFile += '.lock' SystemSucceed( 'mkdir -p ' + os.path.dirname( lockFile ) ) gotLock = False while not gotLock: try: openMode = os.O_CREAT|os.O_EXCL|os.O_WRONLY logging.info( 'opening ' + lockFile + ' with mode ' + str( openMode ) ) lockId = os.open( lockFile, openMode ) gotLock = True except EnvironmentError: logging.info( 'Could not create ' + lockFile + ' , waiting...' ) time.sleep( 10 + random.normalvariate( 3.0, 1.0 ) ) fcntl.lockf( lockId, fcntl.LOCK_EX ) srcLockIds.append( lockId ) srcLockFiles.append( lockFile ) logging.info( 'Got lock on ' + lockFile ) SystemSucceed( 'rsync -zprv --files-from=:' + os.path.join( fs.root, fullTaskDir, 'sources.lst' ) + ' ' + fs.username + '@' + fs.hostname + ':/ ' + options.localDataDir ) for srcLockId, srcLockFile in zip( srcLockIds, srcLockFiles)[::-1]: fcntl.lockf( srcLockId, fcntl.LOCK_UN ) os.close( srcLockId ) SystemSucceed( 'rm -rf ' + srcLockFile ) targets = fs.SlurpFile( os.path.join( fs.root, fullTaskDir, 'targets.lst' ) ).rstrip( '\n' ).split('\n') targetDirs = set( map( os.path.dirname, [_f for _f in map( str.strip, targets ) if _f] ) ) dbg( '"DDDDDD" targetDirs' ) for targetDir in targetDirs: assert targetDir.startswith( '/' ) tdir = os.path.join( options.localDataDir, targetDir[1:] ) SystemSucceed( 'mkdir -p ' + tdir + ' ' + os.path.join( tdir, 'makeinfo' ) ) theCMD = 'cd ' + os.path.join( options.localDataDir, fs.root[1:] ) + ' && ' + theCMD logging.info( 'Under ' + claimedFN + ' RUNNING: ' + theCMD ) # Actually run the task; get its exit code save_cwd = os.getcwd() try: os.chdir( theCmdDir ) logging.info( 'CWD=' + os.getcwd() ) runScriptFN = os.path.join( fullTaskDir, 'run.sh' ) with open( runScriptFN, 'w' ) as out: out.write( '#!/usr/bin/env bash\n' ) out.write( 'set -e -o pipefail\n' ) with open( theCmdEnvFN ) as envFile: for line in envFile: if '=' not in line or line.startswith('module='): break equalIdx = line.index( '=' ) envVarName = line[ :equalIdx+1 ] if not ( re.search( r'\W', envVarName ) or envVarName.startswith( 'LSB_' ) or \ envVarName.startswith( 'LSF_' ) or \ envVarName.startswith( 'LS_' ) or envVarName.startswith( 'SLURM' ) or \ envVarName in \ ( 'SYS_TYPE', 'MACHTYPE', 'VENDOR', 'OSTYPE', 'DOMAINNAME', 'HOSTTYPE', 'SHORTHOST', 'SSH_TTY', 'HOST', 'HOSTNAME', 'REMOTEHOST', 'STY' ) ): out.write( 'export ' + envVarName + "'" + line[ equalIdx+1: -1 ] + "'\n" ) out.write( theCMD ) os.chmod( runScriptFN, stat.S_IXUSR | stat.S_IRWXU ) try: exitCode = os.system( runScriptFN ) except ( KeyboardInterrupt, SystemExit ): interruptedFN = os.path.join( fullTaskDir, 'interrupted.dat' ) DumpFile( interruptedFN, 'interrupted' ); raise finally: os.chdir( save_cwd ) logging.info( 'Under ' + claimedFN + ' FINISHED RUNNING: ' + theCMD ) logging.info( 'Got exit code %d' % exitCode ) if options.remote: # copy the target files and the output log back to the correct dirs on the remote system # first, make sure the files all exist, and are no longer being written to. time.sleep( options.aftTaskDelay ) os.system( 'rsync -zprv --files-from=:' + os.path.join( fs.root, fullTaskDir, 'targets.lst' ) + ' ' + options.localDataDir + ' ' + fs.username + '@' + fs.hostname + ':/' ) # If we succeeded in running the task (whether the task itself failed or not), # tell the cleanup code to NOT release this task if we crash. fileToErase[0] = None # Tell the task submitter script that we are done, and what the task's # exit code was. if os.path.exists( os.path.join( fullTaskDir, 'nmq.dat' ) ): time.sleep(3) fd = fs.open( os.path.join( fullTaskDir, 'completed.dat' ), os.O_CREAT|os.O_EXCL|os.O_WRONLY ) fs.close(fd) try: shutil.move( fullTaskDir, os.path.join( queue, 'succ' if exitCode == 0 else 'fail' ) ) except EnvironmentError as e: logging.warning( 'Error moving ' + fullTaskDir + ' to ' + os.path.join( queue, 'succ' if exitCode == 0 else 'fail' ) + ' : ' + e ) else: exitCodeFN = os.path.join( fullTaskDir, 'exitCode.dat' ) fd = fs.open( exitCodeFN, os.O_CREAT|os.O_EXCL|os.O_WRONLY ) bytesWritten = fs.write( fd, str( exitCode ) ) fs.close( fd ) time.sleep(3) logging.info( 'Wrote exit code %s to file %s (%s bytes)' % ( exitCode, exitCodeFN, bytesWritten ) ) fd = fs.open( os.path.join( fullTaskDir, 'completed.dat' ), os.O_CREAT|os.O_EXCL|os.O_WRONLY ) fs.close(fd) # Record that we actually ran a task here. ranCommand = True lastFinish = time.time() numTasksRun += 1 else: logging.info( 'did not take task ' + taskDir + ' ; reason: ' + str( failedCond ) ); except: excInfo = sys.exc_info() logging.warning( 'Error trying to grab task from ' + taskDir + ' (%s), skipping...' % str( excInfo ) ) traceback.print_exc() dbg( 'ranCommand lastFinish time.time()-lastFinish' ) if not ranCommand: waitTimeHere = time.time() - lastFinish if ( numTasksRun > 0 and options.maxWaitTime > 0 and waitTimeHere > options.maxWaitTime ) \ or ( numTasksRun == 0 and options.maxFirstWaitTime > 0 and waitTimeHere > options.maxFirstWaitTime ) : logging.info( 'Runner exiting after idle time of %s' % waitTimeHere ) return time.sleep( options.taskCheckInterval + random.normalvariate( 3.0, 1.0 ) )