def terminateNcsScInstances(authToken, instanceIds): '''try to terminate instances; return list of instances terminated (empty if none confirmed)''' terminationLogFilePath = os.path.join( dataDirPath, 'badTerminations.log') # using global dataDirPath dateTimeStr = datetime.datetime.now(datetime.timezone.utc).isoformat() try: ncs.terminateInstances(authToken, instanceIds) logger.info('terminateInstances returned') except Exception as exc: logger.warning('got exception terminating %d instances (%s) %s', len(instanceIds), type(exc), exc) try: with open(terminationLogFilePath, 'a') as logFile: for iid in instanceIds: print(dateTimeStr, iid, sep=',', file=logFile) except: logger.warning( 'got exception (%s) appending to terminationLogFile %s', type(exc), terminationLogFilePath) return [] # empty list meaning none may have been terminated else: return instanceIds
def recruitAndRender(): '''a threadproc to recruit an instance,render frames on it, and terminate it''' eLoop = asyncio.new_event_loop() asyncio.set_event_loop( eLoop ) randomPart = str( uuid.uuid4() )[0:13] launchedJsonFilePath = dataDirPath+'/recruitLaunched_' + randomPart + '.json' resultsLogFilePath = dataDirPath+'/recruitInstance_' + randomPart + '.jlog' try: instance = recruitInstance( launchedJsonFilePath, resultsLogFilePath ) #instances = recruitInstances( 1, launchedJsonFilePath, True, resultsLogFilePath ) except Exception as exc: logger.info( 'got exception from recruitInstance (%s) %s', type(exc), exc ) return -13 if not instance: logger.warning( 'no good instance from recruit') return -14 else: renderFramesOnInstance( instance ) #return renderFramesOnInstance( instances[0] ) iid = instance['instanceId'] logOperation( 'terminateFinal', [iid], '<master>' ) ncs.terminateInstances( args.authToken, [iid] )
def renderFramesOnInstance( inst ): timeLimit = min( args.frameTimeLimit, args.timeLimit ) rsyncTimeLimit = min( 18000, timeLimit ) # was 240; have used 1800 for big files iid = inst['instanceId'] abbrevIid = iid[0:16] g_workingInstances.append( iid ) saveProgress() logger.info( 'would render frames on instance %s', abbrevIid ) # rsync the blend file, with standardized dest file name logFrameState( -1, 'rsyncing', iid, 0 ) blendFileName = 'render.blend' # was SpinningCube_002_c.blend 'cube0c.blend' (rc, stderr) = rsyncToRemote( args.blendFilePath, blendFileName, inst, timeLimit=rsyncTimeLimit ) if rc == 0: logFrameState( -1, 'rsynced', iid ) else: logStderr( stderr.rstrip(), iid ) logFrameState( -1, 'rsyncFailed', iid, rc ) logger.warning( 'rc from rsync was %d', rc ) logOperation( 'terminateFailedWorker', iid, '<master>') ncs.terminateInstances( args.authToken, [iid] ) g_workingInstances.remove( iid ) saveProgress() return -1 # go no further if we can't rsync to the worker def trackStderr( proc ): for line in proc.stderr: print( '<stderr>', abbrevIid, line.strip(), file=sys.stderr ) logStderr( line.rstrip(), iid ) def trackStdout( proc ): nonlocal frameProgress for line in proc.stdout: #print( '<stdout>', abbrevIid, line.strip(), file=sys.stderr ) if 'Path Tracing Tile' in line: pass # yes, this progress-parsing code does work pat = r'Path Tracing Tile ([0-9]+)/([0-9]+)' match = re.search( pat, line ) if match: frameProgress = float( match.group(1) ) / float( match.group(2) ) elif '| Updating ' in line: pass elif '| Synchronizing object |' in line: pass elif line.strip(): print( '<stdout>', abbrevIid, line.strip(), file=sys.stderr ) logStdout( line.rstrip(), iid ) nFailures = 0 while len( g_framesFinished) < g_nFramesWanted: if sigtermSignaled(): break if time.time() >= g_deadline: logger.warning( 'exiting thread because global deadline has passed' ) break if nFailures >= 3: logger.warning( 'exiting thread because instance has encountered %d failures', nFailures ) logOperation( 'terminateFailedWorker', iid, '<master>') ncs.terminateInstances( args.authToken, [iid] ) break #logger.info( '%s would claim a frame; %d done so far', abbrevIid, len( g_framesFinished) ) try: frameNum = g_framesToDo.popleft() except IndexError: #logger.info( 'empty g_framesToDo' ) time.sleep(10) overageFactor = 3 nUnfinished = g_nFramesWanted - len(g_framesFinished) nWorkers = len( g_workingInstances ) if nWorkers > (nUnfinished * overageFactor): logger.warning( 'exiting thread because not many left to do (%d unfinished, %d workers)', nUnfinished, nWorkers ) logOperation( 'terminateExcessWorker', iid, '<master>') g_workingInstances.remove( iid ) ncs.terminateInstances( args.authToken, [iid] ) break continue frameDetails = { 'frameNum': frameNum, 'elapsedTime': 0, 'progress': 0 } frameDetails[ 'lastDateTime' ] = datetime.datetime.now(datetime.timezone.utc).isoformat() g_.frameDetails[ frameNum ] = frameDetails outFileName = g_outFilePattern.replace( '######', '%06d' % frameNum ) returnCode = None pyExpr = '' if args.width > 0 and args.height > 0: pyExpr = '--python-expr "import bpy; scene=bpy.context.scene; '\ 'scene.render.resolution_x=%d; scene.render.resolution_y=%d; '\ 'scene.render.resolution_percentage=100"' % (args.width, args.height) #cmd = 'blender -b -noaudio --version' cmd = 'blender -b -noaudio --enable-autoexec %s %s -o %s --render-format %s -f %d' % \ (blendFileName, pyExpr, g_outFilePattern, args.frameFileType, frameNum) logger.info( 'commanding %s', cmd ) sshSpecs = inst['ssh'] curFrameRendered = False logFrameState( frameNum, 'starting', iid ) frameStartDateTime = datetime.datetime.now(datetime.timezone.utc) with subprocess.Popen(['ssh', '-p', str(sshSpecs['port']), '-o', 'ServerAliveInterval=360', '-o', 'ServerAliveCountMax=3', sshSpecs['user'] + '@' + sshSpecs['host'], cmd], encoding='utf8', stdout=subprocess.PIPE, # subprocess.PIPE subprocess.DEVNULL stderr=subprocess.PIPE) as proc: frameProgress = 0 frameProgressReported = 0 deadline = min( g_deadline, time.time() + timeLimit ) stdoutThr = threading.Thread(target=trackStdout, args=(proc,)) stdoutThr.start() stderrThr = threading.Thread(target=trackStderr, args=(proc,)) stderrThr.start() while time.time() < deadline: proc.poll() # sets proc.returncode if proc.returncode == None: if frameProgress > min( .99, frameProgressReported + .01 ): logger.info( 'frame %d on %s is %.1f %% done', frameNum, abbrevIid, frameProgress*100 ) frameProgressReported = frameProgress rightNow = datetime.datetime.now(datetime.timezone.utc) frameDetails[ 'lastDateTime' ] = rightNow.isoformat() frameDetails[ 'elapsedTime' ] = (rightNow - frameStartDateTime).total_seconds() frameDetails[ 'progress' ] = frameProgress saveProgress() if ((deadline - time.time() < timeLimit/2)) and frameProgress < .5: logger.warning( 'frame %d on %s seems slow', frameNum, abbrevIid ) logFrameState( frameNum, 'seemsSlow', iid, frameProgress ) else: if proc.returncode == 0: logger.info( 'frame %d on %s succeeded', frameNum, abbrevIid ) curFrameRendered = True else: logger.warning( 'instance %s gave returnCode %d', abbrevIid, proc.returncode ) break if sigtermSignaled(): break time.sleep(10) returnCode = proc.returncode if proc.returncode != None else 124 if returnCode: logger.warning( 'renderFailed for frame %d on %s', frameNum, iid ) logFrameState( frameNum, 'renderFailed', iid, returnCode ) frameDetails[ 'progress' ] = 0 g_framesToDo.append( frameNum ) saveProgress() time.sleep(10) # maybe we should retire this instance; at least, making it sleep so it is less competitive else: logFrameState( frameNum, 'rendered', iid ) #g_framesFinished.append( frameNum ) # too soon proc.terminate() try: proc.wait(timeout=5) if proc.returncode: logger.warning( 'ssh return code %d', proc.returncode ) except subprocess.TimeoutExpired: logger.warning( 'ssh did not terminate in time' ) stdoutThr.join() stderrThr.join() # may not need this logging here if returnCode != 0: logger.warning( 'blender returnCode %d for %s', returnCode, abbrevIid ) if curFrameRendered: logFrameState( frameNum, 'retrieving', iid ) (returnCode, stderr) = scpFromRemote( outFileName, os.path.join( dataDirPath, outFileName ), inst ) if returnCode == 0: logFrameState( frameNum, 'retrieved', iid ) logger.info( 'retrieved frame %d', frameNum ) logger.info( 'finished %d frames out of %d', len( g_framesFinished), g_nFramesWanted ) g_framesFinished.append( frameNum ) rightNow = datetime.datetime.now(datetime.timezone.utc) frameDetails[ 'lastDateTime' ] = rightNow.isoformat() frameDetails[ 'elapsedTime' ] = (rightNow - frameStartDateTime).total_seconds() frameDetails[ 'progress' ] = 1.0 else: logStderr( stderr.rstrip(), iid ) logFrameState( frameNum, 'retrieveFailed', iid, returnCode ) frameDetails[ 'progress' ] = 0 g_framesToDo.append( frameNum ) saveProgress() if returnCode: nFailures += 1 if iid in g_workingInstances: g_workingInstances.remove( iid ) saveProgress() return 0
def recruitInstances( nWorkersWanted, launchedJsonFilePath, launchWanted, resultsLogFilePath='' ): '''launch instances and install blender on them; terminate those that could not install; return list of good instances''' logger.info( 'recruiting up to %d instances', nWorkersWanted ) if not resultsLogFilePath: resultsLogFilePath = dataDirPath+'/recruitInstances.jlog' goodInstances = [] if launchWanted: logger.info( 'recruiting %d instances', nWorkersWanted ) nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter ) if nWorkersWanted > (nAvail + 0): logger.error( 'not enough devices available (%d requested, %d avail)', nWorkersWanted, nAvail ) raise ValueError( 'not enough devices available') # prepare sshClientKey for launch if args.sshClientKeyName: sshClientKeyName = args.sshClientKeyName else: keyContents = loadSshPubKey().strip() randomPart = str( uuid.uuid4() )[0:13] #keyContents += ' #' + randomPart sshClientKeyName = 'bfr_%s' % (randomPart) respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents ) if respCode < 200 or respCode >= 300: logger.warning( 'ncs.uploadSshClientKey returned %s', respCode ) sys.exit( 'could not upload SSH client key') #launch #logResult( 'operation', {'launchInstances': nWorkersWanted}, '<recruitInstances>' ) logOperation( 'launchInstances', nWorkersWanted, '<recruitInstances>' ) rc = launchInstances( args.authToken, nWorkersWanted, sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter, encryptFiles = args.encryptFiles ) if rc: logger.debug( 'launchInstances returned %d', rc ) # delete sshClientKey only if we just uploaded it if sshClientKeyName != args.sshClientKeyName: logger.info( 'deleting sshClientKey %s', sshClientKeyName) ncs.deleteSshClientKey( args.authToken, sshClientKeyName ) launchedInstances = [] # get instances from the launched json file with open( launchedJsonFilePath, 'r') as jsonInFile: try: launchedInstances = json.load(jsonInFile) # an array except Exception as exc: logger.warning( 'could not load json (%s) %s', type(exc), exc ) if len( launchedInstances ) < nWorkersWanted: logger.warning( 'could not launch as many instances as wanted (%d vs %d)', len( launchedInstances ), nWorkersWanted ) nonstartedIids = [inst['instanceId'] for inst in launchedInstances if inst['state'] != 'started' ] if nonstartedIids: logger.warning( 'terminating non-started instances %s', nonstartedIids ) ncs.terminateInstances( args.authToken, nonstartedIids ) # proceed with instances that were actually started startedInstances = [inst for inst in launchedInstances if inst['state'] == 'started' ] # add instances to knownHosts with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile: jsonToKnownHosts.jsonToKnownHosts( startedInstances, khFile ) # install blender on startedInstances if not sigtermSignaled(): installerCmd = 'sudo apt-get -qq update && sudo apt-get -qq -y install blender > /dev/null' logger.info( 'calling tellInstances to install on %d instances', len(startedInstances)) stepStatuses = tellInstances.tellInstances( startedInstances, installerCmd, resultsLogFilePath=resultsLogFilePath, download=None, downloadDestDir=None, jsonOut=None, sshAgent=args.sshAgent, timeLimit=min(args.instTimeLimit, args.timeLimit), upload=None, stopOnSigterm=not True, knownHostsOnly=True ) # SHOULD restore our handler because tellInstances may have overridden it #signal.signal( signal.SIGTERM, sigtermHandler ) if not stepStatuses: logger.warning( 'no statuses returned from installer') startedIids = [inst['instanceId'] for inst in startedInstances] logOperation( 'terminateBad', startedIids, '<recruitInstances>' ) ncs.terminateInstances( args.authToken, startedIids ) return [] (goodOnes, badOnes) = triage( stepStatuses ) #stepTiming.finish() #eventTimings.append(stepTiming) logger.info( '%d good installs, %d bad installs', len(goodOnes), len(badOnes) ) logger.info( 'stepStatuses %s', stepStatuses ) goodInstances = [inst for inst in startedInstances if inst['instanceId'] in goodOnes ] badIids = [] for status in badOnes: badIids.append( status['instanceId'] ) if badIids: logOperation( 'terminateBad', badIids, '<recruitInstances>' ) ncs.terminateInstances( args.authToken, badIids ) #if goodInstances: # recycleInstances( goodInstances ) return goodInstances
def recruitInstance( launchedJsonFilePath, resultsLogFilePathIgnored ): logger.info( 'recruiting 1 instance' ) nWorkersWanted = 1 # prepare sshClientKey for launch if args.sshClientKeyName: sshClientKeyName = args.sshClientKeyName else: keyContents = loadSshPubKey().strip() randomPart = str( uuid.uuid4() )[0:13] #keyContents += ' #' + randomPart sshClientKeyName = 'bfr_%s' % (randomPart) respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents ) if respCode < 200 or respCode >= 300: logger.warning( 'ncs.uploadSshClientKey returned %s', respCode ) raise Exception( 'could not upload SSH client key') #launch logOperation( 'launchInstances', 1, '<recruitInstances>' ) rc = launchInstances( args.authToken, 1, sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter, encryptFiles = args.encryptFiles ) if rc: logger.debug( 'launchInstances returned %d', rc ) # delete sshClientKey only if we just uploaded it if sshClientKeyName != args.sshClientKeyName: logger.info( 'deleting sshClientKey %s', sshClientKeyName) ncs.deleteSshClientKey( args.authToken, sshClientKeyName ) if rc: return None launchedInstances = [] # get instances from the launched json file with open( launchedJsonFilePath, 'r') as jsonInFile: try: launchedInstances = json.load(jsonInFile) # an array except Exception as exc: logger.warning( 'could not load json (%s) %s', type(exc), exc ) if len( launchedInstances ) < nWorkersWanted: logger.warning( 'could not launch as many instances as wanted (%d vs %d)', len( launchedInstances ), nWorkersWanted ) nonstartedIids = [inst['instanceId'] for inst in launchedInstances if inst['state'] != 'started' ] if nonstartedIids: logger.warning( 'terminating non-started instances %s', nonstartedIids ) ncs.terminateInstances( args.authToken, nonstartedIids ) # proceed with instances that were actually started startedInstances = [inst for inst in launchedInstances if inst['state'] == 'started' ] if len(startedInstances) != 1: logger.warning( 'launched %d instances', len(startedInstances) ) return None inst = startedInstances[0] iid = inst['instanceId'] abbrevIid = iid[0:16] def trackStderr( proc ): for line in proc.stderr: print( '<stderr>', abbrevIid, line.strip(), file=sys.stderr ) logInstallerEvent( 'stderr', line.strip(), iid ) if sigtermSignaled(): logger.warning( 'terminating instance because sigtermSignaled %s', iid ) logOperation( 'terminateFinal', [iid], '<master>' ) ncs.terminateInstances( args.authToken, [iid] ) return None else: # add instance to knownHosts with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile: jsonToKnownHosts.jsonToKnownHosts( startedInstances, khFile ) # install blender on startedInstance installerCmd = 'sudo apt-get -qq update && sudo apt-get -qq -y install blender > /dev/null' logger.info( 'installerCmd %s', installerCmd ) sshSpecs = inst['ssh'] deadline = min( g_deadline, time.time() + args.instTimeLimit ) logInstallerOperation( iid, ['connect', sshSpecs['host'], sshSpecs['port']] ) with subprocess.Popen(['ssh', '-p', str(sshSpecs['port']), '-o', 'ServerAliveInterval=360', '-o', 'ServerAliveCountMax=3', sshSpecs['user'] + '@' + sshSpecs['host'], installerCmd], encoding='utf8', #stdout=subprocess.PIPE, # subprocess.PIPE subprocess.DEVNULL stderr=subprocess.PIPE) as proc: logInstallerOperation( iid, ['command', installerCmd] ) stderrThr = threading.Thread(target=trackStderr, args=(proc,)) stderrThr.start() while time.time() < deadline: proc.poll() # sets proc.returncode if proc.returncode == None: logger.info( 'waiting for install') else: if proc.returncode == 0: logger.info( 'installer succeeded on instance %s', abbrevIid ) else: logger.warning( 'instance %s gave returnCode %d', abbrevIid, proc.returncode ) break if sigtermSignaled(): break time.sleep(30) proc.poll() returnCode = proc.returncode if proc.returncode != None else 124 # declare timeout if no rc if returnCode: logger.warning( 'installer returnCode %s', returnCode ) if returnCode == 124: logInstallerEvent( 'timeout', args.instTimeLimit, iid ) else: logInstallerEvent('returncode', returnCode, iid ) proc.terminate() try: proc.wait(timeout=5) if proc.returncode: logger.warning( 'ssh return code %d', proc.returncode ) except subprocess.TimeoutExpired: logger.warning( 'ssh did not terminate in time' ) stderrThr.join() if returnCode: logger.warning( 'terminating instance because installerFailed %s', iid ) ncs.terminateInstances( args.authToken, [iid] ) logOperation( 'terminateBad', [iid], '<recruitInstances>' ) return None else: return inst return None
parResultList = list( parIter ) logger.info( 'finished initial thread pool') # wait until it is time to exit while len(g_framesFinished) < g_nFramesWanted and sigtermNotSignaled() and time.time()< g_deadline: logger.info( 'waiting for frames to finish') time.sleep( 10 ) if args.launch: if not goodInstances: logger.info( 'no good instances to terminate') else: logger.info( 'terminating %d instances', len( goodInstances) ) iids = [inst['instanceId'] for inst in goodInstances] logOperation( 'terminateFinal', iids, '<master>' ) ncs.terminateInstances( args.authToken, iids ) else: with open( dataDirPath + '/survivingInstances.json','w' ) as outFile: json.dump( list(goodInstances), outFile ) nFramesFinished = len(g_framesFinished) if nFramesFinished: encodeTo264( dataDirPath, settingsToSave['outVideoFileName'], args.frameRate, startFrame=args.startFrame, frameFileType=extensions[args.frameFileType] ) elapsed = time.time() - startTime logger.info( 'rendered %d frames using %d "good" instances', len(g_framesFinished), len(goodInstances) ) logger.info( 'finished; elapsed time %.1f seconds (%.1f minutes)', elapsed, elapsed/60 ) logOperation( 'finished', {'nInstancesRecruited': len(goodInstances),
def renderFrame( frameNum ): frameDirPath = os.path.join( dataDirPath, 'frame_%06d' % frameNum ) frameFileName = frameFilePattern.replace( '######', '%06d' % frameNum ) installerFilePath = os.path.join(frameDirPath, 'data', 'runDistributedBlender.py.jlog' ) logResult( 'frameStart', frameNum, frameNum ) os.makedirs( frameDirPath+'/data', exist_ok=True ) instances = allocateInstances( args.nWorkers, frameDirPath+'/data/launched.json') try: cmd = [ scriptDirPath()+'/runDistributedBlender.py', os.path.realpath( args.blendFilePath ), '--launch', False, '--authToken', args.authToken, '--blocks_user', args.blocks_user, '--nWorkers', args.nWorkers, '--filter', args.filter, '--width', args.width, '--height', args.height, '--seed', args.seed, '--timeLimit', args.timeLimit, '--useCompositor', args.useCompositor, '--frame', frameNum ] cmd = [ str( arg ) for arg in cmd ] logger.info( 'frame %d, %s', frameNum, frameDirPath ) logger.info( 'cmd %s', cmd ) if True: try: retCode = subprocess.call( cmd, cwd=frameDirPath, stdout=sys.stdout, stderr=sys.stderr ) except Exception as exc: logger.warning( 'runDistributedBlender call threw exception (%s) %s',type(exc), exc ) logResult( 'exception', str(exc), frameNum ) return exc else: logger.info( 'RC from runDistributed: %d', retCode ) if retCode: logResult( 'retCode', retCode, frameNum ) #recycleInstances( instances ) return retCode frameFilePath = os.path.join( frameDirPath, 'data', frameFileName) if not os.path.isfile( frameFilePath ): logResult( 'retCode', errno.ENOENT, frameNum ) return FileNotFoundError( errno.ENOENT, 'could not render frame', frameFileName ) outFilePath = os.path.join(dataDirPath, frameFileName ) logger.info( 'moving %s to %s', frameFilePath, dataDirPath ) try: if os.path.isfile( outFilePath ): os.remove( outFilePath ) shutil.move( os.path.join( frameDirPath, 'data', frameFileName), dataDirPath ) except Exception as exc: logger.warning( 'trouble moving %s (%s) %s', frameFileName, type(exc), exc ) logResult( 'exception', str(exc), frameNum ) return exc finally: (byInstance, badSet) = demuxResults( installerFilePath ) if badSet: logger.warning( 'instances not well installed: %s', badSet ) badIids = [] goodInstances = [] # remove bad instances from the list of instances to recycle for inst in instances: iid = inst['instanceId'] if iid in badSet: badIids.append( iid ) else: goodInstances.append( inst ) # terminate any bad instances if badIids: logResult( 'renderFrame would terminate bad instances', badIids, frameNum ) ncs.terminateInstances( args.authToken, badIids ) instances = goodInstances # recycle the (hopefully non-bad) instances recycleInstances( instances ) logResult( 'frameEnd', frameNum, frameNum ) return 0