Пример #1
0
def allocateInstances( nWorkersWanted, launchedJsonFilePath ):
    # see if there are enough released ones to reuse
    toReuse = []
    with g_releasedInstancesLock:
        if len( g_releasedInstances ) >= nWorkersWanted:
            for _ in range( 0, nWorkersWanted ):
                toReuse.append( g_releasedInstances.popleft() )
    if toReuse:
        logger.info( 'REUSING instances')
        with open( launchedJsonFilePath,'w' ) as outFile:
            json.dump( toReuse, outFile )
        return toReuse

    nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter )
    if nWorkersWanted > (nAvail + 0):
        logger.error( 'not enough devices available (%d requested)', nWorkersWanted )
        raise ValueError( 'not enough devices available')
    if args.sshClientKeyName:
        sshClientKeyName = args.sshClientKeyName
    else:
        keyContents = runDistributedBlender.loadSshPubKey()
        randomPart = str( uuid.uuid4() )[0:13]
        keyContents += ' #' + randomPart
        sshClientKeyName = 'bfr_%s' % (randomPart)
        respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents )
        if respCode < 200 or respCode >= 300:
            logger.warning( 'ncs.uploadSshClientKey returned %s', respCode )
            sys.exit( 'could not upload SSH client key')
    logResult( 'launchInstances', nWorkersWanted, 'allocateInstances' )
    rc = runDistributedBlender.launchInstances( args.authToken, nWorkersWanted,
        sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter )
    if rc:
        logger.debug( 'launchInstances returned %d', rc )
    # delete sshClientKey only if we just uploaded it
    if sshClientKeyName != args.sshClientKeyName:
        logger.info( 'deleting sshClientKey %s', sshClientKeyName)
        ncs.deleteSshClientKey( args.authToken, sshClientKeyName )
 
    # get instances from the launched json file
    launchedInstances = []
    with open( launchedJsonFilePath, 'r') as jsonInFile:
        try:
            launchedInstances = json.load(jsonInFile)  # an array
        except Exception as exc:
            logger.warning( 'could not load json (%s) %s', type(exc), exc )
    if len( launchedInstances ) < nWorkersWanted:
        logger.warning( 'could not launch as many instances as wanted (%d vs %d)',
            len( launchedInstances ), nWorkersWanted )

    if True:  #launchWanted:
        with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile:
            jsonToKnownHosts.jsonToKnownHosts( launchedInstances, khFile )
    return launchedInstances
Пример #2
0
def getInstancesAvailable(authToken, args):
    '''gets the number of available instances'''
    #authToken = args.get('authToken')
    if not authToken:
        return jsonify('no authToken provided'), 401
    filtersJson = args.get('filter', None)

    callTime = time.time()
    nAvail = ncs.getAvailableDeviceCount(authToken, filtersJson)
    logger.info('ncs.getAvailableDeviceCount took %.1f seconds',
                time.time() - callTime)
    #logger.info( '%d devices available to launch', nAvail )
    return jsonify(nAvail), 200
Пример #3
0
def getInstancesAvailable(authToken, args):
    '''gets the number of available instances'''
    #authToken = args.get('authToken')
    if not authToken:
        return jsonify('no authToken provided'), 401
    filtersJson = args.get('filter', None)
    filtersJson = applyDprIfNone(filtersJson, g_minDpr)
    filtersJson = applyMinRamIfNone(filtersJson, g_minRamMB)
    #if not filtersJson:
    #    return jsonify('missing filter arg'), 422
    callTime = time.time()
    nAvail = ncs.getAvailableDeviceCount(authToken, filtersJson)
    logger.info('ncs.getAvailableDeviceCount took %.1f seconds',
                time.time() - callTime)
    #logger.info( '%d devices available to launch', nAvail )
    return jsonify(nAvail), 200
Пример #4
0
def checkForInstances():
    '''a threadproc to check whether we have enough instances running and maybe launch more'''
    while len(g_framesFinished) < g_nFramesWanted and sigtermNotSignaled() and time.time()< g_deadline:
        overageFactor = 2
        nUnfinished = g_nFramesWanted - len(g_framesFinished)
        nWorkers = len( g_workingInstances )
        if nWorkers < (nUnfinished * overageFactor):
            nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter )
            if nAvail >= 2:
                logger.warning( 'starting thread because not enough workers (%d unfinished, %d workers)',
                    nUnfinished, nWorkers )
                rendererThread = threading.Thread( target=recruitAndRender, name='recruitAndRender' )
                rendererThread.start()

        time.sleep( 60 )
    logger.info( 'finished')
Пример #5
0
            logger.error( 'could not ping target host %s',
                args.targetHost )
            sys.exit(1)
    except Exception as exc:
        logger.warning( 'could not access target host %s',args.targetHost )
        logger.error( 'got exception %s', exc )
        sys.exit(1)

    nWorkersWanted = args.nWorkers
    if launchWanted:
        # overwrite the launchedJson file as empty list, so we won't have problems with stale contents
        with open( launchedJsonFilePath, 'w' ) as outFile:
            json.dump( [], outFile )
    try:
        if launchWanted:
            nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter )
            if nWorkersWanted > (nAvail + 5):
                logger.error( 'not enough devices available (%d requested)', nWorkersWanted )
                sys.exit(1)
            if nWorkersWanted == 0:
                logger.info( '%d devices available to launch', nAvail )
                nWorkersWanted = nAvail
            if args.sshClientKeyName:
                sshClientKeyName = args.sshClientKeyName
            else:
                keyContents = loadSshPubKey()
                randomPart = str( uuid.uuid4() )[0:13]
                keyContents += ' #' + randomPart
                sshClientKeyName = 'pingtest_%s' % (randomPart)
                respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents )
                if respCode < 200 or respCode >= 300:
Пример #6
0
def recruitInstances(nWorkersWanted, launchedJsonFilePath, launchWanted,
                     resultsLogFilePath, installerFileName):
    '''launch instances and install fahclient on them;
        terminate those that could not install; return list of good instances'''
    logger.info('recruiting up to %d instances', nWorkersWanted)
    goodInstances = []
    if launchWanted:
        nAvail = ncs.getAvailableDeviceCount(args.authToken,
                                             filtersJson=args.filter)
        if nWorkersWanted > (nAvail + 0):
            logger.error(
                'not enough devices available (%d requested, %d avail)',
                nWorkersWanted, nAvail)
            raise ValueError('not enough devices available')
        # upload an sshClientKey for launch (unless one was provided)
        if args.sshClientKeyName:
            sshClientKeyName = args.sshClientKeyName
        else:
            keyContents = loadSshPubKey().strip()
            randomPart = str(uuid.uuid4())[0:13]
            #keyContents += ' #' + randomPart
            sshClientKeyName = 'fah_%s' % (randomPart)
            respCode = ncs.uploadSshClientKey(args.authToken, sshClientKeyName,
                                              keyContents)
            if respCode < 200 or respCode >= 300:
                logger.warning('ncs.uploadSshClientKey returned %s', respCode)
                sys.exit('could not upload SSH client key')
        #launch
        rc = launchInstances(args.authToken,
                             nWorkersWanted,
                             sshClientKeyName,
                             launchedJsonFilePath,
                             filtersJson=args.filter,
                             encryptFiles=args.encryptFiles)
        if rc:
            logger.info('launchInstances returned %d', rc)
        # delete sshClientKey only if we just uploaded it
        if sshClientKeyName != args.sshClientKeyName:
            logger.info('deleting sshClientKey %s', sshClientKeyName)
            ncs.deleteSshClientKey(args.authToken, sshClientKeyName)
    launchedInstances = []
    # get instances from the launched json file
    with open(launchedJsonFilePath, 'r') as jsonInFile:
        try:
            launchedInstances = json.load(jsonInFile)  # an array
        except Exception as exc:
            logger.warning('could not load json (%s) %s', type(exc), exc)
    if len(launchedInstances) < nWorkersWanted:
        logger.warning(
            'could not launch as many instances as wanted (%d vs %d)',
            len(launchedInstances), nWorkersWanted)
    nonstartedIids = [
        inst['instanceId'] for inst in launchedInstances
        if inst['state'] != 'started'
    ]
    if nonstartedIids:
        logger.warning('terminating non-started instances %s', nonstartedIids)
        terminateNcsScInstances(args.authToken, nonstartedIids)
        logger.info('done terminating non-started instances')
    # proceed with instances that were actually started
    startedInstances = [
        inst for inst in launchedInstances if inst['state'] == 'started'
    ]
    if not startedInstances:
        return ([], [])
    if not sigtermSignaled():
        installerCmd = './' + installerFileName
        logger.info('calling tellInstances to install on %d instances',
                    len(startedInstances))
        stepStatuses = tellInstances.tellInstances(
            startedInstances,
            installerCmd,
            resultsLogFilePath=resultsLogFilePath,
            download=None,
            downloadDestDir=None,
            jsonOut=None,
            sshAgent=args.sshAgent,
            timeLimit=args.timeLimit,
            upload=args.uploads,
            stopOnSigterm=False,
            knownHostsOnly=False)
        # SHOULD restore our handler because tellInstances may have overridden it
        #signal.signal( signal.SIGTERM, sigtermHandler )
        if not stepStatuses:
            logger.warning('no statuses returned from installer')
            startedIids = [inst['instanceId'] for inst in startedInstances]
            #logOperation( 'terminateBad', startedIids, '<recruitInstances>' )
            terminateNcsScInstances(args.authToken, startedIids)
            return ([], [])
        #(goodOnes, badOnes) = triage( stepStatuses )
        # separate good tellInstances statuses from bad ones
        goodOnes = []
        badOnes = []
        for status in stepStatuses:
            if isinstance(status['status'], int) and status['status'] == 0:
                goodOnes.append(status['instanceId'])
            else:
                badOnes.append(status)
                if isinstance(status['status'], asyncio.TimeoutError):
                    logger.info('installer status asyncio.TimeoutError')

        logger.info('%d good installs, %d bad installs', len(goodOnes),
                    len(badOnes))
        #logger.info( 'stepStatuses %s', stepStatuses )
        goodInstances = [
            inst for inst in startedInstances if inst['instanceId'] in goodOnes
        ]
        badIids = []
        for status in badOnes:
            badIids.append(status['instanceId'])
        if badIids:
            #logOperation( 'terminateBad', badIids, '<recruitInstances>' )
            terminateNcsScInstances(args.authToken, badIids)

    return goodInstances, badOnes
Пример #7
0
def recruitInstances( nWorkersWanted, launchedJsonFilePath, launchWanted, resultsLogFilePath='' ):
    '''launch instances and install blender on them;
        terminate those that could not install; return list of good instances'''
    logger.info( 'recruiting up to %d instances', nWorkersWanted )
    if not resultsLogFilePath:
        resultsLogFilePath = dataDirPath+'/recruitInstances.jlog'
    goodInstances = []
    if launchWanted:
        logger.info( 'recruiting %d instances', nWorkersWanted )
        nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter )
        if nWorkersWanted > (nAvail + 0):
            logger.error( 'not enough devices available (%d requested, %d avail)', nWorkersWanted, nAvail )
            raise ValueError( 'not enough devices available')
        # prepare sshClientKey for launch
        if args.sshClientKeyName:
            sshClientKeyName = args.sshClientKeyName
        else:
            keyContents = loadSshPubKey().strip()
            randomPart = str( uuid.uuid4() )[0:13]
            #keyContents += ' #' + randomPart
            sshClientKeyName = 'bfr_%s' % (randomPart)
            respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents )
            if respCode < 200 or respCode >= 300:
                logger.warning( 'ncs.uploadSshClientKey returned %s', respCode )
                sys.exit( 'could not upload SSH client key')
        #launch
        #logResult( 'operation', {'launchInstances': nWorkersWanted}, '<recruitInstances>' )
        logOperation( 'launchInstances', nWorkersWanted, '<recruitInstances>' )
        rc = launchInstances( args.authToken, nWorkersWanted,
            sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter,
            encryptFiles = args.encryptFiles
            )
        if rc:
            logger.debug( 'launchInstances returned %d', rc )
        # delete sshClientKey only if we just uploaded it
        if sshClientKeyName != args.sshClientKeyName:
            logger.info( 'deleting sshClientKey %s', sshClientKeyName)
            ncs.deleteSshClientKey( args.authToken, sshClientKeyName )
    launchedInstances = []
    # get instances from the launched json file
    with open( launchedJsonFilePath, 'r') as jsonInFile:
        try:
            launchedInstances = json.load(jsonInFile)  # an array
        except Exception as exc:
            logger.warning( 'could not load json (%s) %s', type(exc), exc )
    if len( launchedInstances ) < nWorkersWanted:
        logger.warning( 'could not launch as many instances as wanted (%d vs %d)',
            len( launchedInstances ), nWorkersWanted )
    nonstartedIids = [inst['instanceId'] for inst in launchedInstances if inst['state'] != 'started' ]
    if nonstartedIids:
        logger.warning( 'terminating non-started instances %s', nonstartedIids )
        ncs.terminateInstances( args.authToken, nonstartedIids )
    # proceed with instances that were actually started
    startedInstances = [inst for inst in launchedInstances if inst['state'] == 'started' ]
    # add instances to knownHosts
    with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile:
        jsonToKnownHosts.jsonToKnownHosts( startedInstances, khFile )
    # install blender on startedInstances
    if not sigtermSignaled():
        installerCmd = 'sudo apt-get -qq update && sudo apt-get -qq -y install blender > /dev/null'
        logger.info( 'calling tellInstances to install on %d instances', len(startedInstances))
        stepStatuses = tellInstances.tellInstances( startedInstances, installerCmd,
            resultsLogFilePath=resultsLogFilePath,
            download=None, downloadDestDir=None, jsonOut=None, sshAgent=args.sshAgent,
            timeLimit=min(args.instTimeLimit, args.timeLimit), upload=None, stopOnSigterm=not True,
            knownHostsOnly=True
            )
        # SHOULD restore our handler because tellInstances may have overridden it
        #signal.signal( signal.SIGTERM, sigtermHandler )
        if not stepStatuses:
            logger.warning( 'no statuses returned from installer')
            startedIids = [inst['instanceId'] for inst in startedInstances]
            logOperation( 'terminateBad', startedIids, '<recruitInstances>' )
            ncs.terminateInstances( args.authToken, startedIids )
            return []
        (goodOnes, badOnes) = triage( stepStatuses )
        #stepTiming.finish()
        #eventTimings.append(stepTiming)
        logger.info( '%d good installs, %d bad installs', len(goodOnes), len(badOnes) )
        logger.info( 'stepStatuses %s', stepStatuses )
        goodInstances = [inst for inst in startedInstances if inst['instanceId'] in goodOnes ]
        badIids = []
        for status in badOnes:
            badIids.append( status['instanceId'] )
        if badIids:
            logOperation( 'terminateBad', badIids, '<recruitInstances>' )
            ncs.terminateInstances( args.authToken, badIids )
        #if goodInstances:
        #    recycleInstances( goodInstances )
    return goodInstances