Пример #1
0
def allocateInstances( nWorkersWanted, launchedJsonFilePath ):
    # see if there are enough released ones to reuse
    toReuse = []
    with g_releasedInstancesLock:
        if len( g_releasedInstances ) >= nWorkersWanted:
            for _ in range( 0, nWorkersWanted ):
                toReuse.append( g_releasedInstances.popleft() )
    if toReuse:
        logger.info( 'REUSING instances')
        with open( launchedJsonFilePath,'w' ) as outFile:
            json.dump( toReuse, outFile )
        return toReuse

    nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter )
    if nWorkersWanted > (nAvail + 0):
        logger.error( 'not enough devices available (%d requested)', nWorkersWanted )
        raise ValueError( 'not enough devices available')
    if args.sshClientKeyName:
        sshClientKeyName = args.sshClientKeyName
    else:
        keyContents = runDistributedBlender.loadSshPubKey()
        randomPart = str( uuid.uuid4() )[0:13]
        keyContents += ' #' + randomPart
        sshClientKeyName = 'bfr_%s' % (randomPart)
        respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents )
        if respCode < 200 or respCode >= 300:
            logger.warning( 'ncs.uploadSshClientKey returned %s', respCode )
            sys.exit( 'could not upload SSH client key')
    logResult( 'launchInstances', nWorkersWanted, 'allocateInstances' )
    rc = runDistributedBlender.launchInstances( args.authToken, nWorkersWanted,
        sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter )
    if rc:
        logger.debug( 'launchInstances returned %d', rc )
    # delete sshClientKey only if we just uploaded it
    if sshClientKeyName != args.sshClientKeyName:
        logger.info( 'deleting sshClientKey %s', sshClientKeyName)
        ncs.deleteSshClientKey( args.authToken, sshClientKeyName )
 
    # get instances from the launched json file
    launchedInstances = []
    with open( launchedJsonFilePath, 'r') as jsonInFile:
        try:
            launchedInstances = json.load(jsonInFile)  # an array
        except Exception as exc:
            logger.warning( 'could not load json (%s) %s', type(exc), exc )
    if len( launchedInstances ) < nWorkersWanted:
        logger.warning( 'could not launch as many instances as wanted (%d vs %d)',
            len( launchedInstances ), nWorkersWanted )

    if True:  #launchWanted:
        with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile:
            jsonToKnownHosts.jsonToKnownHosts( launchedInstances, khFile )
    return launchedInstances
Пример #2
0
        if launchWanted:
            nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter )
            if nWorkersWanted > (nAvail + 5):
                logger.error( 'not enough devices available (%d requested)', nWorkersWanted )
                sys.exit(1)
            if nWorkersWanted == 0:
                logger.info( '%d devices available to launch', nAvail )
                nWorkersWanted = nAvail
            if args.sshClientKeyName:
                sshClientKeyName = args.sshClientKeyName
            else:
                keyContents = loadSshPubKey()
                randomPart = str( uuid.uuid4() )[0:13]
                keyContents += ' #' + randomPart
                sshClientKeyName = 'pingtest_%s' % (randomPart)
                respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents )
                if respCode < 200 or respCode >= 300:
                    logger.warning( 'ncs.uploadSshClientKey returned %s', respCode )
                    sys.exit( 'could not upload SSH client key')

            rc = launchInstances( args.authToken, launchedJsonFilePath, nWorkersWanted, sshClientKeyName, filtersJson=args.filter )
            # delete sshClientKey only if we just uploaded it
            if sshClientKeyName != args.sshClientKeyName:
                logger.info( 'deleting sshClientKey %s', sshClientKeyName)
                ncs.deleteSshClientKey( args.authToken, sshClientKeyName )
            if rc:
                logger.warning( 'launchInstances returned %d', rc )
                sys.exit( 'could not launch instances')
        # find out if any instances were started
        startedInstances = getStartedInstances( launchedJsonFilePath )  # a list of instance dicts
        # do the actual work, unless we shouldn't
Пример #3
0
def recruitInstances(nWorkersWanted, launchedJsonFilePath, launchWanted,
                     resultsLogFilePath, installerFileName):
    '''launch instances and install fahclient on them;
        terminate those that could not install; return list of good instances'''
    logger.info('recruiting up to %d instances', nWorkersWanted)
    goodInstances = []
    if launchWanted:
        nAvail = ncs.getAvailableDeviceCount(args.authToken,
                                             filtersJson=args.filter)
        if nWorkersWanted > (nAvail + 0):
            logger.error(
                'not enough devices available (%d requested, %d avail)',
                nWorkersWanted, nAvail)
            raise ValueError('not enough devices available')
        # upload an sshClientKey for launch (unless one was provided)
        if args.sshClientKeyName:
            sshClientKeyName = args.sshClientKeyName
        else:
            keyContents = loadSshPubKey().strip()
            randomPart = str(uuid.uuid4())[0:13]
            #keyContents += ' #' + randomPart
            sshClientKeyName = 'fah_%s' % (randomPart)
            respCode = ncs.uploadSshClientKey(args.authToken, sshClientKeyName,
                                              keyContents)
            if respCode < 200 or respCode >= 300:
                logger.warning('ncs.uploadSshClientKey returned %s', respCode)
                sys.exit('could not upload SSH client key')
        #launch
        rc = launchInstances(args.authToken,
                             nWorkersWanted,
                             sshClientKeyName,
                             launchedJsonFilePath,
                             filtersJson=args.filter,
                             encryptFiles=args.encryptFiles)
        if rc:
            logger.info('launchInstances returned %d', rc)
        # delete sshClientKey only if we just uploaded it
        if sshClientKeyName != args.sshClientKeyName:
            logger.info('deleting sshClientKey %s', sshClientKeyName)
            ncs.deleteSshClientKey(args.authToken, sshClientKeyName)
    launchedInstances = []
    # get instances from the launched json file
    with open(launchedJsonFilePath, 'r') as jsonInFile:
        try:
            launchedInstances = json.load(jsonInFile)  # an array
        except Exception as exc:
            logger.warning('could not load json (%s) %s', type(exc), exc)
    if len(launchedInstances) < nWorkersWanted:
        logger.warning(
            'could not launch as many instances as wanted (%d vs %d)',
            len(launchedInstances), nWorkersWanted)
    nonstartedIids = [
        inst['instanceId'] for inst in launchedInstances
        if inst['state'] != 'started'
    ]
    if nonstartedIids:
        logger.warning('terminating non-started instances %s', nonstartedIids)
        terminateNcsScInstances(args.authToken, nonstartedIids)
        logger.info('done terminating non-started instances')
    # proceed with instances that were actually started
    startedInstances = [
        inst for inst in launchedInstances if inst['state'] == 'started'
    ]
    if not startedInstances:
        return ([], [])
    if not sigtermSignaled():
        installerCmd = './' + installerFileName
        logger.info('calling tellInstances to install on %d instances',
                    len(startedInstances))
        stepStatuses = tellInstances.tellInstances(
            startedInstances,
            installerCmd,
            resultsLogFilePath=resultsLogFilePath,
            download=None,
            downloadDestDir=None,
            jsonOut=None,
            sshAgent=args.sshAgent,
            timeLimit=args.timeLimit,
            upload=args.uploads,
            stopOnSigterm=False,
            knownHostsOnly=False)
        # SHOULD restore our handler because tellInstances may have overridden it
        #signal.signal( signal.SIGTERM, sigtermHandler )
        if not stepStatuses:
            logger.warning('no statuses returned from installer')
            startedIids = [inst['instanceId'] for inst in startedInstances]
            #logOperation( 'terminateBad', startedIids, '<recruitInstances>' )
            terminateNcsScInstances(args.authToken, startedIids)
            return ([], [])
        #(goodOnes, badOnes) = triage( stepStatuses )
        # separate good tellInstances statuses from bad ones
        goodOnes = []
        badOnes = []
        for status in stepStatuses:
            if isinstance(status['status'], int) and status['status'] == 0:
                goodOnes.append(status['instanceId'])
            else:
                badOnes.append(status)
                if isinstance(status['status'], asyncio.TimeoutError):
                    logger.info('installer status asyncio.TimeoutError')

        logger.info('%d good installs, %d bad installs', len(goodOnes),
                    len(badOnes))
        #logger.info( 'stepStatuses %s', stepStatuses )
        goodInstances = [
            inst for inst in startedInstances if inst['instanceId'] in goodOnes
        ]
        badIids = []
        for status in badOnes:
            badIids.append(status['instanceId'])
        if badIids:
            #logOperation( 'terminateBad', badIids, '<recruitInstances>' )
            terminateNcsScInstances(args.authToken, badIids)

    return goodInstances, badOnes
Пример #4
0
def recruitInstances( nWorkersWanted, launchedJsonFilePath, launchWanted, resultsLogFilePath='' ):
    '''launch instances and install blender on them;
        terminate those that could not install; return list of good instances'''
    logger.info( 'recruiting up to %d instances', nWorkersWanted )
    if not resultsLogFilePath:
        resultsLogFilePath = dataDirPath+'/recruitInstances.jlog'
    goodInstances = []
    if launchWanted:
        logger.info( 'recruiting %d instances', nWorkersWanted )
        nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter )
        if nWorkersWanted > (nAvail + 0):
            logger.error( 'not enough devices available (%d requested, %d avail)', nWorkersWanted, nAvail )
            raise ValueError( 'not enough devices available')
        # prepare sshClientKey for launch
        if args.sshClientKeyName:
            sshClientKeyName = args.sshClientKeyName
        else:
            keyContents = loadSshPubKey().strip()
            randomPart = str( uuid.uuid4() )[0:13]
            #keyContents += ' #' + randomPart
            sshClientKeyName = 'bfr_%s' % (randomPart)
            respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents )
            if respCode < 200 or respCode >= 300:
                logger.warning( 'ncs.uploadSshClientKey returned %s', respCode )
                sys.exit( 'could not upload SSH client key')
        #launch
        #logResult( 'operation', {'launchInstances': nWorkersWanted}, '<recruitInstances>' )
        logOperation( 'launchInstances', nWorkersWanted, '<recruitInstances>' )
        rc = launchInstances( args.authToken, nWorkersWanted,
            sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter,
            encryptFiles = args.encryptFiles
            )
        if rc:
            logger.debug( 'launchInstances returned %d', rc )
        # delete sshClientKey only if we just uploaded it
        if sshClientKeyName != args.sshClientKeyName:
            logger.info( 'deleting sshClientKey %s', sshClientKeyName)
            ncs.deleteSshClientKey( args.authToken, sshClientKeyName )
    launchedInstances = []
    # get instances from the launched json file
    with open( launchedJsonFilePath, 'r') as jsonInFile:
        try:
            launchedInstances = json.load(jsonInFile)  # an array
        except Exception as exc:
            logger.warning( 'could not load json (%s) %s', type(exc), exc )
    if len( launchedInstances ) < nWorkersWanted:
        logger.warning( 'could not launch as many instances as wanted (%d vs %d)',
            len( launchedInstances ), nWorkersWanted )
    nonstartedIids = [inst['instanceId'] for inst in launchedInstances if inst['state'] != 'started' ]
    if nonstartedIids:
        logger.warning( 'terminating non-started instances %s', nonstartedIids )
        ncs.terminateInstances( args.authToken, nonstartedIids )
    # proceed with instances that were actually started
    startedInstances = [inst for inst in launchedInstances if inst['state'] == 'started' ]
    # add instances to knownHosts
    with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile:
        jsonToKnownHosts.jsonToKnownHosts( startedInstances, khFile )
    # install blender on startedInstances
    if not sigtermSignaled():
        installerCmd = 'sudo apt-get -qq update && sudo apt-get -qq -y install blender > /dev/null'
        logger.info( 'calling tellInstances to install on %d instances', len(startedInstances))
        stepStatuses = tellInstances.tellInstances( startedInstances, installerCmd,
            resultsLogFilePath=resultsLogFilePath,
            download=None, downloadDestDir=None, jsonOut=None, sshAgent=args.sshAgent,
            timeLimit=min(args.instTimeLimit, args.timeLimit), upload=None, stopOnSigterm=not True,
            knownHostsOnly=True
            )
        # SHOULD restore our handler because tellInstances may have overridden it
        #signal.signal( signal.SIGTERM, sigtermHandler )
        if not stepStatuses:
            logger.warning( 'no statuses returned from installer')
            startedIids = [inst['instanceId'] for inst in startedInstances]
            logOperation( 'terminateBad', startedIids, '<recruitInstances>' )
            ncs.terminateInstances( args.authToken, startedIids )
            return []
        (goodOnes, badOnes) = triage( stepStatuses )
        #stepTiming.finish()
        #eventTimings.append(stepTiming)
        logger.info( '%d good installs, %d bad installs', len(goodOnes), len(badOnes) )
        logger.info( 'stepStatuses %s', stepStatuses )
        goodInstances = [inst for inst in startedInstances if inst['instanceId'] in goodOnes ]
        badIids = []
        for status in badOnes:
            badIids.append( status['instanceId'] )
        if badIids:
            logOperation( 'terminateBad', badIids, '<recruitInstances>' )
            ncs.terminateInstances( args.authToken, badIids )
        #if goodInstances:
        #    recycleInstances( goodInstances )
    return goodInstances
Пример #5
0
def recruitInstance( launchedJsonFilePath, resultsLogFilePathIgnored ):
    logger.info( 'recruiting 1 instance' )
    nWorkersWanted = 1
    # prepare sshClientKey for launch
    if args.sshClientKeyName:
        sshClientKeyName = args.sshClientKeyName
    else:
        keyContents = loadSshPubKey().strip()
        randomPart = str( uuid.uuid4() )[0:13]
        #keyContents += ' #' + randomPart
        sshClientKeyName = 'bfr_%s' % (randomPart)
        respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents )
        if respCode < 200 or respCode >= 300:
            logger.warning( 'ncs.uploadSshClientKey returned %s', respCode )
            raise Exception( 'could not upload SSH client key')
    #launch
    logOperation( 'launchInstances', 1, '<recruitInstances>' )
    rc = launchInstances( args.authToken, 1,
        sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter,
        encryptFiles = args.encryptFiles
        )
    if rc:
        logger.debug( 'launchInstances returned %d', rc )
    # delete sshClientKey only if we just uploaded it
    if sshClientKeyName != args.sshClientKeyName:
        logger.info( 'deleting sshClientKey %s', sshClientKeyName)
        ncs.deleteSshClientKey( args.authToken, sshClientKeyName )
    if rc:
        return None
    launchedInstances = []
    # get instances from the launched json file
    with open( launchedJsonFilePath, 'r') as jsonInFile:
        try:
            launchedInstances = json.load(jsonInFile)  # an array
        except Exception as exc:
            logger.warning( 'could not load json (%s) %s', type(exc), exc )
    if len( launchedInstances ) < nWorkersWanted:
        logger.warning( 'could not launch as many instances as wanted (%d vs %d)',
            len( launchedInstances ), nWorkersWanted )
    nonstartedIids = [inst['instanceId'] for inst in launchedInstances if inst['state'] != 'started' ]
    if nonstartedIids:
        logger.warning( 'terminating non-started instances %s', nonstartedIids )
        ncs.terminateInstances( args.authToken, nonstartedIids )
    # proceed with instances that were actually started
    startedInstances = [inst for inst in launchedInstances if inst['state'] == 'started' ]
    if len(startedInstances) != 1:
        logger.warning( 'launched %d instances', len(startedInstances) )
        return None

    inst = startedInstances[0]
    iid = inst['instanceId']
    abbrevIid = iid[0:16]
    def trackStderr( proc ):
        for line in proc.stderr:
            print( '<stderr>', abbrevIid, line.strip(), file=sys.stderr )
            logInstallerEvent( 'stderr', line.strip(), iid )

    if sigtermSignaled():
        logger.warning( 'terminating instance because sigtermSignaled %s', iid )
        logOperation( 'terminateFinal', [iid], '<master>' )
        ncs.terminateInstances( args.authToken, [iid] )
        return None
    else:
        # add instance to knownHosts
        with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile:
            jsonToKnownHosts.jsonToKnownHosts( startedInstances, khFile )
        # install blender on startedInstance
        installerCmd = 'sudo apt-get -qq update && sudo apt-get -qq -y install blender > /dev/null'
        logger.info( 'installerCmd %s', installerCmd )
        sshSpecs = inst['ssh']
        deadline = min( g_deadline, time.time() + args.instTimeLimit )
        logInstallerOperation( iid, ['connect', sshSpecs['host'], sshSpecs['port']] )
        with subprocess.Popen(['ssh',
                        '-p', str(sshSpecs['port']),
                        '-o', 'ServerAliveInterval=360',
                        '-o', 'ServerAliveCountMax=3',
                        sshSpecs['user'] + '@' + sshSpecs['host'], installerCmd],
                        encoding='utf8',
                        #stdout=subprocess.PIPE,  # subprocess.PIPE subprocess.DEVNULL
                        stderr=subprocess.PIPE) as proc:
            logInstallerOperation( iid, ['command', installerCmd] )
            stderrThr = threading.Thread(target=trackStderr, args=(proc,))
            stderrThr.start()
            while time.time() < deadline:
                proc.poll() # sets proc.returncode
                if proc.returncode == None:
                    logger.info( 'waiting for install')
                else:
                    if proc.returncode == 0:
                        logger.info( 'installer succeeded on instance %s', abbrevIid )
                    else:
                        logger.warning( 'instance %s gave returnCode %d', abbrevIid, proc.returncode )
                    break
                if sigtermSignaled():
                    break
                time.sleep(30)
            proc.poll()
            returnCode = proc.returncode if proc.returncode != None else 124 # declare timeout if no rc
            if returnCode:
                logger.warning( 'installer returnCode %s', returnCode )
            if returnCode == 124:
                logInstallerEvent( 'timeout', args.instTimeLimit, iid )
            else:
                logInstallerEvent('returncode', returnCode, iid )
            proc.terminate()
            try:
                proc.wait(timeout=5)
                if proc.returncode:
                    logger.warning( 'ssh return code %d', proc.returncode )
            except subprocess.TimeoutExpired:
                logger.warning( 'ssh did not terminate in time' )
            stderrThr.join()
            if returnCode:
                logger.warning( 'terminating instance because installerFailed %s', iid )
                ncs.terminateInstances( args.authToken, [iid] )
                logOperation( 'terminateBad', [iid], '<recruitInstances>' )
                return None
            else:
                return inst
    return None