def _setQueue(taskName, batchState): yield _blockOnTask(taskName) cluster = yield loadCluster( 'localhost', batchState['pipeline_config']['cluster.CLUSTER_NAME'], 'guest') yield defer_utils.tryUntil( 10, lambda: _getOutput(batchState, [ '/opt/clovr_pipelines/workflow/project_saved_templates/clovr_lgt_wrapper/set_queue.sh', cluster['master']['public_dns'] ], log=True), onFailure=defer_utils.sleep(2)) conf = config.configFromStream(open('/tmp/machine.conf')) # Remove autoshutdown, we want no part of that yield ssh.runProcessSSH(cluster['master']['public_dns'], 'rm -v /var/vappio/runtime/noautoshutdown', stdoutf=None, stderrf=None, sshUser=conf('ssh.user'), sshFlags=conf('ssh.options'), log=True)
def runInstances(credClient, ami, key, iType, groups, availZone, bidPrice, minInstances, maxInstances, userData): def _runInstances(num): if bidPrice: return credClient.runSpotInstances(bidPrice=bidPrice, ami=ami, key=key, instanceType=iType, groups=groups, availabilityZone=availZone, numInstances=num, userData=userData) else: return credClient.runInstances(ami=ami, key=key, instanceType=iType, groups=groups, availabilityZone=availZone, numInstances=num, userData=userData) instances = [] @defer.inlineCallbacks def _startInstances(): startedInstances = yield _runInstances(maxInstances - len(instances)) instances.extend(startedInstances) if len(instances) < minInstances: raise InstanceStartError('Wanted %d instances got %d' % (maxInstances - len(instances), len(startedInstances))) try: yield defer_utils.tryUntil(RUN_INSTANCE_TRIES, _startInstances, onFailure=defer_utils.sleep(30)) except Exception, err: ## If we got an exception then terminate any instances ## that were started and reraise exception. ## The last thing we want is to leak instances ## ## This is not completely safe! We should probably ## raise an exception with the started instances in it ## and let the caller decide what to do with them log.err('Error starting instances') log.err(err) defer_utils.mapSerial(lambda iChunk : credClient.terminateInstances(iChunk), func.chunk(5, instances))
def initialize(self): cacheId = lambda d: func.updateDict(d, {"_id": d["user_name"] + "_" + d["pipeline_name"]}) self.cache = yield mongo_cache.createCache("pipelines_cache", cacheId) self.persistManager.addDependent(self) self.tagNotify.addDependent(self) pipelines = yield defer_utils.tryUntil( 10, lambda: self.persistManager.loadAllPipelinesByAdmin({}), onFailure=defer_utils.sleep(2) ) for pipeline in pipelines: self.workQueue.add(self._pipelineToDictAndCache, "load", pipeline)
def subscribe(mq, state): yield defer_utils.tryUntil(10, lambda : _monitorAnyPipelines(mq, state), onFailure=defer_utils.sleep(2)) processPipelineList = queue.returnResponse(defer_pipe.pipe([queue.keysInBody(['cluster', 'user_name']), _forwardToCluster(state.conf, state.conf('pipelines.list_www')), handleWWWPipelineList])) queue.subscribe(mq, state.conf('pipelines.list_www'), state.conf('pipelines.concurrent_list'), queue.wrapRequestHandler(state, processPipelineList))
def subscribe(mq, state): yield defer_utils.tryUntil(10, lambda: _monitorAnyPipelines(mq, state), onFailure=defer_utils.sleep(2)) processPipelineList = queue.returnResponse( defer_pipe.pipe([ queue.keysInBody(['cluster', 'user_name']), _forwardToCluster(state.conf, state.conf('pipelines.list_www')), handleWWWPipelineList ])) queue.subscribe(mq, state.conf('pipelines.list_www'), state.conf('pipelines.concurrent_list'), queue.wrapRequestHandler(state, processPipelineList))
def runInstances(credClient, ami, key, iType, groups, availZone, bidPrice, minInstances, maxInstances, userData): def _runInstances(num): if bidPrice: return credClient.runSpotInstances(bidPrice=bidPrice, ami=ami, key=key, instanceType=iType, groups=groups, availabilityZone=availZone, numInstances=num, userData=userData) else: return credClient.runInstances(ami=ami, key=key, instanceType=iType, groups=groups, availabilityZone=availZone, numInstances=num, userData=userData) instances = [] @defer.inlineCallbacks def _startInstances(): startedInstances = yield _runInstances(maxInstances - len(instances)) instances.extend(startedInstances) if len(instances) < minInstances: raise InstanceStartError( 'Wanted %d instances got %d' % (maxInstances - len(instances), len(startedInstances))) try: yield defer_utils.tryUntil(RUN_INSTANCE_TRIES, _startInstances, onFailure=defer_utils.sleep(30)) except Exception, err: ## If we got an exception then terminate any instances ## that were started and reraise exception. ## The last thing we want is to leak instances ## ## This is not completely safe! We should probably ## raise an exception with the started instances in it ## and let the caller decide what to do with them log.err('Error starting instances') log.err(err) defer_utils.mapSerial( lambda iChunk: credClient.terminateInstances(iChunk), func.chunk(5, instances))
def initialize(self): cacheId = lambda d : func.updateDict(d, {'_id': d['user_name'] + '_' + d['pipeline_name']}) self.cache = yield mongo_cache.createCache('pipelines_cache', cacheId) self.persistManager.addDependent(self) self.tagNotify.addDependent(self) pipelines = yield defer_utils.tryUntil(10, lambda : self.persistManager.loadAllPipelinesByAdmin({}), onFailure=defer_utils.sleep(2)) for pipeline in pipelines: self.workQueue.add(self._pipelineToDictAndCache, 'load', pipeline)
def performQueryNoParse(host, url, var, headers=None, timeout=30, tries=4, debug=False): if tries == 0: raise RetriesFailed() if headers is None: headers = {} d = defer_utils.tryUntil(tries, lambda : getPage(('http://' + host + url).encode('utf_8'), method='POST', postdata=urllib.urlencode({'request': json.dumps(var)}), headers=func.updateDict(headers, {'Content-Type': 'application/x-www-form-urlencoded'}), connectionTimeout=timeout, timeout=timeout), onFailure=defer_utils.sleep(10)) def _error(f): log.err(f) return f d.addErrback(_error) return d
def _setQueue(taskName, batchState): yield _blockOnTask(taskName) cluster = yield loadCluster('localhost', batchState['pipeline_config']['cluster.CLUSTER_NAME'], 'guest') yield defer_utils.tryUntil(10, lambda : _getOutput(batchState, ['/opt/clovr_pipelines/workflow/project_saved_templates/clovr_lgt_wrapper/set_queue.sh', cluster['master']['public_dns']], log=True), onFailure=defer_utils.sleep(2)) conf = config.configFromStream(open('/tmp/machine.conf')) # Remove autoshutdown, we want no part of that yield ssh.runProcessSSH(cluster['master']['public_dns'], 'rm -v /var/vappio/runtime/noautoshutdown', stdoutf=None, stderrf=None, sshUser=conf('ssh.user'), sshFlags=conf('ssh.options'), log=True)
def instantiateCredential(conf, cred): if not conf('config_loaded', default=False): conf = config.configFromConfig(conf, base=config.configFromStream(open(conf('conf_file')), base=conf)) certFile = os.path.join(conf('general.secure_tmp'), cred.name + '_cert.pem') keyFile = os.path.join(conf('general.secure_tmp'), cred.name + '_key.pem') mainDeferred = defer.succeed(None) if not os.path.exists(certFile) and not os.path.exists(keyFile): tmpCertFile = os.path.join(conf('general.secure_tmp'), cred.name + '_cert-tmp.pem') tmpKeyFile = os.path.join(conf('general.secure_tmp'), cred.name + '_key-tmp.pem') if 'ec2_url' not in cred.metadata: return defer.fail(Exception('You must have an ec2_url')) parsedUrl = urlparse.urlparse(cred.metadata['ec2_url']) if ':' not in parsedUrl.netloc: return defer.fail(Exception('Your URL must contain a port')) host, port = parsedUrl.netloc.split(':') fout = open(tmpCertFile, 'w') fout.write(cred.cert) fout.close() fout = open(tmpKeyFile, 'w') fout.write(cred.pkey) fout.close() d = commands.runProcess(['nimbusCerts2EC2.py', '--in-cert=' + tmpCertFile, '--out-cert=' + certFile, '--in-key=' + tmpKeyFile, '--out-key=' + keyFile, '--java-cert-dir=/tmp', '--java-cert-host=' + host, '--java-cert-port=' + port], stdoutf=None, stderrf=None, log=True) def _chmod(_exitCode): return commands.runProcess(['chmod', '+r', keyFile], stdoutf=None, stderrf=None) d.addCallback(_chmod) def _unlink(v): os.unlink(tmpCertFile) os.unlink(tmpKeyFile) return v d.addCallback(_unlink) d.addErrback(_unlink) mainDeferred.addCallback(lambda _ : d) ec2Home = cred.metadata.get('ec2_api_tools', '/opt/ec2-api-tools-1.3-57419') newCred = func.Record(name=cred.name, conf=conf, cert=certFile, pkey=keyFile, ec2Path=os.path.join(ec2Home, 'bin'), env=dict(EC2_JVM_ARGS='-Djavax.net.ssl.trustStore=/tmp/jssecacerts', EC2_HOME=ec2Home, EC2_URL=cred.metadata['ec2_url'])) if os.path.exists(conf('cluster.cluster_private_key') + '.pub'): pubKey = open(conf('cluster.cluster_private_key') + '.pub').read().rstrip() def _addKeypair(): keyPairDefer = ec2.addKeypair(newCred, conf('cluster.key') + '||' + pubKey) def _printError(f): log.msg('Adding keypair failed, retrying') log.err(f) return f keyPairDefer.addErrback(_printError) return keyPairDefer mainDeferred.addCallback(lambda _ : defer_utils.tryUntil(10, _addKeypair, onFailure=defer_utils.sleep(30))) mainDeferred.addCallback(lambda _ : newCred) return mainDeferred
def _(*args, **kwargs): return defer_utils.tryUntil(n, lambda : f(*args, **kwargs), onFailure=defer_utils.sleep(30), retry=retryIfTTLError)
def instantiateCredential(conf, cred): if not conf('config_loaded', default=False): conf = config.configFromConfig(conf, base=config.configFromStream(open( conf('conf_file')), base=conf)) certFile = os.path.join(conf('general.secure_tmp'), cred.name + '_cert.pem') keyFile = os.path.join(conf('general.secure_tmp'), cred.name + '_key.pem') mainDeferred = defer.succeed(None) if not os.path.exists(certFile) and not os.path.exists(keyFile): tmpCertFile = os.path.join(conf('general.secure_tmp'), cred.name + '_cert-tmp.pem') tmpKeyFile = os.path.join(conf('general.secure_tmp'), cred.name + '_key-tmp.pem') if 'ec2_url' not in cred.metadata: return defer.fail(Exception('You must have an ec2_url')) parsedUrl = urlparse.urlparse(cred.metadata['ec2_url']) if ':' not in parsedUrl.netloc: return defer.fail(Exception('Your URL must contain a port')) host, port = parsedUrl.netloc.split(':') fout = open(tmpCertFile, 'w') fout.write(cred.cert) fout.close() fout = open(tmpKeyFile, 'w') fout.write(cred.pkey) fout.close() d = commands.runProcess([ 'nimbusCerts2EC2.py', '--in-cert=' + tmpCertFile, '--out-cert=' + certFile, '--in-key=' + tmpKeyFile, '--out-key=' + keyFile, '--java-cert-dir=/tmp', '--java-cert-host=' + host, '--java-cert-port=' + port ], stdoutf=None, stderrf=None, log=True) def _chmod(_exitCode): return commands.runProcess(['chmod', '+r', keyFile], stdoutf=None, stderrf=None) d.addCallback(_chmod) def _unlink(v): os.unlink(tmpCertFile) os.unlink(tmpKeyFile) return v d.addCallback(_unlink) d.addErrback(_unlink) mainDeferred.addCallback(lambda _: d) ec2Home = cred.metadata.get('ec2_api_tools', '/opt/ec2-api-tools-1.3-57419') newCred = func.Record( name=cred.name, conf=conf, cert=certFile, pkey=keyFile, ec2Path=os.path.join(ec2Home, 'bin'), env=dict(EC2_JVM_ARGS='-Djavax.net.ssl.trustStore=/tmp/jssecacerts', EC2_HOME=ec2Home, EC2_URL=cred.metadata['ec2_url'])) if os.path.exists(conf('cluster.cluster_private_key') + '.pub'): pubKey = open(conf('cluster.cluster_private_key') + '.pub').read().rstrip() def _addKeypair(): keyPairDefer = ec2.addKeypair(newCred, conf('cluster.key') + '||' + pubKey) def _printError(f): log.msg('Adding keypair failed, retrying') log.err(f) return f keyPairDefer.addErrback(_printError) return keyPairDefer mainDeferred.addCallback(lambda _: defer_utils.tryUntil( 10, _addKeypair, onFailure=defer_utils.sleep(30))) mainDeferred.addCallback(lambda _: newCred) return mainDeferred
def _(*args, **kwargs): return defer_utils.tryUntil(n, lambda: f(*args, **kwargs), onFailure=defer_utils.sleep(30), retry=retryIfTTLError)