def _createSecurityGroup(self): assert self._ctx def groupNotFound(e): retry = (e.status == 400 and 'does not exist in default VPC' in e.body) return retry vpcId = None if self._vpcSubnet: conn = boto.connect_vpc(region=self._ctx.ec2.region) subnets = conn.get_all_subnets(subnet_ids=[self._vpcSubnet]) if len(subnets) > 0: vpcId = subnets[0].vpc_id # security group create/get. ssh + all ports open within the group try: web = self._ctx.ec2.create_security_group( self.clusterName, 'Toil appliance security group', vpc_id=vpcId) except EC2ResponseError as e: if e.status == 400 and 'already exists' in e.body: pass # group exists- nothing to do else: raise else: for attempt in retry(predicate=groupNotFound, timeout=300): with attempt: # open port 22 for ssh-ing web.authorize(ip_protocol='tcp', from_port=22, to_port=22, cidr_ip='0.0.0.0/0') for attempt in retry(predicate=groupNotFound, timeout=300): with attempt: # the following authorizes all TCP access within the web security group web.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=web) for attempt in retry(predicate=groupNotFound, timeout=300): with attempt: # We also want to open up UDP, both for user code and for the RealtimeLogger web.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=web) out = [] for sg in self._ctx.ec2.get_all_security_groups(): if sg.name == self.clusterName and (vpcId is None or sg.vpc_id == vpcId): out.append(sg) return out
def _dockerKill(containerName, action): """ Deprecated. Kills the specified container. :param str containerName: The name of the container created by docker_call :param int action: What action should be taken on the container? """ running = containerIsRunning(containerName) if running is None: # This means that the container doesn't exist. We will see this if the # container was run with --rm and has already exited before this call. logger.debug( 'The container with name "%s" appears to have already been ' 'removed. Nothing to ' 'do.', containerName) else: if action in (None, FORGO): logger.debug( 'The container with name %s continues to exist as we ' 'were asked to forgo a ' 'post-job action on it.', containerName) else: logger.debug( 'The container with name %s exists. Running ' 'user-specified defer functions.', containerName) if running and action >= STOP: logger.debug('Stopping container "%s".', containerName) for attempt in retry(predicate=dockerPredicate): with attempt: subprocess.check_call( ['docker', 'stop', containerName]) else: logger.debug('The container "%s" was not found to be running.', containerName) if action >= RM: # If the container was run with --rm, then stop will most likely # remove the container. We first check if it is running then # remove it. running = containerIsRunning(containerName) if running is not None: logger.debug('Removing container "%s".', containerName) for attempt in retry(predicate=dockerPredicate): with attempt: subprocess.check_call( ['docker', 'rm', '-f', containerName]) else: logger.debug( 'Container "%s" was not found on the system.' 'Nothing to remove.', containerName)
def _getProfileArn(self): assert self._ctx policy = dict(iam_full=self.full_policy('iam'), ec2_full=self.full_policy('ec2'), s3_full=self.full_policy('s3'), sbd_full=self.full_policy('sdb')) iamRoleName = self._ctx.setup_iam_ec2_role(role_name=_INSTANCE_PROFILE_ROLE_NAME, policies=policy) try: profile = self._ctx.iam.get_instance_profile(iamRoleName) except BotoServerError as e: if e.status == 404: profile = self._ctx.iam.create_instance_profile(iamRoleName) profile = profile.create_instance_profile_response.create_instance_profile_result else: raise else: profile = profile.get_instance_profile_response.get_instance_profile_result profile = profile.instance_profile profile_arn = profile.arn if len(profile.roles) > 1: raise RuntimeError('Did not expect profile to contain more than one role') elif len(profile.roles) == 1: # this should be profile.roles[0].role_name if profile.roles.member.role_name == iamRoleName: return profile_arn else: self._ctx.iam.remove_role_from_instance_profile(iamRoleName, profile.roles.member.role_name) for attempt in retry(predicate=lambda err: err.status == 404): with attempt: self._ctx.iam.add_role_to_instance_profile(iamRoleName, iamRoleName) return profile_arn
def _fixPermissions(tool, workDir): """ Deprecated. Fix permission of a mounted Docker directory by reusing the tool to change ownership. Docker natively runs as a root inside the container, and files written to the mounted directory are implicitly owned by root. :param list baseDockerCall: Docker run parameters :param str tool: Name of tool :param str workDir: Path of work directory to recursively chown """ if os.geteuid() == 0: # we're running as root so this chown is redundant return baseDockerCall = [ 'docker', 'run', '--log-driver=none', '-v', os.path.abspath(workDir) + ':/data', '--rm', '--entrypoint=chown' ] stat = os.stat(workDir) command = baseDockerCall + [tool] + [ '-R', '{}:{}'.format(stat.st_uid, stat.st_gid), '/data' ] for attempt in retry(predicate=dockerPredicate): with attempt: subprocess.check_call(command)
def retry_ec2(retry_after=a_short_time, retry_for=10 * a_short_time, retry_while=not_found): t = retry_after return retry(delays=(t, t, t * 2, t * 4), timeout=retry_for, predicate=retry_while)
def _getProfileARN(self): assert self._ctx def addRoleErrors(e): return e.status == 404 roleName = 'toil' policy = dict(iam_full=iamFullPolicy, ec2_full=ec2FullPolicy, s3_full=s3FullPolicy, sbd_full=sdbFullPolicy) iamRoleName = self._ctx.setup_iam_ec2_role(role_name=roleName, policies=policy) try: profile = self._ctx.iam.get_instance_profile(iamRoleName) except BotoServerError as e: if e.status == 404: profile = self._ctx.iam.create_instance_profile(iamRoleName) profile = profile.create_instance_profile_response.create_instance_profile_result else: raise else: profile = profile.get_instance_profile_response.get_instance_profile_result profile = profile.instance_profile profile_arn = profile.arn if len(profile.roles) > 1: raise RuntimeError('Did not expect profile to contain more than one role') elif len(profile.roles) == 1: # this should be profile.roles[0].role_name if profile.roles.member.role_name == iamRoleName: return profile_arn else: self._ctx.iam.remove_role_from_instance_profile(iamRoleName, profile.roles.member.role_name) for attempt in retry(predicate=addRoleErrors): with attempt: self._ctx.iam.add_role_to_instance_profile(iamRoleName, iamRoleName) return profile_arn
def _try_kubernetes_expecting_gone(self, method, *args, **kwargs): """ Same as _try_kubernetes, but raises 404 errors as soon as they are encountered (because we are waiting for them) instead of retrying on them. """ for attempt in retry(predicate=retryable_kubernetes_errors_expecting_gone): with attempt: return method(*args, **kwargs)
def _download(self, dstFile): """ Download this resource from its URL to the given file object. :type dstFile: io.BytesIO|io.FileIO """ for attempt in retry(predicate=lambda e: isinstance(e, HTTPError) and e.code == 400): with attempt: with closing(urlopen(self.url)) as content: buf = content.read() contentHash = hashlib.md5(buf) assert contentHash.hexdigest() == self.contentHash dstFile.write(buf)
def _discoverAMI(self): """ :return: The AMI ID (a string like 'ami-0a9a5d2b65cce04eb') for CoreOS or a compatible replacement like Flatcar. :rtype: str """ # Take a user override ami = os.environ.get('TOIL_AWS_AMI') if ami is not None: return ami # CoreOS is dead, long live Flatcar # Flatcar images, however, only live for 9 months. # Rather than hardcode a list of AMIs by region that will die, we use # their JSON feed of the current ones. JSON_FEED_URL = 'https://stable.release.flatcar-linux.net/amd64-usr/current/flatcar_production_ami_all.json' # What region do we care about? region = zoneToRegion(self._zone) for attempt in retry(predicate=lambda e: True): # Until we get parseable JSON # TODO: What errors do we get for timeout, JSON parse failure, etc? with attempt: # Try to get the JSON and parse it. feed = json.loads(urllib.request.urlopen(JSON_FEED_URL).read()) try: for ami_record in feed['amis']: # Scan the klist of regions if ami_record['name'] == region: # When we find ours # Save the AMI ID ami = ami_record['hvm'] # And stop scanning break except KeyError: # We didn't see a field we need raise RuntimeError( 'Flatcar image feed at {} does not have expected format'. format(JSON_FEED_URL)) if ami is None: # We didn't find it raise RuntimeError( 'Flatcar image feed at {} does not have an image for region {}' .format(JSON_FEED_URL, region)) return ami
def _try_kubernetes(self, method, *args, **kwargs): """ Kubernetes API can end abruptly and fail when it could dynamically backoff and retry. For example, calling self._api('batch').create_namespaced_job(self.namespace, job), Kubernetes can behave inconsistently and fail given a large job. See https://github.com/DataBiosphere/toil/issues/2884 . This function gives Kubernetes more time to try an executable api. """ for attempt in retry(predicate=retryable_kubernetes_errors): with attempt: return method(*args, **kwargs)
def _discoverAMI(self): def descriptionMatches(ami): return ami.description is not None and 'stable 1855.5.0' in ami.description coreOSAMI = os.environ.get('TOIL_AWS_AMI') if coreOSAMI is not None: return coreOSAMI for attempt in retry(predicate=lambda e: isinstance(e, SSLError)): # SSLError is thrown when get_all_images times out with attempt: # 679593333241 is the aws marketplace account amis = self._ctx.ec2.get_all_images(owners=['679593333241'], filters={'name': 'CoreOS-stable-1855.5.0-hvm-0d1e0bd0-eaea-4397-9a3a-c56f861d2a14-ami-0f74e41ea6c13f74b.4'}) coreOSAMI = [ami for ami in amis if descriptionMatches(ami)] logger.debug('Found the following matching AMIs: %s', coreOSAMI) assert len(coreOSAMI) == 1, coreOSAMI return coreOSAMI.pop().id
def _obtain_credentials_from_boto3(self): """ We know the current cached credentials are not good, and that we need to get them from Boto 3. Fill in our credential fields (_access_key, _secret_key, _security_token, _credential_expiry_time) from Boto 3. """ # We get a Credentials object # <https://github.com/boto/botocore/blob/8d3ea0e61473fba43774eb3c74e1b22995ee7370/botocore/credentials.py#L227> # or a RefreshableCredentials, or None on failure. creds = None for attempt in retry(timeout=10, predicate=lambda _: True): with attempt: creds = self._boto3_resolver.load_credentials() if creds is None: try: resolvers = str(self._boto3_resolver.providers) except: resolvers = "(Resolvers unavailable)" raise RuntimeError( "Could not obtain AWS credentials from Boto3. Resolvers tried: " + resolvers) # Make sure the credentials actually has some credentials if it is lazy creds.get_frozen_credentials() # Get when the credentials will expire, if ever if isinstance(creds, RefreshableCredentials): # Credentials may expire. # Get a naive UTC datetime like boto 2 uses from the boto 3 time. self._credential_expiry_time = creds._expiry_time.astimezone( timezone('UTC')).replace(tzinfo=None) else: # Credentials never expire self._credential_expiry_time = None # Then, atomically get all the credentials bits. They may be newer than we think they are, but never older. frozen = creds.get_frozen_credentials() # Copy them into us self._access_key = frozen.access_key self._secret_key = frozen.secret_key self._security_token = frozen.token
def destroyCluster(self): """ Terminate instances and delete the profile and security group. """ assert self._ctx def expectedShutdownErrors(e): return e.status == 400 and 'dependent object' in e.body instances = self._getNodesInCluster(nodeType=None, both=True) spotIDs = self._getSpotRequestIDs() if spotIDs: self._ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs) instancesToTerminate = awsFilterImpairedNodes(instances, self._ctx.ec2) vpcId = None if instancesToTerminate: vpcId = instancesToTerminate[0].vpc_id self._deleteIAMProfiles(instances=instancesToTerminate) self._terminateInstances(instances=instancesToTerminate) if len(instances) == len(instancesToTerminate): logger.info('Deleting security group...') removed = False for attempt in retry(timeout=300, predicate=expectedShutdownErrors): with attempt: for sg in self._ctx.ec2.get_all_security_groups(): if sg.name == self.clusterName and vpcId and sg.vpc_id == vpcId: try: self._ctx.ec2.delete_security_group( group_id=sg.id) removed = True except BotoServerError as e: if e.error_code == 'InvalidGroup.NotFound': pass else: raise if removed: logger.info('... Succesfully deleted security group') else: assert len(instances) > len(instancesToTerminate) # the security group can't be deleted until all nodes are terminated logger.warning( 'The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes ' 'have failed health checks. As a result, the security group & IAM ' 'roles will not be deleted.')
def _startMesos(self, numCores=None): if numCores is None: numCores = cpu_count() shutil.rmtree('/tmp/mesos', ignore_errors=True) self.master = self.MesosMasterThread(numCores) self.master.start() self.agent = self.MesosAgentThread(numCores) self.agent.start() # Wait for the master to come up. # Bad Things will happen if the master is not yet ready when Toil tries to use it. for attempt in retry(predicate=lambda e: True): with attempt: log.info('Checking if Mesos is ready...') with closing(urlopen('http://127.0.0.1:5050/version')) as content: content.read() log.info('Mesos is ready! Running test.')
def _discoverAMI(self): def descriptionMatches(ami): return ami.description is not None and 'stable 1632.2.1' in ami.description coreOSAMI = os.environ.get('TOIL_AWS_AMI') if coreOSAMI is not None: return coreOSAMI # that ownerID corresponds to coreOS for attempt in retry(predicate=lambda e: isinstance(e, SSLError)): # SSLError is thrown when get_all_images times out with attempt: amis = self._ctx.ec2.get_all_images(owners=['679593333241']) coreOSAMI = [ami for ami in amis if descriptionMatches(ami)] logger.debug('Found the following matching AMIs: %s', coreOSAMI) assert len(coreOSAMI) == 1 return coreOSAMI.pop().id
def subprocessDockerCall(job, tool, parameters=None, workDir=None, dockerParameters=None, checkOutput=True, outfile=None, errfile=None, defer=None): """ Deprecated. Calls Docker using subprocess.check_output(). Assumes `docker` is on the PATH. Uses Toil's defer functionality to ensure containers are shutdown even in case of job or pipeline failure Example of using dockerCall in toil to index a FASTA file with SAMtools: def toil_job(job): work_dir = job.fileStore.getLocalTempDir() path = job.fileStore.readGlobalFile(ref_id, os.path.join( work_dir, 'ref.fasta') parameters = ['faidx', path] dockerCall(job, tool='quay.io/ucgc_cgl/samtools:latest', work_dir=work_dir, parameters=parameters) :param toil.Job.job job: The Job instance for the calling function. :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools). :param list[str] parameters: Command line arguments to be passed. If list of lists: list[list[str]], then treat as successive commands chained with pipe. :param str workDir: Directory to mount into the container via `-v`. Destination convention is '/data'. :param list[str] dockerParameters: Parameters to pass to Docker. Default parameters are `--rm`, `--log-driver none`, and the mountpoint `-v work_dir:/data` where /data is the destination convention. These defaults are removed if docker_parmaters is passed, so be sure to pass them if they are desired. :param file outfile: Pipe output of Docker call to file handle :param file errfile: Pipe standard error of Docker call to file handle :param int defer: What action should be taken on the container upon job completion? FORGO (0) will leave the container untouched. STOP (1) will attempt to stop the container with `docker stop` (useful for debugging). RM (2) will stop the container and then forcefully remove it from the system using `docker rm -f`. This is the default behavior if defer is set to None. """ if parameters is None: parameters = [] if workDir is None: workDir = os.getcwd() # Setup the outgoing subprocess call for docker baseDockerCall = ['docker', 'run'] if dockerParameters: baseDockerCall += dockerParameters else: baseDockerCall += [ '--rm', '--log-driver', 'none', '-v', os.path.abspath(workDir) + ':/data' ] # Ensure the user has passed a valid value for defer assert defer in (None, FORGO, STOP, RM) # Get container name which is needed for _dockerKill try: if any('--name' in x for x in baseDockerCall): if any('--name=' in x for x in baseDockerCall): containerName = [ x.split('=')[1] for x in baseDockerCall if '--name' in x ][0] else: containerName = baseDockerCall[baseDockerCall.index('--name') + 1] else: containerName = getContainerName(job) baseDockerCall.extend(['--name', containerName]) except ValueError: containerName = getContainerName(job) baseDockerCall.extend(['--name', containerName]) except IndexError: raise RuntimeError( "Couldn't parse Docker's `--name=` option, check parameters: " + str(dockerParameters)) # Defer the container on-exit action if '--rm' in baseDockerCall and defer is None: defer = RM if '--rm' in baseDockerCall and defer is not RM: logger.warn('--rm being passed to docker call but defer not set to ' 'dockerCall.RM, defer set to: ' + str(defer)) job.defer(_dockerKill, containerName, action=defer) # Defer the permission fixing function which will run after this job. # We call this explicitly later on in this function, # but we defer it as well to handle unexpected job failure. job.defer(_fixPermissions, tool, workDir) # Make subprocess call # If parameters is list of lists, treat each list as separate command and chain with pipes if len(parameters) > 0 and type(parameters[0]) is list: # When piping, all arguments now get merged into a single string to bash -c. # We try to support spaces in paths by wrapping them all in quotes first. chain_params = [ ' '.join(p) for p in [list(map(pipes.quote, q)) for q in parameters] ] # Use bash's set -eo pipefail to detect and abort on a failure in any command in the chain call = baseDockerCall + [ '--entrypoint', '/bin/bash', tool, '-c', 'set -eo pipefail && {}'.format(' | '.join(chain_params)) ] else: call = baseDockerCall + [tool] + parameters logger.info("Calling docker with " + repr(call)) params = {} if outfile: params['stdout'] = outfile if errfile: params['stderr'] = errfile if checkOutput: callMethod = subprocess.check_output else: callMethod = subprocess.check_call for attempt in retry(predicate=dockerPredicate): with attempt: out = callMethod(call, **params) _fixPermissions(tool=tool, workDir=workDir) return out
def wrapper(*args, **kwargs): for attempt in retry(delays=truncExpBackoff(), timeout=300, predicate=awsRetryPredicate): with attempt: return f(*args, **kwargs)
def retry_kubernetes(retry_while=retryable_kubernetes_errors): """ A wrapper that sends retryable Kubernetes predicates into a context-manager which will allow Kubernetes to keep retrying until a False or an executable method is seen. """ return retry(predicate=retry_while)
def addNodes(self, nodeType, numNodes, preemptable, spotBid=None): assert self._leaderPrivateIP if preemptable and not spotBid: if self._spotBidsMap and nodeType in self._spotBidsMap: spotBid = self._spotBidsMap[nodeType] else: raise RuntimeError( "No spot bid given for a preemptable node request.") instanceType = E2Instances[nodeType] bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self._nodeStorage) keyPath = self._sseKey if self._sseKey else None userData = self._getCloudConfigUserData('worker', self._masterPublicKey, keyPath, preemptable) if isinstance(userData, text_type): # Spot-market provisioning requires bytes for user data. userData = userData.encode('utf-8') sgs = [ sg for sg in self._ctx.ec2.get_all_security_groups() if sg.name in self._leaderSecurityGroupNames ] kwargs = { 'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': self._leaderProfileArn, 'placement': self._zone, 'subnet_id': self._subnetID } instancesLaunched = [] for attempt in retry(predicate=awsRetryPredicate): with attempt: # after we start launching instances we want to ensure the full setup is done # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptable: logger.debug('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances( self._ctx.ec2, image_id=self._discoverAMI(), spec=kwargs, num_instances=numNodes) else: logger.debug('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(spotBid, instanceType.name, self._ctx) # force generator to evaluate instancesLaunched = list( create_spot_instances( ec2=self._ctx.ec2, price=spotBid, image_id=self._discoverAMI(), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True)) # flatten the list instancesLaunched = [ item for sublist in instancesLaunched for item in sublist ] for attempt in retry(predicate=awsRetryPredicate): with attempt: wait_instances_running(self._ctx.ec2, instancesLaunched) self._tags[_TOIL_NODE_TYPE_TAG_KEY] = 'worker' AWSProvisioner._addTags(instancesLaunched, self._tags) if self._sseKey: for i in instancesLaunched: self._waitForIP(i) node = Node(publicIP=i.ip_address, privateIP=i.private_ip_address, name=i.id, launchTime=i.launch_time, nodeType=i.instance_type, preemptable=preemptable, tags=i.tags) node.waitForNode('toil_worker') node.coreRsync([self._sseKey, ':' + self._sseKey], applianceName='toil_worker') logger.debug('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def destroyCluster(self): """ Terminate instances and delete the profile and security group. """ assert self._ctx def expectedShutdownErrors(e): return e.status == 400 and 'dependent object' in e.body def destroyInstances(instances): """ Similar to _terminateInstances, except that it also cleans up any resources associated with the instances (e.g. IAM profiles). """ self._deleteIAMProfiles(instances) self._terminateInstances(instances) # We should terminate the leader first in case a workflow is still running in the cluster. # The leader may create more instances while we're terminating the workers. vpcId = None try: leader = self.getLeader(returnRawInstance=True) vpcId = leader.vpc_id logger.info('Terminating the leader first ...') destroyInstances([leader]) logger.info('Now terminating any remaining workers ...') except (NoSuchClusterException, InvalidClusterStateException): # It's ok if the leader is not found. We'll terminate any remaining # instances below anyway. pass instances = self._getNodesInCluster(nodeType=None, both=True) spotIDs = self._getSpotRequestIDs() if spotIDs: self._ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs) instancesToTerminate = awsFilterImpairedNodes(instances, self._ctx.ec2) if instancesToTerminate: vpcId = vpcId or instancesToTerminate[0].vpc_id destroyInstances(instancesToTerminate) if len(instances) == len(instancesToTerminate): logger.debug('Deleting security group...') removed = False for attempt in retry(timeout=300, predicate=expectedShutdownErrors): with attempt: for sg in self._ctx.ec2.get_all_security_groups(): if sg.name == self.clusterName and vpcId and sg.vpc_id == vpcId: try: self._ctx.ec2.delete_security_group( group_id=sg.id) removed = True except BotoServerError as e: if e.error_code == 'InvalidGroup.NotFound': pass else: raise if removed: logger.debug('... Succesfully deleted security group') else: assert len(instances) > len(instancesToTerminate) # the security group can't be deleted until all nodes are terminated logger.warning( 'The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes ' 'have failed health checks. As a result, the security group & IAM ' 'roles will not be deleted.')
def setNodeCount(self, nodeType, numNodes, preemptable=False, force=False): """ Attempt to grow or shrink the number of preemptable or non-preemptable worker nodes in the cluster to the given value, or as close a value as possible, and, after performing the necessary additions or removals of worker nodes, return the resulting number of preemptable or non-preemptable nodes currently in the cluster. :param str nodeType: The node type to add or remove. :param int numNodes: Desired size of the cluster :param bool preemptable: whether the added nodes will be preemptable, i.e. whether they may be removed spontaneously by the underlying platform at any time. :param bool force: If False, the provisioner is allowed to deviate from the given number of nodes. For example, when downsizing a cluster, a provisioner might leave nodes running if they have active jobs running on them. :rtype: int :return: the number of worker nodes in the cluster after making the necessary adjustments. This value should be, but is not guaranteed to be, close or equal to the `numNodes` argument. It represents the closest possible approximation of the actual cluster size at the time this method returns. """ for attempt in retry(predicate=self.provisioner.retryPredicate): with attempt: workerInstances = self.getNodes(preemptable=preemptable) logger.debug("Cluster contains %i instances" % len(workerInstances)) # Reduce to nodes of the correct type workerInstances = { node: workerInstances[node] for node in workerInstances if node.nodeType == nodeType } ignoredNodes = [ node for node in workerInstances if node.privateIP in self.ignoredNodes ] numIgnoredNodes = len(ignoredNodes) numCurrentNodes = len(workerInstances) logger.debug( "Cluster contains %i instances of type %s (%i ignored and draining jobs until " "they can be safely terminated)" % (numCurrentNodes, nodeType, numIgnoredNodes)) if not force: delta = numNodes - (numCurrentNodes - numIgnoredNodes) else: delta = numNodes - numCurrentNodes if delta > 0 and numIgnoredNodes > 0: # We can un-ignore a few nodes to compensate for the additional nodes we want. numNodesToUnignore = min(delta, numIgnoredNodes) logger.debug( 'Unignoring %i nodes because we want to scale back up again.' % numNodesToUnignore) delta -= numNodesToUnignore for node in ignoredNodes[:numNodesToUnignore]: self.ignoredNodes.remove(node.privateIP) self.leader.batchSystem.unignoreNode(node.privateIP) if delta > 0: logger.info( 'Adding %i %s nodes to get to desired cluster size of %i.', delta, 'preemptable' if preemptable else 'non-preemptable', numNodes) numNodes = numCurrentNodes + self._addNodes( nodeType, numNodes=delta, preemptable=preemptable) elif delta < 0: logger.info( 'Removing %i %s nodes to get to desired cluster size of %i.', -delta, 'preemptable' if preemptable else 'non-preemptable', numNodes) numNodes = numCurrentNodes - self._removeNodes( workerInstances, nodeType=nodeType, numNodes=-delta, preemptable=preemptable, force=force) else: if not force: logger.debug( 'Cluster (minus ignored nodes) already at desired size of %i. Nothing to do.', numNodes) else: logger.debug( 'Cluster already at desired size of %i. Nothing to do.', numNodes) return numNodes
def retry_azure(delays=(0, 1, 1, 4, 16, 64), timeout=300, predicate=defaultRetryPredicate): return retry(delays=delays, timeout=timeout, predicate=predicate)
def retry_s3(delays=default_delays, timeout=default_timeout, predicate=retryable_s3_errors): return retry(delays=delays, timeout=timeout, predicate=predicate)