def launchCluster(cls, instanceType, keyName, clusterName, spotBid=None, zone=None): ctx = cls._buildContext(clusterName=clusterName, zone=zone) profileARN = cls._getProfileARN(ctx) # the security group name is used as the cluster identifier cls._createSecurityGroup(ctx, clusterName) bdm = cls._getBlockDeviceMapping(ec2_instance_types[instanceType]) leaderData = dict(role='leader', image=applianceSelf(), entrypoint='mesos-master', args=leaderArgs.format(name=clusterName)) userData = awsUserData.format(**leaderData) kwargs = {'key_name': keyName, 'security_groups': [clusterName], 'instance_type': instanceType, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': profileARN} if not spotBid: logger.info('Launching non-preemptable leader') create_ondemand_instances(ctx.ec2, image_id=cls._discoverAMI(ctx), spec=kwargs, num_instances=1) else: logger.info('Launching preemptable leader') # force generator to evaluate list(create_spot_instances(ec2=ctx.ec2, price=spotBid, image_id=cls._discoverAMI(ctx), tags={'clusterName': clusterName}, spec=kwargs, num_instances=1)) return cls._getLeader(clusterName=clusterName, wait=True)
def addNodes(self, numNodes, preemptable): instanceType = self._getInstanceType(preemptable) bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self.nodeStorage) arn = self._getProfileARN(self.ctx) keyPath = '' if not self.config or not self.config.sseKey else self.config.sseKey entryPoint = 'mesos-slave' if not self.config or not self.config.sseKey else "waitForKey.sh" workerData = dict(role='worker', image=applianceSelf(), entrypoint=entryPoint, sshKey=self.masterPublicKey, args=workerArgs.format(ip=self.leaderIP, preemptable=preemptable, keyPath=keyPath)) userData = awsUserData.format(**workerData) sgs = [sg for sg in self.ctx.ec2.get_all_security_groups() if sg.name == self.clusterName] kwargs = {'key_name': self.keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': arn, 'placement': getCurrentAWSZone()} kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id instancesLaunched = [] for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: # after we start launching instances we want to insure the full setup is done # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptable: logger.info('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances(self.ctx.ec2, image_id=self._discoverAMI(self.ctx), spec=kwargs, num_instances=numNodes) else: logger.info('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(self.spotBid, instanceType.name, self.ctx) # force generator to evaluate instancesLaunched = list(create_spot_instances(ec2=self.ctx.ec2, price=self.spotBid, image_id=self._discoverAMI(self.ctx), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True) ) # flatten the list instancesLaunched = [item for sublist in instancesLaunched for item in sublist] for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: wait_instances_running(self.ctx.ec2, instancesLaunched) # request throttling retry happens internally to these two methods to insure proper granularity AWSProvisioner._addTags(instancesLaunched, self.tags) self._propagateKey(instancesLaunched) logger.info('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def __cleanMounts(self): """ Deletes all files in every mounted directory. Without this step, we risk leaking files owned by root on the host. To avoid races, this method should be called after the appliance container was stopped, otherwise the running container might still be writing files. """ # Delete all files within each mounted directory, but not the directory itself. cmd = 'shopt -s dotglob && rm -rf ' + ' '.join(v + '/*' for k, v in self.mounts.iteritems() if os.path.isdir(k)) self.outer._run('docker', 'run', '--rm', '--entrypoint=/bin/bash', applianceSelf(), '-c', cmd)
def __enter__(self): with self.lock: image = applianceSelf() # Omitting --rm, it's unreliable, see https://github.com/docker/docker/issues/16575 args = list(concat('docker', 'run', '--entrypoint=' + self._entryPoint(), '--net=host', '-i', '--name=' + self.containerName, ['--volume=%s:%s' % mount for mount in self.mounts.iteritems()], image, self._containerCommand())) log.info('Running %r', args) self.popen = Popen(args) self.start() self.__wait_running() return self
def needs_appliance(test_item): import json test_item = _mark_test('appliance', test_item) if next(which('docker'), None): image = applianceSelf() try: images = check_output(['docker', 'inspect', image]) except CalledProcessError: images = [] else: images = {i['Id'] for i in json.loads(images) if image in i['RepoTags']} if len(images) == 0: return unittest.skip("Cannot find appliance image %s. Be sure to run 'make docker' " "prior to running this test." % image)(test_item) elif len(images) == 1: return test_item else: assert False, 'Expected `docker inspect` to return zero or one image.' else: return unittest.skip('Install Docker to include this test.')(test_item)
def _addNodes(self, instances, numNodes, preemptable=False): bdm = self._getBlockDeviceMapping(self.instanceType) arn = self._getProfileARN(self.ctx) workerData = dict(role='worker', image=applianceSelf(), entrypoint='mesos-slave', args=workerArgs.format(ip=self.leaderIP, preemptable=preemptable)) userData = awsUserData.format(**workerData) kwargs = {'key_name': self.keyName, 'security_groups': [self.clusterName], 'instance_type': self.instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': arn} instancesLaunched = [] if not preemptable: logger.info('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances(self.ctx.ec2, image_id=self._discoverAMI(self.ctx), spec=kwargs, num_instances=1) else: logger.info('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(self.spotBid, self.instanceType.name, self.ctx) # force generator to evaluate instancesLaunched = list(create_spot_instances(ec2=self.ctx.ec2, price=self.spotBid, image_id=self._discoverAMI(self.ctx), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True) ) wait_instances_running(self.ctx.ec2, instancesLaunched) logger.info('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--leaderNodeType", dest="leaderNodeType", required=True, help="Non-preemptable node type to use for the cluster leader.") parser.add_argument("--keyPairName", dest='keyPairName', help="On AWS, the name of the AWS key pair to include on the instance." " On Google/GCE, this is the ssh key pair." " Not needed for Azure.") parser.add_argument("--owner", dest='owner', help="The owner tag for all instances. If not given, the value in" " --keyPairName will be used if given.") parser.add_argument("--publicKeyFile", dest='publicKeyFile', default="~/.ssh/id_rsa.pub", help="On Azure, the file" " containing the key pairs (the first key pair will be used).") parser.add_argument("--boto", dest='botoPath', help="The path to the boto credentials directory. This is transferred " "to all nodes in order to access the AWS jobStore from non-AWS instances.") parser.add_argument("-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its " "children. Tags are of the form:\n" " -t key1=value1 --tag key2=value2\n" "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {\n" " \"Name\": clusterName,\n" " \"Owner\": IAM username\n" " }. ") parser.add_argument("--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not " "specified. This subnet needs to have auto assign IPs turned on.") parser.add_argument("--nodeTypes", dest='nodeTypes', default=None, type=str, help="Comma-separated list of node types to create while launching the " "leader. The syntax for each node type depends on the provisioner " "used. For the aws provisioner this is the name of an EC2 instance " "type followed by a colon and the price in dollar to bid for a spot " "instance, for example 'c3.8xlarge:0.42'. Must also provide the " "--workers argument to specify how many workers of each node type " "to create.") parser.add_argument("-w", "--workers", dest='workers', default=None, type=str, help="Comma-separated list of the number of workers of each node type to " "launch alongside the leader when the cluster is created. This can be " "useful if running toil without auto-scaling but with need of more " "hardware support") parser.add_argument("--leaderStorage", dest='leaderStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for the leader " "instance. This is an EBS volume.") parser.add_argument("--nodeStorage", dest='nodeStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for any worker " "instances created when using the -w flag. This is an EBS volume.") parser.add_argument('--forceDockerAppliance', dest='forceDockerAppliance', action='store_true', default=False, help="Disables sanity checking the existence of the docker image specified " "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for " "autoscaling.") parser.add_argument("--azureStorageCredentials", dest='azureStorageCredentials', type=str, default=credential_file_path, help="The location of the file containing the Azure storage credentials. If not specified," " the default file is used with Azure provisioning. Use 'None' to disable" " the transfer of credentials.") parser.add_argument('--awsEc2ProfileArn', dest='awsEc2ProfileArn', default=None, type=str, help="If provided, the specified ARN is used as the instance profile for EC2 instances." "Useful for setting custom IAM profiles. If not specified, a new IAM role is created " "by default with sufficient access to perform basic cluster operations.") config = parseBasicOptions(parser) tagsDict = None if config.tags is None else createTagsDict(config.tags) checkValidNodeTypes(config.provisioner, config.nodeTypes) checkValidNodeTypes(config.provisioner, config.leaderNodeType) # checks the validity of TOIL_APPLIANCE_SELF before proceeding applianceSelf(forceDockerAppliance=config.forceDockerAppliance) spotBids = [] nodeTypes = [] preemptableNodeTypes = [] numNodes = [] numPreemptableNodes = [] if (config.nodeTypes or config.workers) and not (config.nodeTypes and config.workers): raise RuntimeError("The --nodeTypes and --workers options must be specified together,") if config.nodeTypes: nodeTypesList = config.nodeTypes.split(",") numWorkersList = config.workers.split(",") if not len(nodeTypesList) == len(numWorkersList): raise RuntimeError("List of node types must be the same length as the list of workers.") for nodeTypeStr, num in zip(nodeTypesList, numWorkersList): parsedBid = nodeTypeStr.split(':', 1) if len(nodeTypeStr) != len(parsedBid[0]): #Is a preemptable node preemptableNodeTypes.append(parsedBid[0]) spotBids.append(float(parsedBid[1])) numPreemptableNodes.append(int(num)) else: nodeTypes.append(nodeTypeStr) numNodes.append(int(num)) # set owner (default to keyPairName if not given) owner = 'toil' if config.owner: owner = config.owner elif config.keyPairName: owner = config.keyPairName # Check to see if the user specified a zone. If not, see if one is stored in an environment variable. config.zone = config.zone or getZoneFromEnv(config.provisioner) if not config.zone: raise RuntimeError('Please provide a value for --zone or set a default in the TOIL_' + config.provisioner.upper() + '_ZONE enviroment variable.') cluster = clusterFactory(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone, nodeStorage=config.nodeStorage) cluster.launchCluster(leaderNodeType=config.leaderNodeType, leaderStorage=config.leaderStorage, owner=owner, keyName=config.keyPairName, botoPath=config.botoPath, userTags=tagsDict, vpcSubnet=config.vpcSubnet, publicKeyFile=config.publicKeyFile, azureStorageCredentials=config.azureStorageCredentials, awsEc2ProfileArn=config.awsEc2ProfileArn) for nodeType, workers in zip(nodeTypes, numNodes): cluster.addNodes(nodeType=nodeType, numNodes=workers, preemptable=False) for nodeType, workers, spotBid in zip(preemptableNodeTypes, numPreemptableNodes, spotBids): cluster.addNodes(nodeType=nodeType, numNodes=workers, preemptable=True, spotBid=spotBid)
def addNodes(self, nodeType, numNodes, preemptable): instanceType = ec2_instance_types[nodeType] bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self.nodeStorage) arn = self._getProfileARN(self.ctx) keyPath = '' if not self.config or not self.config.sseKey else self.config.sseKey entryPoint = 'mesos-slave' if not self.config or not self.config.sseKey else "waitForKey.sh" workerData = dict(role='worker', image=applianceSelf(), entrypoint=entryPoint, sshKey=self.masterPublicKey, args=workerArgs.format(ip=self.leaderIP, preemptable=preemptable, keyPath=keyPath)) userData = awsUserData.format(**workerData) sgs = [ sg for sg in self.ctx.ec2.get_all_security_groups() if sg.name == self.clusterName ] kwargs = { 'key_name': self.keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': arn, 'placement': getCurrentAWSZone() } kwargs[ "subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance( self.instanceMetaData).subnet_id instancesLaunched = [] for attempt in retry(predicate=awsRetryPredicate): with attempt: # after we start launching instances we want to insure the full setup is done # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptable: logger.info('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances( self.ctx.ec2, image_id=self._discoverAMI(self.ctx), spec=kwargs, num_instances=numNodes) else: logger.info('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(self.spotBids[nodeType], instanceType.name, self.ctx) # force generator to evaluate instancesLaunched = list( create_spot_instances( ec2=self.ctx.ec2, price=self.spotBids[nodeType], image_id=self._discoverAMI(self.ctx), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True)) # flatten the list instancesLaunched = [ item for sublist in instancesLaunched for item in sublist ] for attempt in retry(predicate=awsRetryPredicate): with attempt: wait_instances_running(self.ctx.ec2, instancesLaunched) # request throttling retry happens internally to these two methods to insure proper granularity AWSProvisioner._addTags(instancesLaunched, self.tags) self._propagateKey(instancesLaunched) logger.info('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def launchCluster(self, leaderNodeType, leaderSpotBid, nodeTypes, preemptableNodeTypes, keyName, clusterName, numWorkers=0, numPreemptableWorkers=0, spotBids=None, userTags=None, zone=None, vpcSubnet=None, leaderStorage=50, nodeStorage=50): if self.config is None: self.nodeStorage = nodeStorage if userTags is None: userTags = {} ctx = self._buildContext(clusterName=clusterName, zone=zone) profileARN = self._getProfileARN(ctx) leaderInstanceType = ec2_instance_types[leaderNodeType] # the security group name is used as the cluster identifier sgs = self._createSecurityGroup(ctx, clusterName, vpcSubnet) bdm = self._getBlockDeviceMapping(leaderInstanceType, rootVolSize=leaderStorage) self.masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' leaderData = dict(role='leader', image=applianceSelf(), entrypoint='mesos-master', sshKey=self.masterPublicKey, args=leaderArgs.format(name=clusterName)) userData = awsUserData.format(**leaderData) kwargs = { 'key_name': keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': leaderNodeType, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': profileARN, 'placement': zone } if vpcSubnet: kwargs["subnet_id"] = vpcSubnet if not leaderSpotBid: logger.info('Launching non-preemptable leader') instances = create_ondemand_instances( ctx.ec2, image_id=self._discoverAMI(ctx), spec=kwargs, num_instances=1) leader = instances[0] else: logger.info('Launching preemptable leader') # force generator to evaluate instances = list( create_spot_instances(ec2=ctx.ec2, price=leaderSpotBid, image_id=self._discoverAMI(ctx), tags={'clusterName': clusterName}, spec=kwargs, num_instances=1))[0] leader = instances[0] wait_instances_running(ctx.ec2, [leader]) self._waitForNode(leader, 'toil_leader') defaultTags = {'Name': clusterName, 'Owner': keyName} defaultTags.update(userTags) # if we running launch cluster we need to save this data as it won't be generated # from the metadata. This data is needed to launch worker nodes. self.leaderIP = leader.private_ip_address self._addTags([leader], defaultTags) self.ctx = ctx if spotBids: self.spotBids = dict(zip(preemptableNodeTypes, spotBids)) self.clusterName = clusterName self.keyName = keyName self.tags = leader.tags self.subnetID = leader.subnet_id # assuming that if the leader was launched without a spotbid then all workers # will be non-preemptable workersCreated = 0 for nodeType, workers in zip(nodeTypes, numWorkers): workersCreated += self.addNodes(nodeType=nodeType, numNodes=workers, preemptable=False) for nodeType, workers in zip(preemptableNodeTypes, numPreemptableWorkers): workersCreated += self.addNodes(nodeType=nodeType, numNodes=workers, preemptable=True) logger.info('Added %d workers', workersCreated) return leader
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument( "--leaderNodeType", dest="leaderNodeType", required=True, help="Non-preemptable node type to use for the cluster leader.") parser.add_argument( "--keyPairName", dest='keyPairName', required=True, help="On AWS, the name of the AWS key pair to include on the instance." " On Google/GCE, this is the ssh key pair." " On Azure, this will be used as the owner tag.") parser.add_argument( "--publicKeyFile", dest='publicKeyFile', default="~/.ssh/id_rsa.pub", help="On Azure, the file" " containing the key pairs (the first key pair will be used).") parser.add_argument( "--boto", dest='botoPath', help="The path to the boto credentials directory. This is transferred " "to all nodes in order to access the AWS jobStore from non-AWS instances." ) parser.add_argument( "-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its " "children. Tags are of the form:\n" " -t key1=value1 --tag key2=value2\n" "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {\n" " \"Name\": clusterName,\n" " \"Owner\": IAM username\n" " }. ") parser.add_argument( "--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not " "specified. This subnet needs to have auto assign IPs turned on.") parser.add_argument( "--nodeTypes", dest='nodeTypes', default=None, type=str, help="Comma-separated list of node types to create while launching the " "leader. The syntax for each node type depends on the provisioner " "used. For the aws provisioner this is the name of an EC2 instance " "type followed by a colon and the price in dollar to bid for a spot " "instance, for example 'c3.8xlarge:0.42'. Must also provide the " "--workers argument to specify how many workers of each node type " "to create.") parser.add_argument( "-w", "--workers", dest='workers', default=None, type=str, help= "Comma-separated list of the number of workers of each node type to " "launch alongside the leader when the cluster is created. This can be " "useful if running toil without auto-scaling but with need of more " "hardware support") parser.add_argument( "--leaderStorage", dest='leaderStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for the leader " "instance. This is an EBS volume.") parser.add_argument( "--nodeStorage", dest='nodeStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for any worker " "instances created when using the -w flag. This is an EBS volume.") parser.add_argument( '--forceDockerAppliance', dest='forceDockerAppliance', action='store_true', default=False, help= "Disables sanity checking the existence of the docker image specified " "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for " "autoscaling.") parser.add_argument( "--azureStorageCredentials", dest='azureStorageCredentials', type=str, default=credential_file_path, help= "The location of the file containing the Azure storage credentials. If not specified," " the default file is used with Azure provisioning. Use 'None' to disable" " the transfer of credentials.") config = parseBasicOptions(parser) tagsDict = None if config.tags is None else createTagsDict(config.tags) # checks the validity of TOIL_APPLIANCE_SELF before proceeding checkToilApplianceSelf = applianceSelf( forceDockerAppliance=config.forceDockerAppliance) spotBids = [] nodeTypes = [] preemptableNodeTypes = [] numNodes = [] numPreemptableNodes = [] leaderSpotBid = None if config.provisioner == 'aws': logger.info('Using aws provisioner.') try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: logger.error( 'The aws extra must be installed to use this provisioner') raise provisioner = AWSProvisioner() elif config.provisioner == 'azure': try: from toil.provisioners.azure.azureProvisioner import AzureProvisioner except ImportError: raise RuntimeError( 'The aws extra must be installed to use this provisioner') provisioner = AzureProvisioner() elif config.provisioner == 'gce': logger.info('Using a gce provisioner.') try: from toil.provisioners.gceProvisioner import GCEProvisioner except ImportError: logger.error( 'The google extra must be installed to use this provisioner') raise provisioner = GCEProvisioner() else: assert False #Parse leader node type and spot bid parsedBid = config.leaderNodeType.split(':', 1) if len(config.leaderNodeType) != len(parsedBid[0]): leaderSpotBid = float(parsedBid[1]) config.leaderNodeType = parsedBid[0] if (config.nodeTypes or config.workers) and not (config.nodeTypes and config.workers): raise RuntimeError( "The --nodeTypes and --workers options must be specified together," ) if config.nodeTypes: nodeTypesList = config.nodeTypes.split(",") numWorkersList = config.workers.split(",") if not len(nodeTypesList) == len(numWorkersList): raise RuntimeError( "List of node types must be the same length as the list of workers." ) for nodeTypeStr, num in zip(nodeTypesList, numWorkersList): parsedBid = nodeTypeStr.split(':', 1) if len(nodeTypeStr) != len(parsedBid[0]): #Is a preemptable node preemptableNodeTypes.append(parsedBid[0]) spotBids.append(float(parsedBid[1])) numPreemptableNodes.append(int(num)) else: nodeTypes.append(nodeTypeStr) numNodes.append(int(num)) provisioner.launchCluster( leaderNodeType=config.leaderNodeType, leaderSpotBid=leaderSpotBid, nodeTypes=nodeTypes, preemptableNodeTypes=preemptableNodeTypes, numWorkers=numNodes, numPreemptableWorkers=numPreemptableNodes, keyName=config.keyPairName, botoPath=config.botoPath, clusterName=config.clusterName, spotBids=spotBids, userTags=tagsDict, zone=config.zone, leaderStorage=config.leaderStorage, nodeStorage=config.nodeStorage, vpcSubnet=config.vpcSubnet, publicKeyFile=config.publicKeyFile, azureStorageCredentials=config.azureStorageCredentials)
def launchCluster(self, instanceType, keyName, clusterName, workers=0, spotBid=None, userTags=None, zone=None, vpcSubnet=None, leaderStorage=50, nodeStorage=50): # only use this node storage value if launchCluster is called from cluster utility if self.config is None: self.nodeStorage = nodeStorage if userTags is None: userTags = {} ctx = self._buildContext(clusterName=clusterName, zone=zone) profileARN = self._getProfileARN(ctx) # the security group name is used as the cluster identifier sgs = self._createSecurityGroup(ctx, clusterName, vpcSubnet) bdm = self._getBlockDeviceMapping(ec2_instance_types[instanceType], rootVolSize=leaderStorage) self.masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' leaderData = dict(role='leader', image=applianceSelf(), entrypoint='mesos-master', sshKey=self.masterPublicKey, args=leaderArgs.format(name=clusterName)) userData = awsUserData.format(**leaderData) kwargs = {'key_name': keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': profileARN, 'placement': zone} if vpcSubnet: kwargs["subnet_id"] = vpcSubnet if not spotBid: logger.info('Launching non-preemptable leader') create_ondemand_instances(ctx.ec2, image_id=self._discoverAMI(ctx), spec=kwargs, num_instances=1) else: logger.info('Launching preemptable leader') # force generator to evaluate list(create_spot_instances(ec2=ctx.ec2, price=spotBid, image_id=self._discoverAMI(ctx), tags={'clusterName': clusterName}, spec=kwargs, num_instances=1)) leader = self._getLeader(clusterName=clusterName, wait=True, zone=zone) defaultTags = {'Name': clusterName, 'Owner': keyName} defaultTags.update(userTags) # if we running launch cluster we need to save this data as it won't be generated # from the metadata. This data is needed to launch worker nodes. self.leaderIP = leader.private_ip_address self._addTags([leader], defaultTags) self.ctx = ctx self.spotBid = spotBid preemptable = True if spotBid else False self.instanceType[preemptable] = ec2_instance_types[instanceType] self.clusterName = clusterName self.keyName = keyName self.tags = leader.tags self.subnetID = leader.subnet_id if workers: # assuming that if the leader was launched without a spotbid then all workers # will be non-preemptable workersCreated = self.addNodes(workers, preemptable=bool(spotBid)) logger.info('Added %d workers with %d workers requested', workersCreated, workers) return leader
def launchCluster(self, instanceType, keyName, clusterName, workers=0, spotBid=None, userTags=None, zone=None, vpcSubnet=None, leaderStorage=50, nodeStorage=50): # only use this node storage value if launchCluster is called from cluster utility if self.config is None: self.nodeStorage = nodeStorage if userTags is None: userTags = {} ctx = self._buildContext(clusterName=clusterName, zone=zone) profileARN = self._getProfileARN(ctx) # the security group name is used as the cluster identifier sgs = self._createSecurityGroup(ctx, clusterName, vpcSubnet) bdm = self._getBlockDeviceMapping(ec2_instance_types[instanceType], rootVolSize=leaderStorage) self.masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' leaderData = dict(role='leader', image=applianceSelf(), entrypoint='mesos-master', sshKey=self.masterPublicKey, args=leaderArgs.format(name=clusterName)) userData = awsUserData.format(**leaderData) kwargs = { 'key_name': keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': profileARN, 'placement': zone } if vpcSubnet: kwargs["subnet_id"] = vpcSubnet if not spotBid: logger.info('Launching non-preemptable leader') create_ondemand_instances(ctx.ec2, image_id=self._discoverAMI(ctx), spec=kwargs, num_instances=1) else: logger.info('Launching preemptable leader') # force generator to evaluate list( create_spot_instances(ec2=ctx.ec2, price=spotBid, image_id=self._discoverAMI(ctx), tags={'clusterName': clusterName}, spec=kwargs, num_instances=1)) leader = self._getLeader(clusterName=clusterName, wait=True, zone=zone) defaultTags = {'Name': clusterName, 'Owner': keyName} defaultTags.update(userTags) # if we running launch cluster we need to save this data as it won't be generated # from the metadata. This data is needed to launch worker nodes. self.leaderIP = leader.private_ip_address self._addTags([leader], defaultTags) self.ctx = ctx self.spotBid = spotBid preemptable = True if spotBid else False self.instanceType[preemptable] = ec2_instance_types[instanceType] self.clusterName = clusterName self.keyName = keyName self.tags = leader.tags self.subnetID = leader.subnet_id if workers: # assuming that if the leader was launched without a spotbid then all workers # will be non-preemptable workersCreated = self.addNodes(workers, preemptable=bool(spotBid)) logger.info('Added %d workers with %d workers requested', workersCreated, workers) return leader
def main(): parser = parser_with_common_options(provisioner_options=True, jobstore_option=False) parser.add_argument( "--leaderNodeType", dest="leaderNodeType", required=True, help="Non-preemptable node type to use for the cluster leader.") parser.add_argument( "--keyPairName", dest='keyPairName', help="On AWS, the name of the AWS key pair to include on the instance." " On Google/GCE, this is the ssh key pair.") parser.add_argument( "--owner", dest='owner', help="The owner tag for all instances. If not given, the value in" " --keyPairName will be used if given.") parser.add_argument( "--boto", dest='botoPath', help="The path to the boto credentials directory. This is transferred " "to all nodes in order to access the AWS jobStore from non-AWS instances." ) parser.add_argument( "-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its " "children. Tags are of the form:\n" " -t key1=value1 --tag key2=value2\n" "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {\n" " \"Name\": clusterName,\n" " \"Owner\": IAM username\n" " }. ") parser.add_argument( "--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not " "specified. This subnet needs to have auto assign IPs turned on.") parser.add_argument( "--nodeTypes", dest='nodeTypes', default=None, type=str, help="Comma-separated list of node types to create while launching the " "leader. The syntax for each node type depends on the provisioner " "used. For the aws provisioner this is the name of an EC2 instance " "type followed by a colon and the price in dollar to bid for a spot " "instance, for example 'c3.8xlarge:0.42'. Must also provide the " "--workers argument to specify how many workers of each node type " "to create.") parser.add_argument( "-w", "--workers", dest='workers', default=None, type=str, help= "Comma-separated list of the number of workers of each node type to " "launch alongside the leader when the cluster is created. This can be " "useful if running toil without auto-scaling but with need of more " "hardware support") parser.add_argument( "--leaderStorage", dest='leaderStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for the leader " "instance. This is an EBS volume.") parser.add_argument( "--nodeStorage", dest='nodeStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for any worker " "instances created when using the -w flag. This is an EBS volume.") parser.add_argument( '--forceDockerAppliance', dest='forceDockerAppliance', action='store_true', default=False, help= "Disables sanity checking the existence of the docker image specified " "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for " "autoscaling.") parser.add_argument( '--awsEc2ProfileArn', dest='awsEc2ProfileArn', default=None, type=str, help= "If provided, the specified ARN is used as the instance profile for EC2 instances." "Useful for setting custom IAM profiles. If not specified, a new IAM role is created " "by default with sufficient access to perform basic cluster operations." ) parser.add_argument( '--awsEc2ExtraSecurityGroupId', dest='awsEc2ExtraSecurityGroupIds', default=[], action='append', help= "Any additional security groups to attach to EC2 instances. Note that a security group " "with its name equal to the cluster name will always be created, thus ensure that " "the extra security groups do not have the same name as the cluster name." ) options = parser.parse_args() set_logging_from_options(options) tags = create_tags_dict(options.tags) if options.tags else dict() worker_node_types = options.nodeTypes.split( ',') if options.nodeTypes else [] worker_quantities = options.workers.split(',') if options.workers else [] check_valid_node_types(options.provisioner, worker_node_types + [options.leaderNodeType]) # checks the validity of TOIL_APPLIANCE_SELF before proceeding applianceSelf(forceDockerAppliance=options.forceDockerAppliance) owner = options.owner or options.keyPairName or 'toil' # Check to see if the user specified a zone. If not, see if one is stored in an environment variable. options.zone = options.zone or os.environ.get( f'TOIL_{options.provisioner.upper()}_ZONE') if not options.zone: raise RuntimeError( f'Please provide a value for --zone or set a default in the ' f'TOIL_{options.provisioner.upper()}_ZONE environment variable.') if (options.nodeTypes or options.workers) and not (options.nodeTypes and options.workers): raise RuntimeError( "The --nodeTypes and --workers options must be specified together." ) if not len(worker_node_types) == len(worker_quantities): raise RuntimeError( "List of node types must be the same length as the list of workers." ) cluster = cluster_factory(provisioner=options.provisioner, clusterName=options.clusterName, zone=options.zone, nodeStorage=options.nodeStorage) cluster.launchCluster( leaderNodeType=options.leaderNodeType, leaderStorage=options.leaderStorage, owner=owner, keyName=options.keyPairName, botoPath=options.botoPath, userTags=tags, vpcSubnet=options.vpcSubnet, awsEc2ProfileArn=options.awsEc2ProfileArn, awsEc2ExtraSecurityGroupIds=options.awsEc2ExtraSecurityGroupIds) for worker_node_type, num_workers in zip(worker_node_types, worker_quantities): if ':' in worker_node_type: worker_node_type, bid = worker_node_type.split(':', 1) cluster.addNodes(nodeType=worker_node_type, numNodes=int(num_workers), preemptable=True, spotBid=float(bid)) else: cluster.addNodes(nodeType=worker_node_type, numNodes=int(num_workers), preemptable=False)
def launchCluster(self, leaderNodeType, keyName, clusterName, zone, leaderStorage=50, nodeStorage=50, spotBid=None, **kwargs): """ Launches an Azure cluster using Ansible. A resource group is created for the cluster. All the virtual machines are created within this resource group. Cloud-config is called during vm creation to create directories and launch the appliance. """ if spotBid: raise NotImplementedError( "Ansible does not support provisioning spot instances") if not self.isValidClusterName(clusterName): raise RuntimeError( "Invalid cluster name. See the Azure documentation for information " "on cluster naming conventions: " "https://docs.microsoft.com/en-us/azure/architecture/best-practices/naming-conventions" ) self.clusterName = clusterName self.keyName = keyName self.region = zone self.nodeStorage = nodeStorage self.masterPublicKeyFile = kwargs['publicKeyFile'] # Try deleting the resource group. This will fail if it exists. ansibleArgs = {'resgrp': self.clusterName, 'region': self.region} try: self.callPlaybook(self.playbook['create-cluster'], ansibleArgs, wait=True) except RuntimeError: logger.info( "The cluster could not be created. Try deleting the cluster if it already exits." ) raise # Azure VMs must be named, so we need to generate one. Instance names must # be composed of only alphanumeric characters, underscores, and hyphens # (see https://docs.microsoft.com/en-us/azure/architecture/best-practices/naming-conventions). instanceName = 'l' + str(uuid.uuid4()) cloudConfigArgs = { 'image': applianceSelf(), 'role': "leader", 'entrypoint': "mesos-master", '_args': leaderArgs.format(name=self.clusterName), } ansibleArgs = { 'vmsize': leaderNodeType, 'vmname': instanceName, 'storagename': instanceName.replace( '-', '')[:24], # Azure limits the name to 24 characters, no dashes. 'resgrp': self. clusterName, # The resource group, which represents the cluster. 'region': self.region, 'role': "leader", 'owner': self.keyName, # Just a tag. 'diskSize': str(leaderStorage), # TODO: not implemented 'publickeyfile': self. masterPublicKeyFile # The users public key to be added to authorized_keys } ansibleArgs['cloudconfig'] = self._cloudConfig(cloudConfigArgs) self.callPlaybook(self.playbook['create'], ansibleArgs, wait=True) logger.info('Launched non-preemptable leader') # IP available as soon as the playbook finishes leader = self._getNodes('leader')[0] self.leaderIP = leader.privateIP # Make sure leader container is up. self._waitForNode(leader.publicIP, 'toil_leader') # Transfer credentials containerUserPath = '/root/' storageCredentials = kwargs['azureStorageCredentials'] if storageCredentials is not None: fullPathCredentials = os.path.expanduser(storageCredentials) if os.path.isfile(fullPathCredentials): self._rsyncNode(leader.publicIP, [fullPathCredentials, ':' + containerUserPath], applianceName='toil_leader') ansibleCredentials = '.azure/credentials' fullPathAnsibleCredentials = os.path.expanduser('~/' + ansibleCredentials) if os.path.isfile(fullPathAnsibleCredentials): self._sshAppliance(leader.publicIP, 'mkdir', '-p', containerUserPath + '.azure') self._rsyncNode(leader.publicIP, [ fullPathAnsibleCredentials, ':' + containerUserPath + ansibleCredentials ], applianceName='toil_leader') # Add workers workersCreated = 0 for nodeType, workers in zip(kwargs['nodeTypes'], kwargs['numWorkers']): workersCreated += self.addNodes(nodeType=nodeType, numNodes=workers) logger.info('Added %d workers', workersCreated)
def addNodes(self, nodeType, numNodes, preemptable): # If keys are rsynced, then the mesos-slave needs to be started after the keys have been # transferred. The waitForKey.sh script loops on the new VM until it finds the keyPath file, then it starts the # mesos-slave. If there are multiple keys to be transferred, then the last one to be transferred must be # set to keyPath. keyPath = '' entryPoint = 'mesos-slave' self.botoExists = False if self.botoPath is not None and os.path.exists(self.botoPath): entryPoint = "waitForKey.sh" keyPath = self.nodeBotoPath self.botoExists = True elif self.config and self.config.sseKey: entryPoint = "waitForKey.sh" keyPath = self.config.sseKey workerData = dict(role='worker', dockerImage=applianceSelf(), entrypoint=entryPoint, sshKey=self.masterPublicKey, dockerArgs=workerDockerArgs.format( ip=self.leaderIP, preemptable=preemptable, keyPath=keyPath)) #kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id userData = self.gceUserDataWorker.format(**workerData) metadata = {'items': [{'key': 'user-data', 'value': userData}]} imageType = 'coreos-stable' sa_scopes = [{'scopes': ['compute', 'storage-full']}] # TODO: # - bug in gce.py for ex_create_multiple_nodes (erroneously, doesn't allow image and disk to specified) # - ex_create_multiple_nodes is limited to 1000 nodes # - use a different function # - or write a loop over the rest of this function, with 1K nodes max on each iteration if not preemptable: logger.info('Launching %s non-preemptable nodes', numNodes) else: logger.info('Launching %s preemptable nodes', numNodes) disk = {} disk['initializeParams'] = { 'sourceImage': bytes( 'https://www.googleapis.com/compute/v1/projects/coreos-cloud/global' '/images/coreos-stable-1576-4-0-v20171206'), 'diskSizeGb': self.nodeStorage } disk.update({ 'boot': True, #'type': 'bytes('zones/us-central1-a/diskTypes/local-ssd'), #'PERSISTANT' #'mode': 'READ_WRITE', #'deviceName': clusterName, 'autoDelete': True }) #instancesLaunched = driver.ex_create_multiple_nodes( retries = 0 workersCreated = 0 # Try a few times to create the requested number of workers while numNodes - workersCreated > 0 and retries < 3: instancesLaunched = self.ex_create_multiple_nodes( '', nodeType, imageType, numNodes, location=self.zone, ex_service_accounts=sa_scopes, ex_metadata=metadata, ex_disks_gce_struct=[disk], description=self.tags, ex_preemptible=preemptable) self.instanceGroup.add_instances(instancesLaunched) failedWorkers = [] for instance in instancesLaunched: if self._injectWorkerFiles(instance.public_ips[0]): workersCreated += 1 else: failedWorkers.append(instance) if failedWorkers: logger.error("Terminating %d failed workers" % len(failedWorkers)) self.terminateNodes(failedWorkers) retries += 1 logger.info('Launched %d new instance(s)', numNodes) if numNodes != workersCreated: logger.error("Failed to launch %d worker(s)", numNodes - workersCreated) return workersCreated
def launchCluster(self, leaderNodeType, leaderSpotBid, nodeTypes, preemptableNodeTypes, keyName, clusterName, numWorkers=0, numPreemptableWorkers=0, spotBids=None, userTags=None, zone=None, vpcSubnet=None, leaderStorage=50, nodeStorage=50, botoPath=None): if self.config is None: self.nodeStorage = nodeStorage if userTags is None: userTags = {} self.zone = zone self.clusterName = clusterName self.botoPath = botoPath self.keyName = keyName # GCE doesn't have a dictionary tags field. The tags field is just a string list. # Therefore, dumping tags into the description. tags = {'Owner': keyName, 'clusterName': self.clusterName} tags.update(userTags) self.tags = json.dumps(tags) # TODO # - security group: just for a cluster identifier? # - Error thrown if cluster exists. Add an explicit check for an existing cluster? Racey though. leaderData = dict(role='leader', dockerImage=applianceSelf(), entrypoint='mesos-master', dockerArgs=leaderDockerArgs.format(name=clusterName)) userData = gceUserData.format(**leaderData) metadata = {'items': [{'key': 'user-data', 'value': userData}]} imageType = 'coreos-stable' sa_scopes = [{'scopes': ['compute', 'storage-full']}] driver = self._getDriver() # Throws an error if cluster exists self.instanceGroup = driver.ex_create_instancegroup(clusterName, zone) preemptable = False if leaderSpotBid: logger.info('Launching preemptable leader') preemptable = True else: logger.info('Launching non-preemptable leader') disk = {} disk['initializeParams'] = { 'sourceImage': bytes( 'https://www.googleapis.com/compute/v1/projects/coreos-cloud/global/images/coreos-stable-1576-4-0-v20171206' ), 'diskSizeGb': leaderStorage } disk.update({ 'boot': True, #'type': 'bytes('zones/us-central1-a/diskTypes/local-ssd'), #'PERSISTANT' #'mode': 'READ_WRITE', #'deviceName': clusterName, 'autoDelete': True }) name = 'l' + bytes(uuid.uuid4()) leader = driver.create_node(name, leaderNodeType, imageType, location=zone, ex_service_accounts=sa_scopes, ex_metadata=metadata, ex_subnetwork=vpcSubnet, ex_disks_gce_struct=[disk], description=self.tags, ex_preemptible=preemptable) self.instanceGroup.add_instances([leader]) logger.info('... toil_leader is running') # if we are running launch cluster we need to save this data as it won't be generated # from the metadata. This data is needed to launch worker nodes. self.leaderIP = leader.private_ips[0] if spotBids: self.spotBids = dict(zip(preemptableNodeTypes, spotBids)) #TODO: get subnetID #self.subnetID = leader.subnet_id if (not self._waitForNode(leader.public_ips[0], 'toil_leader') or not self._copySshKeys(leader.public_ips[0], keyName) or not self._injectFile( leader.public_ips[0], self.credentialsPath, GoogleJobStore.nodeServiceAccountJson, 'toil_leader') or (self.botoPath and not self._injectFile(leader.public_ips[0], self.botoPath, self.nodeBotoPath, 'toil_leader'))): raise RuntimeError("Failed to start leader") # assuming that if the leader was launched without a spotbid then all workers # will be non-preemptable workersCreated = 0 for nodeType, workers in zip(nodeTypes, numWorkers): workersCreated += self.addNodes(nodeType=nodeType, numNodes=workers, preemptable=False) for nodeType, workers in zip(preemptableNodeTypes, numPreemptableWorkers): workersCreated += self.addNodes(nodeType=nodeType, numNodes=workers, preemptable=True) logger.info('Added %d workers', workersCreated) return leader