def getLeader(self, wait=False, returnRawInstance=False): assert self._ctx instances = self._getNodesInCluster(nodeType=None, both=True) instances.sort(key=lambda x: x.launch_time) try: leader = instances[0] # assume leader was launched first except IndexError: raise NoSuchClusterException(self.clusterName) if (leader.tags.get(_TOIL_NODE_TYPE_TAG_KEY) or 'leader') != 'leader': raise InvalidClusterStateException( 'Invalid cluster state! The first launched instance appears not to be the leader ' 'as it is missing the "leader" tag. The safest recovery is to destroy the cluster ' 'and restart the job. Incorrect Leader ID: %s' % leader.id) leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address, name=leader.id, launchTime=leader.launch_time, nodeType=None, preemptable=False, tags=leader.tags) if wait: logger.debug("Waiting for toil_leader to enter 'running' state...") wait_instances_running(self._ctx.ec2, [leader]) logger.debug('... toil_leader is running') self._waitForIP(leader) leaderNode.waitForNode('toil_leader') return leader if returnRawInstance else leaderNode
def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs): """ In addition to the parameters inherited from the abstractProvisioner, the Google launchCluster takes the following parameters: keyName: The key used to communicate with instances botoPath: Boto credentials for reading an AWS jobStore (optional). vpcSubnet: A subnet (optional). """ if 'keyName' not in kwargs: raise RuntimeError("A keyPairName is required for the GCE provisioner.") self._keyName = kwargs['keyName'] if 'botoPath' in kwargs: self._botoPath = kwargs['botoPath'] self._vpcSubnet = kwargs['vpcSubnet'] if 'vpcSubnet' in kwargs else None # Throws an error if cluster exists self._instanceGroup = self._gceDriver.ex_create_instancegroup(self.clusterName, self._zone) logger.debug('Launching leader') # GCE doesn't have a dictionary tags field. The tags field is just a string list. # Therefore, dumping tags into the description. tags = {'Owner': self._keyName, 'clusterName': self.clusterName} if 'userTags' in kwargs: tags.update(kwargs['userTags']) self._tags = json.dumps(tags) userData = self._getCloudConfigUserData('leader') metadata = {'items': [{'key': 'user-data', 'value': userData}]} imageType = 'flatcar-stable' sa_scopes = [{'scopes': ['compute', 'storage-full']}] disk = {} disk['initializeParams'] = { 'sourceImage': self.SOURCE_IMAGE, 'diskSizeGb' : leaderStorage } disk.update({'boot': True, 'autoDelete': True }) name= 'l' + str(uuid.uuid4()) leader = self._gceDriver.create_node(name, leaderNodeType, imageType, location=self._zone, ex_service_accounts=sa_scopes, ex_metadata=metadata, ex_subnetwork=self._vpcSubnet, ex_disks_gce_struct = [disk], description=self._tags, ex_preemptible=False) self._instanceGroup.add_instances([leader]) self._leaderPrivateIP = leader.private_ips[0] # needed if adding workers #self.subnetID = leader.subnet_id #TODO: get subnetID # Wait for the appliance to start and inject credentials. leaderNode = Node(publicIP=leader.public_ips[0], privateIP=leader.private_ips[0], name=leader.name, launchTime=leader.created_at, nodeType=leader.size, preemptable=False, tags=self._tags) leaderNode.waitForNode('toil_leader', keyName=self._keyName) leaderNode.copySshKeys(self._keyName) leaderNode.injectFile(self._credentialsPath, GoogleJobStore.nodeServiceAccountJson, 'toil_leader') if self._botoPath: leaderNode.injectFile(self._botoPath, self.NODE_BOTO_PATH, 'toil_leader') logger.debug('Launched leader')
def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs): """ In addition to the parameters inherited from the abstractProvisioner, the AWS launchCluster takes the following parameters: keyName: The key used to communicate with instances vpcSubnet: A subnet (optional). """ if 'keyName' not in kwargs: raise RuntimeError("A keyPairName is required for the AWS provisioner.") self._keyName = kwargs['keyName'] self._vpcSubnet = kwargs['vpcSubnet'] if 'vpcSubnet' in kwargs else None profileARN = self._getProfileARN() # the security group name is used as the cluster identifier sgs = self._createSecurityGroup() bdm = self._getBlockDeviceMapping(E2Instances[leaderNodeType], rootVolSize=leaderStorage) self._masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' # dummy key userData = self._getCloudConfigUserData('leader', self._masterPublicKey) specKwargs = {'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': leaderNodeType, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': profileARN, 'placement': self._zone} if self._vpcSubnet: specKwargs["subnet_id"] = self._vpcSubnet instances = create_ondemand_instances(self._ctx.ec2, image_id=self._discoverAMI(), spec=specKwargs, num_instances=1) # wait for the leader to finish setting up leader = instances[0] wait_instances_running(self._ctx.ec2, [leader]) self._waitForIP(leader) leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address, name=leader.id, launchTime=leader.launch_time, nodeType=leaderNodeType, preemptable=False, tags=leader.tags) leaderNode.waitForNode('toil_leader') defaultTags = {'Name': self.clusterName, 'Owner': owner} if kwargs['userTags']: defaultTags.update(kwargs['userTags']) # if we running launch cluster we need to save this data as it won't be generated # from the metadata. This data is needed to launch worker nodes. self._leaderPrivateIP = leader.private_ip_address self._addTags([leader], defaultTags) self._tags = leader.tags self._subnetID = leader.subnet_id
def getLeader(self, wait=False): assert self._ctx instances = self._getNodesInCluster(nodeType=None, both=True) instances.sort(key=lambda x: x.launch_time) try: leader = instances[0] # assume leader was launched first except IndexError: raise NoSuchClusterException(self.clusterName) leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address, name=leader.id, launchTime=leader.launch_time, nodeType=None, preemptable=False, tags=leader.tags) if wait: logger.debug("Waiting for toil_leader to enter 'running' state...") wait_instances_running(self._ctx.ec2, [leader]) logger.debug('... toil_leader is running') self._waitForIP(leader) leaderNode.waitForNode('toil_leader') return leaderNode
def addNodes(self, nodeType, numNodes, preemptable, spotBid=None): assert self._leaderPrivateIP if preemptable and not spotBid: if self._spotBidsMap and nodeType in self._spotBidsMap: spotBid = self._spotBidsMap[nodeType] else: raise RuntimeError( "No spot bid given for a preemptable node request.") instanceType = E2Instances[nodeType] bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self._nodeStorage) keyPath = self._sseKey if self._sseKey else None userData = self._getCloudConfigUserData('worker', self._masterPublicKey, keyPath, preemptable) if isinstance(userData, text_type): # Spot-market provisioning requires bytes for user data. userData = userData.encode('utf-8') sgs = [ sg for sg in self._ctx.ec2.get_all_security_groups() if sg.name in self._leaderSecurityGroupNames ] kwargs = { 'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': self._leaderProfileArn, 'placement': self._zone, 'subnet_id': self._subnetID } instancesLaunched = [] for attempt in retry(predicate=awsRetryPredicate): with attempt: # after we start launching instances we want to ensure the full setup is done # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptable: logger.debug('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances( self._ctx.ec2, image_id=self._discoverAMI(), spec=kwargs, num_instances=numNodes) else: logger.debug('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(spotBid, instanceType.name, self._ctx) # force generator to evaluate instancesLaunched = list( create_spot_instances( ec2=self._ctx.ec2, price=spotBid, image_id=self._discoverAMI(), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True)) # flatten the list instancesLaunched = [ item for sublist in instancesLaunched for item in sublist ] for attempt in retry(predicate=awsRetryPredicate): with attempt: wait_instances_running(self._ctx.ec2, instancesLaunched) self._tags[_TOIL_NODE_TYPE_TAG_KEY] = 'worker' AWSProvisioner._addTags(instancesLaunched, self._tags) if self._sseKey: for i in instancesLaunched: self._waitForIP(i) node = Node(publicIP=i.ip_address, privateIP=i.private_ip_address, name=i.id, launchTime=i.launch_time, nodeType=i.instance_type, preemptable=preemptable, tags=i.tags) node.waitForNode('toil_worker') node.coreRsync([self._sseKey, ':' + self._sseKey], applianceName='toil_worker') logger.debug('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def launchCluster(self, leaderNodeType: str, leaderStorage: int, owner: str, keyName: str, botoPath: str, userTags: dict, vpcSubnet: str, awsEc2ProfileArn: str, awsEc2ExtraSecurityGroupIds: list): """ Starts a single leader node and populates this class with the leader's metadata. :param leaderNodeType: An AWS instance type, like "t2.medium", for example. :param leaderStorage: An integer number of gigabytes to provide the leader instance with. :param owner: Resources will be tagged with this owner string. :param keyName: The ssh key to use to access the leader node. :param botoPath: The path to the boto credentials directory. :param userTags: Optionally provided user tags to put on the leader. :param vpcSubnet: Optionally specify the VPC subnet. :param awsEc2ProfileArn: Optionally provide the profile ARN. :param awsEc2ExtraSecurityGroupIds: Optionally provide additional security group IDs. :return: None """ self._keyName = keyName self._vpcSubnet = vpcSubnet profileArn = awsEc2ProfileArn or self._getProfileArn() # the security group name is used as the cluster identifier sgs = self._createSecurityGroup() bdm = [ { 'DeviceName': '/dev/xvda', 'Ebs': { 'DeleteOnTermination': True, 'VolumeSize': leaderStorage, 'VolumeType': 'gp2' } }, ] self._masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' # dummy key userData = self._getCloudConfigUserData('leader', self._masterPublicKey) if isinstance(userData, text_type): # Spot-market provisioning requires bytes for user data. # We probably won't have a spot-market leader, but who knows! userData = userData.encode('utf-8') instances = create_instances(self.ec2, image_id=self._discoverAMI(), num_instances=1, key_name=self._keyName, security_group_ids=[sg.id for sg in sgs] + awsEc2ExtraSecurityGroupIds, instance_type=leaderNodeType, user_data=userData, block_device_map=bdm, # instance_profile_arn={'Arn': profileArn}, placement={'AvailabilityZone': self._zone}, subnet_id=self._vpcSubnet) # wait for the leader to finish setting up leader = instances[0] leader.wait_until_running() default_tags = {'Name': self.clusterName, 'Owner': owner, _TOIL_NODE_TYPE_TAG_KEY: 'leader'} default_tags.update(userTags) tags = [] for user_key, user_value in default_tags.items(): tags.append({'Key': user_key, 'Value': user_value}) leader.create_tags(Tags=tags) self._tags = leader.tags self._leaderPrivateIP = leader.private_ip_address self._subnetID = leader.subnet_id leaderNode = Node(publicIP=leader.public_ip_address, privateIP=leader.private_ip_address, name=leader.id, launchTime=leader.launch_time, nodeType=leaderNodeType, preemptable=False, tags=leader.tags) leaderNode.waitForNode('toil_leader')