示例#1
0
    def getLeader(self, wait=False, returnRawInstance=False):
        assert self._ctx
        instances = self._getNodesInCluster(nodeType=None, both=True)
        instances.sort(key=lambda x: x.launch_time)
        try:
            leader = instances[0]  # assume leader was launched first
        except IndexError:
            raise NoSuchClusterException(self.clusterName)
        if (leader.tags.get(_TOIL_NODE_TYPE_TAG_KEY) or 'leader') != 'leader':
            raise InvalidClusterStateException(
                'Invalid cluster state! The first launched instance appears not to be the leader '
                'as it is missing the "leader" tag. The safest recovery is to destroy the cluster '
                'and restart the job. Incorrect Leader ID: %s' % leader.id)
        leaderNode = Node(publicIP=leader.ip_address,
                          privateIP=leader.private_ip_address,
                          name=leader.id,
                          launchTime=leader.launch_time,
                          nodeType=None,
                          preemptable=False,
                          tags=leader.tags)
        if wait:
            logger.debug("Waiting for toil_leader to enter 'running' state...")
            wait_instances_running(self._ctx.ec2, [leader])
            logger.debug('... toil_leader is running')
            self._waitForIP(leader)
            leaderNode.waitForNode('toil_leader')

        return leader if returnRawInstance else leaderNode
示例#2
0
    def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs):
        """
        In addition to the parameters inherited from the abstractProvisioner,
        the Google launchCluster takes the following parameters:
        keyName: The key used to communicate with instances
        botoPath: Boto credentials for reading an AWS jobStore (optional).
        vpcSubnet: A subnet (optional).
        """
        if 'keyName' not in kwargs:
            raise RuntimeError("A keyPairName is required for the GCE provisioner.")
        self._keyName = kwargs['keyName']
        if 'botoPath' in kwargs:
            self._botoPath = kwargs['botoPath']
        self._vpcSubnet = kwargs['vpcSubnet'] if 'vpcSubnet' in kwargs else None

        # Throws an error if cluster exists
        self._instanceGroup = self._gceDriver.ex_create_instancegroup(self.clusterName, self._zone)
        logger.debug('Launching leader')

        # GCE doesn't have a dictionary tags field. The tags field is just a string list.
        # Therefore, dumping tags into the description.
        tags = {'Owner': self._keyName, 'clusterName': self.clusterName}
        if 'userTags' in kwargs:
            tags.update(kwargs['userTags'])
        self._tags = json.dumps(tags)

        userData =  self._getCloudConfigUserData('leader')
        metadata = {'items': [{'key': 'user-data', 'value': userData}]}
        imageType = 'flatcar-stable'
        sa_scopes = [{'scopes': ['compute', 'storage-full']}]
        disk = {}
        disk['initializeParams'] = {
            'sourceImage': self.SOURCE_IMAGE,
            'diskSizeGb' : leaderStorage }
        disk.update({'boot': True,
             'autoDelete': True })
        name= 'l' + str(uuid.uuid4())
        leader = self._gceDriver.create_node(name, leaderNodeType, imageType,
                                            location=self._zone,
                                            ex_service_accounts=sa_scopes,
                                            ex_metadata=metadata,
                                            ex_subnetwork=self._vpcSubnet,
                                            ex_disks_gce_struct = [disk],
                                            description=self._tags,
                                            ex_preemptible=False)

        self._instanceGroup.add_instances([leader])
        self._leaderPrivateIP = leader.private_ips[0] # needed if adding workers
        #self.subnetID = leader.subnet_id #TODO: get subnetID

        # Wait for the appliance to start and inject credentials.
        leaderNode = Node(publicIP=leader.public_ips[0], privateIP=leader.private_ips[0],
                          name=leader.name, launchTime=leader.created_at, nodeType=leader.size,
                          preemptable=False, tags=self._tags)
        leaderNode.waitForNode('toil_leader', keyName=self._keyName)
        leaderNode.copySshKeys(self._keyName)
        leaderNode.injectFile(self._credentialsPath, GoogleJobStore.nodeServiceAccountJson, 'toil_leader')
        if self._botoPath:
            leaderNode.injectFile(self._botoPath, self.NODE_BOTO_PATH, 'toil_leader')
        logger.debug('Launched leader')
示例#3
0
    def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs):
        """
        In addition to the parameters inherited from the abstractProvisioner,
        the AWS launchCluster takes the following parameters:
        keyName: The key used to communicate with instances
        vpcSubnet: A subnet (optional).
        """
        if 'keyName' not in kwargs:
            raise RuntimeError("A keyPairName is required for the AWS provisioner.")
        self._keyName = kwargs['keyName']
        self._vpcSubnet = kwargs['vpcSubnet'] if 'vpcSubnet' in kwargs else None

        profileARN = self._getProfileARN()
        # the security group name is used as the cluster identifier
        sgs = self._createSecurityGroup()
        bdm = self._getBlockDeviceMapping(E2Instances[leaderNodeType], rootVolSize=leaderStorage)

        self._masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' # dummy key
        userData =  self._getCloudConfigUserData('leader', self._masterPublicKey)
        specKwargs = {'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs],
                  'instance_type': leaderNodeType,
                  'user_data': userData, 'block_device_map': bdm,
                  'instance_profile_arn': profileARN,
                  'placement': self._zone}
        if self._vpcSubnet:
            specKwargs["subnet_id"] = self._vpcSubnet
        instances = create_ondemand_instances(self._ctx.ec2, image_id=self._discoverAMI(),
                                                  spec=specKwargs, num_instances=1)

        # wait for the leader to finish setting up
        leader = instances[0]
        wait_instances_running(self._ctx.ec2, [leader])
        self._waitForIP(leader)
        leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address,
                          name=leader.id, launchTime=leader.launch_time, nodeType=leaderNodeType,
                          preemptable=False, tags=leader.tags)
        leaderNode.waitForNode('toil_leader')

        defaultTags = {'Name': self.clusterName, 'Owner': owner}
        if kwargs['userTags']:
            defaultTags.update(kwargs['userTags'])

        # if we running launch cluster we need to save this data as it won't be generated
        # from the metadata. This data is needed to launch worker nodes.
        self._leaderPrivateIP = leader.private_ip_address
        self._addTags([leader], defaultTags)
        self._tags = leader.tags
        self._subnetID = leader.subnet_id
示例#4
0
    def getLeader(self, wait=False):
        assert self._ctx
        instances = self._getNodesInCluster(nodeType=None, both=True)
        instances.sort(key=lambda x: x.launch_time)
        try:
            leader = instances[0]  # assume leader was launched first
        except IndexError:
            raise NoSuchClusterException(self.clusterName)
        leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address,
                          name=leader.id, launchTime=leader.launch_time, nodeType=None,
                          preemptable=False, tags=leader.tags)
        if wait:
            logger.debug("Waiting for toil_leader to enter 'running' state...")
            wait_instances_running(self._ctx.ec2, [leader])
            logger.debug('... toil_leader is running')
            self._waitForIP(leader)
            leaderNode.waitForNode('toil_leader')

        return leaderNode
示例#5
0
    def addNodes(self, nodeType, numNodes, preemptable, spotBid=None):
        assert self._leaderPrivateIP
        if preemptable and not spotBid:
            if self._spotBidsMap and nodeType in self._spotBidsMap:
                spotBid = self._spotBidsMap[nodeType]
            else:
                raise RuntimeError(
                    "No spot bid given for a preemptable node request.")
        instanceType = E2Instances[nodeType]
        bdm = self._getBlockDeviceMapping(instanceType,
                                          rootVolSize=self._nodeStorage)

        keyPath = self._sseKey if self._sseKey else None
        userData = self._getCloudConfigUserData('worker',
                                                self._masterPublicKey, keyPath,
                                                preemptable)
        if isinstance(userData, text_type):
            # Spot-market provisioning requires bytes for user data.
            userData = userData.encode('utf-8')
        sgs = [
            sg for sg in self._ctx.ec2.get_all_security_groups()
            if sg.name in self._leaderSecurityGroupNames
        ]
        kwargs = {
            'key_name': self._keyName,
            'security_group_ids': [sg.id for sg in sgs],
            'instance_type': instanceType.name,
            'user_data': userData,
            'block_device_map': bdm,
            'instance_profile_arn': self._leaderProfileArn,
            'placement': self._zone,
            'subnet_id': self._subnetID
        }

        instancesLaunched = []

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                # after we start launching instances we want to ensure the full setup is done
                # the biggest obstacle is AWS request throttling, so we retry on these errors at
                # every request in this method
                if not preemptable:
                    logger.debug('Launching %s non-preemptable nodes',
                                 numNodes)
                    instancesLaunched = create_ondemand_instances(
                        self._ctx.ec2,
                        image_id=self._discoverAMI(),
                        spec=kwargs,
                        num_instances=numNodes)
                else:
                    logger.debug('Launching %s preemptable nodes', numNodes)
                    kwargs['placement'] = getSpotZone(spotBid,
                                                      instanceType.name,
                                                      self._ctx)
                    # force generator to evaluate
                    instancesLaunched = list(
                        create_spot_instances(
                            ec2=self._ctx.ec2,
                            price=spotBid,
                            image_id=self._discoverAMI(),
                            tags={'clusterName': self.clusterName},
                            spec=kwargs,
                            num_instances=numNodes,
                            tentative=True))
                    # flatten the list
                    instancesLaunched = [
                        item for sublist in instancesLaunched
                        for item in sublist
                    ]

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                wait_instances_running(self._ctx.ec2, instancesLaunched)

        self._tags[_TOIL_NODE_TYPE_TAG_KEY] = 'worker'
        AWSProvisioner._addTags(instancesLaunched, self._tags)
        if self._sseKey:
            for i in instancesLaunched:
                self._waitForIP(i)
                node = Node(publicIP=i.ip_address,
                            privateIP=i.private_ip_address,
                            name=i.id,
                            launchTime=i.launch_time,
                            nodeType=i.instance_type,
                            preemptable=preemptable,
                            tags=i.tags)
                node.waitForNode('toil_worker')
                node.coreRsync([self._sseKey, ':' + self._sseKey],
                               applianceName='toil_worker')
        logger.debug('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
示例#6
0
    def launchCluster(self,
                      leaderNodeType: str,
                      leaderStorage: int,
                      owner: str,
                      keyName: str,
                      botoPath: str,
                      userTags: dict,
                      vpcSubnet: str,
                      awsEc2ProfileArn: str,
                      awsEc2ExtraSecurityGroupIds: list):
        """
        Starts a single leader node and populates this class with the leader's metadata.

        :param leaderNodeType: An AWS instance type, like "t2.medium", for example.
        :param leaderStorage: An integer number of gigabytes to provide the leader instance with.
        :param owner: Resources will be tagged with this owner string.
        :param keyName: The ssh key to use to access the leader node.
        :param botoPath: The path to the boto credentials directory.
        :param userTags: Optionally provided user tags to put on the leader.
        :param vpcSubnet: Optionally specify the VPC subnet.
        :param awsEc2ProfileArn: Optionally provide the profile ARN.
        :param awsEc2ExtraSecurityGroupIds: Optionally provide additional security group IDs.
        :return: None
        """
        self._keyName = keyName
        self._vpcSubnet = vpcSubnet

        profileArn = awsEc2ProfileArn or self._getProfileArn()
        # the security group name is used as the cluster identifier
        sgs = self._createSecurityGroup()
        bdm = [
            {
                'DeviceName': '/dev/xvda',
                'Ebs': {
                    'DeleteOnTermination': True,
                    'VolumeSize': leaderStorage,
                    'VolumeType': 'gp2'
                }
            },
        ]

        self._masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' # dummy key
        userData = self._getCloudConfigUserData('leader', self._masterPublicKey)
        if isinstance(userData, text_type):
            # Spot-market provisioning requires bytes for user data.
            # We probably won't have a spot-market leader, but who knows!
            userData = userData.encode('utf-8')
        instances = create_instances(self.ec2,
                                     image_id=self._discoverAMI(),
                                     num_instances=1,
                                     key_name=self._keyName,
                                     security_group_ids=[sg.id for sg in sgs] + awsEc2ExtraSecurityGroupIds,
                                     instance_type=leaderNodeType,
                                     user_data=userData,
                                     block_device_map=bdm,
                                     # instance_profile_arn={'Arn': profileArn},
                                     placement={'AvailabilityZone': self._zone},
                                     subnet_id=self._vpcSubnet)

        # wait for the leader to finish setting up
        leader = instances[0]
        leader.wait_until_running()

        default_tags = {'Name': self.clusterName, 'Owner': owner, _TOIL_NODE_TYPE_TAG_KEY: 'leader'}
        default_tags.update(userTags)

        tags = []
        for user_key, user_value in default_tags.items():
            tags.append({'Key': user_key, 'Value': user_value})
        leader.create_tags(Tags=tags)

        self._tags = leader.tags
        self._leaderPrivateIP = leader.private_ip_address
        self._subnetID = leader.subnet_id

        leaderNode = Node(publicIP=leader.public_ip_address, privateIP=leader.private_ip_address,
                          name=leader.id, launchTime=leader.launch_time, nodeType=leaderNodeType,
                          preemptable=False, tags=leader.tags)
        leaderNode.waitForNode('toil_leader')