Exemplo n.º 1
0
 def launchCluster(cls, instanceType, keyName, clusterName, spotBid=None, zone=None):
     ctx = cls._buildContext(clusterName=clusterName, zone=zone)
     profileARN = cls._getProfileARN(ctx)
     # the security group name is used as the cluster identifier
     cls._createSecurityGroup(ctx, clusterName)
     bdm = cls._getBlockDeviceMapping(ec2_instance_types[instanceType])
     leaderData = dict(role='leader',
                       image=applianceSelf(),
                       entrypoint='mesos-master',
                       args=leaderArgs.format(name=clusterName))
     userData = awsUserData.format(**leaderData)
     kwargs = {'key_name': keyName, 'security_groups': [clusterName],
               'instance_type': instanceType,
               'user_data': userData, 'block_device_map': bdm,
               'instance_profile_arn': profileARN}
     if not spotBid:
         logger.info('Launching non-preemptable leader')
         create_ondemand_instances(ctx.ec2, image_id=cls._discoverAMI(ctx),
                                   spec=kwargs, num_instances=1)
     else:
         logger.info('Launching preemptable leader')
         # force generator to evaluate
         list(create_spot_instances(ec2=ctx.ec2,
                                    price=spotBid,
                                    image_id=cls._discoverAMI(ctx),
                                    tags={'clusterName': clusterName},
                                    spec=kwargs,
                                    num_instances=1))
     return cls._getLeader(clusterName=clusterName, wait=True)
Exemplo n.º 2
0
    def addNodes(self, numNodes, preemptable):
        instanceType = self._getInstanceType(preemptable)
        bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self.nodeStorage)
        arn = self._getProfileARN(self.ctx)
        keyPath = '' if not self.config or not self.config.sseKey else self.config.sseKey
        entryPoint = 'mesos-slave' if not self.config or not self.config.sseKey else "waitForKey.sh"
        workerData = dict(role='worker',
                          image=applianceSelf(),
                          entrypoint=entryPoint,
                          sshKey=self.masterPublicKey,
                          args=workerArgs.format(ip=self.leaderIP, preemptable=preemptable, keyPath=keyPath))
        userData = awsUserData.format(**workerData)
        sgs = [sg for sg in self.ctx.ec2.get_all_security_groups() if sg.name == self.clusterName]
        kwargs = {'key_name': self.keyName,
                  'security_group_ids': [sg.id for sg in sgs],
                  'instance_type': instanceType.name,
                  'user_data': userData,
                  'block_device_map': bdm,
                  'instance_profile_arn': arn,
                  'placement': getCurrentAWSZone()}
        kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id

        instancesLaunched = []

        for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
            with attempt:
                # after we start launching instances we want to insure the full setup is done
                # the biggest obstacle is AWS request throttling, so we retry on these errors at
                # every request in this method
                if not preemptable:
                    logger.info('Launching %s non-preemptable nodes', numNodes)
                    instancesLaunched = create_ondemand_instances(self.ctx.ec2, image_id=self._discoverAMI(self.ctx),
                                                                  spec=kwargs, num_instances=numNodes)
                else:
                    logger.info('Launching %s preemptable nodes', numNodes)
                    kwargs['placement'] = getSpotZone(self.spotBid, instanceType.name, self.ctx)
                    # force generator to evaluate
                    instancesLaunched = list(create_spot_instances(ec2=self.ctx.ec2,
                                                                   price=self.spotBid,
                                                                   image_id=self._discoverAMI(self.ctx),
                                                                   tags={'clusterName': self.clusterName},
                                                                   spec=kwargs,
                                                                   num_instances=numNodes,
                                                                   tentative=True)
                                             )
                    # flatten the list
                    instancesLaunched = [item for sublist in instancesLaunched for item in sublist]

        for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
            with attempt:
                wait_instances_running(self.ctx.ec2, instancesLaunched)

        # request throttling retry happens internally to these two methods to insure proper granularity
        AWSProvisioner._addTags(instancesLaunched, self.tags)
        self._propagateKey(instancesLaunched)

        logger.info('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
Exemplo n.º 3
0
 def __cleanMounts(self):
     """
     Deletes all files in every mounted directory. Without this step, we risk leaking
     files owned by root on the host. To avoid races, this method should be called after
     the appliance container was stopped, otherwise the running container might still be
     writing files.
     """
     # Delete all files within each mounted directory, but not the directory itself.
     cmd = 'shopt -s dotglob && rm -rf ' + ' '.join(v + '/*'
                                                    for k, v in self.mounts.iteritems()
                                                    if os.path.isdir(k))
     self.outer._run('docker', 'run',
                     '--rm',
                     '--entrypoint=/bin/bash',
                     applianceSelf(),
                     '-c',
                     cmd)
Exemplo n.º 4
0
 def __enter__(self):
     with self.lock:
         image = applianceSelf()
         # Omitting --rm, it's unreliable, see https://github.com/docker/docker/issues/16575
         args = list(concat('docker', 'run',
                            '--entrypoint=' + self._entryPoint(),
                            '--net=host',
                            '-i',
                            '--name=' + self.containerName,
                            ['--volume=%s:%s' % mount for mount in self.mounts.iteritems()],
                            image,
                            self._containerCommand()))
         log.info('Running %r', args)
         self.popen = Popen(args)
     self.start()
     self.__wait_running()
     return self
Exemplo n.º 5
0
def needs_appliance(test_item):
    import json
    test_item = _mark_test('appliance', test_item)
    if next(which('docker'), None):
        image = applianceSelf()
        try:
            images = check_output(['docker', 'inspect', image])
        except CalledProcessError:
            images = []
        else:
            images = {i['Id'] for i in json.loads(images) if image in i['RepoTags']}
        if len(images) == 0:
            return unittest.skip("Cannot find appliance image %s. Be sure to run 'make docker' "
                                 "prior to running this test." % image)(test_item)
        elif len(images) == 1:
            return test_item
        else:
            assert False, 'Expected `docker inspect` to return zero or one image.'
    else:
        return unittest.skip('Install Docker to include this test.')(test_item)
Exemplo n.º 6
0
    def _addNodes(self, instances, numNodes, preemptable=False):
        bdm = self._getBlockDeviceMapping(self.instanceType)
        arn = self._getProfileARN(self.ctx)
        workerData = dict(role='worker',
                          image=applianceSelf(),
                          entrypoint='mesos-slave',
                          args=workerArgs.format(ip=self.leaderIP, preemptable=preemptable))
        userData = awsUserData.format(**workerData)
        kwargs = {'key_name': self.keyName,
                  'security_groups': [self.clusterName],
                  'instance_type': self.instanceType.name,
                  'user_data': userData,
                  'block_device_map': bdm,
                  'instance_profile_arn': arn}

        instancesLaunched = []

        if not preemptable:
            logger.info('Launching %s non-preemptable nodes', numNodes)
            instancesLaunched = create_ondemand_instances(self.ctx.ec2, image_id=self._discoverAMI(self.ctx),
                                      spec=kwargs, num_instances=1)
        else:
            logger.info('Launching %s preemptable nodes', numNodes)
            kwargs['placement'] = getSpotZone(self.spotBid, self.instanceType.name, self.ctx)
            # force generator to evaluate
            instancesLaunched = list(create_spot_instances(ec2=self.ctx.ec2,
                                                           price=self.spotBid,
                                                           image_id=self._discoverAMI(self.ctx),
                                                           tags={'clusterName': self.clusterName},
                                                           spec=kwargs,
                                                           num_instances=numNodes,
                                                           tentative=True)
                                     )
        wait_instances_running(self.ctx.ec2, instancesLaunched)
        logger.info('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
Exemplo n.º 7
0
def main():
    parser = getBasicOptionParser()
    parser = addBasicProvisionerOptions(parser)
    parser.add_argument("--leaderNodeType", dest="leaderNodeType", required=True,
                        help="Non-preemptable node type to use for the cluster leader.")
    parser.add_argument("--keyPairName", dest='keyPairName',
                        help="On AWS, the name of the AWS key pair to include on the instance."
                        " On Google/GCE, this is the ssh key pair."
                        " Not needed for Azure.")
    parser.add_argument("--owner", dest='owner',
                        help="The owner tag for all instances. If not given, the value in"
                        " --keyPairName will be used if given.")
    parser.add_argument("--publicKeyFile", dest='publicKeyFile', default="~/.ssh/id_rsa.pub",
                        help="On Azure, the file"
                        " containing the key pairs (the first key pair will be used).")
    parser.add_argument("--boto", dest='botoPath',
                        help="The path to the boto credentials directory. This is transferred "
                        "to all nodes in order to access the AWS jobStore from non-AWS instances.")
    parser.add_argument("-t", "--tag", metavar='NAME=VALUE', dest='tags',
                        default=[], action='append',
                        help="Tags are added to the AWS cluster for this node and all of its "
                             "children. Tags are of the form:\n"
                             " -t key1=value1 --tag key2=value2\n"
                             "Multiple tags are allowed and each tag needs its own flag. By "
                             "default the cluster is tagged with "
                             " {\n"
                             "      \"Name\": clusterName,\n"
                             "      \"Owner\": IAM username\n"
                             " }. ")
    parser.add_argument("--vpcSubnet",
                        help="VPC subnet ID to launch cluster in. Uses default subnet if not "
                        "specified. This subnet needs to have auto assign IPs turned on.")
    parser.add_argument("--nodeTypes", dest='nodeTypes', default=None, type=str,
                        help="Comma-separated list of node types to create while launching the "
                             "leader. The syntax for each node type depends on the provisioner "
                             "used. For the aws provisioner this is the name of an EC2 instance "
                             "type followed by a colon and the price in dollar to bid for a spot "
                             "instance, for example 'c3.8xlarge:0.42'. Must also provide the "
                             "--workers argument to specify how many workers of each node type "
                             "to create.")
    parser.add_argument("-w", "--workers", dest='workers', default=None, type=str,
                        help="Comma-separated list of the number of workers of each node type to "
                             "launch alongside the leader when the cluster is created. This can be "
                             "useful if running toil without auto-scaling but with need of more "
                             "hardware support")
    parser.add_argument("--leaderStorage", dest='leaderStorage', type=int, default=50,
                        help="Specify the size (in gigabytes) of the root volume for the leader "
                             "instance.  This is an EBS volume.")
    parser.add_argument("--nodeStorage", dest='nodeStorage', type=int, default=50,
                        help="Specify the size (in gigabytes) of the root volume for any worker "
                             "instances created when using the -w flag. This is an EBS volume.")
    parser.add_argument('--forceDockerAppliance', dest='forceDockerAppliance', action='store_true',
                        default=False,
                        help="Disables sanity checking the existence of the docker image specified "
                             "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for "
                             "autoscaling.")
    parser.add_argument("--azureStorageCredentials", dest='azureStorageCredentials', type=str,
                        default=credential_file_path,
                        help="The location of the file containing the Azure storage credentials. If not specified,"
                             " the default file is used with Azure provisioning. Use 'None' to disable"
                             " the transfer of credentials.")
    parser.add_argument('--awsEc2ProfileArn', dest='awsEc2ProfileArn', default=None, type=str,
                        help="If provided, the specified ARN is used as the instance profile for EC2 instances."
                             "Useful for setting custom IAM profiles. If not specified, a new IAM role is created "
                             "by default with sufficient access to perform basic cluster operations.")
    config = parseBasicOptions(parser)
    tagsDict = None if config.tags is None else createTagsDict(config.tags)
    checkValidNodeTypes(config.provisioner, config.nodeTypes)
    checkValidNodeTypes(config.provisioner, config.leaderNodeType)


    # checks the validity of TOIL_APPLIANCE_SELF before proceeding
    applianceSelf(forceDockerAppliance=config.forceDockerAppliance)

    spotBids = []
    nodeTypes = []
    preemptableNodeTypes = []
    numNodes = []
    numPreemptableNodes = []
    if (config.nodeTypes or config.workers) and not (config.nodeTypes and config.workers):
        raise RuntimeError("The --nodeTypes and --workers options must be specified together,")
    if config.nodeTypes:
        nodeTypesList = config.nodeTypes.split(",")
        numWorkersList = config.workers.split(",")
        if not len(nodeTypesList) == len(numWorkersList):
            raise RuntimeError("List of node types must be the same length as the list of workers.")
        for nodeTypeStr, num in zip(nodeTypesList, numWorkersList):
            parsedBid = nodeTypeStr.split(':', 1)
            if len(nodeTypeStr) != len(parsedBid[0]):
                #Is a preemptable node
                preemptableNodeTypes.append(parsedBid[0])
                spotBids.append(float(parsedBid[1]))
                numPreemptableNodes.append(int(num))
            else:
                nodeTypes.append(nodeTypeStr)
                numNodes.append(int(num))

    # set owner (default to keyPairName if not given)
    owner = 'toil'
    if config.owner:
        owner = config.owner
    elif config.keyPairName:
        owner = config.keyPairName

    # Check to see if the user specified a zone. If not, see if one is stored in an environment variable.
    config.zone = config.zone or getZoneFromEnv(config.provisioner)

    if not config.zone:
        raise RuntimeError('Please provide a value for --zone or set a default in the TOIL_' +
                           config.provisioner.upper() + '_ZONE enviroment variable.')

    cluster = clusterFactory(provisioner=config.provisioner,
                             clusterName=config.clusterName,
                             zone=config.zone,
                             nodeStorage=config.nodeStorage)

    cluster.launchCluster(leaderNodeType=config.leaderNodeType,
                          leaderStorage=config.leaderStorage,
                          owner=owner,
                          keyName=config.keyPairName,
                          botoPath=config.botoPath,
                          userTags=tagsDict,
                          vpcSubnet=config.vpcSubnet,
                          publicKeyFile=config.publicKeyFile,
                          azureStorageCredentials=config.azureStorageCredentials,
                          awsEc2ProfileArn=config.awsEc2ProfileArn)

    for nodeType, workers in zip(nodeTypes, numNodes):
        cluster.addNodes(nodeType=nodeType, numNodes=workers, preemptable=False)
    for nodeType, workers, spotBid in zip(preemptableNodeTypes, numPreemptableNodes, spotBids):
        cluster.addNodes(nodeType=nodeType, numNodes=workers, preemptable=True,
                                           spotBid=spotBid)
Exemplo n.º 8
0
    def addNodes(self, nodeType, numNodes, preemptable):
        instanceType = ec2_instance_types[nodeType]
        bdm = self._getBlockDeviceMapping(instanceType,
                                          rootVolSize=self.nodeStorage)
        arn = self._getProfileARN(self.ctx)
        keyPath = '' if not self.config or not self.config.sseKey else self.config.sseKey
        entryPoint = 'mesos-slave' if not self.config or not self.config.sseKey else "waitForKey.sh"
        workerData = dict(role='worker',
                          image=applianceSelf(),
                          entrypoint=entryPoint,
                          sshKey=self.masterPublicKey,
                          args=workerArgs.format(ip=self.leaderIP,
                                                 preemptable=preemptable,
                                                 keyPath=keyPath))
        userData = awsUserData.format(**workerData)
        sgs = [
            sg for sg in self.ctx.ec2.get_all_security_groups()
            if sg.name == self.clusterName
        ]
        kwargs = {
            'key_name': self.keyName,
            'security_group_ids': [sg.id for sg in sgs],
            'instance_type': instanceType.name,
            'user_data': userData,
            'block_device_map': bdm,
            'instance_profile_arn': arn,
            'placement': getCurrentAWSZone()
        }
        kwargs[
            "subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(
                self.instanceMetaData).subnet_id

        instancesLaunched = []

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                # after we start launching instances we want to insure the full setup is done
                # the biggest obstacle is AWS request throttling, so we retry on these errors at
                # every request in this method
                if not preemptable:
                    logger.info('Launching %s non-preemptable nodes', numNodes)
                    instancesLaunched = create_ondemand_instances(
                        self.ctx.ec2,
                        image_id=self._discoverAMI(self.ctx),
                        spec=kwargs,
                        num_instances=numNodes)
                else:
                    logger.info('Launching %s preemptable nodes', numNodes)
                    kwargs['placement'] = getSpotZone(self.spotBids[nodeType],
                                                      instanceType.name,
                                                      self.ctx)
                    # force generator to evaluate
                    instancesLaunched = list(
                        create_spot_instances(
                            ec2=self.ctx.ec2,
                            price=self.spotBids[nodeType],
                            image_id=self._discoverAMI(self.ctx),
                            tags={'clusterName': self.clusterName},
                            spec=kwargs,
                            num_instances=numNodes,
                            tentative=True))
                    # flatten the list
                    instancesLaunched = [
                        item for sublist in instancesLaunched
                        for item in sublist
                    ]

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                wait_instances_running(self.ctx.ec2, instancesLaunched)

        # request throttling retry happens internally to these two methods to insure proper granularity
        AWSProvisioner._addTags(instancesLaunched, self.tags)
        self._propagateKey(instancesLaunched)

        logger.info('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
Exemplo n.º 9
0
    def launchCluster(self,
                      leaderNodeType,
                      leaderSpotBid,
                      nodeTypes,
                      preemptableNodeTypes,
                      keyName,
                      clusterName,
                      numWorkers=0,
                      numPreemptableWorkers=0,
                      spotBids=None,
                      userTags=None,
                      zone=None,
                      vpcSubnet=None,
                      leaderStorage=50,
                      nodeStorage=50):
        if self.config is None:
            self.nodeStorage = nodeStorage
        if userTags is None:
            userTags = {}
        ctx = self._buildContext(clusterName=clusterName, zone=zone)
        profileARN = self._getProfileARN(ctx)
        leaderInstanceType = ec2_instance_types[leaderNodeType]
        # the security group name is used as the cluster identifier
        sgs = self._createSecurityGroup(ctx, clusterName, vpcSubnet)
        bdm = self._getBlockDeviceMapping(leaderInstanceType,
                                          rootVolSize=leaderStorage)
        self.masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded'
        leaderData = dict(role='leader',
                          image=applianceSelf(),
                          entrypoint='mesos-master',
                          sshKey=self.masterPublicKey,
                          args=leaderArgs.format(name=clusterName))
        userData = awsUserData.format(**leaderData)
        kwargs = {
            'key_name': keyName,
            'security_group_ids': [sg.id for sg in sgs],
            'instance_type': leaderNodeType,
            'user_data': userData,
            'block_device_map': bdm,
            'instance_profile_arn': profileARN,
            'placement': zone
        }
        if vpcSubnet:
            kwargs["subnet_id"] = vpcSubnet
        if not leaderSpotBid:
            logger.info('Launching non-preemptable leader')
            instances = create_ondemand_instances(
                ctx.ec2,
                image_id=self._discoverAMI(ctx),
                spec=kwargs,
                num_instances=1)
            leader = instances[0]
        else:
            logger.info('Launching preemptable leader')
            # force generator to evaluate
            instances = list(
                create_spot_instances(ec2=ctx.ec2,
                                      price=leaderSpotBid,
                                      image_id=self._discoverAMI(ctx),
                                      tags={'clusterName': clusterName},
                                      spec=kwargs,
                                      num_instances=1))[0]
            leader = instances[0]

        wait_instances_running(ctx.ec2, [leader])
        self._waitForNode(leader, 'toil_leader')

        defaultTags = {'Name': clusterName, 'Owner': keyName}
        defaultTags.update(userTags)

        # if we running launch cluster we need to save this data as it won't be generated
        # from the metadata. This data is needed to launch worker nodes.
        self.leaderIP = leader.private_ip_address
        self._addTags([leader], defaultTags)
        self.ctx = ctx
        if spotBids:
            self.spotBids = dict(zip(preemptableNodeTypes, spotBids))
        self.clusterName = clusterName
        self.keyName = keyName
        self.tags = leader.tags
        self.subnetID = leader.subnet_id
        # assuming that if the leader was launched without a spotbid then all workers
        # will be non-preemptable
        workersCreated = 0
        for nodeType, workers in zip(nodeTypes, numWorkers):
            workersCreated += self.addNodes(nodeType=nodeType,
                                            numNodes=workers,
                                            preemptable=False)
        for nodeType, workers in zip(preemptableNodeTypes,
                                     numPreemptableWorkers):
            workersCreated += self.addNodes(nodeType=nodeType,
                                            numNodes=workers,
                                            preemptable=True)
        logger.info('Added %d workers', workersCreated)

        return leader
Exemplo n.º 10
0
def main():
    parser = getBasicOptionParser()
    parser = addBasicProvisionerOptions(parser)
    parser.add_argument(
        "--leaderNodeType",
        dest="leaderNodeType",
        required=True,
        help="Non-preemptable node type to use for the cluster leader.")
    parser.add_argument(
        "--keyPairName",
        dest='keyPairName',
        required=True,
        help="On AWS, the name of the AWS key pair to include on the instance."
        " On Google/GCE, this is the ssh key pair."
        " On Azure, this will be used as the owner tag.")
    parser.add_argument(
        "--publicKeyFile",
        dest='publicKeyFile',
        default="~/.ssh/id_rsa.pub",
        help="On Azure, the file"
        " containing the key pairs (the first key pair will be used).")
    parser.add_argument(
        "--boto",
        dest='botoPath',
        help="The path to the boto credentials directory. This is transferred "
        "to all nodes in order to access the AWS jobStore from non-AWS instances."
    )
    parser.add_argument(
        "-t",
        "--tag",
        metavar='NAME=VALUE',
        dest='tags',
        default=[],
        action='append',
        help="Tags are added to the AWS cluster for this node and all of its "
        "children. Tags are of the form:\n"
        " -t key1=value1 --tag key2=value2\n"
        "Multiple tags are allowed and each tag needs its own flag. By "
        "default the cluster is tagged with "
        " {\n"
        "      \"Name\": clusterName,\n"
        "      \"Owner\": IAM username\n"
        " }. ")
    parser.add_argument(
        "--vpcSubnet",
        help="VPC subnet ID to launch cluster in. Uses default subnet if not "
        "specified. This subnet needs to have auto assign IPs turned on.")
    parser.add_argument(
        "--nodeTypes",
        dest='nodeTypes',
        default=None,
        type=str,
        help="Comma-separated list of node types to create while launching the "
        "leader. The syntax for each node type depends on the provisioner "
        "used. For the aws provisioner this is the name of an EC2 instance "
        "type followed by a colon and the price in dollar to bid for a spot "
        "instance, for example 'c3.8xlarge:0.42'. Must also provide the "
        "--workers argument to specify how many workers of each node type "
        "to create.")
    parser.add_argument(
        "-w",
        "--workers",
        dest='workers',
        default=None,
        type=str,
        help=
        "Comma-separated list of the number of workers of each node type to "
        "launch alongside the leader when the cluster is created. This can be "
        "useful if running toil without auto-scaling but with need of more "
        "hardware support")
    parser.add_argument(
        "--leaderStorage",
        dest='leaderStorage',
        type=int,
        default=50,
        help="Specify the size (in gigabytes) of the root volume for the leader "
        "instance.  This is an EBS volume.")
    parser.add_argument(
        "--nodeStorage",
        dest='nodeStorage',
        type=int,
        default=50,
        help="Specify the size (in gigabytes) of the root volume for any worker "
        "instances created when using the -w flag. This is an EBS volume.")
    parser.add_argument(
        '--forceDockerAppliance',
        dest='forceDockerAppliance',
        action='store_true',
        default=False,
        help=
        "Disables sanity checking the existence of the docker image specified "
        "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for "
        "autoscaling.")
    parser.add_argument(
        "--azureStorageCredentials",
        dest='azureStorageCredentials',
        type=str,
        default=credential_file_path,
        help=
        "The location of the file containing the Azure storage credentials. If not specified,"
        " the default file is used with Azure provisioning. Use 'None' to disable"
        " the transfer of credentials.")
    config = parseBasicOptions(parser)
    tagsDict = None if config.tags is None else createTagsDict(config.tags)

    # checks the validity of TOIL_APPLIANCE_SELF before proceeding
    checkToilApplianceSelf = applianceSelf(
        forceDockerAppliance=config.forceDockerAppliance)

    spotBids = []
    nodeTypes = []
    preemptableNodeTypes = []
    numNodes = []
    numPreemptableNodes = []
    leaderSpotBid = None
    if config.provisioner == 'aws':
        logger.info('Using aws provisioner.')
        try:
            from toil.provisioners.aws.awsProvisioner import AWSProvisioner
        except ImportError:
            logger.error(
                'The aws extra must be installed to use this provisioner')
            raise
        provisioner = AWSProvisioner()
    elif config.provisioner == 'azure':
        try:
            from toil.provisioners.azure.azureProvisioner import AzureProvisioner
        except ImportError:
            raise RuntimeError(
                'The aws extra must be installed to use this provisioner')
        provisioner = AzureProvisioner()
    elif config.provisioner == 'gce':
        logger.info('Using a gce provisioner.')
        try:
            from toil.provisioners.gceProvisioner import GCEProvisioner
        except ImportError:
            logger.error(
                'The google extra must be installed to use this provisioner')
            raise
        provisioner = GCEProvisioner()
    else:
        assert False

    #Parse leader node type and spot bid
    parsedBid = config.leaderNodeType.split(':', 1)
    if len(config.leaderNodeType) != len(parsedBid[0]):
        leaderSpotBid = float(parsedBid[1])
        config.leaderNodeType = parsedBid[0]

    if (config.nodeTypes
            or config.workers) and not (config.nodeTypes and config.workers):
        raise RuntimeError(
            "The --nodeTypes and --workers options must be specified together,"
        )
    if config.nodeTypes:
        nodeTypesList = config.nodeTypes.split(",")
        numWorkersList = config.workers.split(",")
        if not len(nodeTypesList) == len(numWorkersList):
            raise RuntimeError(
                "List of node types must be the same length as the list of workers."
            )
        for nodeTypeStr, num in zip(nodeTypesList, numWorkersList):
            parsedBid = nodeTypeStr.split(':', 1)
            if len(nodeTypeStr) != len(parsedBid[0]):
                #Is a preemptable node
                preemptableNodeTypes.append(parsedBid[0])
                spotBids.append(float(parsedBid[1]))
                numPreemptableNodes.append(int(num))
            else:
                nodeTypes.append(nodeTypeStr)
                numNodes.append(int(num))

    provisioner.launchCluster(
        leaderNodeType=config.leaderNodeType,
        leaderSpotBid=leaderSpotBid,
        nodeTypes=nodeTypes,
        preemptableNodeTypes=preemptableNodeTypes,
        numWorkers=numNodes,
        numPreemptableWorkers=numPreemptableNodes,
        keyName=config.keyPairName,
        botoPath=config.botoPath,
        clusterName=config.clusterName,
        spotBids=spotBids,
        userTags=tagsDict,
        zone=config.zone,
        leaderStorage=config.leaderStorage,
        nodeStorage=config.nodeStorage,
        vpcSubnet=config.vpcSubnet,
        publicKeyFile=config.publicKeyFile,
        azureStorageCredentials=config.azureStorageCredentials)
Exemplo n.º 11
0
    def launchCluster(self, instanceType, keyName, clusterName, workers=0,
                      spotBid=None, userTags=None, zone=None, vpcSubnet=None, leaderStorage=50, nodeStorage=50):
        # only use this node storage value if launchCluster is called from cluster utility
        if self.config is None:
            self.nodeStorage = nodeStorage
        if userTags is None:
            userTags = {}
        ctx = self._buildContext(clusterName=clusterName, zone=zone)
        profileARN = self._getProfileARN(ctx)
        # the security group name is used as the cluster identifier
        sgs = self._createSecurityGroup(ctx, clusterName, vpcSubnet)
        bdm = self._getBlockDeviceMapping(ec2_instance_types[instanceType], rootVolSize=leaderStorage)
        self.masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded'
        leaderData = dict(role='leader',
                          image=applianceSelf(),
                          entrypoint='mesos-master',
                          sshKey=self.masterPublicKey,
                          args=leaderArgs.format(name=clusterName))
        userData = awsUserData.format(**leaderData)
        kwargs = {'key_name': keyName, 'security_group_ids': [sg.id for sg in sgs],
                  'instance_type': instanceType,
                  'user_data': userData, 'block_device_map': bdm,
                  'instance_profile_arn': profileARN,
                  'placement': zone}
        if vpcSubnet:
            kwargs["subnet_id"] = vpcSubnet
        if not spotBid:
            logger.info('Launching non-preemptable leader')
            create_ondemand_instances(ctx.ec2, image_id=self._discoverAMI(ctx),
                                      spec=kwargs, num_instances=1)
        else:
            logger.info('Launching preemptable leader')
            # force generator to evaluate
            list(create_spot_instances(ec2=ctx.ec2,
                                       price=spotBid,
                                       image_id=self._discoverAMI(ctx),
                                       tags={'clusterName': clusterName},
                                       spec=kwargs,
                                       num_instances=1))
        leader = self._getLeader(clusterName=clusterName, wait=True, zone=zone)

        defaultTags = {'Name': clusterName, 'Owner': keyName}
        defaultTags.update(userTags)

        # if we running launch cluster we need to save this data as it won't be generated
        # from the metadata. This data is needed to launch worker nodes.
        self.leaderIP = leader.private_ip_address
        self._addTags([leader], defaultTags)
        self.ctx = ctx
        self.spotBid = spotBid
        preemptable = True if spotBid else False
        self.instanceType[preemptable] = ec2_instance_types[instanceType]
        self.clusterName = clusterName
        self.keyName = keyName
        self.tags = leader.tags
        self.subnetID = leader.subnet_id
        if workers:
            # assuming that if the leader was launched without a spotbid then all workers
            # will be non-preemptable
            workersCreated = self.addNodes(workers, preemptable=bool(spotBid))
            logger.info('Added %d workers with %d workers requested', workersCreated, workers)

        return leader
Exemplo n.º 12
0
    def launchCluster(self,
                      instanceType,
                      keyName,
                      clusterName,
                      workers=0,
                      spotBid=None,
                      userTags=None,
                      zone=None,
                      vpcSubnet=None,
                      leaderStorage=50,
                      nodeStorage=50):
        # only use this node storage value if launchCluster is called from cluster utility
        if self.config is None:
            self.nodeStorage = nodeStorage
        if userTags is None:
            userTags = {}
        ctx = self._buildContext(clusterName=clusterName, zone=zone)
        profileARN = self._getProfileARN(ctx)
        # the security group name is used as the cluster identifier
        sgs = self._createSecurityGroup(ctx, clusterName, vpcSubnet)
        bdm = self._getBlockDeviceMapping(ec2_instance_types[instanceType],
                                          rootVolSize=leaderStorage)
        self.masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded'
        leaderData = dict(role='leader',
                          image=applianceSelf(),
                          entrypoint='mesos-master',
                          sshKey=self.masterPublicKey,
                          args=leaderArgs.format(name=clusterName))
        userData = awsUserData.format(**leaderData)
        kwargs = {
            'key_name': keyName,
            'security_group_ids': [sg.id for sg in sgs],
            'instance_type': instanceType,
            'user_data': userData,
            'block_device_map': bdm,
            'instance_profile_arn': profileARN,
            'placement': zone
        }
        if vpcSubnet:
            kwargs["subnet_id"] = vpcSubnet
        if not spotBid:
            logger.info('Launching non-preemptable leader')
            create_ondemand_instances(ctx.ec2,
                                      image_id=self._discoverAMI(ctx),
                                      spec=kwargs,
                                      num_instances=1)
        else:
            logger.info('Launching preemptable leader')
            # force generator to evaluate
            list(
                create_spot_instances(ec2=ctx.ec2,
                                      price=spotBid,
                                      image_id=self._discoverAMI(ctx),
                                      tags={'clusterName': clusterName},
                                      spec=kwargs,
                                      num_instances=1))
        leader = self._getLeader(clusterName=clusterName, wait=True, zone=zone)

        defaultTags = {'Name': clusterName, 'Owner': keyName}
        defaultTags.update(userTags)

        # if we running launch cluster we need to save this data as it won't be generated
        # from the metadata. This data is needed to launch worker nodes.
        self.leaderIP = leader.private_ip_address
        self._addTags([leader], defaultTags)
        self.ctx = ctx
        self.spotBid = spotBid
        preemptable = True if spotBid else False
        self.instanceType[preemptable] = ec2_instance_types[instanceType]
        self.clusterName = clusterName
        self.keyName = keyName
        self.tags = leader.tags
        self.subnetID = leader.subnet_id
        if workers:
            # assuming that if the leader was launched without a spotbid then all workers
            # will be non-preemptable
            workersCreated = self.addNodes(workers, preemptable=bool(spotBid))
            logger.info('Added %d workers with %d workers requested',
                        workersCreated, workers)

        return leader
Exemplo n.º 13
0
def main():
    parser = parser_with_common_options(provisioner_options=True,
                                        jobstore_option=False)
    parser.add_argument(
        "--leaderNodeType",
        dest="leaderNodeType",
        required=True,
        help="Non-preemptable node type to use for the cluster leader.")
    parser.add_argument(
        "--keyPairName",
        dest='keyPairName',
        help="On AWS, the name of the AWS key pair to include on the instance."
        " On Google/GCE, this is the ssh key pair.")
    parser.add_argument(
        "--owner",
        dest='owner',
        help="The owner tag for all instances. If not given, the value in"
        " --keyPairName will be used if given.")
    parser.add_argument(
        "--boto",
        dest='botoPath',
        help="The path to the boto credentials directory. This is transferred "
        "to all nodes in order to access the AWS jobStore from non-AWS instances."
    )
    parser.add_argument(
        "-t",
        "--tag",
        metavar='NAME=VALUE',
        dest='tags',
        default=[],
        action='append',
        help="Tags are added to the AWS cluster for this node and all of its "
        "children. Tags are of the form:\n"
        " -t key1=value1 --tag key2=value2\n"
        "Multiple tags are allowed and each tag needs its own flag. By "
        "default the cluster is tagged with "
        " {\n"
        "      \"Name\": clusterName,\n"
        "      \"Owner\": IAM username\n"
        " }. ")
    parser.add_argument(
        "--vpcSubnet",
        help="VPC subnet ID to launch cluster in. Uses default subnet if not "
        "specified. This subnet needs to have auto assign IPs turned on.")
    parser.add_argument(
        "--nodeTypes",
        dest='nodeTypes',
        default=None,
        type=str,
        help="Comma-separated list of node types to create while launching the "
        "leader. The syntax for each node type depends on the provisioner "
        "used. For the aws provisioner this is the name of an EC2 instance "
        "type followed by a colon and the price in dollar to bid for a spot "
        "instance, for example 'c3.8xlarge:0.42'. Must also provide the "
        "--workers argument to specify how many workers of each node type "
        "to create.")
    parser.add_argument(
        "-w",
        "--workers",
        dest='workers',
        default=None,
        type=str,
        help=
        "Comma-separated list of the number of workers of each node type to "
        "launch alongside the leader when the cluster is created. This can be "
        "useful if running toil without auto-scaling but with need of more "
        "hardware support")
    parser.add_argument(
        "--leaderStorage",
        dest='leaderStorage',
        type=int,
        default=50,
        help="Specify the size (in gigabytes) of the root volume for the leader "
        "instance.  This is an EBS volume.")
    parser.add_argument(
        "--nodeStorage",
        dest='nodeStorage',
        type=int,
        default=50,
        help="Specify the size (in gigabytes) of the root volume for any worker "
        "instances created when using the -w flag. This is an EBS volume.")
    parser.add_argument(
        '--forceDockerAppliance',
        dest='forceDockerAppliance',
        action='store_true',
        default=False,
        help=
        "Disables sanity checking the existence of the docker image specified "
        "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for "
        "autoscaling.")
    parser.add_argument(
        '--awsEc2ProfileArn',
        dest='awsEc2ProfileArn',
        default=None,
        type=str,
        help=
        "If provided, the specified ARN is used as the instance profile for EC2 instances."
        "Useful for setting custom IAM profiles. If not specified, a new IAM role is created "
        "by default with sufficient access to perform basic cluster operations."
    )
    parser.add_argument(
        '--awsEc2ExtraSecurityGroupId',
        dest='awsEc2ExtraSecurityGroupIds',
        default=[],
        action='append',
        help=
        "Any additional security groups to attach to EC2 instances. Note that a security group "
        "with its name equal to the cluster name will always be created, thus ensure that "
        "the extra security groups do not have the same name as the cluster name."
    )
    options = parser.parse_args()
    set_logging_from_options(options)
    tags = create_tags_dict(options.tags) if options.tags else dict()

    worker_node_types = options.nodeTypes.split(
        ',') if options.nodeTypes else []
    worker_quantities = options.workers.split(',') if options.workers else []
    check_valid_node_types(options.provisioner,
                           worker_node_types + [options.leaderNodeType])

    # checks the validity of TOIL_APPLIANCE_SELF before proceeding
    applianceSelf(forceDockerAppliance=options.forceDockerAppliance)

    owner = options.owner or options.keyPairName or 'toil'

    # Check to see if the user specified a zone. If not, see if one is stored in an environment variable.
    options.zone = options.zone or os.environ.get(
        f'TOIL_{options.provisioner.upper()}_ZONE')

    if not options.zone:
        raise RuntimeError(
            f'Please provide a value for --zone or set a default in the '
            f'TOIL_{options.provisioner.upper()}_ZONE environment variable.')

    if (options.nodeTypes or
            options.workers) and not (options.nodeTypes and options.workers):
        raise RuntimeError(
            "The --nodeTypes and --workers options must be specified together."
        )

    if not len(worker_node_types) == len(worker_quantities):
        raise RuntimeError(
            "List of node types must be the same length as the list of workers."
        )

    cluster = cluster_factory(provisioner=options.provisioner,
                              clusterName=options.clusterName,
                              zone=options.zone,
                              nodeStorage=options.nodeStorage)

    cluster.launchCluster(
        leaderNodeType=options.leaderNodeType,
        leaderStorage=options.leaderStorage,
        owner=owner,
        keyName=options.keyPairName,
        botoPath=options.botoPath,
        userTags=tags,
        vpcSubnet=options.vpcSubnet,
        awsEc2ProfileArn=options.awsEc2ProfileArn,
        awsEc2ExtraSecurityGroupIds=options.awsEc2ExtraSecurityGroupIds)

    for worker_node_type, num_workers in zip(worker_node_types,
                                             worker_quantities):
        if ':' in worker_node_type:
            worker_node_type, bid = worker_node_type.split(':', 1)
            cluster.addNodes(nodeType=worker_node_type,
                             numNodes=int(num_workers),
                             preemptable=True,
                             spotBid=float(bid))
        else:
            cluster.addNodes(nodeType=worker_node_type,
                             numNodes=int(num_workers),
                             preemptable=False)
Exemplo n.º 14
0
    def launchCluster(self,
                      leaderNodeType,
                      keyName,
                      clusterName,
                      zone,
                      leaderStorage=50,
                      nodeStorage=50,
                      spotBid=None,
                      **kwargs):
        """
        Launches an Azure cluster using Ansible.
        A resource group is created for the cluster. All the virtual machines are created within this
        resource group.

        Cloud-config is called during vm creation to create directories and launch the appliance.
        """
        if spotBid:
            raise NotImplementedError(
                "Ansible does not support provisioning spot instances")

        if not self.isValidClusterName(clusterName):
            raise RuntimeError(
                "Invalid cluster name. See the Azure documentation for information "
                "on cluster naming conventions: "
                "https://docs.microsoft.com/en-us/azure/architecture/best-practices/naming-conventions"
            )
        self.clusterName = clusterName
        self.keyName = keyName
        self.region = zone
        self.nodeStorage = nodeStorage
        self.masterPublicKeyFile = kwargs['publicKeyFile']

        # Try deleting the resource group. This will fail if it exists.
        ansibleArgs = {'resgrp': self.clusterName, 'region': self.region}
        try:
            self.callPlaybook(self.playbook['create-cluster'],
                              ansibleArgs,
                              wait=True)
        except RuntimeError:
            logger.info(
                "The cluster could not be created. Try deleting the cluster if it already exits."
            )
            raise

        # Azure VMs must be named, so we need to generate one. Instance names must
        # be composed of only alphanumeric characters, underscores, and hyphens
        # (see https://docs.microsoft.com/en-us/azure/architecture/best-practices/naming-conventions).
        instanceName = 'l' + str(uuid.uuid4())

        cloudConfigArgs = {
            'image': applianceSelf(),
            'role': "leader",
            'entrypoint': "mesos-master",
            '_args': leaderArgs.format(name=self.clusterName),
        }
        ansibleArgs = {
            'vmsize': leaderNodeType,
            'vmname': instanceName,
            'storagename': instanceName.replace(
                '-',
                '')[:24],  # Azure limits the name to 24 characters, no dashes.
            'resgrp': self.
            clusterName,  # The resource group, which represents the cluster.
            'region': self.region,
            'role': "leader",
            'owner': self.keyName,  # Just a tag.
            'diskSize': str(leaderStorage),  # TODO: not implemented
            'publickeyfile': self.
            masterPublicKeyFile  # The users public key to be added to authorized_keys
        }
        ansibleArgs['cloudconfig'] = self._cloudConfig(cloudConfigArgs)
        self.callPlaybook(self.playbook['create'], ansibleArgs, wait=True)

        logger.info('Launched non-preemptable leader')

        # IP available as soon as the playbook finishes
        leader = self._getNodes('leader')[0]
        self.leaderIP = leader.privateIP

        # Make sure leader container is up.
        self._waitForNode(leader.publicIP, 'toil_leader')

        # Transfer credentials
        containerUserPath = '/root/'
        storageCredentials = kwargs['azureStorageCredentials']
        if storageCredentials is not None:
            fullPathCredentials = os.path.expanduser(storageCredentials)
            if os.path.isfile(fullPathCredentials):
                self._rsyncNode(leader.publicIP,
                                [fullPathCredentials, ':' + containerUserPath],
                                applianceName='toil_leader')

        ansibleCredentials = '.azure/credentials'
        fullPathAnsibleCredentials = os.path.expanduser('~/' +
                                                        ansibleCredentials)
        if os.path.isfile(fullPathAnsibleCredentials):
            self._sshAppliance(leader.publicIP, 'mkdir', '-p',
                               containerUserPath + '.azure')
            self._rsyncNode(leader.publicIP, [
                fullPathAnsibleCredentials,
                ':' + containerUserPath + ansibleCredentials
            ],
                            applianceName='toil_leader')
        # Add workers
        workersCreated = 0
        for nodeType, workers in zip(kwargs['nodeTypes'],
                                     kwargs['numWorkers']):
            workersCreated += self.addNodes(nodeType=nodeType,
                                            numNodes=workers)
        logger.info('Added %d workers', workersCreated)
Exemplo n.º 15
0
    def addNodes(self, nodeType, numNodes, preemptable):
        # If keys are rsynced, then the mesos-slave needs to be started after the keys have been
        # transferred. The waitForKey.sh script loops on the new VM until it finds the keyPath file, then it starts the
        # mesos-slave. If there are multiple keys to be transferred, then the last one to be transferred must be
        # set to keyPath.
        keyPath = ''
        entryPoint = 'mesos-slave'
        self.botoExists = False
        if self.botoPath is not None and os.path.exists(self.botoPath):
            entryPoint = "waitForKey.sh"
            keyPath = self.nodeBotoPath
            self.botoExists = True
        elif self.config and self.config.sseKey:
            entryPoint = "waitForKey.sh"
            keyPath = self.config.sseKey

        workerData = dict(role='worker',
                          dockerImage=applianceSelf(),
                          entrypoint=entryPoint,
                          sshKey=self.masterPublicKey,
                          dockerArgs=workerDockerArgs.format(
                              ip=self.leaderIP,
                              preemptable=preemptable,
                              keyPath=keyPath))

        #kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id

        userData = self.gceUserDataWorker.format(**workerData)
        metadata = {'items': [{'key': 'user-data', 'value': userData}]}

        imageType = 'coreos-stable'
        sa_scopes = [{'scopes': ['compute', 'storage-full']}]

        # TODO:
        #  - bug in gce.py for ex_create_multiple_nodes (erroneously, doesn't allow image and disk to specified)
        #  - ex_create_multiple_nodes is limited to 1000 nodes
        #    - use a different function
        #    - or write a loop over the rest of this function, with 1K nodes max on each iteration

        if not preemptable:
            logger.info('Launching %s non-preemptable nodes', numNodes)
        else:
            logger.info('Launching %s preemptable nodes', numNodes)

        disk = {}
        disk['initializeParams'] = {
            'sourceImage':
            bytes(
                'https://www.googleapis.com/compute/v1/projects/coreos-cloud/global'
                '/images/coreos-stable-1576-4-0-v20171206'),
            'diskSizeGb':
            self.nodeStorage
        }
        disk.update({
            'boot': True,
            #'type': 'bytes('zones/us-central1-a/diskTypes/local-ssd'), #'PERSISTANT'
            #'mode': 'READ_WRITE',
            #'deviceName': clusterName,
            'autoDelete': True
        })
        #instancesLaunched = driver.ex_create_multiple_nodes(
        retries = 0
        workersCreated = 0
        # Try a few times to create the requested number of workers
        while numNodes - workersCreated > 0 and retries < 3:
            instancesLaunched = self.ex_create_multiple_nodes(
                '',
                nodeType,
                imageType,
                numNodes,
                location=self.zone,
                ex_service_accounts=sa_scopes,
                ex_metadata=metadata,
                ex_disks_gce_struct=[disk],
                description=self.tags,
                ex_preemptible=preemptable)
            self.instanceGroup.add_instances(instancesLaunched)
            failedWorkers = []
            for instance in instancesLaunched:
                if self._injectWorkerFiles(instance.public_ips[0]):
                    workersCreated += 1
                else:
                    failedWorkers.append(instance)
            if failedWorkers:
                logger.error("Terminating %d failed workers" %
                             len(failedWorkers))
                self.terminateNodes(failedWorkers)
            retries += 1

        logger.info('Launched %d new instance(s)', numNodes)
        if numNodes != workersCreated:
            logger.error("Failed to launch %d worker(s)",
                         numNodes - workersCreated)
        return workersCreated
Exemplo n.º 16
0
    def launchCluster(self,
                      leaderNodeType,
                      leaderSpotBid,
                      nodeTypes,
                      preemptableNodeTypes,
                      keyName,
                      clusterName,
                      numWorkers=0,
                      numPreemptableWorkers=0,
                      spotBids=None,
                      userTags=None,
                      zone=None,
                      vpcSubnet=None,
                      leaderStorage=50,
                      nodeStorage=50,
                      botoPath=None):
        if self.config is None:
            self.nodeStorage = nodeStorage
        if userTags is None:
            userTags = {}
        self.zone = zone
        self.clusterName = clusterName
        self.botoPath = botoPath
        self.keyName = keyName

        # GCE doesn't have a dictionary tags field. The tags field is just a string list.
        # Therefore, dumping tags into the description.
        tags = {'Owner': keyName, 'clusterName': self.clusterName}
        tags.update(userTags)
        self.tags = json.dumps(tags)

        # TODO
        # - security group: just for a cluster identifier?
        # - Error thrown if cluster exists. Add an explicit check for an existing cluster? Racey though.

        leaderData = dict(role='leader',
                          dockerImage=applianceSelf(),
                          entrypoint='mesos-master',
                          dockerArgs=leaderDockerArgs.format(name=clusterName))
        userData = gceUserData.format(**leaderData)
        metadata = {'items': [{'key': 'user-data', 'value': userData}]}

        imageType = 'coreos-stable'
        sa_scopes = [{'scopes': ['compute', 'storage-full']}]

        driver = self._getDriver()

        # Throws an error if cluster exists
        self.instanceGroup = driver.ex_create_instancegroup(clusterName, zone)

        preemptable = False
        if leaderSpotBid:
            logger.info('Launching preemptable leader')
            preemptable = True
        else:
            logger.info('Launching non-preemptable leader')

        disk = {}
        disk['initializeParams'] = {
            'sourceImage':
            bytes(
                'https://www.googleapis.com/compute/v1/projects/coreos-cloud/global/images/coreos-stable-1576-4-0-v20171206'
            ),
            'diskSizeGb':
            leaderStorage
        }
        disk.update({
            'boot': True,
            #'type': 'bytes('zones/us-central1-a/diskTypes/local-ssd'), #'PERSISTANT'
            #'mode': 'READ_WRITE',
            #'deviceName': clusterName,
            'autoDelete': True
        })
        name = 'l' + bytes(uuid.uuid4())
        leader = driver.create_node(name,
                                    leaderNodeType,
                                    imageType,
                                    location=zone,
                                    ex_service_accounts=sa_scopes,
                                    ex_metadata=metadata,
                                    ex_subnetwork=vpcSubnet,
                                    ex_disks_gce_struct=[disk],
                                    description=self.tags,
                                    ex_preemptible=preemptable)

        self.instanceGroup.add_instances([leader])

        logger.info('... toil_leader is running')

        # if we are running launch cluster we need to save this data as it won't be generated
        # from the metadata. This data is needed to launch worker nodes.
        self.leaderIP = leader.private_ips[0]
        if spotBids:
            self.spotBids = dict(zip(preemptableNodeTypes, spotBids))

        #TODO: get subnetID
        #self.subnetID = leader.subnet_id

        if (not self._waitForNode(leader.public_ips[0], 'toil_leader')
                or not self._copySshKeys(leader.public_ips[0], keyName)
                or not self._injectFile(
                    leader.public_ips[0], self.credentialsPath,
                    GoogleJobStore.nodeServiceAccountJson, 'toil_leader') or
            (self.botoPath
             and not self._injectFile(leader.public_ips[0], self.botoPath,
                                      self.nodeBotoPath, 'toil_leader'))):

            raise RuntimeError("Failed to start leader")

        # assuming that if the leader was launched without a spotbid then all workers
        # will be non-preemptable
        workersCreated = 0
        for nodeType, workers in zip(nodeTypes, numWorkers):
            workersCreated += self.addNodes(nodeType=nodeType,
                                            numNodes=workers,
                                            preemptable=False)
        for nodeType, workers in zip(preemptableNodeTypes,
                                     numPreemptableWorkers):
            workersCreated += self.addNodes(nodeType=nodeType,
                                            numNodes=workers,
                                            preemptable=True)
        logger.info('Added %d workers', workersCreated)

        return leader