示例#1
0
    def _getScript(self):
        def restartScript():
            from toil.job import Job
            import argparse
            import os

            def f0(job):
                if 'FAIL' in os.environ:
                    raise RuntimeError('failed on purpose')

            if __name__ == '__main__':
                parser = argparse.ArgumentParser()
                Job.Runner.addToilOptions(parser)
                options = parser.parse_args()
                rootJob = Job.wrapJobFn(f0,
                                        cores=0.5,
                                        memory='50 M',
                                        disk='50 M')
                Job.Runner.startToil(rootJob, options)

        script = dedent('\n'.join(getsource(restartScript).split('\n')[1:]))
        tempfile_path = '/tmp/temp-or-ary.txt'
        with open(tempfile_path, 'w') as f:
            # use appliance ssh method instead of sshutil so we can specify input param
            f.write(script)
        cluster = clusterFactory(provisioner='aws',
                                 clusterName=self.clusterName)
        leader = cluster.getLeader()
        self.sshUtil([
            'mkdir', '-p', self.scriptDir
        ])  # hot deploy doesn't seem permitted to work in normal /tmp or /home
        leader.injectFile(tempfile_path, self.scriptName, 'toil_leader')
        if os.path.exists(tempfile_path):
            os.remove(tempfile_path)
示例#2
0
    def _getScript(self):
        def userScript():
            from toil.job import Job
            from toil.common import Toil

            # Because this is the only job in the pipeline and because it is preemptable,
            # there will be no non-preemptable jobs. The non-preemptable scaler will therefore
            # not request any nodes initially. And since we made it impossible for the
            # preemptable scaler to allocate any nodes (using an abnormally low spot bid),
            # we will observe a deficit of preemptable nodes that the non-preemptable scaler will
            # compensate for by spinning up non-preemptable nodes instead.
            #
            def job(job, disk='10M', cores=1, memory='10M', preemptable=True):
                pass

            if __name__ == '__main__':
                options = Job.Runner.getDefaultArgumentParser().parse_args()
                with Toil(options) as toil:
                    if toil.config.restart:
                        toil.restart()
                    else:
                        toil.start(Job.wrapJobFn(job))

        script = dedent('\n'.join(getsource(userScript).split('\n')[1:]))
        # use appliance ssh method instead of sshutil so we can specify input param
        cluster = clusterFactory(provisioner='aws', clusterName=self.clusterName)
        leader = cluster.getLeader()
        leader.sshAppliance('tee', '/home/userScript.py', input=script)
示例#3
0
    def launchCluster(self):
        from toil.lib.ec2 import wait_instances_running
        from boto.ec2.blockdevicemapping import BlockDeviceType
        self.createClusterUtil(args=[
            '--leaderStorage',
            str(self.requestedLeaderStorage), '--nodeTypes', ",".join(
                self.instanceTypes), '-w', ",".join(self.numWorkers),
            '--nodeStorage',
            str(self.requestedLeaderStorage)
        ])

        self.cluster = clusterFactory(provisioner='aws',
                                      clusterName=self.clusterName)
        nodes = self.cluster._getNodesInCluster(both=True)
        nodes.sort(key=lambda x: x.launch_time)
        # assuming that leader is first
        workers = nodes[1:]
        # test that two worker nodes were created
        self.assertEqual(2, len(workers))
        # test that workers have expected storage size
        # just use the first worker
        worker = workers[0]
        worker = next(wait_instances_running(self.cluster._ctx.ec2, [worker]))
        rootBlockDevice = worker.block_device_mapping["/dev/xvda"]
        self.assertTrue(isinstance(rootBlockDevice, BlockDeviceType))
        rootVolume = self.cluster._ctx.ec2.get_all_volumes(
            volume_ids=[rootBlockDevice.volume_id])[0]
        self.assertGreaterEqual(rootVolume.size, self.requestedNodeStorage)
示例#4
0
    def _getScript(self):
        def restartScript():
            from toil.job import Job
            import argparse
            import os

            def f0(job):
                if 'FAIL' in os.environ:
                    raise RuntimeError('failed on purpose')

            if __name__ == '__main__':
                parser = argparse.ArgumentParser()
                Job.Runner.addToilOptions(parser)
                options = parser.parse_args()
                rootJob = Job.wrapJobFn(f0,
                                        cores=0.5,
                                        memory='50 M',
                                        disk='50 M')
                Job.Runner.startToil(rootJob, options)

        script = dedent('\n'.join(getsource(restartScript).split('\n')[1:]))
        # use appliance ssh method instead of sshutil so we can specify input param
        cluster = clusterFactory(provisioner='aws',
                                 clusterName=self.clusterName)
        leader = cluster.getLeader()
        leader.sshAppliance('tee', self.scriptName, input=script)
示例#5
0
    def _test(self, preemptableJobs=False):
        """Does the work of the testing.  Many features' tests are thrown in here in no particular order."""
        self.launchCluster()
        # get the leader so we know the IP address - we don't need to wait since create cluster
        # already insures the leader is running
        self.cluster = clusterFactory(provisioner='aws', clusterName=self.clusterName)
        self.leader = self.cluster.getLeader()
        self.sshUtil(['mkdir', '-p', self.scriptDir])  # hot deploy doesn't seem permitted to work in normal /tmp or /home

        assert len(self.getMatchingRoles()) == 1
        # --never-download prevents silent upgrades to pip, wheel and setuptools
        venv_command = ['virtualenv', '--system-site-packages', '--python', exactPython, '--never-download', '/home/venv']
        self.sshUtil(venv_command)

        upgrade_command = ['/home/venv/bin/pip', 'install', 'setuptools==28.7.1', 'pyyaml==3.12']
        self.sshUtil(upgrade_command)

        self._getScript()

        toilOptions = [self.jobStore,
                       '--batchSystem=mesos',
                       '--workDir=/var/lib/toil',
                       '--clean=always',
                       '--retryCount=2',
                       '--clusterStats=/tmp/t/',
                       '--logDebug',
                       '--logFile=/tmp/t/sort.log',
                       '--provisioner=aws']

        toilOptions.extend(['--nodeTypes=' + ",".join(self.instanceTypes),
                            '--maxNodes=' + ",".join(self.numWorkers)])
        if preemptableJobs:
            toilOptions.extend(['--defaultPreemptable'])

        self._runScript(toilOptions)

        assert len(self.getMatchingRoles()) == 1

        # check stats
        self.sshUtil(['/home/venv/bin/python', '-c', 'import json; import os; '
                      'json.load(open("/home/" + [f for f in os.listdir("/tmp/t/") if f.endswith(".json")].pop()))'])

        from boto.exception import EC2ResponseError
        volumeID = self.getRootVolID()
        self.cluster.destroyCluster()
        for attempt in range(6):
            # https://github.com/BD2KGenomics/toil/issues/1567
            # retry this for up to 1 minute until the volume disappears
            try:
                self.cluster._ctx.ec2.get_all_volumes(volume_ids=[volumeID])
                time.sleep(10)
            except EC2ResponseError as e:
                if e.status == 400 and 'InvalidVolume.NotFound' in e.code:
                    break
                else:
                    raise
        else:
            self.fail('Volume with ID %s was not cleaned up properly' % volumeID)

        assert len(self.getMatchingRoles()) == 0
示例#6
0
def main():
    parser = getBasicOptionParser()
    parser = addBasicProvisionerOptions(parser)
    config = parseBasicOptions(parser)
    cluster = clusterFactory(provisioner=config.provisioner,
                             clusterName=config.clusterName,
                             zone=config.zone)
    cluster.destroyCluster()
示例#7
0
def main():
    parser = getBasicOptionParser()
    parser = addBasicProvisionerOptions(parser)
    parser.add_argument("--insecure", dest='insecure', action='store_true', required=False,
                        help="Temporarily disable strict host key checking.")
    parser.add_argument('args', nargs=argparse.REMAINDER)
    config = parseBasicOptions(parser)
    cluster = clusterFactory(provisioner=config.provisioner,
                             clusterName=config.clusterName,
                             zone=config.zone)
    command = config.args if config.args else ['bash']
    cluster.getLeader().sshAppliance(*command, strict=not config.insecure, tty=sys.stdin.isatty())
示例#8
0
def main():
    parser = getBasicOptionParser()
    parser = addBasicProvisionerOptions(parser)
    parser.add_argument("--insecure",
                        action='store_true',
                        help="Temporarily disable strict host key checking.")
    parser.add_argument("--sshOption",
                        dest='sshOptions',
                        default=[],
                        action='append',
                        help="Pass an additional option to the SSH command.")
    parser.add_argument('args', nargs=argparse.REMAINDER)
    config = parseBasicOptions(parser)
    cluster = clusterFactory(provisioner=config.provisioner,
                             clusterName=config.clusterName,
                             zone=config.zone)
    command = config.args if config.args else ['bash']
    cluster.getLeader().sshAppliance(*command,
                                     strict=not config.insecure,
                                     tty=sys.stdin.isatty(),
                                     sshOptions=config.sshOptions)
示例#9
0
def main():
    parser = getBasicOptionParser()
    parser = addBasicProvisionerOptions(parser)
    parser.add_argument("--insecure",
                        dest='insecure',
                        action='store_true',
                        required=False,
                        help="Temporarily disable strict host key checking.")
    parser.add_argument(
        "args",
        nargs=argparse.REMAINDER,
        help="Arguments to pass to"
        "`rsync`. Takes any arguments that rsync accepts. Specify the"
        " remote with a colon. For example, to upload `example.py`,"
        " specify `toil rsync-cluster -p aws test-cluster example.py :`."
        "\nOr, to download a file from the remote:, `toil rsync-cluster"
        " -p aws test-cluster :example.py .`")
    config = parseBasicOptions(parser)
    cluster = clusterFactory(provisioner=config.provisioner,
                             clusterName=config.clusterName,
                             zone=config.zone)
    cluster.getLeader().coreRsync(args=config.args, strict=not config.insecure)
示例#10
0
def main():
    parser = getBasicOptionParser()
    parser = addBasicProvisionerOptions(parser)
    parser.add_argument(
        "--leaderNodeType",
        dest="leaderNodeType",
        required=True,
        help="Non-preemptable node type to use for the cluster leader.")
    parser.add_argument(
        "--keyPairName",
        dest='keyPairName',
        help="On AWS, the name of the AWS key pair to include on the instance."
        " On Google/GCE, this is the ssh key pair.")
    parser.add_argument(
        "--owner",
        dest='owner',
        help="The owner tag for all instances. If not given, the value in"
        " --keyPairName will be used if given.")
    parser.add_argument(
        "--boto",
        dest='botoPath',
        help="The path to the boto credentials directory. This is transferred "
        "to all nodes in order to access the AWS jobStore from non-AWS instances."
    )
    parser.add_argument(
        "-t",
        "--tag",
        metavar='NAME=VALUE',
        dest='tags',
        default=[],
        action='append',
        help="Tags are added to the AWS cluster for this node and all of its "
        "children. Tags are of the form:\n"
        " -t key1=value1 --tag key2=value2\n"
        "Multiple tags are allowed and each tag needs its own flag. By "
        "default the cluster is tagged with "
        " {\n"
        "      \"Name\": clusterName,\n"
        "      \"Owner\": IAM username\n"
        " }. ")
    parser.add_argument(
        "--vpcSubnet",
        help="VPC subnet ID to launch cluster in. Uses default subnet if not "
        "specified. This subnet needs to have auto assign IPs turned on.")
    parser.add_argument(
        "--nodeTypes",
        dest='nodeTypes',
        default=None,
        type=str,
        help="Comma-separated list of node types to create while launching the "
        "leader. The syntax for each node type depends on the provisioner "
        "used. For the aws provisioner this is the name of an EC2 instance "
        "type followed by a colon and the price in dollar to bid for a spot "
        "instance, for example 'c3.8xlarge:0.42'. Must also provide the "
        "--workers argument to specify how many workers of each node type "
        "to create.")
    parser.add_argument(
        "-w",
        "--workers",
        dest='workers',
        default=None,
        type=str,
        help=
        "Comma-separated list of the number of workers of each node type to "
        "launch alongside the leader when the cluster is created. This can be "
        "useful if running toil without auto-scaling but with need of more "
        "hardware support")
    parser.add_argument(
        "--leaderStorage",
        dest='leaderStorage',
        type=int,
        default=50,
        help="Specify the size (in gigabytes) of the root volume for the leader "
        "instance.  This is an EBS volume.")
    parser.add_argument(
        "--nodeStorage",
        dest='nodeStorage',
        type=int,
        default=50,
        help="Specify the size (in gigabytes) of the root volume for any worker "
        "instances created when using the -w flag. This is an EBS volume.")
    parser.add_argument(
        '--forceDockerAppliance',
        dest='forceDockerAppliance',
        action='store_true',
        default=False,
        help=
        "Disables sanity checking the existence of the docker image specified "
        "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for "
        "autoscaling.")
    parser.add_argument(
        '--awsEc2ProfileArn',
        dest='awsEc2ProfileArn',
        default=None,
        type=str,
        help=
        "If provided, the specified ARN is used as the instance profile for EC2 instances."
        "Useful for setting custom IAM profiles. If not specified, a new IAM role is created "
        "by default with sufficient access to perform basic cluster operations."
    )
    config = parseBasicOptions(parser)
    tagsDict = None if config.tags is None else createTagsDict(config.tags)
    checkValidNodeTypes(config.provisioner, config.nodeTypes)
    checkValidNodeTypes(config.provisioner, config.leaderNodeType)

    # checks the validity of TOIL_APPLIANCE_SELF before proceeding
    applianceSelf(forceDockerAppliance=config.forceDockerAppliance)

    spotBids = []
    nodeTypes = []
    preemptableNodeTypes = []
    numNodes = []
    numPreemptableNodes = []
    if (config.nodeTypes
            or config.workers) and not (config.nodeTypes and config.workers):
        raise RuntimeError(
            "The --nodeTypes and --workers options must be specified together,"
        )
    if config.nodeTypes:
        nodeTypesList = config.nodeTypes.split(",")
        numWorkersList = config.workers.split(",")
        if not len(nodeTypesList) == len(numWorkersList):
            raise RuntimeError(
                "List of node types must be the same length as the list of workers."
            )
        for nodeTypeStr, num in zip(nodeTypesList, numWorkersList):
            parsedBid = nodeTypeStr.split(':', 1)
            if len(nodeTypeStr) != len(parsedBid[0]):
                #Is a preemptable node
                preemptableNodeTypes.append(parsedBid[0])
                spotBids.append(float(parsedBid[1]))
                numPreemptableNodes.append(int(num))
            else:
                nodeTypes.append(nodeTypeStr)
                numNodes.append(int(num))

    # set owner (default to keyPairName if not given)
    owner = 'toil'
    if config.owner:
        owner = config.owner
    elif config.keyPairName:
        owner = config.keyPairName

    # Check to see if the user specified a zone. If not, see if one is stored in an environment variable.
    config.zone = config.zone or getZoneFromEnv(config.provisioner)

    if not config.zone:
        raise RuntimeError(
            'Please provide a value for --zone or set a default in the TOIL_' +
            config.provisioner.upper() + '_ZONE enviroment variable.')

    cluster = clusterFactory(provisioner=config.provisioner,
                             clusterName=config.clusterName,
                             zone=config.zone,
                             nodeStorage=config.nodeStorage)

    cluster.launchCluster(leaderNodeType=config.leaderNodeType,
                          leaderStorage=config.leaderStorage,
                          owner=owner,
                          keyName=config.keyPairName,
                          botoPath=config.botoPath,
                          userTags=tagsDict,
                          vpcSubnet=config.vpcSubnet,
                          awsEc2ProfileArn=config.awsEc2ProfileArn)

    for nodeType, workers in zip(nodeTypes, numNodes):
        cluster.addNodes(nodeType=nodeType,
                         numNodes=workers,
                         preemptable=False)
    for nodeType, workers, spotBid in zip(preemptableNodeTypes,
                                          numPreemptableNodes, spotBids):
        cluster.addNodes(nodeType=nodeType,
                         numNodes=workers,
                         preemptable=True,
                         spotBid=spotBid)
示例#11
0
    def testAWSProvisionerUtils(self):
        """
        Runs a number of the cluster utilities in sequence.

        Launches a cluster with custom tags.
        Verifies the tags exist.
        ssh's into the cluster.
        Does some weird string comparisons.
        Makes certain that TOIL_WORKDIR is set as expected in the ssh'ed cluster.
        Rsyncs a file and verifies it exists on the leader.
        Destroys the cluster.

        :return:
        """
        # TODO: Run these for the other clouds.
        clusterName = 'cluster-utils-test' + str(uuid.uuid4())
        keyName = os.getenv('TOIL_AWS_KEYNAME')

        try:
            from toil.provisioners.aws.awsProvisioner import AWSProvisioner

            # launch master with an assortment of custom tags
            system([
                self.toilMain, 'launch-cluster', '-t', 'key1=value1', '-t',
                'key2=value2', '--tag', 'key3=value3',
                '--leaderNodeType=m3.medium', '--keyPairName=' + keyName,
                clusterName, '--provisioner=aws', '--zone=us-west-2a',
                '--logLevel=DEBUG'
            ])

            cluster = clusterFactory(provisioner='aws',
                                     clusterName=clusterName)
            leader = cluster.getLeader()

            # check that the leader carries the appropriate tags
            tags = {
                'key1': 'value1',
                'key2': 'value2',
                'key3': 'value3',
                'Name': clusterName,
                'Owner': keyName
            }
            for key in tags:
                self.assertEqual(tags[key], leader.tags.get(key))

            # Test strict host key checking
            # Doesn't work when run locally.
            if keyName == 'jenkins@jenkins-master':
                try:
                    leader.sshAppliance(strict=True)
                except RuntimeError:
                    pass
                else:
                    self.fail(
                        "Host key verification passed where it should have failed"
                    )

            # Add the host key to known_hosts so that the rest of the tests can
            # pass without choking on the verification prompt.
            leader.sshAppliance('bash',
                                strict=True,
                                sshOptions=['-oStrictHostKeyChecking=no'])

            system([
                self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName
            ])

            testStrings = [
                "'foo'", '"foo"', '  foo', '$PATH', '"', "'", '\\', '| cat',
                '&& cat', '; cat'
            ]
            for test in testStrings:
                logger.debug('Testing SSH with special string: %s', test)
                compareTo = "import sys; assert sys.argv[1]==%r" % test
                leader.sshAppliance('python', '-', test, input=compareTo)

            try:
                leader.sshAppliance('nonsenseShouldFail')
            except RuntimeError:
                pass
            else:
                self.fail(
                    'The remote command failed silently where it should have raised an error'
                )

            leader.sshAppliance(
                'python', '-c',
                "import os; assert os.environ['TOIL_WORKDIR']=='/var/lib/toil'"
            )

            # `toil rsync-cluster`
            # Testing special characters - string.punctuation
            fname = '!"#$%&\'()*+,-.;<=>:\ ?@[\\\\]^_`{|}~'
            testData = os.urandom(3 * (10**6))
            with tempfile.NamedTemporaryFile(suffix=fname) as tmpFile:
                relpath = os.path.basename(tmpFile.name)
                tmpFile.write(testData)
                tmpFile.flush()
                # Upload file to leader
                leader.coreRsync(args=[tmpFile.name, ":"])
                # Ensure file exists
                leader.sshAppliance("test", "-e", relpath)
            tmpDir = tempfile.mkdtemp()
            # Download the file again and make sure it's the same file
            # `--protect-args` needed because remote bash chokes on special characters
            leader.coreRsync(args=["--protect-args", ":" + relpath, tmpDir])
            with open(os.path.join(tmpDir, relpath), "r") as f:
                self.assertEqual(
                    f.read(), testData,
                    "Downloaded file does not match original file")
        finally:
            system([
                self.toilMain, 'destroy-cluster', '--provisioner=aws',
                clusterName
            ])
            try:
                shutil.rmtree(tmpDir)
            except NameError:
                pass