def _getScript(self): def restartScript(): from toil.job import Job import argparse import os def f0(job): if 'FAIL' in os.environ: raise RuntimeError('failed on purpose') if __name__ == '__main__': parser = argparse.ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args() rootJob = Job.wrapJobFn(f0, cores=0.5, memory='50 M', disk='50 M') Job.Runner.startToil(rootJob, options) script = dedent('\n'.join(getsource(restartScript).split('\n')[1:])) tempfile_path = '/tmp/temp-or-ary.txt' with open(tempfile_path, 'w') as f: # use appliance ssh method instead of sshutil so we can specify input param f.write(script) cluster = clusterFactory(provisioner='aws', clusterName=self.clusterName) leader = cluster.getLeader() self.sshUtil([ 'mkdir', '-p', self.scriptDir ]) # hot deploy doesn't seem permitted to work in normal /tmp or /home leader.injectFile(tempfile_path, self.scriptName, 'toil_leader') if os.path.exists(tempfile_path): os.remove(tempfile_path)
def _getScript(self): def userScript(): from toil.job import Job from toil.common import Toil # Because this is the only job in the pipeline and because it is preemptable, # there will be no non-preemptable jobs. The non-preemptable scaler will therefore # not request any nodes initially. And since we made it impossible for the # preemptable scaler to allocate any nodes (using an abnormally low spot bid), # we will observe a deficit of preemptable nodes that the non-preemptable scaler will # compensate for by spinning up non-preemptable nodes instead. # def job(job, disk='10M', cores=1, memory='10M', preemptable=True): pass if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser().parse_args() with Toil(options) as toil: if toil.config.restart: toil.restart() else: toil.start(Job.wrapJobFn(job)) script = dedent('\n'.join(getsource(userScript).split('\n')[1:])) # use appliance ssh method instead of sshutil so we can specify input param cluster = clusterFactory(provisioner='aws', clusterName=self.clusterName) leader = cluster.getLeader() leader.sshAppliance('tee', '/home/userScript.py', input=script)
def launchCluster(self): from toil.lib.ec2 import wait_instances_running from boto.ec2.blockdevicemapping import BlockDeviceType self.createClusterUtil(args=[ '--leaderStorage', str(self.requestedLeaderStorage), '--nodeTypes', ",".join( self.instanceTypes), '-w', ",".join(self.numWorkers), '--nodeStorage', str(self.requestedLeaderStorage) ]) self.cluster = clusterFactory(provisioner='aws', clusterName=self.clusterName) nodes = self.cluster._getNodesInCluster(both=True) nodes.sort(key=lambda x: x.launch_time) # assuming that leader is first workers = nodes[1:] # test that two worker nodes were created self.assertEqual(2, len(workers)) # test that workers have expected storage size # just use the first worker worker = workers[0] worker = next(wait_instances_running(self.cluster._ctx.ec2, [worker])) rootBlockDevice = worker.block_device_mapping["/dev/xvda"] self.assertTrue(isinstance(rootBlockDevice, BlockDeviceType)) rootVolume = self.cluster._ctx.ec2.get_all_volumes( volume_ids=[rootBlockDevice.volume_id])[0] self.assertGreaterEqual(rootVolume.size, self.requestedNodeStorage)
def _getScript(self): def restartScript(): from toil.job import Job import argparse import os def f0(job): if 'FAIL' in os.environ: raise RuntimeError('failed on purpose') if __name__ == '__main__': parser = argparse.ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args() rootJob = Job.wrapJobFn(f0, cores=0.5, memory='50 M', disk='50 M') Job.Runner.startToil(rootJob, options) script = dedent('\n'.join(getsource(restartScript).split('\n')[1:])) # use appliance ssh method instead of sshutil so we can specify input param cluster = clusterFactory(provisioner='aws', clusterName=self.clusterName) leader = cluster.getLeader() leader.sshAppliance('tee', self.scriptName, input=script)
def _test(self, preemptableJobs=False): """Does the work of the testing. Many features' tests are thrown in here in no particular order.""" self.launchCluster() # get the leader so we know the IP address - we don't need to wait since create cluster # already insures the leader is running self.cluster = clusterFactory(provisioner='aws', clusterName=self.clusterName) self.leader = self.cluster.getLeader() self.sshUtil(['mkdir', '-p', self.scriptDir]) # hot deploy doesn't seem permitted to work in normal /tmp or /home assert len(self.getMatchingRoles()) == 1 # --never-download prevents silent upgrades to pip, wheel and setuptools venv_command = ['virtualenv', '--system-site-packages', '--python', exactPython, '--never-download', '/home/venv'] self.sshUtil(venv_command) upgrade_command = ['/home/venv/bin/pip', 'install', 'setuptools==28.7.1', 'pyyaml==3.12'] self.sshUtil(upgrade_command) self._getScript() toilOptions = [self.jobStore, '--batchSystem=mesos', '--workDir=/var/lib/toil', '--clean=always', '--retryCount=2', '--clusterStats=/tmp/t/', '--logDebug', '--logFile=/tmp/t/sort.log', '--provisioner=aws'] toilOptions.extend(['--nodeTypes=' + ",".join(self.instanceTypes), '--maxNodes=' + ",".join(self.numWorkers)]) if preemptableJobs: toilOptions.extend(['--defaultPreemptable']) self._runScript(toilOptions) assert len(self.getMatchingRoles()) == 1 # check stats self.sshUtil(['/home/venv/bin/python', '-c', 'import json; import os; ' 'json.load(open("/home/" + [f for f in os.listdir("/tmp/t/") if f.endswith(".json")].pop()))']) from boto.exception import EC2ResponseError volumeID = self.getRootVolID() self.cluster.destroyCluster() for attempt in range(6): # https://github.com/BD2KGenomics/toil/issues/1567 # retry this for up to 1 minute until the volume disappears try: self.cluster._ctx.ec2.get_all_volumes(volume_ids=[volumeID]) time.sleep(10) except EC2ResponseError as e: if e.status == 400 and 'InvalidVolume.NotFound' in e.code: break else: raise else: self.fail('Volume with ID %s was not cleaned up properly' % volumeID) assert len(self.getMatchingRoles()) == 0
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) config = parseBasicOptions(parser) cluster = clusterFactory(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone) cluster.destroyCluster()
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--insecure", dest='insecure', action='store_true', required=False, help="Temporarily disable strict host key checking.") parser.add_argument('args', nargs=argparse.REMAINDER) config = parseBasicOptions(parser) cluster = clusterFactory(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone) command = config.args if config.args else ['bash'] cluster.getLeader().sshAppliance(*command, strict=not config.insecure, tty=sys.stdin.isatty())
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--insecure", action='store_true', help="Temporarily disable strict host key checking.") parser.add_argument("--sshOption", dest='sshOptions', default=[], action='append', help="Pass an additional option to the SSH command.") parser.add_argument('args', nargs=argparse.REMAINDER) config = parseBasicOptions(parser) cluster = clusterFactory(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone) command = config.args if config.args else ['bash'] cluster.getLeader().sshAppliance(*command, strict=not config.insecure, tty=sys.stdin.isatty(), sshOptions=config.sshOptions)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--insecure", dest='insecure', action='store_true', required=False, help="Temporarily disable strict host key checking.") parser.add_argument( "args", nargs=argparse.REMAINDER, help="Arguments to pass to" "`rsync`. Takes any arguments that rsync accepts. Specify the" " remote with a colon. For example, to upload `example.py`," " specify `toil rsync-cluster -p aws test-cluster example.py :`." "\nOr, to download a file from the remote:, `toil rsync-cluster" " -p aws test-cluster :example.py .`") config = parseBasicOptions(parser) cluster = clusterFactory(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone) cluster.getLeader().coreRsync(args=config.args, strict=not config.insecure)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument( "--leaderNodeType", dest="leaderNodeType", required=True, help="Non-preemptable node type to use for the cluster leader.") parser.add_argument( "--keyPairName", dest='keyPairName', help="On AWS, the name of the AWS key pair to include on the instance." " On Google/GCE, this is the ssh key pair.") parser.add_argument( "--owner", dest='owner', help="The owner tag for all instances. If not given, the value in" " --keyPairName will be used if given.") parser.add_argument( "--boto", dest='botoPath', help="The path to the boto credentials directory. This is transferred " "to all nodes in order to access the AWS jobStore from non-AWS instances." ) parser.add_argument( "-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its " "children. Tags are of the form:\n" " -t key1=value1 --tag key2=value2\n" "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {\n" " \"Name\": clusterName,\n" " \"Owner\": IAM username\n" " }. ") parser.add_argument( "--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not " "specified. This subnet needs to have auto assign IPs turned on.") parser.add_argument( "--nodeTypes", dest='nodeTypes', default=None, type=str, help="Comma-separated list of node types to create while launching the " "leader. The syntax for each node type depends on the provisioner " "used. For the aws provisioner this is the name of an EC2 instance " "type followed by a colon and the price in dollar to bid for a spot " "instance, for example 'c3.8xlarge:0.42'. Must also provide the " "--workers argument to specify how many workers of each node type " "to create.") parser.add_argument( "-w", "--workers", dest='workers', default=None, type=str, help= "Comma-separated list of the number of workers of each node type to " "launch alongside the leader when the cluster is created. This can be " "useful if running toil without auto-scaling but with need of more " "hardware support") parser.add_argument( "--leaderStorage", dest='leaderStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for the leader " "instance. This is an EBS volume.") parser.add_argument( "--nodeStorage", dest='nodeStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for any worker " "instances created when using the -w flag. This is an EBS volume.") parser.add_argument( '--forceDockerAppliance', dest='forceDockerAppliance', action='store_true', default=False, help= "Disables sanity checking the existence of the docker image specified " "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for " "autoscaling.") parser.add_argument( '--awsEc2ProfileArn', dest='awsEc2ProfileArn', default=None, type=str, help= "If provided, the specified ARN is used as the instance profile for EC2 instances." "Useful for setting custom IAM profiles. If not specified, a new IAM role is created " "by default with sufficient access to perform basic cluster operations." ) config = parseBasicOptions(parser) tagsDict = None if config.tags is None else createTagsDict(config.tags) checkValidNodeTypes(config.provisioner, config.nodeTypes) checkValidNodeTypes(config.provisioner, config.leaderNodeType) # checks the validity of TOIL_APPLIANCE_SELF before proceeding applianceSelf(forceDockerAppliance=config.forceDockerAppliance) spotBids = [] nodeTypes = [] preemptableNodeTypes = [] numNodes = [] numPreemptableNodes = [] if (config.nodeTypes or config.workers) and not (config.nodeTypes and config.workers): raise RuntimeError( "The --nodeTypes and --workers options must be specified together," ) if config.nodeTypes: nodeTypesList = config.nodeTypes.split(",") numWorkersList = config.workers.split(",") if not len(nodeTypesList) == len(numWorkersList): raise RuntimeError( "List of node types must be the same length as the list of workers." ) for nodeTypeStr, num in zip(nodeTypesList, numWorkersList): parsedBid = nodeTypeStr.split(':', 1) if len(nodeTypeStr) != len(parsedBid[0]): #Is a preemptable node preemptableNodeTypes.append(parsedBid[0]) spotBids.append(float(parsedBid[1])) numPreemptableNodes.append(int(num)) else: nodeTypes.append(nodeTypeStr) numNodes.append(int(num)) # set owner (default to keyPairName if not given) owner = 'toil' if config.owner: owner = config.owner elif config.keyPairName: owner = config.keyPairName # Check to see if the user specified a zone. If not, see if one is stored in an environment variable. config.zone = config.zone or getZoneFromEnv(config.provisioner) if not config.zone: raise RuntimeError( 'Please provide a value for --zone or set a default in the TOIL_' + config.provisioner.upper() + '_ZONE enviroment variable.') cluster = clusterFactory(provisioner=config.provisioner, clusterName=config.clusterName, zone=config.zone, nodeStorage=config.nodeStorage) cluster.launchCluster(leaderNodeType=config.leaderNodeType, leaderStorage=config.leaderStorage, owner=owner, keyName=config.keyPairName, botoPath=config.botoPath, userTags=tagsDict, vpcSubnet=config.vpcSubnet, awsEc2ProfileArn=config.awsEc2ProfileArn) for nodeType, workers in zip(nodeTypes, numNodes): cluster.addNodes(nodeType=nodeType, numNodes=workers, preemptable=False) for nodeType, workers, spotBid in zip(preemptableNodeTypes, numPreemptableNodes, spotBids): cluster.addNodes(nodeType=nodeType, numNodes=workers, preemptable=True, spotBid=spotBid)
def testAWSProvisionerUtils(self): """ Runs a number of the cluster utilities in sequence. Launches a cluster with custom tags. Verifies the tags exist. ssh's into the cluster. Does some weird string comparisons. Makes certain that TOIL_WORKDIR is set as expected in the ssh'ed cluster. Rsyncs a file and verifies it exists on the leader. Destroys the cluster. :return: """ # TODO: Run these for the other clouds. clusterName = 'cluster-utils-test' + str(uuid.uuid4()) keyName = os.getenv('TOIL_AWS_KEYNAME') try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner # launch master with an assortment of custom tags system([ self.toilMain, 'launch-cluster', '-t', 'key1=value1', '-t', 'key2=value2', '--tag', 'key3=value3', '--leaderNodeType=m3.medium', '--keyPairName=' + keyName, clusterName, '--provisioner=aws', '--zone=us-west-2a', '--logLevel=DEBUG' ]) cluster = clusterFactory(provisioner='aws', clusterName=clusterName) leader = cluster.getLeader() # check that the leader carries the appropriate tags tags = { 'key1': 'value1', 'key2': 'value2', 'key3': 'value3', 'Name': clusterName, 'Owner': keyName } for key in tags: self.assertEqual(tags[key], leader.tags.get(key)) # Test strict host key checking # Doesn't work when run locally. if keyName == 'jenkins@jenkins-master': try: leader.sshAppliance(strict=True) except RuntimeError: pass else: self.fail( "Host key verification passed where it should have failed" ) # Add the host key to known_hosts so that the rest of the tests can # pass without choking on the verification prompt. leader.sshAppliance('bash', strict=True, sshOptions=['-oStrictHostKeyChecking=no']) system([ self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName ]) testStrings = [ "'foo'", '"foo"', ' foo', '$PATH', '"', "'", '\\', '| cat', '&& cat', '; cat' ] for test in testStrings: logger.debug('Testing SSH with special string: %s', test) compareTo = "import sys; assert sys.argv[1]==%r" % test leader.sshAppliance('python', '-', test, input=compareTo) try: leader.sshAppliance('nonsenseShouldFail') except RuntimeError: pass else: self.fail( 'The remote command failed silently where it should have raised an error' ) leader.sshAppliance( 'python', '-c', "import os; assert os.environ['TOIL_WORKDIR']=='/var/lib/toil'" ) # `toil rsync-cluster` # Testing special characters - string.punctuation fname = '!"#$%&\'()*+,-.;<=>:\ ?@[\\\\]^_`{|}~' testData = os.urandom(3 * (10**6)) with tempfile.NamedTemporaryFile(suffix=fname) as tmpFile: relpath = os.path.basename(tmpFile.name) tmpFile.write(testData) tmpFile.flush() # Upload file to leader leader.coreRsync(args=[tmpFile.name, ":"]) # Ensure file exists leader.sshAppliance("test", "-e", relpath) tmpDir = tempfile.mkdtemp() # Download the file again and make sure it's the same file # `--protect-args` needed because remote bash chokes on special characters leader.coreRsync(args=["--protect-args", ":" + relpath, tmpDir]) with open(os.path.join(tmpDir, relpath), "r") as f: self.assertEqual( f.read(), testData, "Downloaded file does not match original file") finally: system([ self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName ]) try: shutil.rmtree(tmpDir) except NameError: pass