def shutdown(self, driver): log.critical("Shutting down executor...") for taskId, pid in self.runningTasks.items(): self.killTask(driver, taskId) Resource.cleanSystem() AbstractBatchSystem.workerCleanup(self.workerCleanupInfo) log.critical("Executor shut down")
def shutdown(self, driver): log.critical('Shutting down executor ...') for taskId in self.runningTasks.keys(): self.killTask(driver, taskId) Resource.cleanSystem() BatchSystemSupport.workerCleanup(self.workerCleanupInfo) log.critical('... executor shut down.')
def __init__(self): super(MesosExecutor, self).__init__() self.popenLock = threading.Lock() self.runningTasks = {} self.workerCleanupInfo = None Resource.prepareSystem() self.address = None
def __init__(self): super(MesosExecutor, self).__init__() self.popenLock = threading.Lock() self.runningTasks = {} self.workerCleanupInfo = None Resource.prepareSystem() self.address = None # Setting this value at this point will ensure that the toil workflow directory will go to # the mesos sandbox if the user hasn't specified --workDir on the command line. if not os.getenv('TOIL_WORKDIR'): os.environ['TOIL_WORKDIR'] = os.getcwd()
def executor(): """ Main function of the _toil_kubernetes_executor entrypoint. Runs inside the Toil container. Responsible for setting up the user script and running the command for the job (which may in turn invoke the Toil worker entrypoint). """ logging.basicConfig(level=logging.DEBUG) logger.debug("Starting executor") if len(sys.argv) != 2: logger.error('Executor requires exactly one base64-encoded argument') sys.exit(1) # Take in a base64-encoded pickled dict as our first argument and decode it try: # Make sure to encode the text arguments to bytes before base 64 decoding job = pickle.loads(base64.b64decode(sys.argv[1].encode('utf-8'))) except: exc_info = sys.exc_info() logger.error('Exception while unpickling task: ', exc_info=exc_info) sys.exit(1) # Set JTRES_ROOT and other global state needed for resource # downloading/deployment to work. logger.debug('Preparing system for resource download') Resource.prepareSystem() if 'userScript' in job: job['userScript'].register() logger.debug("Invoking command: '%s'", job['command']) # Construct the job's environment jobEnv = dict(os.environ, **job['environment']) logger.debug('Using environment variables: %s', jobEnv.keys()) # Start the child process child = subprocess.Popen(job['command'], preexec_fn=lambda: os.setpgrp(), shell=True, env=jobEnv) # Reporduce child's exit code sys.exit(child.wait())
def executor() -> None: """ Main function of the _toil_contained_executor entrypoint. Runs inside the Toil container. Responsible for setting up the user script and running the command for the job (which may in turn invoke the Toil worker entrypoint). """ configure_root_logger() set_log_level("DEBUG") logger.debug("Starting executor") # If we don't manage to run the child, what should our exit code be? exit_code = EXIT_STATUS_UNAVAILABLE_VALUE if len(sys.argv) != 2: logger.error('Executor requires exactly one base64-encoded argument') sys.exit(exit_code) # Take in a base64-encoded pickled dict as our first argument and decode it try: # Make sure to encode the text arguments to bytes before base 64 decoding job = pickle.loads(base64.b64decode(sys.argv[1].encode('utf-8'))) except: exc_info = sys.exc_info() logger.error('Exception while unpickling task: ', exc_info=exc_info) sys.exit(exit_code) if 'environment' in job: # Adopt the job environment into the executor. # This lets us use things like TOIL_WORKDIR when figuring out how to talk to other executors. logger.debug('Adopting environment: %s', str(job['environment'].keys())) for var, value in job['environment'].items(): os.environ[var] = value # Set JTRES_ROOT and other global state needed for resource # downloading/deployment to work. # TODO: Every worker downloads resources independently. # We should have a way to share a resource directory. logger.debug('Preparing system for resource download') Resource.prepareSystem() try: if 'userScript' in job: job['userScript'].register() # Start the child process logger.debug("Invoking command: '%s'", job['command']) child = subprocess.Popen(job['command'], preexec_fn=lambda: os.setpgrp(), shell=True) # Reproduce child's exit code exit_code = child.wait() except: # This will print a traceback for us, since exit() in the finally # will bypass the normal way of getting one. logger.exception('Encountered exception running child') finally: logger.debug('Cleaning up resources') # TODO: Change resource system to use a shared resource directory for everyone. # Then move this into worker cleanup somehow Resource.cleanSystem() logger.debug('Shutting down') sys.exit(exit_code)
def _test(self, module_name, shouldBelongToToil=False, expectedContents=None): module = ModuleDescriptor.forModule(module_name) # Assert basic attributes and properties self.assertEqual(module.belongsToToil, shouldBelongToToil) self.assertEquals(module.name, module_name) if shouldBelongToToil: self.assertTrue(module.dirPath.endswith('/src')) # Before the module is saved as a resource, localize() and globalize() are identity # methods. This should log warnings. self.assertIs(module.localize(), module) self.assertIs(module.globalize(), module) # Create a mock job store ... jobStore = MagicMock() # ... to generate a fake URL for the resource ... url = 'file://foo.zip' jobStore.getSharedPublicUrl.return_value = url # ... and save the resource to it. resource = module.saveAsResourceTo(jobStore) # Ensure that the URL generation method is actually called, ... jobStore.getSharedPublicUrl.assert_called_once_with(resource.pathHash) # ... and that ensure that writeSharedFileStream is called. jobStore.writeSharedFileStream.assert_called_once_with( resource.pathHash, isProtected=False) # Now it gets a bit complicated: Ensure that the context manager returned by the # jobStore's writeSharedFileStream() method is entered and that the file handle yielded # by the context manager is written to once with the zipped source tree from which # 'toil.resource' was orginally imported. Keep the zipped tree around such that we can # mock the download later. file_handle = jobStore.writeSharedFileStream.return_value.__enter__.return_value # The first 0 index selects the first call of write(), the second 0 selects positional # instead of keyword arguments, and the third 0 selects the first positional, i.e. the # contents. This is a bit brittle since it assumes that all the data is written in a # single call to write(). If more calls are made we can easily concatenate them. zipFile = file_handle.write.call_args_list[0][0][0] self.assertTrue( zipFile.startswith('PK')) # the magic header for ZIP files # Check contents if requested if expectedContents is not None: with ZipFile(BytesIO(zipFile)) as _zipFile: self.assertEqual(set(_zipFile.namelist()), expectedContents) self.assertEquals(resource.url, url) # Now we're on the worker. Prepare the storage for localized resources Resource.prepareSystem() # Register the resource for subsequent lookup. resource.register() # Lookup the resource and ensure that the result is equal to but not the same as the # original resource. Lookup will also be used when we localize the module that was # originally used to create the resource. localResource = Resource.lookup(module._resourcePath) self.assertEquals(resource, localResource) self.assertIsNot(resource, localResource) # Now show that we can localize the module using the registered resource. Set up a mock # urlopen() that yields the zipped tree ... mock_urlopen = MagicMock() mock_urlopen.return_value.read.return_value = zipFile with patch('toil.resource.urlopen', mock_urlopen): # ... and use it to download and unpack the resource localModule = module.localize() # The name should be equal between original and localized resource ... self.assertEquals(module.name, localModule.name) # ... but the directory should be different. self.assertNotEquals(module.dirPath, localModule.dirPath) # Show that we can 'undo' localization. This is necessary when the user script's jobs are # invoked on the worker where they generate more child jobs. self.assertEquals(localModule.globalize(), module)
def executor(): """ Main function of the _toil_kubernetes_executor entrypoint. Runs inside the Toil container. Responsible for setting up the user script and running the command for the job (which may in turn invoke the Toil worker entrypoint). """ logging.basicConfig(level=logging.DEBUG) logger.debug("Starting executor") # If we don't manage to run the child, what should our exit code be? exit_code = EXIT_STATUS_UNAVAILABLE_VALUE if len(sys.argv) != 2: logger.error('Executor requires exactly one base64-encoded argument') sys.exit(exit_code) # Take in a base64-encoded pickled dict as our first argument and decode it try: # Make sure to encode the text arguments to bytes before base 64 decoding job = pickle.loads(base64.b64decode(sys.argv[1].encode('utf-8'))) except: exc_info = sys.exc_info() logger.error('Exception while unpickling task: ', exc_info=exc_info) sys.exit(exit_code) if 'environment' in job: # Adopt the job environment into the executor. # This lets us use things like TOIL_WORKDIR when figuring out how to talk to other executors. logger.debug('Adopting environment: %s', str(job['environment'].keys())) for var, value in job['environment'].items(): os.environ[var] = value # Set JTRES_ROOT and other global state needed for resource # downloading/deployment to work. # TODO: Every worker downloads resources independently. # We should have a way to share a resource directory. logger.debug('Preparing system for resource download') Resource.prepareSystem() try: if 'userScript' in job: job['userScript'].register() # We need to tell other workers in this workflow not to do cleanup now that # we are here, or else wait for them to finish. So get the cleanup info # that knows where the work dir is. cleanupInfo = job['workerCleanupInfo'] # Join a Last Process Standing arena, so we know which process should be # responsible for cleanup. # We need to use the real workDir, not just the override from cleanupInfo. # This needs to happen after the environment is applied. arena = LastProcessStandingArena(Toil.getToilWorkDir(cleanupInfo.workDir), cleanupInfo.workflowID + '-kube-executor') arena.enter() try: # Start the child process logger.debug("Invoking command: '%s'", job['command']) child = subprocess.Popen(job['command'], preexec_fn=lambda: os.setpgrp(), shell=True) # Reproduce child's exit code exit_code = child.wait() finally: for _ in arena.leave(): # We are the last concurrent executor to finish. # Do batch system cleanup. logger.debug('Cleaning up worker') BatchSystemSupport.workerCleanup(cleanupInfo) finally: logger.debug('Cleaning up resources') # TODO: Change resource system to use a shared resource directory for everyone. # Then move this into the last-process-standing cleanup Resource.cleanSystem() logger.debug('Shutting down') sys.exit(exit_code)
def shutdown(self, driver): log.critical("Shutting down executor...") for taskId, pid in self.runningTasks.items(): self.killTask(driver, taskId) Resource.cleanSystem() log.critical("Executor shut down")
def __init__(self): super(MesosExecutor, self).__init__() self.popenLock = threading.Lock() self.runningTasks = {} Resource.prepareSystem()
def _test(self, module_name, shouldBelongToToil=False, expectedContents=None, allowExtraContents=True): module = ModuleDescriptor.forModule(module_name) # Assert basic attributes and properties self.assertEqual(module.belongsToToil, shouldBelongToToil) self.assertEquals(module.name, module_name) if shouldBelongToToil: self.assertTrue(module.dirPath.endswith('/src')) # Before the module is saved as a resource, localize() and globalize() are identity # methods. This should log warnings. self.assertIs(module.localize(), module) self.assertIs(module.globalize(), module) # Create a mock job store ... jobStore = MagicMock() # ... to generate a fake URL for the resource ... url = 'file://foo.zip' jobStore.getSharedPublicUrl.return_value = url # ... and save the resource to it. resource = module.saveAsResourceTo(jobStore) # Ensure that the URL generation method is actually called, ... jobStore.getSharedPublicUrl.assert_called_once_with(resource.pathHash) # ... and that ensure that writeSharedFileStream is called. jobStore.writeSharedFileStream.assert_called_once_with(resource.pathHash, isProtected=False) # Now it gets a bit complicated: Ensure that the context manager returned by the # jobStore's writeSharedFileStream() method is entered and that the file handle yielded # by the context manager is written to once with the zipped source tree from which # 'toil.resource' was orginally imported. Keep the zipped tree around such that we can # mock the download later. file_handle = jobStore.writeSharedFileStream.return_value.__enter__.return_value # The first 0 index selects the first call of write(), the second 0 selects positional # instead of keyword arguments, and the third 0 selects the first positional, i.e. the # contents. This is a bit brittle since it assumes that all the data is written in a # single call to write(). If more calls are made we can easily concatenate them. zipFile = file_handle.write.call_args_list[0][0][0] self.assertTrue(zipFile.startswith('PK')) # the magic header for ZIP files # Check contents if requested if expectedContents is not None: with ZipFile(BytesIO(zipFile)) as _zipFile: actualContents = set(_zipFile.namelist()) if allowExtraContents: self.assertTrue(actualContents.issuperset(expectedContents)) else: self.assertEqual(actualContents, expectedContents) self.assertEquals(resource.url, url) # Now we're on the worker. Prepare the storage for localized resources Resource.prepareSystem() try: # Register the resource for subsequent lookup. resource.register() # Lookup the resource and ensure that the result is equal to but not the same as the # original resource. Lookup will also be used when we localize the module that was # originally used to create the resource. localResource = Resource.lookup(module._resourcePath) self.assertEquals(resource, localResource) self.assertIsNot(resource, localResource) # Now show that we can localize the module using the registered resource. Set up a mock # urlopen() that yields the zipped tree ... mock_urlopen = MagicMock() mock_urlopen.return_value.read.return_value = zipFile with patch('toil.resource.urlopen', mock_urlopen): # ... and use it to download and unpack the resource localModule = module.localize() # The name should be equal between original and localized resource ... self.assertEquals(module.name, localModule.name) # ... but the directory should be different. self.assertNotEquals(module.dirPath, localModule.dirPath) # Show that we can 'undo' localization. This is necessary when the user script's jobs # are invoked on the worker where they generate more child jobs. self.assertEquals(localModule.globalize(), module) finally: Resource.cleanSystem()
def createBatchSystem(config, jobStore=None, userScript=None): """ Creates an instance of the batch system specified in the given config. If a job store and a user script are given then the user script can be hot deployed into the workflow. :param toil.common.Config config: the current configuration :param toil.jobStores.abstractJobStore.AbstractJobStore jobStore: an instance of a jobStore :param ModuleDescriptor userScript: a handle to the Python module defining the root job :return: an instance of a concrete subclass of AbstractBatchSystem :rtype: batchSystems.abstractBatchSystem.AbstractBatchSystem """ kwargs = dict(config=config, maxCores=config.maxCores, maxMemory=config.maxMemory, maxDisk=config.maxDisk) if config.batchSystem == 'parasol': from toil.batchSystems.parasol import ParasolBatchSystem batchSystemClass = ParasolBatchSystem elif config.batchSystem == 'single_machine' or config.batchSystem == 'singleMachine': from toil.batchSystems.singleMachine import SingleMachineBatchSystem batchSystemClass = SingleMachineBatchSystem elif config.batchSystem == 'gridengine' or config.batchSystem == 'gridEngine': from toil.batchSystems.gridengine import GridengineBatchSystem batchSystemClass = GridengineBatchSystem elif config.batchSystem == 'lsf' or config.batchSystem == 'LSF': from toil.batchSystems.lsf import LSFBatchSystem batchSystemClass = LSFBatchSystem elif config.batchSystem == 'mesos' or config.batchSystem == 'Mesos': from toil.batchSystems.mesos.batchSystem import MesosBatchSystem batchSystemClass = MesosBatchSystem kwargs['masterAddress'] = config.mesosMasterAddress elif config.batchSystem == 'slurm' or config.batchSystem == 'Slurm': from toil.batchSystems.slurm import SlurmBatchSystem batchSystemClass = SlurmBatchSystem else: raise RuntimeError('Unrecognised batch system: %s' % config.batchSystem) if not config.disableCaching and not batchSystemClass.supportsWorkerCleanup(): raise RuntimeError('%s currently does not support shared caching. Set the ' '--disableCaching flag if you want to ' 'use this batch system.' % config.batchSystem) logger.info('Using the %s' % re.sub("([a-z])([A-Z])", "\g<1> \g<2>", batchSystemClass.__name__).lower()) if jobStore is not None: if userScript is not None: if not userScript.belongsToToil and batchSystemClass.supportsHotDeployment(): userScriptResource = userScript.saveAsResourceTo(jobStore) with jobStore.writeSharedFileStream('userScript') as f: f.write(userScriptResource.pickle()) kwargs['userScript'] = userScriptResource else: from toil.jobStores.abstractJobStore import NoSuchFileException try: with jobStore.readSharedFileStream('userScript') as f: userScriptResource = Resource.unpickle(f.read()) kwargs['userScript'] = userScriptResource except NoSuchFileException: pass return batchSystemClass(**kwargs)