class WorkerCleanupContext: """ Context manager used by :class:`BatchSystemCleanupSupport` to implement cleanup on a node after the last worker is done working. Gets wrapped around the worker's work. """ def __init__(self, workerCleanupInfo): """ Wrap the given workerCleanupInfo in a context manager. :param WorkerCleanupInfo workerCleanupInfo: Info to use to clean up the worker if we are the last to exit the context manager. """ self.workerCleanupInfo = workerCleanupInfo self.arena = None def __enter__(self): # Set up an arena so we know who is the last worker to leave self.arena = LastProcessStandingArena( Toil.getToilWorkDir(self.workerCleanupInfo.workDir), self.workerCleanupInfo.workflowID + '-cleanup') self.arena.enter() def __exit__(self, type, value, traceback): for _ in self.arena.leave(): # We are the last concurrent worker to finish. # Do batch system cleanup. logger.debug('Cleaning up worker') BatchSystemSupport.workerCleanup(self.workerCleanupInfo) # We have nothing to say about exceptions return False
def _testLastProcessStandingTask(scope, arena_name, number): try: arena = LastProcessStandingArena(scope, arena_name) arena.enter() log.info('PID %d = num %d entered arena', os.getpid(), number) try: # We all make files my_precious = os.path.join(scope, 'precious' + str(number)) # Put our name there with open(my_precious, 'w') as out_stream: out_stream.write(str(number)) # Wait time.sleep(random.random() * 0.01) # Make sure our file is still there unmodified assert os.path.exists( my_precious), "Precious file {} has been stolen!".format( my_precious) with open(my_precious, 'r') as in_stream: seen = in_stream.read().rstrip() assert seen == str( number ), "We are {} but saw {} in our precious file!".format( number, seen) finally: was_last = False for _ in arena.leave(): was_last = True log.info('PID %d = num %d is last standing', os.getpid(), number) # Clean up all the files for filename in os.listdir(scope): if filename.startswith('precious'): log.info('PID %d = num %d cleaning up %s', os.getpid(), number, filename) os.unlink(os.path.join(scope, filename)) log.info('PID %d = num %d left arena', os.getpid(), number) return True except: traceback.print_exc() return False
def executor(): """ Main function of the _toil_kubernetes_executor entrypoint. Runs inside the Toil container. Responsible for setting up the user script and running the command for the job (which may in turn invoke the Toil worker entrypoint). """ logging.basicConfig(level=logging.DEBUG) logger.debug("Starting executor") # If we don't manage to run the child, what should our exit code be? exit_code = EXIT_STATUS_UNAVAILABLE_VALUE if len(sys.argv) != 2: logger.error('Executor requires exactly one base64-encoded argument') sys.exit(exit_code) # Take in a base64-encoded pickled dict as our first argument and decode it try: # Make sure to encode the text arguments to bytes before base 64 decoding job = pickle.loads(base64.b64decode(sys.argv[1].encode('utf-8'))) except: exc_info = sys.exc_info() logger.error('Exception while unpickling task: ', exc_info=exc_info) sys.exit(exit_code) if 'environment' in job: # Adopt the job environment into the executor. # This lets us use things like TOIL_WORKDIR when figuring out how to talk to other executors. logger.debug('Adopting environment: %s', str(job['environment'].keys())) for var, value in job['environment'].items(): os.environ[var] = value # Set JTRES_ROOT and other global state needed for resource # downloading/deployment to work. # TODO: Every worker downloads resources independently. # We should have a way to share a resource directory. logger.debug('Preparing system for resource download') Resource.prepareSystem() try: if 'userScript' in job: job['userScript'].register() # We need to tell other workers in this workflow not to do cleanup now that # we are here, or else wait for them to finish. So get the cleanup info # that knows where the work dir is. cleanupInfo = job['workerCleanupInfo'] # Join a Last Process Standing arena, so we know which process should be # responsible for cleanup. # We need to use the real workDir, not just the override from cleanupInfo. # This needs to happen after the environment is applied. arena = LastProcessStandingArena(Toil.getToilWorkDir(cleanupInfo.workDir), cleanupInfo.workflowID + '-kube-executor') arena.enter() try: # Start the child process logger.debug("Invoking command: '%s'", job['command']) child = subprocess.Popen(job['command'], preexec_fn=lambda: os.setpgrp(), shell=True) # Reproduce child's exit code exit_code = child.wait() finally: for _ in arena.leave(): # We are the last concurrent executor to finish. # Do batch system cleanup. logger.debug('Cleaning up worker') BatchSystemSupport.workerCleanup(cleanupInfo) finally: logger.debug('Cleaning up resources') # TODO: Change resource system to use a shared resource directory for everyone. # Then move this into the last-process-standing cleanup Resource.cleanSystem() logger.debug('Shutting down') sys.exit(exit_code)
def __enter__(self): # Set up an arena so we know who is the last worker to leave self.arena = LastProcessStandingArena( Toil.getToilWorkDir(self.workerCleanupInfo.workDir), self.workerCleanupInfo.workflowID + '-cleanup') self.arena.enter()