예제 #1
0
    def formatStdOutErrPath(self, jobID, batchSystem, batchJobIDfmt, fileDesc):
        """
        Format path for batch system standard output/error and other files
        generated by the batch system itself.

        Files will be written to the Toil work directory (which may
        be on a shared file system) with names containing both the Toil and
        batch system job IDs, for ease of debugging job failures.

        :param: string jobID : Toil job ID
        :param: string batchSystem : Name of the batch system
        :param: string batchJobIDfmt : A string which the particular batch system
            will format into the batch job ID once it is submitted
        :param: string fileDesc : File description, should be 'std_output' for standard
             output, 'std_error' for standard error, and as appropriate for other files

        :rtype: string : Formatted filename; however if self.config.noStdOutErr is true,
             returns '/dev/null' or equivalent.

        """
        if self.config.noStdOutErr:
            return os.devnull

        workflowID = self.config.workflowID
        workDir = Toil.getToilWorkDir(self.config.workDir)
        fileName = 'toil_workflow_{workflowID}_job_{jobID}_batch_{batchSystem}_{batchJobIDfmt}_{fileDesc}.log'.format(
            workflowID=workflowID,
            jobID=jobID,
            batchSystem=batchSystem,
            batchJobIDfmt=batchJobIDfmt,
            fileDesc=fileDesc)
        return os.path.join(workDir, fileName)
예제 #2
0
    def formatStdOutErrPath(self, toil_job_id: int, cluster_job_id: str, std: str) -> str:
        """
        Format path for batch system standard output/error and other files
        generated by the batch system itself.

        Files will be written to the Toil work directory (which may
        be on a shared file system) with names containing both the Toil and
        batch system job IDs, for ease of debugging job failures.

        :param: int toil_job_id : The unique id that Toil gives a job.
        :param: cluster_job_id : What the cluster, for example, GridEngine, uses as its internal job id.
        :param: string std : The provenance of the stream (for example: 'err' for 'stderr' or 'out' for 'stdout')

        :rtype: string : Formatted filename; however if self.config.noStdOutErr is true,
             returns '/dev/null' or equivalent.
        """
        if self.config.noStdOutErr:
            return os.devnull

        fileName: str = f'toil_{self.config.workflowID}.{toil_job_id}.{cluster_job_id}.{std}.log'
        workDir: str = Toil.getToilWorkDir(self.config.workDir)
        return os.path.join(workDir, fileName)
예제 #3
0
파일: awsBatch.py 프로젝트: tmooney/toil
    def __init__(self, config: Config, maxCores: float, maxMemory: int,
                 maxDisk: int) -> None:
        super().__init__(config, maxCores, maxMemory, maxDisk)

        # Determine region to use.
        # Either it's set specifically or maybe we can get it from the "best" zone.
        # TODO: Parse it from a full queue ARN?
        self.region = getattr(config, 'aws_batch_region')
        if self.region is None:
            self.region = get_current_aws_region()
            if self.region is None:
                # Can't proceed without a real region
                raise RuntimeError(
                    'To use AWS Batch, specify --awsBatchRegion or '
                    'TOIL_AWS_REGION or TOIL_AWS_ZONE, or configure '
                    'a default zone in boto')

        # Connect to AWS Batch.
        # TODO: Use a global AWSConnectionManager so we can share a client
        # cache with provisioners, etc.
        self.client = establish_boto3_session(self.region).client('batch')

        # Determine our batch queue
        self.queue = getattr(config, 'aws_batch_queue')
        if self.queue is None:
            # Make sure we actually have a queue
            raise RuntimeError(
                "To use AWS Batch, --awsBatchQueue or TOIL_AWS_BATCH_QUEUE must be set"
            )
        # And the role, if any, jobs should assume
        self.job_role_arn = getattr(config, 'aws_batch_job_role_arn')
        # And the Owner tag value, if any, to apply to things we create
        self.owner_tag = os.environ.get('TOIL_OWNER_TAG')

        # Try and guess what Toil work dir the workers will use.
        # We need to be able to provision (possibly shared) space there.
        # TODO: Deduplicate with Kubernetes batch system.
        self.worker_work_dir = Toil.getToilWorkDir(config.workDir)
        if (config.workDir is None and os.getenv('TOIL_WORKDIR') is None
                and self.worker_work_dir == tempfile.gettempdir()):

            # We defaulted to the system temp directory. But we think the
            # worker Dockerfiles will make them use /var/lib/toil instead.
            # TODO: Keep this in sync with the Dockerfile.
            self.worker_work_dir = '/var/lib/toil'

        # We assign job names based on a numerical job ID. This functionality
        # is managed by the BatchSystemLocalSupport.

        # Here is where we will store the user script resource object if we get one.
        self.user_script: Optional[Resource] = None

        # Get the image to deploy from Toil's configuration
        self.docker_image = applianceSelf()

        # We can't use AWS Batch without a job definition. But we can use one
        # of them for all the jobs. We want to lazily initialize it. This will
        # be an ARN.
        self.job_definition: Optional[str] = None

        # We need a way to map between our batch system ID numbers, and AWS Batch job IDs from the server.
        self.bs_id_to_aws_id: Dict[int, str] = {}
        self.aws_id_to_bs_id: Dict[str, int] = {}
        # We need to track if jobs were killed so they don't come out as updated
        self.killed_job_aws_ids: Set[str] = set()
예제 #4
0
    def __init__(self, config, maxCores, maxMemory, maxDisk):
        super(KubernetesBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk)

        # Turn down log level for Kubernetes modules and dependencies.
        # Otherwise if we are at debug log level, we dump every
        # request/response to Kubernetes, including tokens which we shouldn't
        # reveal on CI.
        logging.getLogger('kubernetes').setLevel(logging.ERROR)
        logging.getLogger('requests_oauthlib').setLevel(logging.ERROR)
        
        # This will hold the last time our Kubernetes credentials were refreshed
        self.credential_time = None
        # And this will hold our cache of API objects
        self._apis = {}
        
        # Get our namespace (and our Kubernetes credentials to make sure they exist)
        self.namespace = self._api('namespace')
        
        # Decide if we are going to mount a Kubernetes host path as /tmp in the workers.
        # If we do this and the work dir is the default of the temp dir, caches will be shared.
        self.host_path = config.kubernetesHostPath
        if self.host_path is None and os.environ.get("TOIL_KUBERNETES_HOST_PATH", None) is not None:
            # We can also take it from an environment variable
            self.host_path = os.environ.get("TOIL_KUBERNETES_HOST_PATH")

        # Make a Kubernetes-acceptable version of our username: not too long,
        # and all lowercase letters, numbers, or - or .
        acceptableChars = set(string.ascii_lowercase + string.digits + '-.')
        
        # Use TOIL_KUBERNETES_OWNER if present in env var
        if os.environ.get("TOIL_KUBERNETES_OWNER", None) is not None:
            username = os.environ.get("TOIL_KUBERNETES_OWNER")
        else:    
            username = ''.join([c for c in getpass.getuser().lower() if c in acceptableChars])[:100]
        
        # Create a prefix for jobs, starting with our username
        self.jobPrefix = '{}-toil-{}-'.format(username, uuid.uuid4())
        
        # Instead of letting Kubernetes assign unique job names, we assign our
        # own based on a numerical job ID. This functionality is managed by the
        # BatchSystemLocalSupport.

        # Here is where we will store the user script resource object if we get one.
        self.userScript = None

        # Ge the image to deploy from Toil's configuration
        self.dockerImage = applianceSelf()
        
        # Try and guess what Toil work dir the workers will use.
        # We need to be able to provision (possibly shared) space there.
        self.workerWorkDir = Toil.getToilWorkDir(config.workDir)
        if (config.workDir is None and
            os.getenv('TOIL_WORKDIR') is None and
            self.workerWorkDir == tempfile.gettempdir()):
            
            # We defaulted to the system temp directory. But we think the
            # worker Dockerfiles will make them use /var/lib/toil instead.
            # TODO: Keep this in sync with the Dockerfile.
            self.workerWorkDir = '/var/lib/toil'

        # Get the name of the AWS secret, if any, to mount in containers.
        # TODO: have some way to specify this (env var?)!
        self.awsSecretName = os.environ.get("TOIL_AWS_SECRET_NAME", None)

        # Set this to True to enable the experimental wait-for-job-update code
        self.enableWatching = True

        self.jobIds = set()
예제 #5
0
def executor():
    """
    Main function of the _toil_kubernetes_executor entrypoint.

    Runs inside the Toil container.

    Responsible for setting up the user script and running the command for the
    job (which may in turn invoke the Toil worker entrypoint).

    """

    logging.basicConfig(level=logging.DEBUG)
    logger.debug("Starting executor")
    
    # If we don't manage to run the child, what should our exit code be?
    exit_code = EXIT_STATUS_UNAVAILABLE_VALUE

    if len(sys.argv) != 2:
        logger.error('Executor requires exactly one base64-encoded argument')
        sys.exit(exit_code)

    # Take in a base64-encoded pickled dict as our first argument and decode it
    try:
        # Make sure to encode the text arguments to bytes before base 64 decoding
        job = pickle.loads(base64.b64decode(sys.argv[1].encode('utf-8')))
    except:
        exc_info = sys.exc_info()
        logger.error('Exception while unpickling task: ', exc_info=exc_info)
        sys.exit(exit_code)

    if 'environment' in job:
        # Adopt the job environment into the executor.
        # This lets us use things like TOIL_WORKDIR when figuring out how to talk to other executors.
        logger.debug('Adopting environment: %s', str(job['environment'].keys()))
        for var, value in job['environment'].items():
            os.environ[var] = value
    
    # Set JTRES_ROOT and other global state needed for resource
    # downloading/deployment to work.
    # TODO: Every worker downloads resources independently.
    # We should have a way to share a resource directory.
    logger.debug('Preparing system for resource download')
    Resource.prepareSystem()
    try:
        if 'userScript' in job:
            job['userScript'].register()
            
        # We need to tell other workers in this workflow not to do cleanup now that
        # we are here, or else wait for them to finish. So get the cleanup info
        # that knows where the work dir is.
        cleanupInfo = job['workerCleanupInfo']
        
        # Join a Last Process Standing arena, so we know which process should be
        # responsible for cleanup.
        # We need to use the real workDir, not just the override from cleanupInfo.
        # This needs to happen after the environment is applied.
        arena = LastProcessStandingArena(Toil.getToilWorkDir(cleanupInfo.workDir), 
            cleanupInfo.workflowID + '-kube-executor')
        arena.enter()
        try:
            
            # Start the child process
            logger.debug("Invoking command: '%s'", job['command'])
            child = subprocess.Popen(job['command'],
                                     preexec_fn=lambda: os.setpgrp(),
                                     shell=True)

            # Reproduce child's exit code
            exit_code = child.wait()
            
        finally:
            for _ in arena.leave():
                # We are the last concurrent executor to finish.
                # Do batch system cleanup.
                logger.debug('Cleaning up worker')
                BatchSystemSupport.workerCleanup(cleanupInfo)
    finally:
        logger.debug('Cleaning up resources')
        # TODO: Change resource system to use a shared resource directory for everyone.
        # Then move this into the last-process-standing cleanup
        Resource.cleanSystem()
        logger.debug('Shutting down')
        sys.exit(exit_code)
예제 #6
0
 def __enter__(self):
     # Set up an arena so we know who is the last worker to leave
     self.arena = LastProcessStandingArena(
         Toil.getToilWorkDir(self.workerCleanupInfo.workDir),
         self.workerCleanupInfo.workflowID + '-cleanup')
     self.arena.enter()