def formatPairs(sample_pairs, work_mount): r1, r2 = [], [] print('sample pairs:{}'.format(sample_pairs)) log.info('sample pairs:{}'.format(sample_pairs)) fastqs = sample_pairs.split(',') # Pattern convention: Look for "R1" / "R2" in the filename, or "_1" / "_2" before the extension pattern = re.compile('(?:^|[._-])(R[12]|[12]\.f)') for fastq in sorted(fastqs): match = pattern.search(os.path.basename(fastq)) fastq = fileURL(fastq) if not match: log.info( 'FASTQ file name fails to meet required convention for paired reads ' '(see documentation). ' + fastq) exit(1) elif '1' in match.group(): r1.append(fastq) elif '2' in match.group(): r2.append(fastq) else: assert False, match.group() require( len(r1) == len(r2), 'Check fastq names, uneven number of pairs found.\nr1: {}\nr2: {}'. format(r1, r2)) interleaved_samples = zip(r1, r2) # flatten the list of tuples and join them into a comma delimited string # https://stackoverflow.com/questions/40993966/python-convert-tuple-to-comma-separated-string comma_delimited_samples = ','.join( map(str, chain.from_iterable(interleaved_samples))) log.info('comma delimited samples:{}'.format(comma_delimited_samples)) return comma_delimited_samples
def __init__(self, provisioner, jobBatcher, config): """ Class manages automatically scaling the number of worker nodes. :param AbstractProvisioner provisioner: Provisioner instance to scale. :param JobBatcher jobBatcher: The class issuing jobs to the batch system. This is monitored to make scaling decisions. :param Config config: Config object from which to draw parameters. """ self.provisioner = provisioner self.jobBatcher = jobBatcher self.config = config # Indicates that the scaling threads should shutdown self.stop = False assert config.maxPreemptableNodes >= 0 and config.maxNodes >= 0 require(config.maxPreemptableNodes + config.maxNodes > 0, 'Either --maxNodes or --maxPreemptableNodes must be non-zero.') if config.maxPreemptableNodes > 0: self.preemptableScaler = ScalerThread(self, preemptable=True) self.preemptableScaler.start() else: self.preemptableScaler = None if config.maxNodes > 0: self.scaler = ScalerThread(self, preemptable=False) self.scaler.start() else: self.scaler = None
def __init__(self, provisioner, leader, config): """ Class manages automatically scaling the number of worker nodes. :param AbstractProvisioner provisioner: Provisioner instance to scale. :param toil.leader.Leader leader: :param Config config: Config object from which to draw parameters. """ self.provisioner = provisioner self.leader = leader self.config = config # Indicates that the scaling threads should shutdown self.stop = False #Dictionary of job names to their average runtime, used to estimate wall time #of queued jobs for bin-packing self.jobNameToAvgRuntime = {} self.jobNameToNumCompleted = {} self.totalAvgRuntime = 0.0 self.totalJobsCompleted = 0 require( sum(config.maxNodes) > 0, 'Not configured to create nodes of any type.') self.scaler = ScalerThread(scaler=self)
def _requireEphemeralDrives(self, workerType): require( workerType.disks > 0, "This provisioner only supports instance types with one or more ephemeral " "volumes. The requested type '%s' does not have any.", workerType.name) leaderType = self._resolveInstanceType(self._instance.instance_type) require( workerType.disks == leaderType.disks, 'The instance type selected for worker nodes (%s) offers %i ephemeral volumes but ' 'this type of leader (%s) has %i. The number of drives must match between leader ' 'and worker nodes. Please specify a different worker node type or use a different ' 'leader.', workerType.name, workerType.disks, leaderType.name, leaderType.disks)
def forModule(cls, name): """ Return an instance of this class representing the module of the given name. If the given module name is "__main__", it will be translated to the actual file name of the top-level script without the .py or .pyc extension. This method assumes that the module with the specified name has already been loaded. """ module = sys.modules[name] filePath = os.path.abspath(module.__file__) filePath = filePath.split(os.path.sep) filePath[-1], extension = os.path.splitext(filePath[-1]) require(extension in ('.py', '.pyc'), 'The name of a user script/module must end in .py or .pyc.') if name == '__main__': log.debug("Discovering real name of module") # User script/module was invoked as the main program if module.__package__: # Invoked as a module via python -m foo.bar log.debug("Script was invoked as a module") name = [filePath.pop()] for package in reversed(module.__package__.split('.')): dirPathTail = filePath.pop() assert dirPathTail == package name.append(dirPathTail) name = '.'.join(reversed(name)) dirPath = os.path.sep.join(filePath) else: # Invoked as a script via python foo/bar.py name = filePath.pop() dirPath = os.path.sep.join(filePath) cls._check_conflict(dirPath, name) else: # User module was imported. Determine the directory containing the top-level package if filePath[-1] == '__init__': # module is a subpackage filePath.pop() for package in reversed(name.split('.')): dirPathTail = filePath.pop() assert dirPathTail == package dirPath = os.path.sep.join(filePath) log.debug("Module dir is %s", dirPath) require( os.path.isdir(dirPath), 'Bad directory path %s for module %s. Note that hot-deployment does not support \ .egg-link files yet, or scripts located in the root directory.', dirPath, name) fromVirtualEnv = inVirtualEnv() and dirPath.startswith(sys.prefix) return cls(dirPath=dirPath, name=name, fromVirtualEnv=fromVirtualEnv)
def forModule(cls, name): """ Return an instance of this class representing the module of the given name. If the given module name is "__main__", it will be translated to the actual file name of the top-level script without the .py or .pyc extension. This method assumes that the module with the specified name has already been loaded. """ module = sys.modules[name] filePath = os.path.abspath(module.__file__) filePath = filePath.split(os.path.sep) filePath[-1], extension = os.path.splitext(filePath[-1]) require(extension in ('.py', '.pyc'), 'The name of a user script/module must end in .py or .pyc.') if name == '__main__': log.debug("Discovering real name of module") # User script/module was invoked as the main program if module.__package__: # Invoked as a module via python -m foo.bar log.debug("Script was invoked as a module") name = [filePath.pop()] for package in reversed(module.__package__.split('.')): dirPathTail = filePath.pop() assert dirPathTail == package name.append(dirPathTail) name = '.'.join(reversed(name)) dirPath = os.path.sep.join(filePath) else: # Invoked as a script via python foo/bar.py name = filePath.pop() dirPath = os.path.sep.join(filePath) cls._check_conflict(dirPath, name) else: # User module was imported. Determine the directory containing the top-level package if filePath[-1] == '__init__': # module is a subpackage filePath.pop() for package in reversed(name.split('.')): dirPathTail = filePath.pop() assert dirPathTail == package dirPath = os.path.sep.join(filePath) log.debug("Module dir is %s", dirPath) require(os.path.isdir(dirPath), 'Bad directory path %s for module %s. Note that hot-deployment does not support \ .egg-link files yet, or scripts located in the root directory.', dirPath, name) fromVirtualEnv = inVirtualEnv() and dirPath.startswith(sys.prefix) return cls(dirPath=dirPath, name=name, fromVirtualEnv=fromVirtualEnv)
def __init__(self, provisioner, leader, config): """ Class manages automatically scaling the number of worker nodes. :param AbstractProvisioner provisioner: Provisioner instance to scale. :param toil.leader.Leader leader: :param Config config: Config object from which to draw parameters. """ self.provisioner = provisioner self.leader = leader self.config = config # Indicates that the scaling threads should shutdown self.stop = False assert config.maxPreemptableNodes >= 0 and config.maxNodes >= 0 require(config.maxPreemptableNodes + config.maxNodes > 0, 'Either --maxNodes or --maxPreemptableNodes must be non-zero.') self.preemptableScaler = ScalerThread(self, preemptable=True) if self.config.maxPreemptableNodes > 0 else None self.scaler = ScalerThread(self, preemptable=False) if self.config.maxNodes > 0 else None
def forModule(cls, name): """ Return an instance of this class representing the module of the given name. If the given module name is "__main__", it will be translated to the actual file name of the top-level script without the .py or .pyc extension. This method assumes that the module with the specified name has already been loaded. """ module = sys.modules[name] filePath = os.path.abspath(module.__file__) filePath = filePath.split(os.path.sep) filePath[-1], extension = os.path.splitext(filePath[-1]) require(extension in ('.py', '.pyc'), 'The name of a user script/module must end in .py or .pyc.') if name == '__main__': # User script/module was invoked as the main program if module.__package__: # Invoked as a module via python -m foo.bar name = [filePath.pop()] for package in reversed(module.__package__.split('.')): dirPathTail = filePath.pop() assert dirPathTail == package name.append(dirPathTail) name = '.'.join(reversed(name)) dirPath = os.path.sep.join(filePath) else: # Invoked as a script via python foo/bar.py name = filePath.pop() dirPath = os.path.sep.join(filePath) cls._check_conflict(dirPath, name) else: # User module was imported. Determine the directory containing the top-level package for package in reversed(name.split('.')): dirPathTail = filePath.pop() assert dirPathTail == package dirPath = os.path.sep.join(filePath) assert os.path.isdir(dirPath) return cls(dirPath=dirPath, name=name)
def __init__(self, config, batchSystem): """ :type config: Config :type batchSystem: AbstractBatchSystem """ super(CGCloudProvisioner, self).__init__(config, batchSystem) self.batchSystem = batchSystem self.imageId = self._instance.image_id require(config.nodeType, 'Must pass --nodeType when using the cgcloud provisioner') instanceType = self._resolveInstanceType(config.nodeType) self._requireEphemeralDrives(instanceType) if config.preemptableNodeType: try: preemptableInstanceType, spotBid = config.preemptableNodeType.split( ':') except ValueError: raise ValueError( "Preemptible node type '%s' is not valid for this provisioner. " "Use format INSTANCE_TYPE:SPOT_BID, e.g. m3.large:0.10 instead" % config.preemptableNodeType) preemptableInstanceType = self._resolveInstanceType( preemptableInstanceType) self._requireEphemeralDrives(preemptableInstanceType) try: self.spotBid = float(spotBid) except ValueError: raise ValueError( "The spot bid '%s' is not valid. Use a floating point dollar " "amount such as '0.42' instead." % spotBid) else: preemptableInstanceType, self.spotBid = None, None self.instanceType = { False: instanceType, True: preemptableInstanceType }
def resume(self): if not os.path.exists(self.jobStoreDir): raise NoSuchJobStoreException(self.jobStoreDir) require(os.path.isdir, "'%s' is not a directory", self.jobStoreDir) logger.debug("Resuming...")
def resume(self): if not os.path.exists(self.jobStoreDir): raise NoSuchJobStoreException(self.jobStoreDir) require( os.path.isdir, "'%s' is not a directory", self.jobStoreDir) super(FileJobStore, self).resume()
def setOptions(self, options): """ Creates a config object from the options object. """ from bd2k.util.humanize import human2bytes #This import is used to convert #from human readable quantites to integers def setOption(varName, parsingFn=None, checkFn=None): #If options object has the option "varName" specified #then set the "varName" attrib to this value in the config object x = getattr(options, varName, None) if x is not None: if parsingFn is not None: x = parsingFn(x) if checkFn is not None: try: checkFn(x) except AssertionError: raise RuntimeError( "The %s option has an invalid value: %s" % (varName, x)) setattr(self, varName, x) # Function to parse integer from string expressed in different formats h2b = lambda x: human2bytes(str(x)) def iC(minValue, maxValue=sys.maxint): # Returns function that checks if a given int is in the given half-open interval assert isinstance(minValue, int) and isinstance(maxValue, int) return lambda x: minValue <= x < maxValue def fC(minValue, maxValue=None): # Returns function that checks if a given float is in the given half-open interval assert isinstance(minValue, float) if maxValue is None: return lambda x: minValue <= x else: assert isinstance(maxValue, float) return lambda x: minValue <= x < maxValue def parseJobStore(s): name, rest = Toil.parseLocator(s) if name == 'file': # We need to resolve relative paths early, on the leader, because the worker process # may have a different working directory than the leader, e.g. under Mesos. return Toil.buildLocator(name, os.path.abspath(rest)) else: return s #Core options setOption("jobStore", parsingFn=parseJobStore) #TODO: LOG LEVEL STRING setOption("workDir") setOption("stats") setOption("cleanWorkDir") setOption("clean") if self.stats: if self.clean != "never" and self.clean is not None: raise RuntimeError( "Contradicting options passed: Clean flag is set to %s " "despite the stats flag requiring " "the jobStore to be intact at the end of the run. " "Set clean to \'never\'" % self.clean) self.clean = "never" elif self.clean is None: self.clean = "onSuccess" #Restarting the workflow options setOption("restart") #Batch system options setOption("batchSystem") setOption("scale", float, fC(0.0)) setOption("mesosMasterAddress") setOption("parasolCommand") setOption("parasolMaxBatches", int, iC(1)) setOption("environment", parseSetEnv) #Autoscaling options setOption("provisioner") setOption("nodeType") setOption("nodeOptions") setOption("minNodes", int) setOption("maxNodes", int) setOption("preemptableNodeType") setOption("preemptableNodeOptions") setOption("minPreemptableNodes", int) setOption("maxPreemptableNodes", int) setOption("alphaPacking", float) setOption("betaInertia", float) setOption("scaleInterval", float) setOption("preemptableCompensation", float) require(0.0 <= self.preemptableCompensation <= 1.0, '--preemptableCompensation (%f) must be >= 0.0 and <= 1.0', self.preemptableCompensation) # Resource requirements setOption("defaultMemory", h2b, iC(1)) setOption("defaultCores", float, fC(1.0)) setOption("defaultDisk", h2b, iC(1)) setOption("readGlobalFileMutableByDefault") setOption("maxCores", int, iC(1)) setOption("maxMemory", h2b, iC(1)) setOption("maxDisk", h2b, iC(1)) setOption("defaultPreemptable") #Retrying/rescuing jobs setOption("retryCount", int, iC(0)) setOption("maxJobDuration", int, iC(1)) setOption("rescueJobsFrequency", int, iC(1)) #Misc setOption("disableCaching") setOption("maxLogFileSize", h2b, iC(1)) def checkSse(sseKey): with open(sseKey) as f: assert (len(f.readline().rstrip()) == 32) setOption("sseKey", checkFn=checkSse) setOption("cseKey", checkFn=checkSse) setOption("servicePollingInterval", float, fC(0.0)) #Debug options setOption("badWorker", float, fC(0.0, 1.0)) setOption("badWorkerFailInterval", float, fC(0.0))
def docker_call(job, tool, parameters=None, work_dir='.', rm=True, detached=False, env=None, outfile=None, inputs=None, outputs=None, docker_parameters=None, check_output=False, mock=None, defer=None, container_name=None, mounts=None): """ Calls Docker, passing along parameters and tool. :param toil.Job.job job: The Job instance for the calling function. :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools) :param list[str] parameters: Command line arguments to be passed to the tool :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data :param bool rm: Should the container be run with the --rm flag (Should it be removed upon container exit)? rm and detached are mutually exclusive in Docker. This is the flag passed to docker and is independent of the defer flag. If this is set to True and `defer` is None, `defer` takes the value `docker_call.RM`. :param bool detached: Should the container be run with the --detached flag (Should it be run in detached mode)? See `rm` above. :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G')) :param file outfile: Pipe output of Docker call to file handle :param list[str] inputs: A list of the input files. :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None or a url. The value is only used if mock=True :param dict[str,str] docker_parameters: Parameters to pass to docker :param bool check_output: When True, this function returns docker's output :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by the environment variable. :param int defer: What action should be taken on the container upon job completion? docker_call.FORGO will leave the container untouched. docker_call.STOP will attempt to stop the container with `docker stop` (useful for debugging). docker_call.RM will stop the container and then forcefully remove it from the system using `docker rm -f`. The default value is None and that shadows docker_call.FORGO, unless rm is true. :param str container_name: An optional name for your container. :param dict mounts: A dictionary of data volumes to mount into the Docker container containing host paths as keys and the corresponding container paths as values """ from toil_lib.urls import download_url if mock is None: mock = mock_mode() if parameters is None: parameters = [] if inputs is None: inputs = [] if outputs is None: outputs = {} # Docker does not allow the --rm flag to be used when the container is run in detached mode. require(not (rm and detached), "Conflicting options 'rm' and 'detached'.") # Ensure the user has passed a valid value for defer require( defer in (None, docker_call.FORGO, docker_call.STOP, docker_call.RM), 'Please provide a valid value for defer.') for filename in inputs: assert (os.path.isfile(os.path.join(work_dir, filename))) if mock: for filename, url in outputs.items(): file_path = os.path.join(work_dir, filename) if url is None: # create mock file if not os.path.exists(file_path): f = open(file_path, 'w') f.write("contents") # FIXME f.close() else: file_path = os.path.join(work_dir, filename) if not os.path.exists(file_path): outfile = download_url(job, url, work_dir=work_dir, name=filename, mock=False) assert os.path.exists(file_path) return if not container_name: container_name = _get_container_name(job) base_docker_call = [ 'docker', 'run', '--log-driver=none', '-v', '{}:/data'.format(os.path.abspath(work_dir)) ] if mounts: require(isinstance(mounts, dict), "'mounts' parameter must be a dictionary object") for k, v in mounts.iteritems(): base_docker_call.extend(['-v', k + ':' + v]) # Defer the permission fixing function. We call this explicitly later on in this function, but # we defer it as well to handle unexpected job failure. job.defer(_fix_permissions, base_docker_call, tool, work_dir) base_docker_call.extend(['--name', container_name]) if rm: base_docker_call.append('--rm') if defer is None: defer = docker_call.RM elif detached: base_docker_call += ['-d'] # Defer the container on-exit action job.defer(_docker_kill, container_name, action=defer) if env: for e, v in env.iteritems(): base_docker_call.extend(['-e', '{}={}'.format(e, v)]) if docker_parameters: base_docker_call += docker_parameters _log.debug("Calling docker with %s." % " ".join(base_docker_call + [tool] + parameters)) call = base_docker_call + [tool] + parameters if outfile: subprocess.check_call(call, stdout=outfile) else: if check_output: return subprocess.check_output(call) else: subprocess.check_call(call) # Fix root ownership of output files _fix_permissions(base_docker_call, tool, work_dir) for filename in outputs.keys(): if not os.path.isabs(filename): filename = os.path.join(work_dir, filename) assert (os.path.isfile(filename))
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz Dockerized Toil RNA-seq pipeline RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto) General Usage: docker run -v $(pwd):$(pwd) -v /var/run/docker.sock:/var/run/docker.sock \ quay.io/ucsc_cgl/rnaseq-cgl-pipeline --samples sample1.tar Please see the complete documentation located at: https://github.com/BD2KGenomics/cgl-docker-lib/tree/master/rnaseq-cgl-pipeline or inside the container at: /opt/rnaseq-pipeline/README.md Structure of RNA-Seq Pipeline (per sample) 3 -- 4 -- 5 / | 0 -- 1 -- 2 ---- 6 -- 8 \ | 7 --------- 0 = Download sample 1 = Unpack/Merge fastqs 2 = CutAdapt (adapter trimming) 3 = STAR Alignment 4 = RSEM Quantification 5 = RSEM Post-processing 6 = Kallisto 7 = FastQC 8 = Consoliate output and upload to S3 ======================================= Dependencies Docker """ # Define argument parser for parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--sample-tar', default=[], action="append", help='Absolute path to sample tarball.') parser.add_argument('--sample-single', default=[], action="append", help='Absolute path to sample single-ended FASTQ.') parser.add_argument( '--sample-paired', default=[], action="append", help= 'Absolute path to sample paired FASTQs, in the form `read1,read2,read1,read2`.' ) parser.add_argument('--star', type=str, required=True, help='Absolute path to STAR index tarball.') parser.add_argument('--rsem', type=str, required=True, help='Absolute path to rsem reference tarball.') parser.add_argument('--kallisto', type=str, required=True, help='Absolute path to kallisto index (.idx) file.') parser.add_argument( '--disable-cutadapt', action='store_true', default=False, help= 'Cutadapt fails if samples are improperly paired. Use this flag to disable cutadapt.' ) parser.add_argument( '--save-bam', action='store_true', default='false', help='If this flag is used, genome-aligned bam is written to output.') parser.add_argument( '--save-wiggle', action='store_true', default='false', help='If this flag is used, wiggle files (.bg) are written to output.') parser.add_argument( '--no-clean', action='store_true', help='If this flag is used, temporary work directory is not cleaned.') parser.add_argument( '--resume', type=str, default=None, help= 'Pass the working directory that contains a job store to be resumed.') parser.add_argument( '--cores', type=int, default=None, help= 'Will set a cap on number of cores to use, default is all available cores.' ) parser.add_argument('--bamqc', action='store_true', default=None, help='Enable BAM QC step. Disabled by default') parser.add_argument( '--work_mount', required=True, help='Mount where intermediate files should be written. This directory ' 'should be mirror mounted into the container.') parser.add_argument('--output-basename', default="", help='Base name to use for naming the output files ') # although we don't actually set the log level in this module, the option is propagated to toil. For this reason # we want the logging options to show up with we run --help addLoggingOptions(parser) toilLoggingOption = None for arg in sys.argv: if 'log' in arg: toilLoggingOption = arg sys.argv.remove(toilLoggingOption) break args = parser.parse_args() args.toilLoggingOption = toilLoggingOption # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Get name of most recent running container. If socket is mounted, should be this one. try: name = subprocess.check_output( ['docker', 'ps', '--format', '{{.Names}}']).split('\n')[0] except subprocess.CalledProcessError as e: raise RuntimeError( 'No container detected, ensure Docker is being run with: ' '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument. \n\n{}' .format(e.message)) # Get name of mounted volume blob = json.loads(subprocess.check_output(['docker', 'inspect', name])) mounts = blob[0]['Mounts'] # Ensure docker.sock is mounted correctly sock_mount = [ x['Source'] == x['Destination'] for x in mounts if 'docker.sock' in x['Source'] ] require( len(sock_mount) == 1, 'Missing socket mount. Requires the following: ' 'docker run -v /var/run/docker.sock:/var/run/docker.sock') work_mount = args.work_mount #create work_mount directories if they don't exist yet. cmd = ["mkdir", "-p", work_mount] log.info('Creating directory: %s', work_mount) subprocess.call(cmd) curr_mount = os.path.join(os.getcwd(), work_mount) cmd = ["mkdir", "-p", curr_mount] log.info('Creating directory: %s', curr_mount) subprocess.call(cmd) for samples in [args.sample_tar, args.sample_paired, args.sample_single]: if not samples: continue # If sample is given as relative path, assume it's in the work directory if not all(x.startswith('/') for x in samples): samples = [ os.path.join(work_mount, x) for x in samples if not x.startswith('/') ] log.info( '\nSample given as relative path, assuming sample is in work directory: {}' .format(work_mount[0])) # Enforce file input standards require( all(x.startswith('/') for x in samples), "Sample inputs must point to a file's full path, " "e.g. '/full/path/to/sample1.tar'. You provided %s", str(samples)) if samples == args.sample_tar: log.info('TARs to run: {}'.format('\t'.join(args.sample_tar))) if samples == args.sample_paired: log.info('Paired FASTQS to run: {}'.format('\t'.join( args.sample_paired))) if samples == args.sample_single: log.info('Single FASTQS to run: {}'.format('\t'.join( args.sample_single))) require( all(x.startswith('/') for x in [args.star, args.kallisto, args.rsem]), "Sample inputs must point to a file's full path, " "e.g. '/full/path/to/kallisto_hg38.idx'.") # Output log information log.info('The work mount is: {}'.format(work_mount)) log.info('Pipeline input locations: \n{}\n{}\n{}'.format( args.star, args.rsem, args.kallisto)) call_pipeline(work_mount, args)
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz Dockerized Toil RNA-seq pipeline RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto) General Usage: docker run -v $(pwd):$(pwd) -v /var/run/docker.sock:/var/run/docker.sock \ quay.io/ucsc_cgl/rnaseq-cgl-pipeline --samples sample1.tar Please see the complete documentation located at: https://github.com/BD2KGenomics/cgl-docker-lib/tree/master/rnaseq-cgl-pipeline or inside the container at: /opt/rnaseq-pipeline/README.md Structure of RNA-Seq Pipeline (per sample) 3 -- 4 -- 5 / | 0 -- 1 -- 2 ---- 6 -- 8 \ | 7 --------- 0 = Download sample 1 = Unpack/Merge fastqs 2 = CutAdapt (adapter trimming) 3 = STAR Alignment 4 = RSEM Quantification 5 = RSEM Post-processing 6 = Kallisto 7 = FastQC 8 = Consoliate output and upload to S3 ======================================= Dependencies Docker """ # Define argument parser for parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--sample-tar', default=[], action="append", help='Absolute path to sample tarball.') parser.add_argument('--sample-single', default=[], action="append", help='Absolute path to sample single-ended FASTQ.') parser.add_argument( '--sample-paired', nargs='*', default=[], help= 'Absolute path to sample paired FASTQs, in the form `read1,read2,read1,read2`.' ) parser.add_argument('--output-basenames', nargs='*', default=[], help='Base names to use for naming the output files ') parser.add_argument('--star', type=str, default="", help='Absolute path to STAR index tarball.') parser.add_argument('--rsem', type=str, default="", help='Absolute path to rsem reference tarball.') parser.add_argument('--kallisto', type=str, default="", help='Absolute path to kallisto index (.idx) file.') parser.add_argument('--hera', type=str, default="", help='Absolute path to hera index (.idx) file.') parser.add_argument( '--disable-cutadapt', action='store_true', default=False, help= 'Cutadapt fails if samples are improperly paired. Use this flag to disable cutadapt.' ) parser.add_argument( '--save-bam', action='store_true', default='false', help='If this flag is used, genome-aligned bam is written to output.') parser.add_argument( '--save-wiggle', action='store_true', default='false', help='If this flag is used, wiggle files (.bg) are written to output.') parser.add_argument( '--no-clean', action='store_true', help='If this flag is used, temporary work directory is not cleaned.') parser.add_argument( '--resume', type=str, default=None, help= 'Pass the working directory that contains a job store to be resumed.') parser.add_argument( '--cores', type=int, default=None, help= 'Will set a cap on number of cores to use, default is all available cores.' ) parser.add_argument('--bamqc', action='store_true', default=None, help='Enable BAM QC step. Disabled by default') parser.add_argument( '--work_mount', required=True, help='Mount where intermediate files should be written. This directory ' 'should be mirror mounted into the container.') parser.add_argument( '--max-sample-size', default="20G", help='Maximum size of sample file using Toil resource requirements ' "syntax, e.g '20G'. Standard suffixes like K, Ki, M, Mi, G or Gi are supported." ) auto_scale_options = parser.add_argument_group('Auto-scaling options') auto_scale_options.add_argument( '--auto-scale', action='store_true', default=False, help='Enable Toil autoscaling. Disabled by default') auto_scale_options.add_argument( '--cluster-name', default="", help='Name of the Toil cluster. Usually the security group name') auto_scale_options.add_argument( '--job-store', default="aws:us-west-2:autoscaling-toil-rnaseq-jobstore-2", help='Directory in cloud where working files will be put; ' 'e.g. aws:us-west-2:autoscaling-toil-rnaseq-jobstore') auto_scale_options.add_argument( '--output-location', default="s3://toil-rnaseq-cloud-staging-area", help='Directory in cloud where output files will be put; ' 'e.g. s3://toil-rnaseq-cloud-staging-area') auto_scale_options.add_argument('--provisioner', default="aws", help='Cloud provisioner to use. E.g aws') auto_scale_options.add_argument( '--node-type', default="c3.8xlarge", help='Cloud worker VM type; e.g. c3.8xlarge') auto_scale_options.add_argument( '--max-nodes', type=int, default=2, help='Maximum worker nodes to launch. E.g. 2') auto_scale_options.add_argument('--credentials-id', default="", help='Credentials id') auto_scale_options.add_argument('--credentials-secret-key', default="", help='Credentials secret key') # although we don't actually set the log level in this module, the option is propagated to toil. For this reason # we want the logging options to show up with we run --help addLoggingOptions(parser) toilLoggingOption = '--logDebug' for arg in sys.argv: if 'log' in arg: toilLoggingOption = arg sys.argv.remove(toilLoggingOption) break args = parser.parse_args() args.toilLoggingOption = toilLoggingOption # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) if args.auto_scale: if not args.cluster_name: log.info( 'Auto-scaling requires a cluster name to be input with the --cluster-name option' ) parser.error( 'Auto-scaling requires a cluster name to be input with the --cluster-name option' ) if not args.credentials_id or not args.credentials_secret_key: log.info( 'Auto-scaling requires provisioner credentials id and secret key' ) parser.error( 'Auto-scaling requires provisioner credentials id and secret key' ) # Get name of most recent running container. If socket is mounted, should be this one. try: name = subprocess.check_output( ['docker', 'ps', '--format', '{{.Names}}']).split('\n')[0] except subprocess.CalledProcessError as e: raise RuntimeError( 'No container detected, ensure Docker is being run with: ' '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument. \n\n{}' .format(e.message)) # Get name of mounted volume blob = json.loads(subprocess.check_output(['docker', 'inspect', name])) mounts = blob[0]['Mounts'] # Ensure docker.sock is mounted correctly sock_mount = [ x['Source'] == x['Destination'] for x in mounts if 'docker.sock' in x['Source'] ] require( len(sock_mount) == 1, 'Missing socket mount. Requires the following: ' 'docker run -v /var/run/docker.sock:/var/run/docker.sock') work_mount = args.work_mount for samples in [args.sample_tar, args.sample_paired, args.sample_single]: if not samples: continue # Enforce file input standards if args.auto_scale: require( len(args.output_basenames) == len(samples), "There must be a " "unique output filename for each sample. You provided {}". format(args.output_basenames)) require(all( ((x.lower().startswith('http://') or x.lower().startswith('s3://') \ or x.lower().startswith('ftp://')) or not x) for x in samples), "Sample inputs must point to a file's full path, " "e.g. 's3://full/path/to/sample_R1.fastq.gz', and should start with " " file://, http://, s3://, or ftp://. You provided %s", str(samples)) else: # If sample is given as relative path, assume it's in the work directory if not all(x.startswith('/') for x in samples): samples = [ os.path.join(work_mount, x) for x in samples if not x.startswith('/') ] log.info( '\nSample given as relative path, assuming sample is in work directory: {}' .format(work_mount[0])) require( all(x.startswith('/') for x in samples), "Sample inputs must point to a file's full path, " "e.g. '/full/path/to/sample1.tar'. You provided %s", str(samples)) if samples == args.sample_tar: log.info('TARs to run: {}'.format('\t'.join(args.sample_tar))) if samples == args.sample_paired: log.info('Paired FASTQS to run: {}'.format('\t'.join( args.sample_paired))) if samples == args.sample_single: log.info('Single FASTQS to run: {}'.format('\t'.join( args.sample_single))) #file paths should start with /, file://, http://, s3://, or ftp:// if args.auto_scale: require(all( ((x.lower().startswith('http://') or x.lower().startswith('s3://') \ or x.lower().startswith('ftp://')) or not x) for x in [args.star, \ args.kallisto, args.rsem, args.hera]), "Sample inputs must point to a file's full path, " "e.g. 's3://full/path/to/kallisto_hg38.idx', and should start with file://, http://, s3://, or ftp://.") else: #Input for star and rsem will be empty if user wants to run kallisto only so test for not x require( all((x.startswith('/') or not x) for x in [args.star, args.kallisto, args.rsem, args.hera]), "Sample inputs must point to a file's full path, " "e.g. '/full/path/to/kallisto_hg38.idx'") # Output log information log.info('The work mount is: {}'.format(work_mount)) log.info('Pipeline input locations: \n{}\n{}\n{}\n{}'.format( args.star, args.rsem, args.kallisto, args.hera)) call_pipeline(work_mount, args)
def setOptions(self, options): """ Creates a config object from the options object. """ from bd2k.util.humanize import human2bytes #This import is used to convert #from human readable quantites to integers def setOption(varName, parsingFn=None, checkFn=None): #If options object has the option "varName" specified #then set the "varName" attrib to this value in the config object x = getattr(options, varName, None) if x is not None: if parsingFn is not None: x = parsingFn(x) if checkFn is not None: try: checkFn(x) except AssertionError: raise RuntimeError("The %s option has an invalid value: %s" % (varName, x)) setattr(self, varName, x) # Function to parse integer from string expressed in different formats h2b = lambda x : human2bytes(str(x)) def iC(minValue, maxValue=sys.maxint): # Returns function that checks if a given int is in the given half-open interval assert isinstance(minValue, int) and isinstance(maxValue, int) return lambda x: minValue <= x < maxValue def fC(minValue, maxValue=None): # Returns function that checks if a given float is in the given half-open interval assert isinstance(minValue, float) if maxValue is None: return lambda x: minValue <= x else: assert isinstance(maxValue, float) return lambda x: minValue <= x < maxValue def parseJobStore(s): name, rest = Toil.parseLocator(s) if name == 'file': # We need to resolve relative paths early, on the leader, because the worker process # may have a different working directory than the leader, e.g. under Mesos. return Toil.buildLocator(name, os.path.abspath(rest)) else: return s #Core options setOption("jobStore", parsingFn=parseJobStore) #TODO: LOG LEVEL STRING setOption("workDir") if self.workDir is not None: self.workDir = os.path.abspath(self.workDir) if not os.path.exists(self.workDir): raise RuntimeError("The path provided to --workDir (%s) does not exist." % self.workDir) setOption("stats") setOption("cleanWorkDir") setOption("clean") if self.stats: if self.clean != "never" and self.clean is not None: raise RuntimeError("Contradicting options passed: Clean flag is set to %s " "despite the stats flag requiring " "the jobStore to be intact at the end of the run. " "Set clean to \'never\'" % self.clean) self.clean = "never" elif self.clean is None: self.clean = "onSuccess" #Restarting the workflow options setOption("restart") #Batch system options setOption("batchSystem") setOption("scale", float, fC(0.0)) setOption("mesosMasterAddress") setOption("parasolCommand") setOption("parasolMaxBatches", int, iC(1)) setOption("environment", parseSetEnv) #Autoscaling options setOption("provisioner") setOption("nodeType") setOption("nodeOptions") setOption("minNodes", int) setOption("maxNodes", int) setOption("preemptableNodeType") setOption("preemptableNodeOptions") setOption("minPreemptableNodes", int) setOption("maxPreemptableNodes", int) setOption("alphaPacking", float) setOption("betaInertia", float) setOption("scaleInterval", float) setOption("preemptableCompensation", float) require(0.0 <= self.preemptableCompensation <= 1.0, '--preemptableCompensation (%f) must be >= 0.0 and <= 1.0', self.preemptableCompensation) # Parameters to limit service jobs / detect deadlocks setOption("maxServiceJobs", int) setOption("maxPreemptableServiceJobs", int) setOption("deadlockWait", int) # Resource requirements setOption("defaultMemory", h2b, iC(1)) setOption("defaultCores", float, fC(1.0)) setOption("defaultDisk", h2b, iC(1)) setOption("readGlobalFileMutableByDefault") setOption("maxCores", int, iC(1)) setOption("maxMemory", h2b, iC(1)) setOption("maxDisk", h2b, iC(1)) setOption("defaultPreemptable") #Retrying/rescuing jobs setOption("retryCount", int, iC(0)) setOption("maxJobDuration", int, iC(1)) setOption("rescueJobsFrequency", int, iC(1)) #Misc setOption("disableCaching") setOption("maxLogFileSize", h2b, iC(1)) def checkSse(sseKey): with open(sseKey) as f: assert(len(f.readline().rstrip()) == 32) setOption("sseKey", checkFn=checkSse) setOption("cseKey", checkFn=checkSse) setOption("servicePollingInterval", float, fC(0.0)) #Debug options setOption("badWorker", float, fC(0.0, 1.0)) setOption("badWorkerFailInterval", float, fC(0.0))
def _docker(job, tool, parameters=None, workDir=None, dockerParameters=None, outfile=None, checkOutput=False, defer=None): """ :param toil.Job.job job: The Job instance for the calling function. :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools). :param list[str] parameters: Command line arguments to be passed to the tool. If list of lists: list[list[str]], then treat as successive commands chained with pipe. :param str workDir: Directory to mount into the container via `-v`. Destination convention is /data :param list[str] dockerParameters: Parameters to pass to Docker. Default parameters are `--rm`, `--log-driver none`, and the mountpoint `-v work_dir:/data` where /data is the destination convention. These defaults are removed if docker_parmaters is passed, so be sure to pass them if they are desired. :param file outfile: Pipe output of Docker call to file handle :param bool checkOutput: When True, this function returns docker's output. :param int defer: What action should be taken on the container upon job completion? FORGO (0) will leave the container untouched. STOP (1) will attempt to stop the container with `docker stop` (useful for debugging). RM (2) will stop the container and then forcefully remove it from the system using `docker rm -f`. This is the default behavior if defer is set to None. """ if parameters is None: parameters = [] if workDir is None: workDir = os.getcwd() # Setup the outgoing subprocess call for docker baseDockerCall = ['docker', 'run'] if dockerParameters: baseDockerCall += dockerParameters else: baseDockerCall += [ '--rm', '--log-driver', 'none', '-v', os.path.abspath(workDir) + ':/data' ] # Ensure the user has passed a valid value for defer require(defer in (None, FORGO, STOP, RM), 'Please provide a valid value for defer.') # Get container name which is needed for _dockerKill try: if any('--name' in x for x in baseDockerCall): if any('--name=' in x for x in baseDockerCall): containerName = [ x.split('=')[1] for x in baseDockerCall if '--name' in x ][0] else: containerName = baseDockerCall[baseDockerCall.index('--name') + 1] else: containerName = _getContainerName(job) except ValueError: containerName = _getContainerName(job) baseDockerCall.extend(['--name', containerName]) except IndexError: raise RuntimeError( "Couldn't parse Docker's `--name=` option, check parameters: " + str(dockerParameters)) # Defer the container on-exit action if '--rm' in baseDockerCall and defer is None: defer = RM if '--rm' in baseDockerCall and defer is not RM: _logger.warn( '--rm being passed to docker call but defer not set to dockerCall.RM, defer set to: ' + str(defer)) job.defer(_dockerKill, containerName, action=defer) # Defer the permission fixing function which will run after this job concludes. # We call this explicitly later on in this function, but we defer it as well to handle unexpected job failure. job.defer(_fixPermissions, tool, workDir) # Make subprocess call # If parameters is list of lists, treat each list as separate command and chain with pipes if len(parameters) > 0 and type(parameters[0]) is list: # When piping, all arguments now get merged into a single string to bash -c. # We try to support spaces in paths by wrapping them all in quotes first. chain_params = [ ' '.join(p) for p in [map(pipes.quote, q) for q in parameters] ] call = baseDockerCall + [ '--entrypoint', '/bin/bash', tool, '-c', ' | '.join(chain_params) ] else: call = baseDockerCall + [tool] + parameters _logger.info("Calling docker with " + repr(call)) params = {} if outfile: params['stdout'] = outfile if checkOutput: callMethod = subprocess.check_output else: callMethod = subprocess.check_call for attempt in retry(predicate=dockerPredicate): with attempt: out = callMethod(call, **params) _fixPermissions(tool=tool, workDir=workDir) return out
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz Dockerized Toil RNA-seq pipeline RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto) General Usage: docker run -v $(pwd):$(pwd) -v /var/run/docker.sock:/var/run/docker.sock \ quay.io/ucsc_cgl/rnaseq-cgl-pipeline --samples sample1.tar Please see the complete documentation located at: https://github.com/BD2KGenomics/cgl-docker-lib/tree/master/rnaseq-cgl-pipeline or inside the container at: /opt/rnaseq-pipeline/README.md Structure of RNA-Seq Pipeline (per sample) 3 -- 4 -- 5 / | 0 -- 1 -- 2 ---- 6 -- 8 \ | 7 --------- 0 = Download sample 1 = Unpack/Merge fastqs 2 = CutAdapt (adapter trimming) 3 = STAR Alignment 4 = RSEM Quantification 5 = RSEM Post-processing 6 = Kallisto 7 = FastQC 8 = Consoliate output and upload to S3 ======================================= Dependencies Docker """ # Define argument parser for parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--samples', nargs='+', required=True, help='Absolute path(s) to sample tarballs.') parser.add_argument('--star', type=str, required=True, help='Absolute path to STAR index tarball.') parser.add_argument('--rsem', type=str, required=True, help='Absolute path to rsem reference tarball.') parser.add_argument('--kallisto', type=str, required=True, help='Absolute path to kallisto index (.idx) file.') parser.add_argument('--disable-cutadapt', action='store_true', default=False, help='Cutadapt fails if samples are improperly paired. Use this flag to disable cutadapt.') parser.add_argument('--save-bam', action='store_true', default='false', help='If this flag is used, genome-aligned bam is written to output.') parser.add_argument('--save-wiggle', action='store_true', default='false', help='If this flag is used, wiggle files (.bg) are written to output.') parser.add_argument('--no-clean', action='store_true', help='If this flag is used, temporary work directory is not cleaned.') parser.add_argument('--resume', type=str, default=None, help='Pass the working directory that contains a job store to be resumed.') parser.add_argument('--cores', type=int, default=None, help='Will set a cap on number of cores to use, default is all available cores.') args = parser.parse_args() # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Get name of most recent running container. If socket is mounted, should be this one. try: name = subprocess.check_output(['docker', 'ps', '--format', '{{.Names}}']).split('\n')[0] except subprocess.CalledProcessError as e: raise RuntimeError('No container detected, ensure Docker is being run with: ' '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument. \n\n{}'.format(e.message)) # Get name of mounted volume blob = json.loads(subprocess.check_output(['docker', 'inspect', name])) mounts = blob[0]['Mounts'] # Ensure docker.sock is mounted correctly sock_mount = [x['Source'] == x['Destination'] for x in mounts if 'docker.sock' in x['Source']] require(len(sock_mount) == 1, 'Missing socket mount. Requires the following: ' 'docker run -v /var/run/docker.sock:/var/run/docker.sock') # Ensure formatting of command for 2 mount points if len(mounts) == 2: require(all(x['Source'] == x['Destination'] for x in mounts), 'Docker Src/Dst mount points, invoked with the -v argument, ' 'must be the same if only using one mount point aside from the docker socket.') work_mount = [x['Source'] for x in mounts if 'docker.sock' not in x['Source']] else: # Ensure only one mirror mount exists aside from docker.sock mirror_mounts = [x['Source'] for x in mounts if x['Source'] == x['Destination']] work_mount = [x for x in mirror_mounts if 'docker.sock' not in x] require(len(work_mount) == 1, 'Wrong number of mirror mounts provided, see documentation.') # If sample is given as relative path, assume it's in the work directory if not all(x.startswith('/') for x in args.samples): args.samples = [os.path.join(work_mount[0], x) for x in args.samples if not x.startswith('/')] log.info('\nSample given as relative path, assuming sample is in work directory: {}'.format(work_mount[0])) # Enforce file input standards require(all(x.startswith('/') for x in args.samples), "Sample inputs must point to a file's full path, " "e.g. '/full/path/to/sample1.tar'. You provided %s", str(args.samples)) require(all(x.startswith('/') for x in [args.star, args.kallisto, args.rsem]), "Sample inputs must point to a file's full path, " "e.g. '/full/path/to/kallisto_hg38.idx'.") # Output log information log.info('The work mount is: {}'.format(work_mount[0])) log.info('Samples to run: {}'.format('\t'.join(args.samples))) log.info('Pipeline input locations: \n{}\n{}\n{}'.format(args.star, args.rsem, args.kallisto)) call_pipeline(work_mount[0], args)
# If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Get name of most recent running container. If socket is mounted, should be this one. try: name = subprocess.check_output(['docker', 'ps', '--format', '{{.Names}}']).split('\n')[0] except subprocess.CalledProcessError as e: raise RuntimeError('No container detected, ensure Docker is being run with: ' '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument. \n\n{}'.format(e.message)) # Get name of mounted volume blob = json.loads(subprocess.check_output(['docker', 'inspect', name])) mounts = blob[0]['Mounts'] # Ensure docker.sock is mounted correctly sock_mount = [x['Source'] == x['Destination'] for x in mounts if 'docker.sock' in x['Source']] require(len(sock_mount) == 1, 'Missing socket mount. Requires the following: ' 'docker run -v /var/run/docker.sock:/var/run/docker.sock') work_mount = args.work_mount for samples in [args.sample_tar, args.sample_paired, args.sample_single]: if not samples: continue # If sample is given as relative path, assume it's in the work directory if not all(x.startswith('/') for x in samples): samples = [os.path.join(work_mount, x) for x in samples if not x.startswith('/')] log.info('\nSample given as relative path, assuming sample is in work directory: {}'.format(work_mount[0])) # Enforce file input standards require(all(x.startswith('/') for x in samples), "Sample inputs must point to a file's full path, " "e.g. '/full/path/to/sample1.tar'. You provided %s", str(samples)) if samples == args.sample_tar: log.info('TARs to run: {}'.format('\t'.join(args.sample_tar))) if samples == args.sample_paired:
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz Dockerized Toil RNA-seq pipeline RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto) General Usage: docker run -v $(pwd):$(pwd) -v /var/run/docker.sock:/var/run/docker.sock \ quay.io/ucsc_cgl/rnaseq-cgl-pipeline --samples sample1.tar Please see the complete documentation located at: https://github.com/BD2KGenomics/cgl-docker-lib/tree/master/rnaseq-cgl-pipeline or inside the container at: /opt/rnaseq-pipeline/README.md Structure of RNA-Seq Pipeline (per sample) 3 -- 4 -- 5 / | 0 -- 1 -- 2 ---- 6 -- 8 \ | 7 --------- 0 = Download sample 1 = Unpack/Merge fastqs 2 = CutAdapt (adapter trimming) 3 = STAR Alignment 4 = RSEM Quantification 5 = RSEM Post-processing 6 = Kallisto 7 = FastQC 8 = Consoliate output and upload to S3 ======================================= Dependencies Docker """ # Define argument parser for parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--samples', nargs='+', required=True, help='Absolute path(s) to sample tarballs.') parser.add_argument('--star', type=str, required=True, help='Absolute path to STAR index tarball.') parser.add_argument('--rsem', type=str, required=True, help='Absolute path to rsem reference tarball.') parser.add_argument('--kallisto', type=str, required=True, help='Absolute path to kallisto index (.idx) file.') parser.add_argument( '--disable-cutadapt', action='store_true', default=False, help= 'Cutadapt fails if samples are improperly paired. Use this flag to disable cutadapt.' ) parser.add_argument( '--save-bam', action='store_true', default='false', help='If this flag is used, genome-aligned bam is written to output.') parser.add_argument( '--save-wiggle', action='store_true', default='false', help='If this flag is used, wiggle files (.bg) are written to output.') parser.add_argument( '--no-clean', action='store_true', help='If this flag is used, temporary work directory is not cleaned.') parser.add_argument( '--resume', type=str, default=None, help= 'Pass the working directory that contains a job store to be resumed.') parser.add_argument( '--cores', type=int, default=None, help= 'Will set a cap on number of cores to use, default is all available cores.' ) args = parser.parse_args() # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Get name of most recent running container. If socket is mounted, should be this one. try: name = subprocess.check_output( ['docker', 'ps', '--format', '{{.Names}}']).split('\n')[0] except subprocess.CalledProcessError as e: raise RuntimeError( 'No container detected, ensure Docker is being run with: ' '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument. \n\n{}' .format(e.message)) # Get name of mounted volume blob = json.loads(subprocess.check_output(['docker', 'inspect', name])) mounts = blob[0]['Mounts'] # Ensure docker.sock is mounted correctly sock_mount = [ x['Source'] == x['Destination'] for x in mounts if 'docker.sock' in x['Source'] ] require( len(sock_mount) == 1, 'Missing socket mount. Requires the following: ' 'docker run -v /var/run/docker.sock:/var/run/docker.sock') ''' # Ensure formatting of command for 2 mount points if len(mounts) == 2: require(all(x['Source'] == x['Destination'] for x in mounts), 'Docker Src/Dst mount points, invoked with the -v argument, ' 'must be the same if only using one mount point aside from the docker socket.') work_mount = [x['Source'] for x in mounts if 'docker.sock' not in x['Source']] else: # Ensure only one mirror mount exists aside from docker.sock mirror_mounts = [x['Source'] for x in mounts if x['Source'] == x['Destination']] work_mount = [x for x in mirror_mounts if 'docker.sock' not in x] require(len(work_mount) == 1, 'Wrong number of mirror mounts provided, see documentation.') ''' # if "TMPDIR" in os.environ: # log.info('Setting work mount to TMPDIR which is: {}'.format(os.environ['TMPDIR'])) # work_dir = os.environ['TMPDIR'] # else: # log.info('TMPDIR not set; setting work mount to cwd which is: {}'.format(os.getcwd())) # work_dir = os.getcwd() # work_mount = list(os.getenv('TMPDIR', os.getcwd())) # workdir is the cwd so CWL can collect the output work_dir = os.getcwd() # If sample is given as relative path, assume it's in the work directory if not all(x.startswith('/') for x in args.samples): args.samples = [ os.path.join(work_mount[0], x) for x in args.samples if not x.startswith('/') ] log.info( '\nSample given as relative path, assuming sample is in work directory: {}' .format(work_mount[0])) # Enforce file input standards require( all(x.startswith('/') for x in args.samples), "Sample inputs must point to a file's full path, " "e.g. '/full/path/to/sample1.tar'. You provided %s", str(args.samples)) require( all(x.startswith('/') for x in [args.star, args.kallisto, args.rsem]), "Sample inputs must point to a file's full path, " "e.g. '/full/path/to/kallisto_hg38.idx'.") # Output log information log.info('The work mount is: {}'.format(work_dir)) # log.info('The work mount is: {}'.format(work_mount[0])) log.info('Samples to run: {}'.format('\t'.join(args.samples))) log.info('Pipeline input locations: \n{}\n{}\n{}'.format( args.star, args.rsem, args.kallisto)) call_pipeline(work_dir, args)
def resume(self): if not os.path.exists(self.jobStoreDir): raise NoSuchJobStoreException(self.jobStoreDir) require(os.path.isdir, "'%s' is not a directory", self.jobStoreDir) super(FileJobStore, self).resume()
def _docker(job, tool, parameters=None, workDir=None, dockerParameters=None, outfile=None, checkOutput=False, defer=None): """ :param toil.Job.job job: The Job instance for the calling function. :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools). :param list[str] parameters: Command line arguments to be passed to the tool. If list of lists: list[list[str]], then treat as successive commands chained with pipe. :param str workDir: Directory to mount into the container via `-v`. Destination convention is /data :param list[str] dockerParameters: Parameters to pass to Docker. Default parameters are `--rm`, `--log-driver none`, and the mountpoint `-v work_dir:/data` where /data is the destination convention. These defaults are removed if docker_parmaters is passed, so be sure to pass them if they are desired. :param file outfile: Pipe output of Docker call to file handle :param bool checkOutput: When True, this function returns docker's output. :param int defer: What action should be taken on the container upon job completion? FORGO (0) will leave the container untouched. STOP (1) will attempt to stop the container with `docker stop` (useful for debugging). RM (2) will stop the container and then forcefully remove it from the system using `docker rm -f`. This is the default behavior if defer is set to None. """ if parameters is None: parameters = [] if workDir is None: workDir = os.getcwd() # Setup the outgoing subprocess call for docker baseDockerCall = ['docker', 'run'] if dockerParameters: baseDockerCall += dockerParameters else: baseDockerCall += ['--rm', '--log-driver', 'none', '-v', os.path.abspath(workDir) + ':/data'] # Ensure the user has passed a valid value for defer require(defer in (None, FORGO, STOP, RM), 'Please provide a valid value for defer.') # Get container name which is needed for _dockerKill try: if any('--name' in x for x in baseDockerCall): if any('--name=' in x for x in baseDockerCall): containerName = [x.split('=')[1] for x in baseDockerCall if '--name' in x][0] else: containerName = baseDockerCall[baseDockerCall.index('--name') + 1] else: containerName = _getContainerName(job) baseDockerCall.extend(['--name', containerName]) except ValueError: containerName = _getContainerName(job) baseDockerCall.extend(['--name', containerName]) except IndexError: raise RuntimeError("Couldn't parse Docker's `--name=` option, check parameters: " + str(dockerParameters)) # Defer the container on-exit action if '--rm' in baseDockerCall and defer is None: defer = RM if '--rm' in baseDockerCall and defer is not RM: _logger.warn('--rm being passed to docker call but defer not set to dockerCall.RM, defer set to: ' + str(defer)) job.defer(_dockerKill, containerName, action=defer) # Defer the permission fixing function which will run after this job concludes. # We call this explicitly later on in this function, but we defer it as well to handle unexpected job failure. job.defer(_fixPermissions, tool, workDir) # Make subprocess call # If parameters is list of lists, treat each list as separate command and chain with pipes if len(parameters) > 0 and type(parameters[0]) is list: # When piping, all arguments now get merged into a single string to bash -c. # We try to support spaces in paths by wrapping them all in quotes first. chain_params = [' '.join(p) for p in [list(map(pipes.quote, q)) for q in parameters]] # Use bash's set -eo pipefail to detect and abort on a failure in any command in the chain call = baseDockerCall + ['--entrypoint', '/bin/bash', tool, '-c', 'set -eo pipefail && {}'.format(' | '.join(chain_params))] else: call = baseDockerCall + [tool] + parameters _logger.info("Calling docker with " + repr(call)) params = {} if outfile: params['stdout'] = outfile if checkOutput: callMethod = subprocess.check_output else: callMethod = subprocess.check_call for attempt in retry(predicate=dockerPredicate): with attempt: out = callMethod(call, **params) _fixPermissions(tool=tool, workDir=workDir) return out