def star_index(job, args): download_url(url=args.ref, name='ref.fa', work_dir=job.tempDir) download_url(url=args.gtf, name='annotation.gtf', work_dir=job.tempDir) # Run STAR to generate index star_dir = os.path.join(job.tempDir, args.star_name) os.mkdir(star_dir) parameters = [ '--runThreadN', str(args.cores), '--runMode', 'genomeGenerate', '--genomeDir', '/data/' + args.star_name, '--genomeFastaFiles', 'ref.fa', '--sjdbGTFfile', 'annotation.gtf' ] dockerCall( job, tool= 'quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', workDir=job.tempDir, parameters=parameters) # Compress starIndex into a tarball star_tar = '{}.tar.gz'.format(args.star_name) tarball_files(star_tar, file_paths=[star_dir], output_dir=job.tempDir) # Move to output dir or return tar_path = os.path.join(job.tempDir, star_tar) if _move_instead_of_return: move_files([tar_path], args.output_dir) else: return job.fileStore.readGlobalFile(tar_path)
def convert_bam_to_fastq(job, bam_path, check_paired=True, ignore_validation_errors=True): """ Converts BAM to a pair of FASTQ files :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam_path: Path to BAM :param bool check_paired: If True, checks whether BAM is paired-end :param bool ignore_validation_errors: If True, ignores validation errors from picardTools :return: FileStoreIDs for R1 and R2 :rtype: tuple """ if check_paired: assert_bam_is_paired_end(job, bam_path) work_dir = os.path.dirname(os.path.abspath(bam_path)) parameters = [ 'SamToFastq', 'I={}'.format(docker_path(bam_path)), 'F=/data/R1.fq', 'F2=/data/R2.fq' ] if ignore_validation_errors: parameters.append('VALIDATION_STRINGENCY=SILENT') dockerCall(job=job, workDir=work_dir, parameters=parameters, tool=picardtools_version) r1 = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1.fq')) r2 = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R2.fq')) return r1, r2
def _testDockerCleanFn(job, workDir, detached=None, rm=None, defer=None, containerName=None): """ Test function for test docker_clean. Runs a container with given flags and then dies leaving behind a zombie container :param toil.job.Job job: job :param workDir: See `work_dir=` in :func:`dockerCall` :param bool rm: See `rm=` in :func:`dockerCall` :param bool detached: See `detached=` in :func:`dockerCall` :param int defer: See `defer=` in :func:`dockerCall` :param str containerName: See `container_name=` in :func:`dockerCall` :return: """ dockerParameters = ['--log-driver=none', '-v', os.path.abspath(workDir) + ':/data', '--name', containerName] if detached: dockerParameters.append('-d') if rm: dockerParameters.append('--rm') def killSelf(): test_file = os.path.join(workDir, 'test.txt') # This will kill the worker once we are sure the docker container started while not os.path.exists(test_file): _log.debug('Waiting on the file created by spooky_container.') time.sleep(1) # By the time we reach here, we are sure the container is running. os.kill(os.getpid(), signal.SIGKILL) # signal.SIGINT) t = Thread(target=killSelf) # Make it a daemon thread so that thread failure doesn't hang tests. t.daemon = True t.start() dockerCall(job, tool='quay.io/ucsc_cgl/spooky_test', workDir=workDir, defer=defer, dockerParameters=dockerParameters)
def rsem_index(job, args): download_url(url=args.ref, name='ref.fa', work_dir=job.tempDir) download_url(url=args.gtf, name='annotation.gtf', work_dir=job.tempDir) # Run RSEM to generate reference rsem_dir = os.path.join(job.tempDir, args.rsem_name) os.mkdir(rsem_dir) docker_parameters = [ '--entrypoint', 'rsem-prepare-reference', '-v', '{}:/data'.format(job.tempDir), '--rm', '--log-driver=none' ] parameters = [ '-p', str(args.cores), '--gtf', '/data/annotation.gtf', '/data/ref.fa', os.path.join('/data', args.rsem_name, args.rsem_name) ] dockerCall( job, tool= 'quay.io/ucsc_cgl/rsem:1.2.25--d4275175cc8df36967db460b06337a14f40d2f21', parameters=parameters, dockerParameters=docker_parameters) # Compress rsemRef into a tarball rsem_tar = '{}.tar.gz'.format(args.rsem_name) tarball_files(rsem_tar, file_paths=[rsem_dir], output_dir=job.tempDir) # Move to output dir tar_path = os.path.join(job.tempDir, rsem_tar) if _move_instead_of_return: move_files([tar_path], args.output_dir) else: return job.fileStore.readGlobalFile(tar_path)
def run_sambamba_sort(job, bam, sort_by_name=False): """ Sorts BAM file using Sambamba sort :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam: FileStoreID for BAM file :param boolean sort_by_name: If true, sorts by read name instead of coordinate. :return: FileStoreID for sorted BAM file :rtype: str """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam')) command = ['/usr/local/bin/sambamba', 'sort', '-t', str(int(job.cores)), '-m', str(job.memory), '-o', '/data/output.bam', '/data/input.bam'] if sort_by_name: command.append('-n') start_time = time.time() dockerCall(job=job, workDir=work_dir, parameters=command, tool='quay.io/biocontainers/sambamba:0.6.6--0') end_time = time.time() _log_runtime(job, start_time, end_time, "sambamba sort") return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam'))
def sort_and_save_bam(job, config, bam_id, skip_sort=True): """ Sorts STAR's output BAM using samtools :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param bool skip_sort: If True, skips sort step and upload BAM :param FileID bam_id: FileID for STARs genome aligned bam """ bam_path = os.path.join(job.tempDir, 'aligned.bam') sorted_bam = os.path.join(job.tempDir, '{}.sorted.bam'.format(config.uuid)) job.fileStore.readGlobalFile(bam_id, bam_path) parameters = [ 'sort', '-o', '/data/{}.sorted.bam'.format(config.uuid), '-O', 'bam', '-T', 'temp', '-@', str(job.cores), '/data/aligned.bam' ] if skip_sort: job.log('Skipping samtools sort as STAR already sorted BAM') os.rename(bam_path, sorted_bam) else: dockerCall(job, tool=samtools_version, parameters=parameters, workDir=job.tempDir) move_or_upload(config, files=[sorted_bam])
def run_bamqc(job, aligned_bam_id, config, save_bam=False): """ Run BAMQC as specified by Treehouse (UCSC) https://github.com/UCSC-Treehouse/bam-umend-qc :param JobFunctionWrappingJob job: :param str aligned_bam_id: FileStoreID of aligned bam from STAR :param Expando config: Contains sample information :param bool save_bam: Option to save mark-duplicate bam from BAMQC :return: FileStoreID for output tar :rtype: str """ job.fileStore.readGlobalFile(aligned_bam_id, os.path.join(job.tempDir, 'input.bam')) dockerCall(job, tool=bamqc_version, workDir=job.tempDir, parameters=['/data/input.bam', '/data']) # Tar Output files output_names = ['readDist.txt', 'bam_umend_qc.tsv', 'bam_umend_qc.json'] output_files = [os.path.join(job.tempDir, x) for x in output_names] tarball_files(tar_name='bam_qc.tar.gz', file_paths=output_files, output_dir=job.tempDir) tar_path = os.path.join(job.tempDir, 'bam_qc.tar.gz') # Save output BAM - this step is done here instead of in its own job for efficiency if save_bam: # Tag bam with sample UUID, upload, and delete bam_path = os.path.join(job.tempDir, 'sortedByCoord.md.bam') new_bam = os.path.join(job.tempDir, config.uuid + '.sortedByCoord.md.bam') os.rename(bam_path, new_bam) move_or_upload(config, [new_bam]) job.fileStore.deleteGlobalFile(new_bam) # Delete intermediates job.fileStore.deleteGlobalFile(aligned_bam_id) return job.fileStore.writeGlobalFile(tar_path)
def star_index(job, args): work_dir = job.fileStore.getLocalTempDir() download_url(url=args.ref, name='ref.fa', work_dir=work_dir) download_url(url=args.gtf, name='annotation.gtf', work_dir=work_dir) # Run STAR to generate index star_dir = os.path.join(work_dir, args.star_name) os.mkdir(star_dir) parameters = [ '--runThreadN', str(args.cores), '--runMode', 'genomeGenerate', '--genomeDir', '/data/' + args.star_name, '--genomeFastaFiles', 'ref.fa', '--sjdbGTFfile', 'annotation.gtf' ] dockerCall( job, tool= 'quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', workDir=work_dir, parameters=parameters) # Compress starIndex into a tarball subprocess.check_call(['tar', '-zcvf', star_dir + '.tar.gz', star_dir]) # Move to output dir or return if _move_instead_of_return: move_files([star_dir + '.tar.gz'], args.output_dir) else: return job.fileStore.readGlobalFile(star_dir + '.tar.gz')
def run_fastqc(job, r1_id, r2_id): """ Run Fastqc on the input reads :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq read 1 :param str r2_id: FileStoreID of fastq read 2 :return: FileStoreID of fastQC output (tarball) :rtype: str """ # Read in files and set parameters job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq')) parameters = ['/data/R1.fastq'] output_names = ['R1_fastqc.html', 'R1_fastqc.zip'] if r2_id: job.fileStore.readGlobalFile(r2_id, os.path.join(job.tempDir, 'R2.fastq')) parameters.extend(['-t', '2', '/data/R2.fastq']) output_names.extend(['R2_fastqc.html', 'R2_fastqc.zip']) # Call fastQC dockerCall(job=job, tool=fastqc_version, workDir=job.tempDir, parameters=parameters) # Package output files and return FileStoreID output_files = [os.path.join(job.tempDir, x) for x in output_names] tarball_files(tar_name='fastqc.tar.gz', file_paths=output_files, output_dir=job.tempDir) return job.fileStore.writeGlobalFile( os.path.join(job.tempDir, 'fastqc.tar.gz'))
def download_bam_from_gdc(job, work_dir, url, token): """ Downloads BAM file from the GDC using an url (format: "gdc://<GDC ID>") and a GDC access token :param JobFunctionWrappingJob job: passed automatically by Toil :param str work_dir: Directory being mounted into Docker :param str url: gdc URL to be downloaded :param str token: Full path to token :return: Path to BAM :rtype: str """ assert token, 'gdc_token is missing which is required for downloading. Check config.' copy_files([os.path.abspath(token)], work_dir) parsed_url = urlparse(url) parameters = [ 'download', '-d', '/data', '-t', '/data/{}'.format(os.path.basename(token)), parsed_url.netloc ] dockerCall(job, tool=gdc_version, parameters=parameters, workDir=work_dir) files = [ x for x in os.listdir(os.path.join(work_dir, parsed_url.netloc)) if x.lower().endswith('.bam') ] assert len(files) == 1, 'More than one BAM found from GDC URL: {}'.format( files) bam_path = os.path.join(work_dir, parsed_url.netloc, files[0]) return bam_path
def kallisto_index(job, args): if args.transcriptome: download_url(url=args.transcriptome, name='transcriptome.fa', work_dir=job.tempDir) else: _create_transcriptome(job, args, job.tempDir) # Run Kallisto Index parameters = [ 'index', 'transcriptome.fa', '-i', '/data/{}.index'.format(args.kallisto_name) ] dockerCall( job, tool= 'quay.io/ucsc_cgl/kallisto:0.43.1--355c19b1fb6fbb85f7f8293e95fb8a1e9d0da163', workDir=job.tempDir, parameters=parameters) # Move to output dir output_path = os.path.join(job.tempDir, args.kallisto_name + '.index') if _move_instead_of_return: move_files([output_path], args.output_dir) else: return job.fileStore.readGlobalFile(output_path)
def run_bam_qc(job, aligned_bam_id, config): """ Run BAM QC as specified by California Kids Cancer Comparison (CKCC) :param JobFunctionWrappingJob job: :param str aligned_bam_id: FileStoreID of aligned bam from STAR :param Namespace config: Argparse Namespace object containing argument inputs Must contain: config.uuid str: UUID of input sample config.save_bam bool: True/False depending on whether to save bam config.output_dir str: Path to save bam config.ssec str: Path to encryption key for secure upload to S3 :return: boolean flag, FileStoreID for output bam, and FileStoreID for output tar :rtype: tuple(bool, str, str) """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile( aligned_bam_id, os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) dockerCall(job, tool='hbeale/treehouse_bam_qc:1.0', workDir=work_dir, parameters=['runQC.sh', str(job.cores)]) # Tar Output files output_names = [ 'readDist.txt', 'rnaAligned.out.md.sorted.geneBodyCoverage.curves.pdf', 'rnaAligned.out.md.sorted.geneBodyCoverage.txt' ] if os.path.exists(os.path.join(work_dir, 'readDist.txt_PASS_qc.txt')): output_names.append('readDist.txt_PASS_qc.txt') fail_flag = False else: output_names.append('readDist.txt_FAIL_qc.txt') fail_flag = True output_files = [os.path.join(work_dir, x) for x in output_names] tarball_files(tar_name='bam_qc.tar.gz', file_paths=output_files, output_dir=work_dir) # Save output BAM if config.save_bam: bam_path = os.path.join(work_dir, 'rnaAligned.sortedByCoord.md.bam') new_bam_path = os.path.join(work_dir, config.uuid + '.sortedByCoord.md.bam') os.rename(bam_path, new_bam_path) if urlparse(config.output_dir).scheme == 's3' and config.ssec: s3am_upload(fpath=new_bam_path, s3_dir=config.output_dir, s3_key_path=config.ssec) elif urlparse(config.output_dir).scheme != 's3': copy_files(file_paths=[new_bam_path], output_dir=config.output_dir) # Delete intermediates job.fileStore.deleteGlobalFile(aligned_bam_id) return fail_flag, job.fileStore.writeGlobalFile( os.path.join(work_dir, 'bam_qc.tar.gz'))
def run_rsem(job, bam_id, rsem_ref_url, paired=True): """ RNA quantification with RSEM :param JobFunctionWrappingJob job: Passed automatically by Toil :param str bam_id: FileStoreID of transcriptome bam for quantification :param str rsem_ref_url: URL of RSEM reference (tarball) :param bool paired: If True, uses parameters for paired end data :return: FileStoreIDs for RSEM's gene and isoform output :rtype: str """ # Retrieve RSEM reference download_url(url=rsem_ref_url, name='rsem_ref.tar.gz', work_dir=job.tempDir) subprocess.check_call([ 'tar', '-xvf', os.path.join(job.tempDir, 'rsem_ref.tar.gz'), '-C', job.tempDir ]) os.remove(os.path.join(job.tempDir, 'rsem_ref.tar.gz')) # Determine tarball structure - based on it, ascertain folder name and rsem reference prefix rsem_files = [] for root, directories, files in os.walk(job.tempDir): rsem_files.extend([os.path.join(root, x) for x in files]) # "grp" is a required RSEM extension that should exist in the RSEM reference ref_prefix = [ os.path.basename(os.path.splitext(x)[0]) for x in rsem_files if 'grp' in x ][0] ref_folder = os.path.join('/data', os.listdir(job.tempDir)[0]) if len( os.listdir(job.tempDir)) == 1 else '/data' # Read bam from fileStore job.fileStore.readGlobalFile( bam_id, os.path.join(job.tempDir, 'transcriptome.bam')) # Call: RSEM output_prefix = 'rsem' parameters = [ '--quiet', '--no-qualities', '-p', str(job.cores), '--forward-prob', '0.5', '--seed-length', '25', '--fragment-length-mean', '-1.0', '--bam', '/data/transcriptome.bam', os.path.join(ref_folder, ref_prefix), output_prefix ] if paired: parameters = ['--paired-end'] + parameters dockerCall(job, parameters=parameters, workDir=job.tempDir, tool=rsem_version) # Store output in fileStore and return gene_id = job.fileStore.writeGlobalFile( os.path.join(job.tempDir, output_prefix + '.genes.results')) isoform_id = job.fileStore.writeGlobalFile( os.path.join(job.tempDir, output_prefix + '.isoforms.results')) return gene_id, isoform_id
def _testSubprocessDockerPermissions(job): testDir = job.fileStore.getLocalTempDir() dockerCall(job, tool='ubuntu', workDir=testDir, parameters=[['touch', '/data/test.txt']]) outFile = os.path.join(testDir, 'test.txt') assert os.path.exists(outFile) assert not ownerName(outFile) == "root"
def run_base_recalibration(job, bam, bai, ref, ref_dict, fai, dbsnp, mills, unsafe=False): """ Creates recalibration table for Base Quality Score Recalibration :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam: FileStoreID for BAM file :param str bai: FileStoreID for BAM index file :param str ref: FileStoreID for reference genome fasta file :param str ref_dict: FileStoreID for reference genome sequence dictionary file :param str fai: FileStoreID for reference genome fasta index file :param str dbsnp: FileStoreID for dbSNP VCF file :param str mills: FileStoreID for Mills VCF file :param bool unsafe: If True, runs GATK in UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY" :return: FileStoreID for the recalibration table file :rtype: str """ inputs = {'ref.fasta': ref, 'ref.fasta.fai': fai, 'ref.dict': ref_dict, 'input.bam': bam, 'input.bai': bai, 'dbsnp.vcf': dbsnp, 'mills.vcf': mills} work_dir = job.fileStore.getLocalTempDir() for name, file_store_id in inputs.iteritems(): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call: GATK -- BaseRecalibrator parameters = ['-T', 'BaseRecalibrator', '-nct', str(int(job.cores)), '-R', '/data/ref.fasta', '-I', '/data/input.bam', # Recommended known sites: # https://software.broadinstitute.org/gatk/guide/article?id=1247 '-knownSites', '/data/dbsnp.vcf', '-knownSites', '/data/mills.vcf', '-o', '/data/recal_data.table'] if unsafe: parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY']) # Set TMPDIR to /data to prevent writing temporary files to /tmp docker_parameters = ['--rm', '--log-driver', 'none', '-e', 'JAVA_OPTS=-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory), '-v', '{}:/data'.format(work_dir)] start_time = time.time() dockerCall(job=job, tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', workDir=work_dir, parameters=parameters, dockerParameters=docker_parameters) end_time = time.time() _log_runtime(job, start_time, end_time, "GATK3 BaseRecalibrator") return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'recal_data.table'))
def apply_bqsr_recalibration(job, table, bam, bai, ref, ref_dict, fai, unsafe=False): """ Creates BAM file with recalibrated base quality scores :param JobFunctionWrappingJob job: passed automatically by Toil :param str table: FileStoreID for BQSR recalibration table file :param str bam: FileStoreID for BAM file :param str bai: FileStoreID for BAM index file :param str ref: FileStoreID for reference genome fasta file :param str ref_dict: FileStoreID for reference genome sequence dictionary file :param str fai: FileStoreID for reference genome fasta index file :param bool unsafe: If True, runs GATK in UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY" :return: FileStoreIDs for recalibrated BAM and BAI files :rtype: tuple(str, str) """ inputs = {'ref.fasta': ref, 'ref.fasta.fai': fai, 'ref.dict': ref_dict, 'recal.table': table, 'input.bam': bam, 'input.bai': bai} work_dir = job.fileStore.getLocalTempDir() for name, file_store_id in inputs.iteritems(): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call: GATK -- PrintReads parameters = ['-T', 'PrintReads', '-nct', str(int(job.cores)), '-R', '/data/ref.fasta', '-I', '/data/input.bam', '-BQSR', '/data/recal.table', '-o', '/data/bqsr.bam'] end_time = time.time() _log_runtime(job, start_time, end_time, "GATK3 BQSR PrintReads") if unsafe: parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY']) # Set TMPDIR to /data to prevent writing temporary files to /tmp docker_parameters = ['--rm', '--log-driver', 'none', '-e', 'JAVA_OPTS=-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory), '-v', '{}:/data'.format(work_dir)] start_time = time.time() dockerCall(job=job, tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', workDir=work_dir, parameters=parameters, dockerParameters=docker_parameters) end_time = time.time() _log_runtime(job, start_time, end_time, "GATK3 BQSR PrintReads") output_bam = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'bqsr.bam')) output_bai = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'bqsr.bai')) return output_bam, output_bai
def _create_transcriptome(job, args, work_dir): # Download files to generate transcriptome download_url(url=args.ref, name='ref.fa', work_dir=work_dir) download_url(url=args.gtf, name='annotation.gtf', work_dir=work_dir) parameters = [ 'gtf_to_fasta', '/data/annotation.gtf', '/data/ref.fa', '/data/transcriptome.fa' ] dockerCall(job, tool='limesbonn/tophat2', workDir=work_dir, parameters=parameters)
def index_bam(job, bam_path): """ Creates a BAM index (.bai) in the same directory as the BAM Indexing is necessary for viewing slices of the BAM :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam_path: Path to BAM """ work_dir = os.path.dirname(os.path.abspath(bam_path)) parameters = ['index', docker_path(bam_path)] dockerCall(job, workDir=work_dir, parameters=parameters, tool=samtools_version)
def run_samtools_faidx(job, ref_id): """ Use Samtools to create reference index file :param JobFunctionWrappingJob job: passed automatically by Toil :param str ref_id: FileStoreID for the reference genome :return: FileStoreID for reference index :rtype: str """ job.fileStore.logToMaster('Created reference index') work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fasta')) command = ['faidx', '/data/ref.fasta'] dockerCall(job=job, workDir=work_dir, parameters=command, tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e') return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'ref.fasta.fai'))
def run_picard_create_sequence_dictionary(job, ref_id): """ Uses Picard to create reference sequence dictionary :param JobFunctionWrappingJob job: passed automatically by Toil :param str ref_id: FileStoreID for the reference genome fasta file :return: FileStoreID for sequence dictionary file :rtype: str """ job.fileStore.logToMaster('Created reference dictionary') work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fasta')) command = ['CreateSequenceDictionary', 'R=ref.fasta', 'O=ref.dict'] dockerCall(job=job, workDir=work_dir, parameters=command, tool='quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e') return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'ref.dict'))
def run_kallisto(job, r1_id, r2_id, kallisto_index_url): """ RNA quantification via Kallisto :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end) :param str kallisto_index_url: FileStoreID for Kallisto index file :return: FileStoreID from Kallisto output :rtype: str """ # Retrieve files and define parameters download_url(url=kallisto_index_url, name='kallisto_hg38.idx', work_dir=job.tempDir) job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq')) parameters = [ 'quant', '-i', '/data/kallisto_hg38.idx', '-t', str(job.cores), '-o', '/data/', '-b', '100', '--fusion' ] # If R2 fastq is present... if r2_id: job.fileStore.readGlobalFile(r2_id, os.path.join(job.tempDir, 'R2.fastq')) parameters.extend(['/data/R1.fastq', '/data/R2.fastq']) else: parameters.extend( ['--single', '-l', '200', '-s', '15', '/data/R1.fastq']) # Call: Kallisto dockerCall(job, workDir=job.tempDir, parameters=parameters, tool=kallisto_version) # Tar output files together, store in fileStore, and return output_names = [ 'run_info.json', 'abundance.tsv', 'abundance.h5', 'fusion.txt' ] output_files = [os.path.join(job.tempDir, x) for x in output_names] tarball_files(tar_name='kallisto.tar.gz', file_paths=output_files, output_dir=job.tempDir) return job.fileStore.writeGlobalFile( os.path.join(job.tempDir, 'kallisto.tar.gz'))
def run_samtools_index(job, bam): """ Runs SAMtools index to create a BAM index file :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam: FileStoreID of the BAM file :return: FileStoreID for BAM index file :rtype: str """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'sample.bam')) # Call: index the bam parameters = ['index', '/data/sample.bam'] dockerCall(job=job, workDir=work_dir, parameters=parameters, tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e') # Write to fileStore return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.bam.bai'))
def picard_mark_duplicates(job, bam, bai, validation_stringency='LENIENT'): """ Runs Picard MarkDuplicates on a BAM file. Requires that the BAM file be coordinate sorted. :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam: FileStoreID for BAM file :param str bai: FileStoreID for BAM index file :param str validation_stringency: BAM file validation stringency, default is LENIENT :return: FileStoreIDs for BAM and BAI files :rtype: tuple """ work_dir = job.fileStore.getLocalTempDir() # Retrieve file path job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'sorted.bam')) job.fileStore.readGlobalFile(bai, os.path.join(work_dir, 'sorted.bai')) # Call: picardtools command = ['MarkDuplicates', 'INPUT=sorted.bam', 'OUTPUT=mkdups.bam', 'METRICS_FILE=metrics.txt', 'ASSUME_SORTED=true', 'CREATE_INDEX=true', 'VALIDATION_STRINGENCY=%s' % validation_stringency.upper()] # picard-tools container doesn't have JAVA_OPTS variable # Set TMPDIR to /data to prevent writing temporary files to /tmp docker_parameters = ['--rm', '--log-driver', 'none', '-e', 'JAVA_OPTIONS=-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory), '-v', '{}:/data'.format(work_dir)] start_time = time.time() dockerCall(job=job, workDir=work_dir, parameters=command, tool='quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e', dockerParameters=docker_parameters) end_time = time.time() _log_runtime(job, start_time, end_time, "Picard MarkDuplicates") bam = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'mkdups.bam')) bai = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'mkdups.bai')) return bam, bai
def run_bwa_index(job, ref_id): """ Use BWA to create reference index files :param JobFunctionWrappingJob job: passed automatically by Toil :param str ref_id: FileStoreID for the reference genome :return: FileStoreIDs for BWA index files :rtype: tuple(str, str, str, str, str) """ job.fileStore.logToMaster('Created BWA index files') work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fa')) command = ['index', '/data/ref.fa'] dockerCall(job=job, workDir=work_dir, parameters=command, tool='quay.io/ucsc_cgl/bwa:0.7.12--256539928ea162949d8a65ca5c79a72ef557ce7c') ids = {} for output in ['ref.fa.amb', 'ref.fa.ann', 'ref.fa.bwt', 'ref.fa.pac', 'ref.fa.sa']: ids[output.split('.')[-1]] = (job.fileStore.writeGlobalFile(os.path.join(work_dir, output))) return ids['amb'], ids['ann'], ids['bwt'], ids['pac'], ids['sa']
def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter): """ Adapter trimming for RNA-seq data :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq read 1 :param str r2_id: FileStoreID of fastq read 2 (if paired data) :param str fwd_3pr_adapter: Adapter sequence for the forward 3' adapter :param str rev_3pr_adapter: Adapter sequence for the reverse 3' adapter (second fastq pair) :return: R1 and R2 FileStoreIDs :rtype: tuple(str, str) """ # Retrieve files and define parameters job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq')) parameters = ['-a', fwd_3pr_adapter, '-m', '35'] # If R2 fastq is present... if r2_id: require(rev_3pr_adapter, "Paired end data requires a reverse 3' adapter sequence.") job.fileStore.readGlobalFile(r2_id, os.path.join(job.tempDir, 'R2.fastq')) parameters.extend([ '-A', rev_3pr_adapter, '-o', '/data/R1_cutadapt.fastq', '-p', '/data/R2_cutadapt.fastq', '/data/R1.fastq', '/data/R2.fastq' ]) else: parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq']) # Call: CutAdapt dockerCall(job=job, tool=cutadapt_version, workDir=job.tempDir, parameters=parameters) # Write to fileStore r1_cut_id = job.fileStore.writeGlobalFile( os.path.join(job.tempDir, 'R1_cutadapt.fastq')) r2_cut_id = job.fileStore.writeGlobalFile( os.path.join(job.tempDir, 'R2_cutadapt.fastq')) if r2_id else None return r1_cut_id, r2_cut_id
def call_conductor(job, master_ip, src, dst, memory=None, override_parameters=None): """ Invokes the Conductor container to copy files between S3 and HDFS and vice versa. Find Conductor at https://github.com/BD2KGenomics/conductor. :param toil.Job.job job: The Toil Job calling this function :param masterIP: The Spark leader IP address. :param src: URL of file to copy. :param src: URL of location to copy file to. :param memory: Gigabytes of memory to provision for Spark driver/worker. :param override_parameters: Parameters passed by the user, that override our defaults. :type masterIP: MasterAddress :type src: string :type dst: string :type memory: int or None :type override_parameters: list of string or None """ arguments = ["-C", src, dst] docker_parameters = [ '--log-driver', 'none', master_ip.docker_parameters(["--net=host"]) ] dockerCall( job=job, tool="quay.io/ucsc_cgl/conductor", parameters=_make_parameters( master_ip, [], # no conductor specific spark configuration memory, arguments, override_parameters), dockerParameters=docker_parameters)
def run_samtools_rmdup(job, bam): """ Mark reads as PCR duplicates using SAMtools rmdup :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam: FileStoreID for BAM file :return: FileStoreID for sorted BAM file :rtype: str """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'input.bam')) command = ['rmdup', '/data/input.bam', '/data/output.bam'] start_time = time.time() dockerCall(job=job, workDir=work_dir, parameters=command, tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c') end_time = time.time() _log_runtime(job, start_time, end_time, "samtools rmdup") return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.bam'))
def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter): """ Adapter trimming for RNA-seq data :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq read 1 :param str r2_id: FileStoreID of fastq read 2 (if paired data) :param str fwd_3pr_adapter: Adapter sequence for the forward 3' adapter :param str rev_3pr_adapter: Adapter sequence for the reverse 3' adapter (second fastq pair) :return: R1 and R2 FileStoreIDs :rtype: tuple """ work_dir = job.fileStore.getLocalTempDir() if r2_id: require(rev_3pr_adapter, "Paired end data requires a reverse 3' adapter sequence.") # Retrieve files parameters = ['-a', fwd_3pr_adapter, '-m', '35'] if r1_id and r2_id: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq')) parameters.extend(['-A', rev_3pr_adapter, '-o', '/data/R1_cutadapt.fastq', '-p', '/data/R2_cutadapt.fastq', '/data/R1.fastq', '/data/R2.fastq']) else: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq']) # Call: CutAdapt dockerCall(job=job, tool='quay.io/ucsc_cgl/cutadapt:1.9--6bd44edd2b8f8f17e25c5a268fedaab65fa851d2', workDir=work_dir, parameters=parameters) # Write to fileStore if r1_id and r2_id: r1_cut_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1_cutadapt.fastq')) r2_cut_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R2_cutadapt.fastq')) else: r1_cut_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1_cutadapt.fastq')) r2_cut_id = None return r1_cut_id, r2_cut_id
def run_rsem_gene_mapping(job, rsem_gene_id, rsem_isoform_id): """ Parses RSEM output files to map ENSEMBL IDs to Gencode HUGO gene names :param JobFunctionWrappingJob job: passed automatically by Toil :param str rsem_gene_id: FileStoreID of rsem_gene_ids :param str rsem_isoform_id: FileStoreID of rsem_isoform_ids :return: FileStoreID from RSEM post process tarball :rytpe: str """ # Retrieve input files genes = job.fileStore.readGlobalFile( rsem_gene_id, os.path.join(job.tempDir, 'rsem_genes.results')) iso = job.fileStore.readGlobalFile( rsem_isoform_id, os.path.join(job.tempDir, 'rsem_isoforms.results')) # Perform HUGO gene / isoform name mapping command = ['-g', 'rsem_genes.results', '-i', 'rsem_isoforms.results'] dockerCall(job, parameters=command, workDir=job.tempDir, tool=rsemgenemapping_version) hugo_files = [ os.path.join(job.tempDir, x) for x in ['rsem_genes.hugo.results', 'rsem_isoforms.hugo.results'] ] # Create tarballs for output, store in fileStore, and return tarball_files('rsem.tar.gz', file_paths=[genes, iso], output_dir=job.tempDir) tarball_files('rsem_hugo.tar.gz', file_paths=hugo_files, output_dir=job.tempDir) rsem_id = job.fileStore.writeGlobalFile( os.path.join(job.tempDir, 'rsem.tar.gz')) hugo_id = job.fileStore.writeGlobalFile( os.path.join(job.tempDir, 'rsem_hugo.tar.gz')) return rsem_id, hugo_id
def run_samblaster(job, sam): """ Marks reads as PCR duplicates using SAMBLASTER :param JobFunctionWrappingJob job: passed automatically by Toil :param str sam: FileStoreID for SAM file :return: FileStoreID for deduped SAM file :rtype: str """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(sam, os.path.join(work_dir, 'input.sam')) command = ['/usr/local/bin/samblaster', '-i', '/data/input.sam', '-o', '/data/output.sam', '--ignoreUnmated'] start_time = time.time() dockerCall(job=job, workDir=work_dir, parameters=command, tool='quay.io/biocontainers/samblaster:0.1.24--0') end_time = time.time() _log_runtime(job, start_time, end_time, "SAMBLASTER") return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.sam'))
def _testDockerPermissions(job): testDir = job.fileStore.getLocalTempDir() dockerCall(job, tool='ubuntu', workDir=testDir, parameters=[['touch', '/data/test.txt']]) outFile = os.path.join(testDir, 'test.txt') assert os.path.exists(outFile) assert not ownerName(outFile) == "root"