def chunk_vg(xg_path, path_name, out_dir, chunks, chunk_i, overwrite): """ use vg find to make one chunk of the graph """ chunk = chunks[chunk_i] vg_chunk_path = chunk_base_name(chunk[0], out_dir, chunk_i, ".vg") if overwrite or not os.path.isfile(vg_chunk_path): first_node = xg_path_node_id(xg_path, chunk[0], int(chunk[1]), out_dir) # xg_path query takes 0-based inclusive coordinates, so we # subtract 1 below to convert from BED chunk (0-based exlcusive) last_node = xg_path_node_id(xg_path, chunk[0], chunk[2] - 1, out_dir) assert first_node > 0 and last_node >= first_node # todo: would be cleaner to not have to pad context here with open(vg_chunk_path, "w") as vg_chunk_path_stream: command = ['find', '-x', os.path.basename(xg_path), '-r', str(first_node)+':'+str(last_node), '-c', '1'] docker_call(work_dir=out_dir, parameters=command, tool='quay.io/ucsc_cgl/vg:latest', outfile=vg_chunk_path_stream) # but because we got a context, manually go in and make sure # our path starts at first_node by deleting everything before left_path_padding = xg_path_predecessors(xg_path, path_name, first_node, out_dir, context = 1) for destroy_id in left_path_padding: # destroy should take node list destroy_list = vg_chunk_path + ".destroy" with open(destroy_list, "w") as destroy_list_stream: command = ['vg mod -y {} {}'.format(str(destroy_id), os.path.basename(vg_chunk_path)), 'vg mod -o -'] docker_call(work_dir=out_dir, parameters=command, tools='quay.io/ucsc_cgl/vg:latest', outfile=destroy_list_stream) run("mv {} {}".format( vg_chunk_path + ".destroy", vg_chunk_path))
def call_conductor(master_ip, src, dst, memory=None, override_parameters=None): """ Invokes the Conductor container to copy files between S3 and HDFS and vice versa. Find Conductor at https://github.com/BD2KGenomics/conductor. :param masterIP: The Spark leader IP address. :param src: URL of file to copy. :param src: URL of location to copy file to. :param memory: Gigabytes of memory to provision for Spark driver/worker. :param override_parameters: Parameters passed by the user, that override our defaults. :type masterIP: MasterAddress :type src: string :type dst: string :type memory: int or None :type override_parameters: list of string or None """ arguments = ["-C", src, dst] docker_call( rm=False, tool="quay.io/ucsc_cgl/conductor", docker_parameters=master_ip.docker_parameters(["--net=host"]), parameters=_make_parameters( master_ip, [], # no conductor specific spark configuration memory, arguments, override_parameters), mock=False)
def run_fastqc(job, r1_id, r2_id): """ Run Fastqc on the input reads :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq read 1 :param str r2_id: FileStoreID of fastq read 2 :return: FileStoreID of fastQC output (tarball) :rtype: str """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) parameters = ['/data/R1.fastq'] output_names = ['R1_fastqc.html', 'R1_fastqc.zip'] if r2_id: job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq')) parameters.extend(['-t', '2', '/data/R2.fastq']) output_names.extend(['R2_fastqc.html', 'R2_fastqc.zip']) docker_call( tool= 'quay.io/ucsc_cgl/fastqc:0.11.5--be13567d00cd4c586edf8ae47d991815c8c72a49', work_dir=work_dir, parameters=parameters) output_files = [os.path.join(work_dir, x) for x in output_names] tarball_files(tar_name='fastqc.tar.gz', file_paths=output_files, output_dir=work_dir) return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'fastqc.tar.gz'))
def run_bwa_index(job, ref_id): """ Use BWA to create reference index files :param JobFunctionWrappingJob job: passed automatically by Toil :param str ref_id: FileStoreID for the reference genome :return: FileStoreIDs for BWA index files :rtype: tuple(str, str, str, str, str) """ job.fileStore.logToMaster('Created BWA index files') work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fa')) command = ['index', '/data/ref.fa'] docker_call( job=job, work_dir=work_dir, parameters=command, tool= 'quay.io/ucsc_cgl/bwa:0.7.12--256539928ea162949d8a65ca5c79a72ef557ce7c' ) ids = {} for output in [ 'ref.fa.amb', 'ref.fa.ann', 'ref.fa.bwt', 'ref.fa.pac', 'ref.fa.sa' ]: ids[output.split('.')[-1]] = (job.fileStore.writeGlobalFile( os.path.join(work_dir, output))) return ids['amb'], ids['ann'], ids['bwt'], ids['pac'], ids['sa']
def run_print_reads(job, table, indel_bam, indel_bai, ref, ref_dict, fai, mem, unsafe=False): """ Creates BAM that has had the base quality scores recalibrated :param JobFunctionWrappingJob job: passed automatically by Toil :param str table: Recalibration table FileStoreID :param str indel_bam: Indel interval FileStoreID :param str indel_bai: Bam Index FileStoreID :param str ref: Reference genome FileStoreID :param str ref_dict: Reference dictionary FileStoreID :param str fai: Reference index FileStoreID :param str mem: Memory value to be passed to children. Needed for CI tests :param bool unsafe: If True, runs gatk UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY" :return: FileStoreID for the processed bam :rtype: tuple(str, str) """ work_dir = job.fileStore.getLocalTempDir() file_ids = [ref, fai, ref_dict, table, indel_bam, indel_bai] inputs = [ 'ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.recal.table', 'sample.indel.bam', 'sample.indel.bai' ] for file_store_id, name in zip(file_ids, inputs): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call: GATK -- PrintReads parameters = [ '-T', 'PrintReads', '-nct', str(job.cores), '-R', '/data/ref.fasta', '--emit_original_quals', '-I', '/data/sample.indel.bam', '-BQSR', '/data/sample.recal.table', '-o', '/data/sample.bqsr.bam' ] if unsafe: parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY']) docker_call( tool= 'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', inputs=inputs, outputs={ 'sample.bqsr.bam': None, 'sample.bqsr.bai': None }, work_dir=work_dir, parameters=parameters, env=dict(JAVA_OPTS='-Xmx{}'.format(mem))) # Write ouptut to file store bam_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'sample.bqsr.bam')) bai_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'sample.bqsr.bai')) return bam_id, bai_id
def variant_calling_and_qc(job, inputs, bam_id, bai_id): """ Perform variant calling with samtools nad QC with CheckBias :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of qc tarball :rtype: str """ job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input files input_info = [(inputs.genome, 'genome.fa'), (inputs.positions, 'positions.tsv'), (inputs.genome_index, 'genome.fa.fai'), (inputs.gtf, 'annotation.gtf'), (inputs.gtf_m53, 'annotation.m53')] for url, fname in input_info: download_url(job=job, url=url, work_dir=work_dir, name=fname) # Part 1: Variant Calling variant_command = [ 'mpileup', '-f', 'genome.fa', '-l', 'positions.tsv', '-v', 'alignment.bam', '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP', '-o', '/data/output.vcf.gz' ] docker_call( job=job, work_dir=work_dir, parameters=variant_command, tool= 'quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c' ) # Part 2: QC qc_command = [ '-o', 'qc', '-n', 'alignment.bam', '-a', 'annotation.gtf', '-m', 'annotation.m53' ] docker_call( job=job, work_dir=work_dir, parameters=qc_command, tool= 'jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3') # Write output to fileStore and return ids output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0] output_vcf = os.path.join(work_dir, 'output.vcf.gz') tarball_files('vcqc.tar.gz', file_paths=[output_tsv, output_vcf], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
def spladder(job, inputs, bam_id, bai_id): """ Run SplAdder to detect and quantify alternative splicing events :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of SplAdder tarball :rtype: str """ job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input file download_url(job=job, url=inputs.gtf, work_dir=work_dir, name='annotation.gtf') download_url(job=job, url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle') # Call Spladder command = [ '--insert_ir=y', '--insert_es=y', '--insert_ni=y', '--remove_se=n', '--validate_sg=n', '-b', 'alignment.bam', '-o ', '/data', '-a', 'annotation.gtf', '-v', 'y', '-c', '3', '-M', 'single', '-T', 'n', '-n', '50', '-P', 'y', '-p', 'n', '--sparse_bam', 'y' ] docker_call(job=job, work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0') # Write output to fileStore and return ids output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle') if not os.path.exists(output_pickle): matches = [] for root, dirnames, filenames in os.walk(work_dir): for filename in fnmatch.filter(filenames, '*genes_graph*'): matches.append(os.path.join(root, filename)) if matches: output_pickle = matches[0] else: raise RuntimeError("Couldn't find genes file!") output_filt = os.path.join(work_dir, 'alignment.filt.hdf5') output = os.path.join(work_dir, 'alignment.hdf5') print os.listdir(work_dir) tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir) return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'spladder.tar.gz'))
def run_rsem(job, bam_id, rsem_ref_url, paired=True): """ RNA quantification with RSEM :param JobFunctionWrappingJob job: Passed automatically by Toil :param str bam_id: FileStoreID of transcriptome bam for quantification :param str rsem_ref_url: URL of RSEM reference (tarball) :param bool paired: If True, uses parameters for paired end data :return: FileStoreIDs for RSEM's gene and isoform output :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=rsem_ref_url, name='rsem_ref.tar.gz', work_dir=work_dir) subprocess.check_call([ 'tar', '-xvf', os.path.join(work_dir, 'rsem_ref.tar.gz'), '-C', work_dir ]) os.remove(os.path.join(work_dir, 'rsem_ref.tar.gz')) # Determine tarball structure - based on it, ascertain folder name and rsem reference prefix rsem_files = [] for root, directories, files in os.walk(work_dir): rsem_files.extend([os.path.join(root, x) for x in files]) # "grp" is a required RSEM extension that should exist in the RSEM reference ref_prefix = [ os.path.basename(os.path.splitext(x)[0]) for x in rsem_files if 'grp' in x ][0] ref_folder = os.path.join('/data', os.listdir(work_dir)[0]) if len( os.listdir(work_dir)) == 1 else '/data' # I/O job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'transcriptome.bam')) output_prefix = 'rsem' # Call: RSEM parameters = [ '--quiet', '--no-qualities', '-p', str(job.cores), '--forward-prob', '0.5', '--seed-length', '25', '--fragment-length-mean', '-1.0', '--bam', '/data/transcriptome.bam', os.path.join(ref_folder, ref_prefix), output_prefix ] if paired: parameters = ['--paired-end'] + parameters docker_call( tool= 'quay.io/ucsc_cgl/rsem:1.2.25--d4275175cc8df36967db460b06337a14f40d2f21', parameters=parameters, work_dir=work_dir) os.rename(os.path.join(work_dir, output_prefix + '.genes.results'), os.path.join(work_dir, 'rsem_gene.tab')) os.rename(os.path.join(work_dir, output_prefix + '.isoforms.results'), os.path.join(work_dir, 'rsem_isoform.tab')) # Write to FileStore gene_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rsem_gene.tab')) isoform_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rsem_isoform.tab')) return gene_id, isoform_id
def run_star(job, r1_id, r2_id, star_index_url, wiggle=False): """ Performs alignment of fastqs to bam via STAR :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, else pass None) :param str star_index_url: STAR index tarball :param bool wiggle: If True, will output a wiggle file and return it :return: FileStoreID from RSEM :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=star_index_url, name='starIndex.tar.gz', work_dir=work_dir) subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir]) os.remove(os.path.join(work_dir, 'starIndex.tar.gz')) # Determine tarball structure - star index contains are either in a subdir or in the tarball itself star_index = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data' # Parameter handling for paired / single-end data parameters = ['--runThreadN', str(job.cores), '--genomeDir', star_index, '--outFileNamePrefix', 'rna', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--outSAMunmapped', 'Within', '--quantMode', 'TranscriptomeSAM', '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD', '--outFilterType', 'BySJout', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '999', '--outFilterMismatchNoverReadLmax', '0.04', '--alignIntronMin', '20', '--alignIntronMax', '1000000', '--alignMatesGapMax', '1000000', '--alignSJoverhangMin', '8', '--alignSJDBoverhangMin', '1', '--sjdbScore', '1'] if wiggle: parameters.extend(['--outWigType', 'bedGraph', '--outWigStrand', 'Unstranded', '--outWigReferencesPrefix', 'chr']) if r1_id and r2_id: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq']) else: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq']) # Call: STAR Mapping docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', work_dir=work_dir, parameters=parameters) # Write to fileStore transcriptome_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.toTranscriptome.out.bam')) sorted_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) if wiggle: wiggle_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaSignal.UniqueMultiple.str1.out.bg')) return transcriptome_id, sorted_id, wiggle_id else: return transcriptome_id, sorted_id
def run_rsem_postprocess(job, uuid, rsem_gene_id, rsem_isoform_id): """ Parses RSEMs output to produce the separate .tab files (TPM, FPKM, counts) for both gene and isoform. These are two-column files: Genes and Quantifications. HUGO files are also provided that have been mapped from Gencode/ENSEMBLE names. :param JobFunctionWrappingJob job: passed automatically by Toil :param str uuid: UUID to mark the samples with :param str rsem_gene_id: FileStoreID of rsem_gene_ids :param str rsem_isoform_id: FileStoreID of rsem_isoform_ids :return: FileStoreID from RSEM post process tarball :rytpe: str """ work_dir = job.fileStore.getLocalTempDir() # I/O job.fileStore.readGlobalFile(rsem_gene_id, os.path.join(work_dir, 'rsem_gene.tab'), mutable=True) job.fileStore.readGlobalFile(rsem_isoform_id, os.path.join(work_dir, 'rsem_isoform.tab'), mutable=True) # Convert RSEM files into individual .tab files. docker_call(tool='jvivian/rsem_postprocess', parameters=[uuid], work_dir=work_dir) os.rename(os.path.join(work_dir, 'rsem_gene.tab'), os.path.join(work_dir, 'rsem_genes.results')) os.rename(os.path.join(work_dir, 'rsem_isoform.tab'), os.path.join(work_dir, 'rsem_isoforms.results')) output_files = [ 'rsem.genes.norm_counts.tab', 'rsem.genes.raw_counts.tab', 'rsem.isoform.norm_counts.tab', 'rsem.isoform.raw_counts.tab', 'rsem_genes.results', 'rsem_isoforms.results' ] # Perform HUGO gene / isoform name mapping genes = [x for x in output_files if 'rsem.genes' in x] isoforms = [x for x in output_files if 'rsem.isoform' in x] command = ['-g'] + genes + ['-i'] + isoforms docker_call(tool='jvivian/gencode_hugo_mapping', parameters=command, work_dir=work_dir) hugo_files = [ os.path.splitext(x)[0] + '.hugo' + os.path.splitext(x)[1] for x in genes + isoforms ] # Create tarballs for outputs tarball_files('rsem.tar.gz', file_paths=[os.path.join(work_dir, x) for x in output_files], output_dir=work_dir) tarball_files('rsem_hugo.tar.gz', [os.path.join(work_dir, x) for x in hugo_files], output_dir=work_dir) rsem_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rsem.tar.gz')) hugo_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'rsem_hugo.tar.gz')) return rsem_id, hugo_id
def run_realigner_target_creator(job, bam, bai, ref, ref_dict, fai, phase, mills, mem, unsafe=False): """ Creates intervals file needed for indel realignment :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam: Sample BAM FileStoreID :param str bai: Bam Index FileStoreID :param str ref: Reference genome FileStoreID :param str ref_dict: Reference dictionary FileStoreID :param str fai: Reference index FileStoreID :param str phase: Phase VCF FileStoreID :param str mills: Mills VCF FileStoreID :param str mem: Memory value to be passed to children. Needed for CI tests :param bool unsafe: If True, runs gatk UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY" :return: FileStoreID for the processed bam :rtype: str """ work_dir = job.fileStore.getLocalTempDir() file_ids = [ref, fai, ref_dict, bam, bai, phase, mills] inputs = [ 'ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.bam', 'sample.bam.bai', 'phase.vcf', 'mills.vcf' ] for file_store_id, name in zip(file_ids, inputs): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call: GATK -- RealignerTargetCreator parameters = [ '-T', 'RealignerTargetCreator', '-nt', str(job.cores), '-R', '/data/ref.fasta', '-I', '/data/sample.bam', '-known', '/data/phase.vcf', '-known', '/data/mills.vcf', '--downsampling_type', 'NONE', '-o', '/data/sample.intervals' ] if unsafe: parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY']) docker_call( tool= 'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', inputs=inputs, outputs={'sample.intervals': None}, work_dir=work_dir, parameters=parameters, env=dict(JAVA_OPTS='-Xmx{}'.format(mem))) # Write to fileStore return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'sample.intervals'))
def run_bam_qc(job, aligned_bam_id, config): """ Run BAM QC as specified by California Kids Cancer Comparison (CKCC) :param JobFunctionWrappingJob job: :param str aligned_bam_id: FileStoreID of sorted bam from STAR :param Namespace config: Argparse Namespace object containing argument inputs Must contain: config.uuid str: UUID of input sample config.save_bam bool: True/False depending on whether to save bam config.output_dir str: Path to save bam config.ssec str: Path to encryption key for secure upload to S3 :return: boolean flag, FileStoreID for output bam, and FileStoreID for output tar :rtype: tuple(bool, str, str) """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile( aligned_bam_id, os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) docker_call(tool='hbeale/treehouse_bam_qc:1.0', work_dir=work_dir, parameters=['runQC.sh', str(job.cores)]) # Tar Output files output_names = [ 'readDist.txt', 'rnaAligned.out.md.sorted.geneBodyCoverage.curves.pdf', 'rnaAligned.out.md.sorted.geneBodyCoverage.txt' ] if os.path.exists(os.path.join(work_dir, 'readDist.txt_PASS_qc.txt')): output_names.append('readDist.txt_PASS_qc.txt') fail_flag = False else: output_names.append('readDist.txt_FAIL_qc.txt') fail_flag = True output_files = [os.path.join(work_dir, x) for x in output_names] tarball_files(tar_name='bam_qc.tar.gz', file_paths=output_files, output_dir=work_dir) # Save output BAM if config.save_bam: bam_path = os.path.join(work_dir, 'rnaAligned.sortedByCoord.md.bam') new_bam_path = os.path.join(work_dir, config.uuid + '.sortedByCoord.md.bam') os.rename(bam_path, new_bam_path) if urlparse(config.output_dir).scheme == 's3' and config.ssec: s3am_upload(fpath=new_bam_path, s3_dir=config.output_dir, s3_key_path=config.ssec) elif urlparse(config.output_dir).scheme != 's3': copy_files(file_paths=[new_bam_path], output_dir=config.output_dir) return fail_flag, job.fileStore.writeGlobalFile( os.path.join(work_dir, 'bam_qc.tar.gz'))
def run_base_recalibration(job, indel_bam, indel_bai, ref, ref_dict, fai, dbsnp, mem, unsafe=False): """ Creates recal table used in Base Quality Score Recalibration :param JobFunctionWrappingJob job: passed automatically by Toil :param str indel_bam: Indel interval FileStoreID :param str indel_bai: Bam Index FileStoreID :param str ref: Reference genome FileStoreID :param str ref_dict: Reference dictionary FileStoreID :param str fai: Reference index FileStoreID :param str dbsnp: DBSNP VCF FileStoreID :param str mem: Memory value to be passed to children. Needed for CI tests :param bool unsafe: If True, runs gatk UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY" :return: FileStoreID for the processed bam :rtype: str """ work_dir = job.fileStore.getLocalTempDir() file_ids = [ref, fai, ref_dict, indel_bam, indel_bai, dbsnp] inputs = [ 'ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.indel.bam', 'sample.indel.bai', 'dbsnp.vcf' ] for file_store_id, name in zip(file_ids, inputs): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call: GATK -- IndelRealigner parameters = [ '-T', 'BaseRecalibrator', '-nct', str(job.cores), '-R', '/data/ref.fasta', '-I', '/data/sample.indel.bam', '-knownSites', '/data/dbsnp.vcf', '-o', '/data/sample.recal.table' ] if unsafe: parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY']) docker_call( tool= 'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', inputs=inputs, outputs={'sample.recal.table': None}, work_dir=work_dir, parameters=parameters, env=dict(JAVA_OPTS='-Xmx{}'.format(mem))) # Write output to file store return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'sample.recal.table'))
def download_bam(job, gdc_id, disk='40G'): work_dir = job.fileStore.getLocalTempDir() output_dir = os.path.join(work_dir, gdc_id) job.fileStore.logToMaster('Downloading: ' + gdc_id) parameters = ['download', '-d', '/data', gdc_id] docker_call(tool='jvivian/gdc-client', work_dir=work_dir, parameters=parameters) sample = glob(os.path.join(output_dir, '*.bam'))[0] bam_id = job.fileStore.writeGlobalFile(sample) job.addChildJobFn(process_bam_and_upload, bam_id, gdc_id, disk='80G')
def star(job, inputs, r1_cutadapt, r2_cutadapt): """ Performs alignment of fastqs to BAM via STAR :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str r1_cutadapt: FileStore ID of read 1 fastq :param str r2_cutadapt: FileStore ID of read 2 fastq """ job.fileStore.logToMaster('Aligning with STAR: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() cores = min(inputs.cores, 16) # Retrieve files job.fileStore.readGlobalFile(r1_cutadapt, os.path.join(work_dir, 'R1_cutadapt.fastq')) job.fileStore.readGlobalFile(r2_cutadapt, os.path.join(work_dir, 'R2_cutadapt.fastq')) # Get starIndex download_url(inputs.star_index, work_dir, 'starIndex.tar.gz') subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir]) # Parameters parameters = ['--runThreadN', str(cores), '--genomeDir', '/data/starIndex', '--outFileNamePrefix', 'rna', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--outSAMunmapped', 'Within', '--quantMode', 'TranscriptomeSAM', '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD', '--outFilterType', 'BySJout', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '999', '--outFilterMismatchNoverReadLmax', '0.04', '--alignIntronMin', '20', '--alignIntronMax', '1000000', '--alignMatesGapMax', '1000000', '--alignSJoverhangMin', '8', '--alignSJDBoverhangMin', '1', '--sjdbScore', '1', '--readFilesIn', '/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq'] # Call: STAR Map docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', work_dir=work_dir, parameters=parameters) # Call Samtools Index index_command = ['index', '/data/rnaAligned.sortedByCoord.out.bam'] docker_call(work_dir=work_dir, parameters=index_command, tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c') # fileStore bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) bai_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam.bai')) job.fileStore.deleteGlobalFile(r1_cutadapt) job.fileStore.deleteGlobalFile(r2_cutadapt) # Launch children and follow-on vcqc_id = job.addChildJobFn(variant_calling_and_qc, inputs, bam_id, bai_id, cores=2, disk='30G').rv() spladder_id = job.addChildJobFn(spladder, inputs, bam_id, bai_id, disk='30G').rv() job.addFollowOnJobFn(consolidate_output_tarballs, inputs, vcqc_id, spladder_id, disk='30G')
def spladder(job, inputs, bam_id, bai_id): """ Run SplAdder to detect and quantify alternative splicing events :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of SplAdder tarball :rtype: str """ job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input file download_url(url=inputs.gtf, work_dir=work_dir, name='annotation.gtf') download_url(url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle') # Call Spladder command = ['--insert_ir=y', '--insert_es=y', '--insert_ni=y', '--remove_se=n', '--validate_sg=n', '-b', 'alignment.bam', '-o ', '/data', '-a', 'annotation.gtf', '-v', 'y', '-c', '3', '-M', 'single', '-T', 'n', '-n', '50', '-P', 'y', '-p', 'n', '--sparse_bam', 'y'] docker_call(work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0') # Write output to fileStore and return ids output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle') if not os.path.exists(output_pickle): matches = [] for root, dirnames, filenames in os.walk(work_dir): for filename in fnmatch.filter(filenames, '*genes_graph*'): matches.append(os.path.join(root, filename)) if matches: output_pickle = matches[0] else: raise RuntimeError("Couldn't find genes file!") output_filt = os.path.join(work_dir, 'alignment.filt.hdf5') output = os.path.join(work_dir, 'alignment.hdf5') print os.listdir(work_dir) tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'spladder.tar.gz'))
def gatk_combine_variants(job, vcfs, ref_fasta, ref_fai, ref_dict, merge_option='UNIQUIFY'): """ Merges VCF files using GATK CombineVariants :param JobFunctionWrappingJob job: Toil Job instance :param dict vcfs: Dictionary of VCF FileStoreIDs {sample identifier: FileStoreID} :param str ref_fasta: FileStoreID for reference genome fasta :param str ref_fai: FileStoreID for reference genome index file :param str ref_dict: FileStoreID for reference genome sequence dictionary file :param str merge_option: Value for --genotypemergeoption flag (Default: 'UNIQUIFY') 'UNIQUIFY': Multiple variants at a single site are merged into a single variant record. 'UNSORTED': Used to merge VCFs from the same sample :return: FileStoreID for merged VCF file :rtype: str """ job.fileStore.logToMaster('Running GATK CombineVariants') inputs = { 'genome.fa': ref_fasta, 'genome.fa.fai': ref_fai, 'genome.dict': ref_dict } inputs.update(vcfs) work_dir = job.fileStore.getLocalTempDir() for name, file_store_id in inputs.iteritems(): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) command = [ '-T', 'CombineVariants', '-R', '/data/genome.fa', '-o', '/data/merged.vcf', '--genotypemergeoption', merge_option ] for uuid, vcf_id in vcfs.iteritems(): command.extend(['--variant', os.path.join('/data', uuid)]) docker_call( work_dir=work_dir, env={'JAVA_OPTS': '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory)}, parameters=command, tool= 'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', inputs=inputs.keys(), outputs={'merged.vcf': None}) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'merged.vcf'))
def run_merge_vcf(job, options, index_dir_id, vcf_file_key_list): RealTimeLogger.get().info("Completed gam merging and gam path variant calling.") RealTimeLogger.get().info("Starting vcf merging vcf files.") # Set up the IO stores each time, since we can't unpickle them on Azure for # some reason. input_store = IOStore.get(options.input_store) out_store = IOStore.get(options.out_store) # Define work directory for docker calls work_dir = job.fileStore.getLocalTempDir() # Download local input files from the remote storage container graph_dir = work_dir read_global_directory(job.fileStore, index_dir_id, graph_dir) vcf_merging_file_key_list = [] for vcf_file_key in vcf_file_key_list: vcf_file = "{}/{}.gz".format(work_dir, vcf_file_key) vcf_file_idx = "{}.tbi".format(vcf_file) out_store.read_input_file(vcf_file_key+".gz", vcf_file) out_store.read_input_file(vcf_file_key+".gz"+ ".tbi", vcf_file_idx) vcf_merging_file_key_list.append(os.path.basename(vcf_file)) vcf_merged_file_key = "" if len(vcf_merging_file_key_list) > 1: # merge vcf files vcf_merged_file_key = "{}.vcf.gz".format(options.sample_name) command=['bcftools', 'concat', '-O', 'z', '-o', os.path.basename(vcf_merged_file_key), ' '.join(vcf_merging_file_key_list)] docker_call(work_dir=work_dir, parameters=command, tool='quay.io/cmarkello/bcftools') command=['bcftools', 'tabix', '-f', '-p', 'vcf', os.path.basename(vcf_merged_file_key)] docker_call(work_dir=work_dir, parameters=command, tool='quay.io/cmarkello/bcftools') else: vcf_merged_file_key = vcf_merging_file_key_list[0] # save variant calling results to the output store vcf_file = "{}/{}".format(work_dir, vcf_merged_file_key) vcf_file_idx = "{}/{}.tbi".format(work_dir, vcf_merged_file_key) out_store.write_output_file(vcf_file, vcf_merged_file_key) out_store.write_output_file(vcf_file_idx, vcf_merged_file_key + ".tbi") #Run downloader to download output IO store files to local output directory. vcf_file_id = job.fileStore.writeGlobalFile(vcf_file) vcf_file_idx_id = job.fileStore.writeGlobalFile(vcf_file_idx) downloadList = [[vcf_file_id, vcf_merged_file_key], [vcf_file_idx_id, vcf_merged_file_key+".tbi"]] return downloadList
def merge_vcf_chunks(job, options, index_dir_id, path_name, path_size, chunks, overwrite): """ merge a bunch of clipped vcfs created above, taking care to fix up the headers. everything expected to be sorted already """ # Set up the IO stores each time, since we can't unpickle them on Azure for # some reason. input_store = IOStore.get(options.input_store) out_store = IOStore.get(options.out_store) # Define work directory for docker calls out_dir = job.fileStore.getLocalTempDir() # Download local input files from the remote storage container read_global_directory(job.fileStore, index_dir_id, out_dir) vcf_path = os.path.join(out_dir, path_name + ".vcf") if overwrite or not os.path.isfile(vcf_path): first = True for chunk_i, chunk in enumerate(chunks): clip_path = chunk_base_name(path_name, out_dir, chunk_i, "_clip.vcf") # Download clip.vcf file out_store.read_input_file(os.path.basename(clip_path), clip_path) if os.path.isfile(clip_path): if first is True: # copy everything including the header run("cat {} > {}".format(clip_path, vcf_path)) first = False else: # add on everythin but header run("grep -v \"^#\" {} >> {}".format(clip_path, vcf_path), check=False) # add a compressed indexed version if overwrite or not os.path.isfile(vcf_path + ".gz"): vcf_gz_file = vcf_path + ".gz" with open(vcf_gz_file, "w") as vcf_gz_file_stream: command=['bgzip', '-c', '{}'.format(os.path.basename(vcf_path))] docker_call(work_dir=out_dir, parameters=command, tool='quay.io/cmarkello/htslib:latest', outfile=vcf_gz_file_stream) command=['bcftools', 'tabix', '-f', '-p', 'vcf', '{}'.format(os.path.basename(vcf_path+".gz"))] docker_call(work_dir=out_dir, parameters=command, tool='quay.io/cmarkello/bcftools:latest') # Save merged vcf files to the output store out_store.write_output_file(vcf_path, os.path.basename(vcf_path)) out_store.write_output_file(vcf_path+".gz", os.path.basename(vcf_path+".gz")) out_store.write_output_file(vcf_path+".gz.tbi", os.path.basename(vcf_path+".gz.tbi")) return os.path.basename(vcf_path)
def test_docker_call(tmpdir): from toil_lib.programs import docker_call work_dir = str(tmpdir) parameter = ['--help'] tool = 'quay.io/ucsc_cgl/samtools' docker_call(work_dir=work_dir, parameters=parameter, tool=tool) # Test outfile fpath = os.path.join(work_dir, 'test') with open(fpath, 'w') as f: docker_call(tool='ubuntu', env=dict(foo='bar'), parameters=['printenv', 'foo'], outfile=f) assert open(fpath).read() == 'bar\n'
def run_kallisto(job, r1_id, r2_id, kallisto_index_url): """ RNA quantification via Kallisto :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end) :param str kallisto_index_url: FileStoreID for Kallisto index file :return: FileStoreID from Kallisto output :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=kallisto_index_url, name='kallisto_hg38.idx', work_dir=work_dir) # Retrieve files parameters = [ 'quant', '-i', '/data/kallisto_hg38.idx', '-t', str(job.cores), '-o', '/data/', '-b', '100' ] if r1_id and r2_id: job.fileStore.readGlobalFile( r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) job.fileStore.readGlobalFile( r2_id, os.path.join(work_dir, 'R2_cutadapt.fastq')) parameters.extend( ['/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq']) else: job.fileStore.readGlobalFile( r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) parameters.extend( ['--single', '-l', '200', '-s', '15', '/data/R1_cutadapt.fastq']) # Call: Kallisto docker_call( job=job, tool= 'quay.io/ucsc_cgl/kallisto:0.42.4--35ac87df5b21a8e8e8d159f26864ac1e1db8cf86', work_dir=work_dir, parameters=parameters) # Tar output files together and store in fileStore output_files = [ os.path.join(work_dir, x) for x in ['run_info.json', 'abundance.tsv', 'abundance.h5'] ] tarball_files(tar_name='kallisto.tar.gz', file_paths=output_files, output_dir=work_dir) return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'kallisto.tar.gz'))
def gatk_variant_filtration(job, vcf_id, filter_name, filter_expression, ref_fasta, ref_fai, ref_dict): """ Filters VCF file using GATK VariantFiltration. Fixes extra pair of quotation marks in VCF header that may interfere with other VCF tools. :param JobFunctionWrappingJob job: passed automatically by Toil :param str vcf_id: FileStoreID for input VCF file :param str filter_name: Name of filter for VCF header :param str filter_expression: JEXL filter expression :param str ref_fasta: FileStoreID for reference genome fasta :param str ref_fai: FileStoreID for reference genome index file :param str ref_dict: FileStoreID for reference genome sequence dictionary file :return: FileStoreID for filtered VCF file :rtype: str """ inputs = {'genome.fa': ref_fasta, 'genome.fa.fai': ref_fai, 'genome.dict': ref_dict, 'input.vcf': vcf_id} work_dir = job.fileStore.getLocalTempDir() for name, file_store_id in inputs.iteritems(): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) command = ['-T', 'VariantFiltration', '-R', 'genome.fa', '-V', 'input.vcf', '--filterName', filter_name, # Documents filter name in header '--filterExpression', filter_expression, '-o', 'filtered_variants.vcf'] job.fileStore.logToMaster('Running GATK VariantFiltration using {name}: ' '{expression}'.format(name=filter_name, expression=filter_expression)) docker_call(job=job, work_dir=work_dir, env={'JAVA_OPTS': '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory)}, parameters=command, tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', inputs=inputs.keys(), outputs={'filtered_variants.vcf': None}) # Remove extra quotation marks around filter expression. malformed_header = os.path.join(work_dir, 'filtered_variants.vcf') fixed_header = os.path.join(work_dir, 'fixed_header.vcf') filter_regex = re.escape('"%s"' % filter_expression) with open(malformed_header, 'r') as f, open(fixed_header, 'w') as g: for line in f: g.write(re.sub(filter_regex, filter_expression, line)) return job.fileStore.writeGlobalFile(fixed_header)
def run_pindel(job, normal_bam, normal_bai, tumor_bam, tumor_bai, ref, fai): """ Calls Pindel to compute indels / deletions :param JobFunctionWrappingJob job: Passed automatically by Toil :param str normal_bam: Normal BAM FileStoreID :param str normal_bai: Normal BAM index FileStoreID :param str tumor_bam: Tumor BAM FileStoreID :param str tumor_bai: Tumor BAM Index FileStoreID :param str ref: Reference genome FileStoreID :param str fai: Reference index FileStoreID :return: Pindel output (tarball) FileStoreID :rtype: str """ work_dir = job.fileStore.getLocalTempDir() file_ids = [normal_bam, normal_bai, tumor_bam, tumor_bai, ref, fai] file_names = [ 'normal.bam', 'normal.bai', 'tumor.bam', 'tumor.bai', 'ref.fasta', 'ref.fasta.fai' ] for file_store_id, name in zip(file_ids, file_names): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Create Pindel config with open(os.path.join(work_dir, 'pindel-config.txt'), 'w') as f: for bam in ['normal', 'tumor']: f.write('/data/{} {} {}\n'.format( bam + '.bam', get_mean_insert_size(work_dir, bam + '.bam'), bam)) # Call: Pindel parameters = [ '-f', '/data/ref.fasta', '-i', '/data/pindel-config.txt', '--number_of_threads', str(job.cores), '--minimum_support_for_event', '3', '--report_long_insertions', 'true', '--report_breakpoints', 'true', '-o', 'pindel' ] docker_call( tool= 'quay.io/ucsc_cgl/pindel:0.2.5b6--4e8d1b31d4028f464b3409c6558fb9dfcad73f88', work_dir=work_dir, parameters=parameters) # Collect output files and write to file store output_files = glob(os.path.join(work_dir, 'pindel*')) tarball_files('pindel.tar.gz', file_paths=output_files, output_dir=work_dir) return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'pindel.tar.gz'))
def download_and_transfer_sample(job, sample, inputs): """ Downloads a sample from CGHub via GeneTorrent, then uses S3AM to transfer it to S3 input_args: dict Dictionary of input arguments analysis_id: str An analysis ID for a sample in CGHub """ analysis_id = sample[0] work_dir = job.fileStore.getLocalTempDir() folder_path = os.path.join(work_dir, os.path.basename(analysis_id)) # Acquire genetorrent key and download sample shutil.copy(inputs['genetorrent_key'], os.path.join(work_dir, 'cghub.key')) parameters = ['-vv', '-c', 'cghub.key', '-d', analysis_id] docker_call( job=job, tool= 'quay.io/ucsc_cgl/genetorrent:3.8.7--9911761265b6f08bc3ef09f53af05f56848d805b', work_dir=work_dir, parameters=parameters) try: sample = glob.glob(os.path.join(folder_path, '*tar*'))[0] except KeyError as e: print 'No tarfile found inside of folder: '.format(e) raise # Upload sample to S3AM key_path = inputs['ssec'] if sample.endswith('gz'): sample_name = analysis_id + '.tar.gz' shutil.move(sample, os.path.join(work_dir, sample_name)) else: sample_name = analysis_id + '.tar' shutil.move(sample, os.path.join(work_dir, sample_name)) # Parse s3_dir to get bucket and s3 path s3_dir = inputs['s3_dir'] bucket_name = s3_dir.lstrip('/').split('/')[0] base_url = 'https://s3-us-west-2.amazonaws.com/' url = os.path.join(base_url, bucket_name, sample_name) # Generate keyfile for upload with open(os.path.join(work_dir, 'temp.key'), 'wb') as f_out: f_out.write(generate_unique_key(key_path, url)) # Upload to S3 via S3AM s3am_command = [ 's3am', 'upload', '--sse-key-file', os.path.join(work_dir, 'temp.key'), 'file://{}'.format(os.path.join(work_dir, sample_name)), 's3://' + bucket_name + '/' ] subprocess.check_call(s3am_command)
def run_muse(job, normal_bam, normal_bai, tumor_bam, tumor_bai, ref, ref_dict, fai, dbsnp): """ Calls MuSe to find variants :param JobFunctionWrappingJob job: passed automatically by Toil :param str normal_bam: Normal BAM FileStoreID :param str normal_bai: Normal BAM index FileStoreID :param str tumor_bam: Tumor BAM FileStoreID :param str tumor_bai: Tumor BAM Index FileStoreID :param str tumor_bai: Tumor BAM Index FileStoreID :param str ref: Reference genome FileStoreID :param str ref_dict: Reference genome dictionary FileStoreID :param str fai: Reference index FileStoreID :param str dbsnp: DBSNP VCF FileStoreID :return: MuSe output (tarball) FileStoreID :rtype: str """ work_dir = job.fileStore.getLocalTempDir() file_ids = [ normal_bam, normal_bai, tumor_bam, tumor_bai, ref, ref_dict, fai, dbsnp ] file_names = [ 'normal.bam', 'normal.bai', 'tumor.bam', 'tumor.bai', 'ref.fasta', 'ref.dict', 'ref.fasta.fai', 'dbsnp.vcf' ] for file_store_id, name in zip(file_ids, file_names): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call: MuSE parameters = [ '--mode', 'wxs', '--dbsnp', '/data/dbsnp.vcf', '--fafile', '/data/ref.fasta', '--tumor-bam', '/data/tumor.bam', '--tumor-bam-index', '/data/tumor.bai', '--normal-bam', '/data/normal.bam', '--normal-bam-index', '/data/normal.bai', '--outfile', '/data/muse.vcf', '--cpus', str(job.cores) ] docker_call( tool= 'quay.io/ucsc_cgl/muse:1.0--6add9b0a1662d44fd13bbc1f32eac49326e48562', work_dir=work_dir, parameters=parameters) # Return fileStore ID tarball_files('muse.tar.gz', file_paths=[os.path.join(work_dir, 'muse.vcf')], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'muse.tar.gz'))
def xg_path_predecessors(xg_path, path_name, node_id, out_dir, context = 1): """ get nodes before given node in a path. """ stdout = '' command = ['vg find -x {} -n {} -c {}'.format(os.path.basename(xg_path), str(node_id), str(context)), 'vg view -j -'] stdout = docker_call(work_dir=out_dir, parameters=command, tools='quay.io/ucsc_cgl/vg:latest', check_output=True) # get our json graph j = json.loads(stdout) paths = j["path"] path = [x for x in paths if x["name"] == path_name][0] mappings = path["mapping"] assert len(mappings) > 0 # check that we have a node_mapping assert len([x for x in mappings if x["position"]["node_id"] == node_id]) == 1 # collect mappings that come before out_ids = [] for mapping in mappings: if mapping["position"]["node_id"] == node_id: break out_ids.append(mapping["position"]["node_id"]) return out_ids
def process_bam_and_upload(job, bam_id, gdc_id, disk='80G'): work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'input.bam')) parameters = ['fastq', '-1', '/data/R1.fastq', '-2', '/data/R2.fastq', '/data/input.bam'] docker_call(tool='quay.io/ucsc_cgl/samtools', work_dir=work_dir, parameters=parameters) subprocess.check_call(['gzip', os.path.join(work_dir, 'R1.fastq')]) subprocess.check_call(['gzip', os.path.join(work_dir, 'R2.fastq')]) out_tar = os.path.join(work_dir, gdc_id + '.tar.gz') with tarfile.open(out_tar, 'w:gz') as tar: for name in [os.path.join(work_dir, x) for x in ['R1.fastq.gz', 'R2.fastq.gz']]: tar.add(name, arcname=os.path.basename(name)) s3am_upload(out_tar, s3_dir='s3://cgl-ccle-data/')
def run_oncotator(job, vcf_id, oncotator_db): """ Uses Oncotator to add cancer relevant variant annotations to a VCF file. Oncotator can accept other genome builds, but the output VCF is based on hg19. :param JobFunctionWrappingJob job: passed automatically by Toil :param str vcf_id: FileStoreID for VCF file :param str oncotator_db: FileStoreID for Oncotator database :return: Annotated VCF FileStoreID :rtype: str """ job.fileStore.logToMaster('Running Oncotator') inputs = {'input.vcf': vcf_id, 'oncotator_db': oncotator_db} work_dir = job.fileStore.getLocalTempDir() for name, file_store_id in inputs.iteritems(): inputs[name] = job.fileStore.readGlobalFile( file_store_id, os.path.join(work_dir, name)) # The Oncotator database may be tar/gzipped if tarfile.is_tarfile(inputs['oncotator_db']): tar = tarfile.open(inputs['oncotator_db']) tar.extractall(path=work_dir) # Get the extracted database directory name inputs['oncotator_db'] = tar.getmembers()[0].name tar.close() command = [ '-i', 'VCF', '-o', 'VCF', '--db-dir', inputs['oncotator_db'], 'input.vcf', 'annotated.vcf', 'hg19' ] # Oncotator annotations are based on hg19 docker_call( job=job, work_dir=work_dir, env={ '_JAVA_OPTIONS': '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory) }, parameters=command, tool='jpfeil/oncotator:1.9--8fffc356981862d50cfacd711b753700b886b605', inputs=inputs.keys(), outputs={'annotated.vcf': None}) return job.fileStore.writeGlobalFile( os.path.join(work_dir, 'annotated.vcf'))
def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter): """ Adapter trimming for RNA-seq data :param JobFunctionWrappingJob job: passed automatically by Toil :param str r1_id: FileStoreID of fastq read 1 :param str r2_id: FileStoreID of fastq read 2 (if paired data) :param str fwd_3pr_adapter: Adapter sequence for the forward 3' adapter :param str rev_3pr_adapter: Adapter sequence for the reverse 3' adapter (second fastq pair) :return: R1 and R2 FileStoreIDs :rtype: tuple """ work_dir = job.fileStore.getLocalTempDir() if r2_id: require(rev_3pr_adapter, "Paired end data requires a reverse 3' adapter sequence.") # Retrieve files parameters = ['-a', fwd_3pr_adapter, '-m', '35'] if r1_id and r2_id: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq')) parameters.extend([ '-A', rev_3pr_adapter, '-o', '/data/R1_cutadapt.fastq', '-p', '/data/R2_cutadapt.fastq', '/data/R1.fastq', '/data/R2.fastq' ]) else: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq']) # Call: CutAdapt docker_call( job=job, tool= 'quay.io/ucsc_cgl/cutadapt:1.9--6bd44edd2b8f8f17e25c5a268fedaab65fa851d2', work_dir=work_dir, parameters=parameters) # Write to fileStore if r1_id and r2_id: r1_cut_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'R1_cutadapt.fastq')) r2_cut_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'R2_cutadapt.fastq')) else: r1_cut_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'R1_cutadapt.fastq')) r2_cut_id = None return r1_cut_id, r2_cut_id
def xg_path_node_id(xg_path, path_name, offset, out_dir): """ use vg find to get the node containing a given path position """ #NOTE: vg find -p range offsets are 0-based inclusive. tmp_out_filename = "{}/tmp_out_{}".format(out_dir, uuid4()) with open(tmp_out_filename, "w") as tmp_out_file: command = ['vg find -x {} -p {}:{}-{}'.format(os.path.basename(xg_path), str(path_name), str(offset), str(offset)), 'vg mod -o -', 'vg view -j -'] docker_call(work_dir=out_dir, parameters=command, tools='quay.io/ucsc_cgl/vg:latest', outfile=tmp_out_file) command = ['cat data/{}'.format(os.path.basename(tmp_out_filename)), 'jq .node[0].id -'] stdout = docker_call(work_dir=out_dir, parameters=command, tools='devorbitus/ubuntu-bash-jq-curl', check_output=True) return int(stdout)
def _download_with_genetorrent(url, file_path, cghub_key_path): parsed_url = urlparse(url) analysis_id = parsed_url.path[1:] assert parsed_url.scheme == 'gnos', 'Improper format. gnos://cghub/ID. User supplied: {}'.format( parsed_url) work_dir = os.path.dirname(file_path) folder_path = os.path.join(work_dir, os.path.basename(analysis_id)) parameters = ['-vv', '-c', cghub_key_path, '-d', analysis_id] docker_call( tool= 'quay.io/ucsc_cgl/genetorrent:3.8.7--9911761265b6f08bc3ef09f53af05f56848d805b', work_dir=work_dir, parameters=parameters) sample = glob.glob(os.path.join(folder_path, '*tar*')) assert len( sample) == 1, 'More than one sample tar in CGHub download: {}'.format( analysis_id)
def variant_calling_and_qc(job, inputs, bam_id, bai_id): """ Perform variant calling with samtools nad QC with CheckBias :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of qc tarball :rtype: str """ job.fileStore.logToMaster('Variant calling and QC: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input files input_info = [(inputs.genome, 'genome.fa'), (inputs.positions, 'positions.tsv'), (inputs.genome_index, 'genome.fa.fai'), (inputs.gtf, 'annotation.gtf'), (inputs.gtf_m53, 'annotation.m53')] for url, fname in input_info: download_url(url, work_dir=work_dir, name=fname) # Part 1: Variant Calling variant_command = ['mpileup', '-f', 'genome.fa', '-l', 'positions.tsv', '-v', 'alignment.bam', '-t', 'DP,SP,INFO/AD,INFO/ADF,INFO/ADR,INFO/DPR,SP', '-o', '/data/output.vcf.gz'] docker_call(work_dir=work_dir, parameters=variant_command, sudo=inputs.sudo, tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c') # Part 2: QC qc_command = ['-o', 'qc', '-n', 'alignment.bam', '-a', 'annotation.gtf', '-m', 'annotation.m53'] docker_call(work_dir=work_dir, parameters=qc_command, tool='jvivian/checkbias:612f129--b08a1fb6526a620bbb0304b08356f2ae7c3c0ec3') # Write output to fileStore and return ids output_tsv = glob(os.path.join(work_dir, '*counts.tsv*'))[0] output_vcf = os.path.join(work_dir, 'output.vcf.gz') tarball_files('vcqc.tar.gz', file_paths=[output_tsv, output_vcf], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'vcqc.tar.gz'))
def chunk_gam(gam_path, xg_path, path_name, out_dir, chunks, filter_opts, overwrite): """ use vg filter to chunk up the gam """ RealTimeLogger.get().info("Starting chunk_gam") # make bed chunks chunk_path = os.path.join(out_dir, path_name + "_chunks.bed") with open(chunk_path, "w") as f: for chunk in chunks: f.write("{}\t{}\t{}\n".format(chunk[0], chunk[1], chunk[2])) # run vg filter on the gam stdout = '' if overwrite or not any( os.path.isfile(chunk_base_name(path_name, out_dir, i, ".gam")) \ for i in range(len(chunks))): out_file = os.path.join(out_dir, path_name + "-chunk") command = ['filter', os.path.basename(gam_path), '-x', os.path.basename(xg_path), '-R', os.path.basename(chunk_path), '-B', os.path.basename(out_file)] + filter_opts.split(" ") docker_call(work_dir=out_dir, parameters=command, tool='quay.io/ucsc_cgl/vg:latest')
def download_and_transfer_sample(job, sample, inputs): """ Downloads a sample from CGHub via GeneTorrent, then uses S3AM to transfer it to S3 input_args: dict Dictionary of input arguments analysis_id: str An analysis ID for a sample in CGHub """ analysis_id = sample[0] work_dir = job.fileStore.getLocalTempDir() folder_path = os.path.join(work_dir, os.path.basename(analysis_id)) # Acquire genetorrent key and download sample shutil.copy(inputs['genetorrent_key'], os.path.join(work_dir, 'cghub.key')) parameters = ['-vv', '-c', 'cghub.key', '-d', analysis_id] docker_call(job=job, tool='quay.io/ucsc_cgl/genetorrent:3.8.7--9911761265b6f08bc3ef09f53af05f56848d805b', work_dir=work_dir, parameters=parameters) try: sample = glob.glob(os.path.join(folder_path, '*tar*'))[0] except KeyError as e: print 'No tarfile found inside of folder: '.format(e) raise # Upload sample to S3AM key_path = inputs['ssec'] if sample.endswith('gz'): sample_name = analysis_id + '.tar.gz' shutil.move(sample, os.path.join(work_dir, sample_name)) else: sample_name = analysis_id + '.tar' shutil.move(sample, os.path.join(work_dir, sample_name)) # Parse s3_dir to get bucket and s3 path s3_dir = inputs['s3_dir'] bucket_name = s3_dir.lstrip('/').split('/')[0] base_url = 'https://s3-us-west-2.amazonaws.com/' url = os.path.join(base_url, bucket_name, sample_name) # Generate keyfile for upload with open(os.path.join(work_dir, 'temp.key'), 'wb') as f_out: f_out.write(generate_unique_key(key_path, url)) # Upload to S3 via S3AM s3am_command = ['s3am', 'upload', '--sse-key-file', os.path.join(work_dir, 'temp.key'), 'file://{}'.format(os.path.join(work_dir, sample_name)), 's3://' + bucket_name + '/'] subprocess.check_call(s3am_command)
def picard_mark_duplicates(job, bam, bai, validation_stringency='LENIENT'): """ Runs Picard MarkDuplicates on a BAM file. Requires that the BAM file be coordinate sorted. :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam: FileStoreID for BAM file :param str bai: FileStoreID for BAM index file :param str validation_stringency: BAM file validation stringency, default is LENIENT :return: FileStoreIDs for BAM and BAI files :rtype: tuple """ work_dir = job.fileStore.getLocalTempDir() # Retrieve file path job.fileStore.readGlobalFile(bam, os.path.join(work_dir, 'sorted.bam')) job.fileStore.readGlobalFile(bai, os.path.join(work_dir, 'sorted.bai')) # Call: picardtools command = [ 'MarkDuplicates', 'INPUT=sorted.bam', 'OUTPUT=mkdups.bam', 'METRICS_FILE=metrics.txt', 'ASSUME_SORTED=true', 'CREATE_INDEX=true', 'VALIDATION_STRINGENCY=%s' % validation_stringency.upper() ] docker_call( job=job, work_dir=work_dir, parameters=command, # picard-tools container doesn't have JAVA_OPTS variable # Set TMPDIR to /data to prevent writing temporary files to /tmp env={ '_JAVA_OPTIONS': '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory) }, tool= 'quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e', outputs={ 'mkdups.bam': None, 'mkdups.bai': None }) bam = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'mkdups.bam')) bai = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'mkdups.bai')) return bam, bai
def gatk_select_variants(job, mode, vcf_id, ref_fasta, ref_fai, ref_dict): """ Isolates a particular variant type from a VCF file using GATK SelectVariants :param JobFunctionWrappingJob job: passed automatically by Toil :param str mode: variant type (i.e. SNP or INDEL) :param str vcf_id: FileStoreID for input VCF file :param str ref_fasta: FileStoreID for reference genome fasta :param str ref_fai: FileStoreID for reference genome index file :param str ref_dict: FileStoreID for reference genome sequence dictionary file :return: FileStoreID for filtered VCF :rtype: str """ job.fileStore.logToMaster('Running GATK SelectVariants to select %ss' % mode) inputs = { 'genome.fa': ref_fasta, 'genome.fa.fai': ref_fai, 'genome.dict': ref_dict, 'input.vcf': vcf_id } work_dir = job.fileStore.getLocalTempDir() for name, file_store_id in inputs.iteritems(): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) command = [ '-T', 'SelectVariants', '-R', 'genome.fa', '-V', 'input.vcf', '-o', 'output.vcf', '-selectType', mode ] docker_call( work_dir=work_dir, env={'JAVA_OPTS': '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory)}, parameters=command, tool= 'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', inputs=inputs.keys(), outputs={'output.vcf': None}) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.vcf'))
def gatk_haplotype_caller(job, bam, bai, ref, fai, ref_dict, annotations=None, emit_threshold=10.0, call_threshold=30.0, unsafe_mode=False, hc_output=None): """ Uses GATK HaplotypeCaller to identify SNPs and INDELs. Outputs variants in a Genomic VCF file. :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam: FileStoreID for BAM file :param str bai: FileStoreID for BAM index file :param str ref: FileStoreID for reference genome fasta file :param str ref_dict: FileStoreID for reference sequence dictionary file :param str fai: FileStoreID for reference fasta index file :param list[str] annotations: List of GATK variant annotations, default is None :param float emit_threshold: Minimum phred-scale confidence threshold for a variant to be emitted, default is 10.0 :param float call_threshold: Minimum phred-scale confidence threshold for a variant to be called, default is 30.0 :param bool unsafe_mode: If True, runs gatk UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY" :param str hc_output: URL or local path to pre-cooked VCF file, default is None :return: FileStoreID for GVCF file :rtype: str """ job.fileStore.logToMaster('Running GATK HaplotypeCaller') inputs = {'genome.fa': ref, 'genome.fa.fai': fai, 'genome.dict': ref_dict, 'input.bam': bam, 'input.bam.bai': bai} work_dir = job.fileStore.getLocalTempDir() for name, file_store_id in inputs.iteritems(): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call GATK -- HaplotypeCaller with parameters to produce a genomic VCF file: # https://software.broadinstitute.org/gatk/documentation/article?id=2803 command = ['-T', 'HaplotypeCaller', '-nct', str(job.cores), '-R', 'genome.fa', '-I', 'input.bam', '-o', 'output.g.vcf', '-stand_call_conf', str(call_threshold), '-stand_emit_conf', str(emit_threshold), '-variant_index_type', 'LINEAR', '-variant_index_parameter', '128000', '--genotyping_mode', 'Discovery', '--emitRefConfidence', 'GVCF'] if unsafe_mode: command = ['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY'] + command if annotations: for annotation in annotations: command.extend(['-A', annotation]) # Uses docker_call mock mode to replace output with hc_output file outputs = {'output.g.vcf': hc_output} docker_call(work_dir=work_dir, env={'JAVA_OPTS': '-Djava.io.tmpdir=/data/ -Xmx{}'.format(job.memory)}, parameters=command, tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', inputs=inputs.keys(), outputs=outputs, mock=True if outputs['output.g.vcf'] else False) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'output.g.vcf'))