def create_reference_index(job, ref_id): """ Uses Samtools to create reference index file (.fasta.fai) ref_id: str The fileStore ID of the reference """ work_dir = job.fileStore.getLocalTempDir() # Retrieve file path to reference try: job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fa')) except: sys.stderr.write("Failed when reading global file %s to %s. Retrying with dict index." % (ref_id, os.path.join(work_dir, 'ref.fa'))) try: job.fileStore.readGlobalFile(ref_id['ref.fa'], os.path.join(work_dir, 'ref.fa')) except: sys.stderr.write("Reading %s on retry failed." % ref_id['ref.fa']) raise # Call: Samtools command = ['faidx', 'ref.fa'] docker_call(work_dir=work_dir, parameters=command, tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e', inputs=['ref.fa'], outputs={'ref.fa.fai': None}) output = os.path.join(work_dir, 'ref.fa.fai') assert os.path.exists(output) # Write to fileStore return job.fileStore.writeGlobalFile(output)
def start(self, fileStore): """ Start spark and hdfs master containers fileStore: Unused """ self.IP = check_output(["hostname", "-f",])[:-1] _log.info("Started Spark master container.") self.sparkContainerID = docker_call(tool = "quay.io/ucsc_cgl/apache-spark-master:1.5.2", docker_parameters = ["--net=host", "-d", "-v", "/mnt/ephemeral/:/ephemeral/:rw", "-e", "SPARK_MASTER_IP="+self.IP, "-e", "SPARK_LOCAL_DIRS=/ephemeral/spark/local", "-e", "SPARK_WORKER_DIR=/ephemeral/spark/work"], rm=False, sudo = self.sudo, check_output = True, mock = False)[:-1] _log.info("Started HDFS Datanode.") self.hdfsContainerID = docker_call(tool = "quay.io/ucsc_cgl/apache-hadoop-master:2.6.2", docker_parameters = ["--net=host", "-d"], parameters = [self.IP], rm=False, sudo = self.sudo, check_output = True, mock = False)[:-1] return self.IP
def index(job, shared_ids, input_args): """ Index sample bam using samtools, calls haplotypeCaller. :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ work_dir = job.fileStore.getLocalTempDir() # Retrieve file path # FIXME: unused variable bam_path = return_input_paths(job, work_dir, shared_ids, 'toil.bam') output_path = os.path.join(work_dir, 'toil.bam.bai') # Call: index the normal.bam parameters = ['index', 'toil.bam'] inputs=['toil.bam'] outputs={'toil.bam.bai': None} docker_call(work_dir = work_dir, parameters = parameters, tool = 'quay.io/ucsc_cgl/samtools', inputs=inputs, outputs=outputs, sudo = input_args['sudo']) # Update FileStore and call child shared_ids['toil.bam.bai'] = job.fileStore.writeGlobalFile(output_path) job.addChildJobFn(haplotype_caller, shared_ids, input_args, cores = input_args['cpu_count'])
def print_reads(job, cores, table, indel_bam, indel_bai, ref, ref_dict, fai, mem): """ Creates BAM that has had the base quality scores recalibrated :param JobFunctionWrappingJob job: passed automatically by Toil :param int cores: Maximum number of cores on host node :param str table: Recalibration table FileStoreID :param str indel_bam: Indel interval FileStoreID :param str indel_bai: Bam Index FileStoreID :param str ref: Reference genome FileStoreID :param str ref_dict: Reference dictionary FileStoreID :param str fai: Reference index FileStoreID :param str mem: Memory value to be passed to children. Needed for CI tests :return: FileStoreID for the processed bam :rtype: str """ work_dir = job.fileStore.getLocalTempDir() file_ids = [ref, fai, ref_dict, table, indel_bam, indel_bai] file_names = ['ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.recal.table', 'sample.indel.bam', 'sample.indel.bai'] for file_store_id, name in zip(file_ids, file_names): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call: GATK -- PrintReads parameters = ['-T', 'PrintReads', '-nct', str(cores), '-R', '/data/ref.fasta', '--emit_original_quals', '-I', '/data/sample.indel.bam', '-BQSR', '/data/sample.recal.table', '-o', '/data/sample.bqsr.bam'] docker_call(tool='quay.io/ucsc_cgl/gatk:3.4--dd5ac549b95eb3e5d166a5e310417ef13651994e', work_dir=work_dir, parameters=parameters, env=dict(JAVA_OPTS='-Xmx{}'.format(mem))) # Write ouptut to file store bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.bqsr.bam')) return bam_id
def base_recalibration(job, shared_ids, input_args): """ Creates recal table to perform Base Quality Score Recalibration job_vars: tuple Contains the input_args and ids dictionaries sample: str Either "normal" or "tumor" to track which one is which """ # Unpack convenience variables for job work_dir = job.fileStore.getLocalTempDir() # Retrieve input file paths return_input_paths(job, work_dir, shared_ids, 'ref.fa', 'sample.indel.bam', 'dbsnp.vcf', 'ref.fa.fai', 'ref.dict', 'sample.indel.bam.bai') # Output file path output = os.path.join(work_dir, 'sample.recal.table') # Call: GATK -- IndelRealigner parameters = ['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', # RISKY! (?) See #189 '-T', 'BaseRecalibrator', '-nct', str(input_args.cpu_count), '-R', 'ref.fa', '-I', 'sample.indel.bam', '-knownSites', 'dbsnp.vcf', '-o', 'sample.recal.table'] docker_call(tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', work_dir=work_dir, parameters=parameters, inputs=['ref.fa', 'sample.indel.bam', 'dbsnp.vcf', 'ref.fa.fai', 'ref.dict', 'sample.indel.bam.bai'], outputs={'sample.recal.table': None}, env={'JAVA_OPTS':'-Xmx%sg' % input_args.memory}) # Write to fileStore shared_ids['sample.recal.table'] = job.fileStore.writeGlobalFile(output) job.addChildJobFn(print_reads, shared_ids, input_args, cores = input_args.cpu_count)
def mark_dups_sample(job, shared_ids, input_args): """ Uses picardtools MarkDuplicates """ work_dir = job.fileStore.getLocalTempDir() # Retrieve file path read_from_filestore(job, work_dir, shared_ids, 'sample.sorted.bam') outpath = os.path.join(work_dir, 'sample.mkdups.bam') # Call: picardtools command = ['MarkDuplicates', 'INPUT=sample.sorted.bam', 'OUTPUT=sample.mkdups.bam', 'METRICS_FILE=metrics.txt', 'ASSUME_SORTED=true', 'CREATE_INDEX=true'] docker_call(work_dir=work_dir, parameters=command, env={'JAVA_OPTS':'-Xmx%sg' % input_args.memory}, tool='quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e', inputs=['sample.sorted.bam'], outputs={'sample.mkdups.bam': None, 'sample.mkdups.bai': None}) shared_ids['sample.mkdups.bam'] = job.fileStore.writeGlobalFile(outpath) # picard writes the index for file.bam at file.bai, not file.bam.bai _move_bai(outpath) shared_ids['sample.mkdups.bam.bai'] = job.fileStore.writeGlobalFile(outpath + ".bai") job.addChildJobFn(realigner_target_creator, shared_ids, input_args, cores = input_args.cpu_count)
def run_rsem_postprocess(job, uuid, rsem_gene_id, rsem_isoform_id): """ Parses RSEMs output to produce the separate .tab files (TPM, FPKM, counts) for both gene and isoform. These are two-column files: Genes and Quantifications. HUGO files are also provided that have been mapped from Gencode/ENSEMBLE names. :param JobFunctionWrappingJob job: passed automatically by Toil :param str uuid: UUID to mark the samples with :param str rsem_gene_id: FileStoreID of rsem_gene_ids :param str rsem_isoform_id: FileStoreID of rsem_isoform_ids :return: FileStoreID from RSEM post process tarball :rytpe: str """ work_dir = job.fileStore.getLocalTempDir() # I/O job.fileStore.readGlobalFile(rsem_gene_id, os.path.join(work_dir, 'rsem_gene.tab')) job.fileStore.readGlobalFile(rsem_isoform_id, os.path.join(work_dir, 'rsem_isoform.tab')) # Convert RSEM files into individual .tab files. docker_call(tool='jvivian/rsem_postprocess', parameters=[uuid], work_dir=work_dir) os.rename(os.path.join(work_dir, 'rsem_gene.tab'), os.path.join(work_dir, 'rsem_genes.results')) os.rename(os.path.join(work_dir, 'rsem_isoform.tab'), os.path.join(work_dir, 'rsem_isoforms.results')) output_files = ['rsem.genes.norm_counts.tab', 'rsem.genes.raw_counts.tab', 'rsem.isoform.norm_counts.tab', 'rsem.isoform.raw_counts.tab', 'rsem_genes.results', 'rsem_isoforms.results'] # Perform HUGO gene / isoform name mapping genes = [x for x in output_files if 'rsem.genes' in x] isoforms = [x for x in output_files if 'rsem.isoform' in x] command = ['-g'] + genes + ['-i'] + isoforms docker_call(tool='jvivian/gencode_hugo_mapping', parameters=command, work_dir=work_dir) hugo_files = [os.path.splitext(x)[0] + '.hugo' + os.path.splitext(x)[1] for x in genes + isoforms] # Create tarballs for outputs tarball_files('rsem.tar.gz', file_paths=[os.path.join(work_dir, x) for x in output_files], output_dir=work_dir) tarball_files('rsem_hugo.tar.gz', [os.path.join(work_dir, x) for x in hugo_files], output_dir=work_dir) rsem_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem.tar.gz')) hugo_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem_hugo.tar.gz')) return rsem_id, hugo_id
def realigner_target_creator(job, shared_ids, input_args): """ Creates <type>.intervals file needed for indel realignment job_vars: tuple Contains the input_args and ids dictionaries sample: str Either "normal" or "tumor" to track which one is which """ work_dir = job.fileStore.getLocalTempDir() # Retrieve input file paths read_from_filestore(job, work_dir, shared_ids, 'ref.fa', 'sample.mkdups.bam', 'ref.fa.fai', 'ref.dict', 'sample.mkdups.bam.bai', 'phase.vcf', 'mills.vcf') # Output file path output = os.path.join(work_dir, 'sample.intervals') # Call: GATK -- RealignerTargetCreator parameters = ['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', # RISKY! (?) See #189 '-T', 'RealignerTargetCreator', '-nt', str(input_args.cpu_count), '-R', 'ref.fa', '-I', 'sample.mkdups.bam', '-known', 'phase.vcf', '-known', 'mills.vcf', '--downsampling_type', 'NONE', '-o', 'sample.intervals'] docker_call(work_dir=work_dir, parameters=parameters, tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', inputs=['ref.fa','sample.mkdups.bam', 'ref.fa.fai', 'ref.dict', 'sample.mkdups.bam.bai', 'phase.vcf', 'mills.vcf'], outputs={'sample.intervals': None}, env={'JAVA_OPTS':'-Xmx%sg' % input_args.memory}) shared_ids['sample.intervals'] = job.fileStore.writeGlobalFile(output) job.addChildJobFn(indel_realignment, shared_ids, input_args)
def call_adam(master_ip, arguments, memory=None, override_parameters=None): """ Invokes the ADAM container. Find ADAM at https://github.com/bigdatagenomics/adam. :param masterIP: The Spark leader IP address. :param arguments: Arguments to pass to ADAM. :param memory: Gigabytes of memory to provision for Spark driver/worker. :param override_parameters: Parameters passed by the user, that override our defaults. :type masterIP: MasterAddress :type arguments: list of string :type memory: int or None :type override_parameters: list of string or None """ default_params = ["--conf", "spark.driver.maxResultSize=0"] # set max result size to unlimited, see #177 docker_call(rm=False, tool="quay.io/ucsc_cgl/adam:962-ehf--6e7085f8cac4b9a927dc9fb06b48007957256b80", docker_parameters=master_ip.docker_parameters(["--net=host"]), parameters=_make_parameters(master_ip, default_params, memory, arguments, override_parameters), mock=False)
def run_kallisto(job, cores, r1_id, r2_id, kallisto_index_url): """ RNA quantification via Kallisto :param JobFunctionWrappingJob job: passed automatically by Toil :param int cores: Number of cores to run Kallisto with :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, otherwise pass None for single-end) :param str kallisto_index_url: FileStoreID for Kallisto index file :return: FileStoreID from Kallisto output :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=kallisto_index_url, name='kallisto_hg38.idx', work_dir=work_dir) # Retrieve files parameters = ['quant', '-i', '/data/kallisto_hg38.idx', '-t', str(cores), '-o', '/data/', '-b', '100'] if r1_id and r2_id: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2_cutadapt.fastq')) parameters.extend(['/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq']) else: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) parameters.extend(['--single', '-l', '200', '-s', '15', '/data/R1_cutadapt.fastq']) # Call: Kallisto docker_call(tool='quay.io/ucsc_cgl/kallisto:0.42.4--35ac87df5b21a8e8e8d159f26864ac1e1db8cf86', work_dir=work_dir, parameters=parameters) # Tar output files together and store in fileStore output_files = [os.path.join(work_dir, x) for x in ['run_info.json', 'abundance.tsv', 'abundance.h5']] tarball_files(tar_name='kallisto.tar.gz', file_paths=output_files, output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'kallisto.tar.gz'))
def base_recalibration(job, cores, indel_bam, indel_bai, ref, ref_dict, fai, dbsnp, mem): """ Creates recal table used in Base Quality Score Recalibration :param JobFunctionWrappingJob job: passed automatically by Toil :param int cores: Maximum number of cores on a worker node :param str indel_bam: Indel interval FileStoreID :param str indel_bai: Bam Index FileStoreID :param str ref: Reference genome FileStoreID :param str ref_dict: Reference dictionary FileStoreID :param str fai: Reference index FileStoreID :param str dbsnp: DBSNP VCF FileStoreID :param str mem: Memory value to be passed to children. Needed for CI tests :return: FileStoreID for the processed bam :rtype: str """ work_dir = job.fileStore.getLocalTempDir() file_ids = [ref, fai, ref_dict, indel_bam, indel_bai, dbsnp] file_names = ['ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.indel.bam', 'sample.indel.bai', 'dbsnp.vcf'] for file_store_id, name in zip(file_ids, file_names): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call: GATK -- IndelRealigner parameters = ['-T', 'BaseRecalibrator', '-nct', str(cores), '-R', '/data/ref.fasta', '-I', '/data/sample.indel.bam', '-knownSites', '/data/dbsnp.vcf', '-o', '/data/sample.recal.table'] docker_call(tool='quay.io/ucsc_cgl/gatk:3.4--dd5ac549b95eb3e5d166a5e310417ef13651994e', work_dir=work_dir, parameters=parameters, env=dict(JAVA_OPTS='-Xmx{}'.format(mem))) # Write output to file store table = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.recal.table')) return job.addChildJobFn(print_reads, cores, table, indel_bam, indel_bai, ref, ref_dict, fai, mem, cores=cores, memory=mem, disk='25G').rv()
def call_conductor(master_ip, src, dst, memory=None, override_parameters=None): """ Invokes the Conductor container to copy files between S3 and HDFS and vice versa. Find Conductor at https://github.com/BD2KGenomics/conductor. :param masterIP: The Spark leader IP address. :param src: URL of file to copy. :param src: URL of location to copy file to. :param memory: Gigabytes of memory to provision for Spark driver/worker. :param override_parameters: Parameters passed by the user, that override our defaults. :type masterIP: MasterAddress :type src: string :type dst: string :type memory: int or None :type override_parameters: list of string or None """ arguments = ["--", "-C", src, dst] docker_call(rm=False, tool="quay.io/ucsc_cgl/conductor", docker_parameters=master_ip.docker_parameters(["--net=host"]), parameters=_make_parameters(master_ip, [], # no conductor specific spark configuration memory, arguments, override_parameters), mock=False)
def create_reference_dict_hc(job, shared_ids, input_args): """ Uses Picardtools to create sequence dictionary for reference genome. Calls next step in pipeline - spawn batch jobs :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ # Unpack convenience variables for job work_dir = job.fileStore.getLocalTempDir() # Retrieve file path # FIXME: unused variable ref_path = return_input_paths(job, work_dir, shared_ids, 'ref.fa') # Call: picardtools picard_output = os.path.join(work_dir, 'ref.dict') command = ['CreateSequenceDictionary', 'R=ref.fa', 'O=ref.dict'] inputs=['ref.fa'] outputs={picard_output: None} docker_call(work_dir = work_dir, env={'JAVA_OPTS':'-Xmx%sg' % input_args.memory}, parameters = command, tool = 'quay.io/ucsc_cgl/picardtools', inputs=inputs, outputs=outputs) # Update fileStore for output shared_ids['ref.dict'] = job.fileStore.writeGlobalFile(picard_output) job.addChildJobFn(spawn_batch_variant_calling, shared_ids, input_args)
def create_reference_index_hc(job, shared_ids, input_args): """ Uses samtools to create reference index file in working directory, spawns next job in pipeline - create reference dictionary :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ # Unpack convenience variables for job work_dir = job.fileStore.getLocalTempDir() # Retrieve file path # FIXME: unused variable ref_path = return_input_paths(job, work_dir, shared_ids, 'ref.fa') faidx_output = os.path.join(work_dir, 'ref.fa.fai') # Call: Samtools faidx_command = ['faidx', 'ref.fa'] inputs= ref_path outputs={'ref.fa.fai': None} docker_call(work_dir = work_dir, parameters = faidx_command, tool = 'quay.io/ucsc_cgl/samtools', inputs=inputs, outputs=outputs) # Update fileStore for output shared_ids['ref.fa.fai'] = job.fileStore.writeGlobalFile(faidx_output) job.addChildJobFn(create_reference_dict_hc, shared_ids, input_args)
def run_star(job, cores, r1_id, r2_id, star_index_url, wiggle=False): """ Performs alignment of fastqs to bam via STAR :param JobFunctionWrappingJob job: passed automatically by Toil :param int cores: Number of cores to run star with :param str r1_id: FileStoreID of fastq (pair 1) :param str r2_id: FileStoreID of fastq (pair 2 if applicable, else pass None) :param str star_index_url: STAR index tarball :param bool wiggle: If True, will output a wiggle file and return it :return: FileStoreID from RSEM :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=star_index_url, name='starIndex.tar.gz', work_dir=work_dir) subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir]) os.remove(os.path.join(work_dir, 'starIndex.tar.gz')) # Determine tarball structure - star index contains are either in a subdir or in the tarball itself star_index = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data' # Parameter handling for paired / single-end data parameters = ['--runThreadN', str(cores), '--genomeDir', star_index, '--outFileNamePrefix', 'rna', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--outSAMunmapped', 'Within', '--quantMode', 'TranscriptomeSAM', '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD', '--outFilterType', 'BySJout', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '999', '--outFilterMismatchNoverReadLmax', '0.04', '--alignIntronMin', '20', '--alignIntronMax', '1000000', '--alignMatesGapMax', '1000000', '--alignSJoverhangMin', '8', '--alignSJDBoverhangMin', '1', '--sjdbScore', '1'] if wiggle: parameters.extend(['--outWigType', 'bedGraph', '--outWigStrand', 'Unstranded', '--outWigReferencesPrefix', 'chr']) if r1_id and r2_id: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq']) else: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq']) # Call: STAR Mapping docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', work_dir=work_dir, parameters=parameters) # Write to fileStore transcriptome_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.toTranscriptome.out.bam')) sorted_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) if wiggle: wiggle_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaSignal.UniqueMultiple.str1.out.bg')) return transcriptome_id, sorted_id, wiggle_id else: return transcriptome_id, sorted_id
def test_docker_call(tmpdir): from toil_scripts.lib.programs import docker_call work_dir = str(tmpdir) parameter = ['--help'] tool = 'quay.io/ucsc_cgl/samtools' docker_call(work_dir=work_dir, parameters=parameter, tool=tool) # Test outfile fpath = os.path.join(work_dir, 'test') with open(fpath, 'w') as f: docker_call(tool='ubuntu', env=dict(foo='bar'), parameters=['printenv', 'foo'], outfile=f) assert open(fpath).read() == 'bar\n'
def _download_with_genetorrent(url, file_path, cghub_key_path): parsed_url = urlparse(url) analysis_id = parsed_url.path[1:] assert parsed_url.scheme == 'gnos', 'Improper format. gnos://cghub/ID. User supplied: {}'.format(parsed_url) work_dir = os.path.dirname(file_path) folder_path = os.path.join(work_dir, os.path.basename(analysis_id)) parameters = ['-vv', '-c', cghub_key_path, '-d', analysis_id] docker_call(tool='quay.io/ucsc_cgl/genetorrent:3.8.7--9911761265b6f08bc3ef09f53af05f56848d805b', work_dir=work_dir, parameters=parameters) sample = glob.glob(os.path.join(folder_path, '*tar*')) assert len(sample) == 1, 'More than one sample tar in CGHub download: {}'.format(analysis_id)
def spladder(job, inputs, bam_id, bai_id): """ Run SplAdder to detect and quantify alternative splicing events :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str bam_id: FileStore ID of bam :param str bai_id: FileStore ID of bam index file :return: FileStore ID of SplAdder tarball :rtype: str """ job.fileStore.logToMaster('SplAdder: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() # Pull in alignment.bam from fileStore job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'alignment.bam')) job.fileStore.readGlobalFile(bai_id, os.path.join(work_dir, 'alignment.bam.bai')) # Download input file download_url(url=inputs.gtf, work_dir=work_dir, name='annotation.gtf') download_url(url=inputs.gtf_pickle, work_dir=work_dir, name='annotation.gtf.pickle') # Call Spladder command = ['--insert_ir=y', '--insert_es=y', '--insert_ni=y', '--remove_se=n', '--validate_sg=n', '-b', 'alignment.bam', '-o ', '/data', '-a', 'annotation.gtf', '-v', 'y', '-c', '3', '-M', 'single', '-T', 'n', '-n', '50', '-P', 'y', '-p', 'n', '--sparse_bam', 'y'] docker_call(work_dir=work_dir, parameters=command, sudo=inputs.sudo, tool='jvivian/spladder:1.0') # Write output to fileStore and return ids output_pickle = os.path.join(work_dir, ' ', 'spladder', 'genes_graph_conf3.alignment.pickle') if not os.path.exists(output_pickle): matches = [] for root, dirnames, filenames in os.walk(work_dir): for filename in fnmatch.filter(filenames, '*genes_graph*'): matches.append(os.path.join(root, filename)) if matches: output_pickle = matches[0] else: raise RuntimeError("Couldn't find genes file!") output_filt = os.path.join(work_dir, 'alignment.filt.hdf5') output = os.path.join(work_dir, 'alignment.hdf5') print os.listdir(work_dir) tarball_files('spladder.tar.gz', file_paths=[output_pickle, output_filt, output], output_dir=work_dir) return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'spladder.tar.gz'))
def haplotype_caller(job, shared_ids, input_args): """ Uses GATK HaplotypeCaller to identify SNPs and Indels and writes a gVCF. Calls per-sample genotyper to genotype gVCF. :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ work_dir = job.fileStore.getLocalTempDir() input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'toil.bam', 'toil.bam.bai'] read_from_filestore_hc(job, work_dir, shared_ids, *input_files) output = '%s.raw.BOTH%s.gvcf' % (input_args['uuid'], input_args['suffix']) # Call GATK -- HaplotypeCaller command = ['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', # RISKY! (?) See #189 '-nct', str(input_args['cpu_count']), '-R', 'ref.fa', '-T', 'HaplotypeCaller', '--genotyping_mode', 'Discovery', '--emitRefConfidence', 'GVCF', '-I', 'toil.bam', '-o', output, '-variant_index_type', 'LINEAR', '-variant_index_parameter', '128000', '--annotation', 'QualByDepth', '--annotation', 'DepthPerSampleHC', '--annotation', 'FisherStrand', '--annotation', 'ReadPosRankSumTest'] try: inputs=input_files outputs={output: None} docker_call(work_dir = work_dir, env={'JAVA_OPTS':'-Xmx%sg' % input_args['memory']}, parameters = command, tool = 'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', inputs=inputs, outputs=outputs, sudo = input_args['sudo']) except: sys.stderr.write("Running haplotype caller with %s in %s failed." % ( " ".join(command), work_dir)) raise # Update fileStore and spawn child job shared_ids[output] = job.fileStore.writeGlobalFile(os.path.join(work_dir, output)) # upload gvcf upload_or_move_hc(work_dir, input_args, output) # call variants prior to vqsr job.addChildJobFn(genotype_gvcf, shared_ids, input_args, cores = input_args['cpu_count'])
def star(job, inputs, r1_cutadapt, r2_cutadapt): """ Performs alignment of fastqs to BAM via STAR :param JobFunctionWrappingJob job: passed by Toil automatically :param Namespace inputs: Stores input arguments (see main) :param str r1_cutadapt: FileStore ID of read 1 fastq :param str r2_cutadapt: FileStore ID of read 2 fastq """ job.fileStore.logToMaster('Aligning with STAR: {}'.format(inputs.uuid)) work_dir = job.fileStore.getLocalTempDir() cores = min(inputs.cores, 16) # Retrieve files job.fileStore.readGlobalFile(r1_cutadapt, os.path.join(work_dir, 'R1_cutadapt.fastq')) job.fileStore.readGlobalFile(r2_cutadapt, os.path.join(work_dir, 'R2_cutadapt.fastq')) # Get starIndex download_url(inputs.star_index, work_dir, 'starIndex.tar.gz') subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'starIndex.tar.gz'), '-C', work_dir]) # Parameters parameters = ['--runThreadN', str(cores), '--genomeDir', '/data/starIndex', '--outFileNamePrefix', 'rna', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--outSAMunmapped', 'Within', '--quantMode', 'TranscriptomeSAM', '--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD', '--outFilterType', 'BySJout', '--outFilterMultimapNmax', '20', '--outFilterMismatchNmax', '999', '--outFilterMismatchNoverReadLmax', '0.04', '--alignIntronMin', '20', '--alignIntronMax', '1000000', '--alignMatesGapMax', '1000000', '--alignSJoverhangMin', '8', '--alignSJDBoverhangMin', '1', '--sjdbScore', '1', '--readFilesIn', '/data/R1_cutadapt.fastq', '/data/R2_cutadapt.fastq'] # Call: STAR Map docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', work_dir=work_dir, parameters=parameters) # Call Samtools Index index_command = ['index', '/data/rnaAligned.sortedByCoord.out.bam'] docker_call(work_dir=work_dir, parameters=index_command, tool='quay.io/ucsc_cgl/samtools:1.3--256539928ea162949d8a65ca5c79a72ef557ce7c') # fileStore bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam')) bai_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rnaAligned.sortedByCoord.out.bam.bai')) job.fileStore.deleteGlobalFile(r1_cutadapt) job.fileStore.deleteGlobalFile(r2_cutadapt) # Launch children and follow-on vcqc_id = job.addChildJobFn(variant_calling_and_qc, inputs, bam_id, bai_id, cores=2, disk='30G').rv() spladder_id = job.addChildJobFn(spladder, inputs, bam_id, bai_id, disk='30G').rv() job.addFollowOnJobFn(consolidate_output_tarballs, inputs, vcqc_id, spladder_id, disk='30G')
def run_bwa(job, inputs, ids): """ Aligns two fastqs into a BAMFILE via BWA :param JobFunctionWrappingJob job: Passed by Toil automatically :param Namespace inputs: Input arguments (see main) :param list ids: list of FileStore IDs (R1, R2, reference inputs) """ work_dir = job.fileStore.getLocalTempDir() file_names = ['r1.fq.gz', 'r2.fq.gz', 'ref.fa', 'ref.fa.amb', 'ref.fa.ann', 'ref.fa.bwt', 'ref.fa.pac', 'ref.fa.sa', 'ref.fa.fai'] if inputs.alt: file_names.append('ref.fa.alt') for fileStoreID, name in zip(ids, file_names): job.fileStore.readGlobalFile(fileStoreID, os.path.join(work_dir, name)) # Add read group line rg = "@RG\\tID:{0}\\tLB:{1}\\tPL:{2}\\tPU:{3}\\tSM:{0}".format(inputs.uuid, inputs.library, inputs.platform, inputs.program_unit) # BWA Options opt_args = [] if not inputs.skip_sort: opt_args.append('-s') if inputs.trim: opt_args.append('-a') # Call: bwakit parameters = (['-t', str(inputs.cores), '-R', rg] + opt_args + ['-o', '/data/aligned', '/data/ref.fa', '/data/r1.fq.gz', '/data/r2.fq.gz']) outputs = {'aligned.aln.bam': inputs.mock_bam} docker_call(tool='quay.io/ucsc_cgl/bwakit:0.7.12--528bb9bf73099a31e74a7f5e6e3f2e0a41da486e', parameters=parameters, inputs=file_names, outputs=outputs, work_dir=work_dir) # BWA insists on adding an `*.aln.sam` suffix, so rename the output file output_file = os.path.join(work_dir, '{}.bam'.format(inputs.uuid)) os.rename(os.path.join(work_dir, 'aligned.aln.bam'), output_file) # Either write file to local output directory or upload to S3 cloud storage job.fileStore.logToMaster('Aligned sample: {}'.format(inputs.uuid)) if inputs.output_dir: move_files([output_file], inputs.output_dir) if inputs.s3_dir: s3am_upload(output_file, inputs.s3_dir, s3_key_path=inputs.ssec)
def call_conductor(masterIP, inputs, src, dst): """ Invokes the Conductor container to copy files between S3 and HDFS :type masterIP: MasterAddress """ docker_call(rm = False, tool = "quay.io/ucsc_cgl/conductor", docker_parameters = masterIP.docker_parameters(["--net=host"]), parameters = ["--master", "spark://"+masterIP+":"+SPARK_MASTER_PORT, "--conf", "spark.driver.memory=%sg" % inputs["driverMemory"], "--conf", "spark.executor.memory=%sg" % inputs["executorMemory"], "--", "-C", src, dst], mock=False)
def run_picard_create_sequence_dictionary(job, ref_id): """ Use Picard-tools to create reference dictionary :param JobFunctionWrappingJob job: passed automatically by Toil :param str ref_id: FileStoreID for the reference genome :return: FileStoreID for reference dictionary :rtype: str """ job.fileStore.logToMaster('Created reference dictionary') work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fasta')) command = ['CreateSequenceDictionary', 'R=ref.fasta', 'O=ref.dict'] docker_call(work_dir=work_dir, parameters=command, tool='quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e') return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'ref.dict'))
def run_samtools_faidx(job, ref_id): """ Use Samtools to create reference index file :param JobFunctionWrappingJob job: passed automatically by Toil :param str ref_id: FileStoreID for the reference genome :return: FileStoreID for reference index :rtype: str """ job.fileStore.logToMaster('Created reference index') work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fasta')) command = ['faidx', 'ref.fasta'] docker_call(work_dir=work_dir, parameters=command, tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e') return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'ref.fasta.fai'))
def genotype_gvcf(job, shared_ids, input_args): """ Genotypes the gVCF generated by the HaplotypeCaller. Calls variant quality score recalibration functions. :param job: Job instance :param shared_ids: dictionary of shared file promises :param input_args: dictionary of input arguments """ work_dir = job.fileStore.getLocalTempDir() input_files = ['%s.raw.BOTH%s.gvcf' % (input_args['uuid'], input_args['suffix']), 'ref.fa', 'ref.fa.fai', 'ref.dict'] read_from_filestore_hc(job, work_dir, shared_ids, *input_files) output = 'unified.raw.BOTH.gatk.vcf' command = ['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', # RISKY! (?) See #189 '-nt', str(input_args['cpu_count']), '-R', 'ref.fa', '-T', 'GenotypeGVCFs', '--variant', '%s.raw.BOTH.gatk.gvcf' % input_args['uuid'], '--out', output, '-stand_emit_conf', '10.0', '-stand_call_conf', '30.0'] try: inputs=input_files outputs={output: None} docker_call(work_dir = work_dir, env={'JAVA_OPTS':'-Xmx%sg' % input_args['memory']}, parameters = command, tool = 'quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', inputs=inputs, outputs=outputs, sudo = input_args['sudo']) except: sys.stderr.write("Running GenotypeGVCFs with %s in %s failed." % ( " ".join(command), work_dir)) raise # Update fileStore and spawn child job shared_ids[output] = job.fileStore.writeGlobalFile(os.path.join(work_dir, output)) # run vqsr job.addChildJobFn(vqsr_snp, shared_ids, input_args, cores = input_args['cpu_count']) job.addChildJobFn(vqsr_indel, shared_ids, input_args, cores = input_args['cpu_count'])
def call_adam(masterIP, inputs, arguments): """ Invokes the ADAM container :type masterIP: MasterAddress """ default_params = ["--master", ("spark://%s:%s" % (masterIP, SPARK_MASTER_PORT)), "--conf", ("spark.driver.memory=%sg" % inputs["driverMemory"]), "--conf", ("spark.executor.memory=%sg" % inputs["executorMemory"]), "--conf", ("spark.hadoop.fs.default.name=hdfs://%s:%s" % (masterIP, HDFS_MASTER_PORT)), "--conf", "spark.driver.maxResultSize=0", # set max result size to unlimited, see #177 "--"] docker_call(rm = False, tool = "quay.io/ucsc_cgl/adam:962-ehf--6e7085f8cac4b9a927dc9fb06b48007957256b80", docker_parameters = masterIP.docker_parameters(["--net=host"]), parameters = default_params + arguments, mock=False)
def run_merge_vcf(job, options, index_dir_id, work_dir, vcf_file_key_list): RealTimeLogger.get().info("Completed gam merging and gam path variant calling.") RealTimeLogger.get().info("Starting vcf merging...") # Set up the IO stores each time, since we can't unpickle them on Azure for # some reason. input_store = IOStore.get(options.input_store) out_store = IOStore.get(options.out_store) # Download local input files from the remote storage container graph_dir = work_dir read_global_directory(job.fileStore, index_dir_id, graph_dir) vcf_merging_file_key_list = [] for vcf_file_key in vcf_file_key_list: vcf_file = "{}/{}".format(work_dir, vcf_file_key) vcf_file_idx = "{}.tbi".format(vcf_file) out_store.read_input_file(vcf_file_key, vcf_file) out_store.read_input_file(vcf_file_key + ".tbi", vcf_file_idx) vcf_merging_file_key_list.append(vcf_file) vcf_merged_file_key = "" if len(vcf_file_key_list) > 1: # merge vcf files vcf_merged_file_key = "{}.vcf.gz".format(options.sample_name) command=['concat', '-O', 'z', '-o', '{}/{}'.format(work_dir, vcf_merged_file_key), ' '.join(vcf_merging_file_key_list)] docker_call(work_dir=work_dir, parameters=command, tool='biodckr/bcftools') # run("bcftools concat -O z -o {}/{} {}".format(work_dir, vcf_merged_file_key, ' '.join(vcf_merging_file_key_list))) run("tabix -f -p vcf {}/{}".format(work_dir, vcf_merged_file_key)) else: vcf_merged_file_key = vcf_file_key_list[0] # save variant calling results to the output store vcf_file = "{}/{}".format(work_dir, vcf_merged_file_key) vcf_file_idx = "{}/{}.tbi".format(work_dir, vcf_merged_file_key) out_store.write_output_file(vcf_file, vcf_merged_file_key) out_store.write_output_file(vcf_file_idx, vcf_merged_file_key + ".tbi") #Run downloader to download output IO store files to local output directory. vcf_file_id = job.fileStore.writeGlobalFile(vcf_file) vcf_file_idx_id = job.fileStore.writeGlobalFile(vcf_file_idx) downloadList = [[vcf_file_id, vcf_merged_file_key], [vcf_file_idx_id, vcf_merged_file_key+".tbi"]] return downloadList
def run_rsem(job, cores, bam_id, rsem_ref_url, paired=True): """ RNA quantification with RSEM :param JobFunctionWrappingJob job: Passed automatically by Toil :param int cores: Number of cores to run RSEM with :param str bam_id: FileStoreID of transcriptome bam for quantification :param str rsem_ref_url: URL of RSEM reference (tarball) :param bool paired: If True, uses parameters for paired end data :return: FileStoreIDs for RSEM's gene and isoform output :rtype: str """ work_dir = job.fileStore.getLocalTempDir() download_url(url=rsem_ref_url, name='rsem_ref.tar.gz', work_dir=work_dir) subprocess.check_call(['tar', '-xvf', os.path.join(work_dir, 'rsem_ref.tar.gz'), '-C', work_dir]) os.remove(os.path.join(work_dir, 'rsem_ref.tar.gz')) # Determine tarball structure - based on it, ascertain folder name and rsem reference prefix rsem_files = [] for root, directories, files in os.walk(work_dir): rsem_files.extend([os.path.join(root, x) for x in files]) # "grp" is a required RSEM extension that should exist in the RSEM reference ref_prefix = [os.path.basename(os.path.splitext(x)[0]) for x in rsem_files if 'grp' in x][0] ref_folder = os.path.join('/data', os.listdir(work_dir)[0]) if len(os.listdir(work_dir)) == 1 else '/data' # I/O job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'transcriptome.bam')) output_prefix = 'rsem' # Call: RSEM parameters = ['--quiet', '--no-qualities', '-p', str(cores), '--forward-prob', '0.5', '--seed-length', '25', '--fragment-length-mean', '-1.0', '--bam', '/data/transcriptome.bam', os.path.join(ref_folder, ref_prefix), output_prefix] if paired: parameters = ['--paired-end'] + parameters docker_call(tool='quay.io/ucsc_cgl/rsem:1.2.25--d4275175cc8df36967db460b06337a14f40d2f21', parameters=parameters, work_dir=work_dir) os.rename(os.path.join(work_dir, output_prefix + '.genes.results'), os.path.join(work_dir, 'rsem_gene.tab')) os.rename(os.path.join(work_dir, output_prefix + '.isoforms.results'), os.path.join(work_dir, 'rsem_isoform.tab')) # Write to FileStore gene_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem_gene.tab')) isoform_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem_isoform.tab')) return gene_id, isoform_id
def run_samtools_index(job, bam_id): """ Runs samtools index to create (.bai) files :param JobFunctionWrappingJob job: passed automatically by Toil :param str bam_id: FileStoreID of the bam file :return: BAM index FileStoreID :rtype: str """ work_dir = job.fileStore.getLocalTempDir() job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, 'sample.bam')) # Call: index the bam parameters = ['index', '/data/sample.bam'] docker_call(work_dir=work_dir, parameters=parameters, tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e') # Write to fileStore return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.bam.bai'))
def run_indel_realignment(job, intervals, bam, bai, ref, ref_dict, fai, phase, mills, mem, unsafe=False): """ Creates realigned bams using the intervals file from previous step :param JobFunctionWrappingJob job: passed automatically by Toil :param str intervals: Indel interval FileStoreID :param str bam: Sample BAM FileStoreID :param str bai: Bam Index FileStoreID :param str ref: Reference genome FileStoreID :param str ref_dict: Reference dictionary FileStoreID :param str fai: Reference index FileStoreID :param str phase: Phase VCF FileStoreID :param str mills: Mills VCF FileStoreID :param str mem: Memory value to be passed to children. Needed for CI tests :param bool unsafe: If True, runs gatk UNSAFE mode: "-U ALLOW_SEQ_DICT_INCOMPATIBILITY" :return: FileStoreID for the processed bam :rtype: tuple(str, str) """ work_dir = job.fileStore.getLocalTempDir() file_ids = [ref, fai, ref_dict, intervals, bam, bai, phase, mills] inputs = ['ref.fasta', 'ref.fasta.fai', 'ref.dict', 'sample.intervals', 'sample.bam', 'sample.bam.bai', 'phase.vcf', 'mills.vcf'] for file_store_id, name in zip(file_ids, inputs): job.fileStore.readGlobalFile(file_store_id, os.path.join(work_dir, name)) # Call: GATK -- IndelRealigner parameters = ['-T', 'IndelRealigner', '-R', '/data/ref.fasta', '-I', '/data/sample.bam', '-known', '/data/phase.vcf', '-known', '/data/mills.vcf', '-targetIntervals', '/data/sample.intervals', '--downsampling_type', 'NONE', '-maxReads', str(720000), # Taken from MC3 pipeline '-maxInMemory', str(5400000), # Taken from MC3 pipeline '-o', '/data/sample.indel.bam'] if unsafe: parameters.extend(['-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY']) docker_call(tool='quay.io/ucsc_cgl/gatk:3.5--dba6dae49156168a909c43330350c6161dc7ecc2', inputs=inputs, outputs={'sample.indel.bam': None, 'sample.indel.bai': None}, work_dir=work_dir, parameters=parameters, env=dict(JAVA_OPTS='-Xmx{}'.format(mem))) # Write to fileStore indel_bam = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.indel.bam')) indel_bai = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sample.indel.bai')) return indel_bam, indel_bai