def run(items, config): """Run third party disambiguation script, resolving into single set of calls. """ assert len(items) == 2, "Can only resolve two organism disambiguation" # check aligner, handling tophat/tophat2 distinctions aligner = config["algorithm"].get("aligner") aligner = "tophat" if aligner.startswith("tophat") else aligner assert aligner in ["bwa", "tophat", "star"], "Disambiguation only supported for bwa, star and tophat alignments." if items[0]["disambiguate"].get("base"): data_a, data_b = items else: data_b, data_a = items work_bam_a = bam.sort(data_a["work_bam"], config, "queryname") work_bam_b = bam.sort(data_b["work_bam"], config, "queryname") out_dir = os.path.normpath(os.path.join(os.path.dirname(work_bam_a), os.pardir, "disambiguate_%s" % aligner)) base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a))[0]) summary_file = "%s_summary.txt" % base_name if not utils.file_exists(summary_file): with file_transaction(items[0], out_dir) as tx_out_dir: Args = collections.namedtuple("Args", "A B output_dir intermediate_dir " "no_sort prefix aligner") args = Args(work_bam_a, work_bam_b, tx_out_dir, tx_out_dir, True, "", aligner) disambiguate_main(args) data_a["disambiguate"] = \ {data_b["genome_build"]: "%s.disambiguatedSpeciesB.bam" % base_name, "%s-ambiguous" % data_a["genome_build"]: "%s.ambiguousSpeciesA.bam" % base_name, "%s-ambiguous" % data_b["genome_build"]: "%s.ambiguousSpeciesB.bam" % base_name, "summary": summary_file} data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name, config) return [[data_a]]
def run_cplusplus(items, config): """Run third party disambiguation script, resolving into single set of calls. """ assert len(items) == 2, "Can only resolve two organism disambiguation" # check aligner, handling tophat/tophat2 distinctions aligner = config["algorithm"].get("aligner") aligner = "tophat" if aligner.startswith("tophat") else aligner assert aligner in ["bwa", "tophat", "star"], "Disambiguation only supported for bwa, star and tophat alignments." if items[0]["disambiguate"].get("base"): data_a, data_b = items else: data_b, data_a = items work_bam_a = bam.sort(data_a["work_bam"], config, "queryname") work_bam_b = bam.sort(data_b["work_bam"], config, "queryname") out_dir = os.path.normpath(os.path.join(os.path.dirname(work_bam_a), os.pardir, os.pardir, "disambiguate")) base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a))[0]) summary_file = "%s_summary.txt" % base_name if not utils.file_exists(summary_file): with file_transaction(items[0], out_dir) as tx_out_dir: raise NotImplementedError("Still need to test and support C++ version") cmd = "" do.run(cmd.format(**locals()), "Disambiguation", data_a) data_a["disambiguate"] = \ {data_b["genome_build"]: "%s.disambiguatedSpeciesB.bam" % base_name, "%s-ambiguous" % data_a["genome_build"]: "%s.ambiguousSpeciesA.bam" % base_name, "%s-ambiguous" % data_b["genome_build"]: "%s.ambiguousSpeciesB.bam" % base_name, "summary": summary_file} data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name, config) return [[data_a]]
def pipeline_summary(data): """Provide summary information on processing sample. Handles standard and CWL (single QC output) cases. """ data = utils.to_single_data(data) if data["analysis"].startswith("wgbs-seq"): bismark_bam = dd.get_align_bam(data) sorted_bam = bam.sort(bismark_bam, data["config"]) data = dd.set_align_bam(data, sorted_bam) data = dd.set_work_bam(data, bismark_bam) work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if not work_bam or not work_bam.endswith(".bam"): work_bam = None if dd.get_ref_file(data): if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)): # kraken doesn't need bam logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join( dd.get_algorithm_qc(data)))) work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data) data["summary"] = _run_qc_tools(work_bam, work_data) if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data): data["summary"]["qc"] = data["summary"]["qc"].get( dd.get_algorithm_qc(data)[0]) return [[data]]
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname") gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" if file_exists(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}") message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, count_file) as tx_count_file: do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) shutil.move(fixed_count_file, count_file) return count_file
def dedup_bismark(data): """Remove alignments to the same position in the genome from the Bismark mapping output using deduplicate_bismark """ input_file = datadict.get_work_bam(data) input_file = bam.sort(input_file, datadict.get_config(data), order="queryname") sample_name = datadict.get_sample_name(data) output_dir = os.path.join(datadict.get_work_dir(data), 'dedup', sample_name) output_dir = utils.safe_makedir(output_dir) input_file_name, input_file_extension = os.path.splitext( os.path.basename(input_file)) output_file = os.path.join( output_dir, f'{input_file_name}.deduplicated{input_file_extension}') if utils.file_exists(output_file): data = datadict.set_work_bam(data, output_file) return [[data]] deduplicate_bismark = config_utils.get_program('deduplicate_bismark', data['config']) command = f'{deduplicate_bismark} --output_dir {output_dir} {input_file}' with transaction.file_transaction(output_dir): do.run(command, 'remove deduplicate alignments') data = datadict.set_work_bam(data, output_file) return [[data]]
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname") gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = "{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}" message = "Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts" with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) return count_file
def clean_chipseq_alignment(data): # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) method = dd.get_chip_method(data) if method == "atac": data = clean_ATAC(data) # for ATAC-seq, this will be the NF BAM work_bam = dd.get_work_bam(data) work_bam = bam.sort(work_bam, dd.get_config(data)) bam.index(work_bam, dd.get_config(data)) clean_bam = remove_nonassembled_chrom(work_bam, data) clean_bam = remove_mitochondrial_reads(clean_bam, data) data = atac.calculate_complexity_metrics(clean_bam, data) if not dd.get_keep_multimapped(data): clean_bam = remove_multimappers(clean_bam, data) if not dd.get_keep_duplicates(data): clean_bam = bam.remove_duplicates(clean_bam, data) data["work_bam"] = clean_bam encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) try: data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) except subprocess.CalledProcessError: logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, " f" falling back to non-normalized coverage.") data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def calculate_complexity_metrics(work_bam, data): """ the work_bam should have duplicates marked but not removed mitochondrial reads should be removed """ bedtools = config_utils.get_program("bedtools", dd.get_config(data)) work_dir = dd.get_work_dir(data) metrics_dir = os.path.join(work_dir, "metrics", "atac") utils.safe_makedir(metrics_dir) metrics_file = os.path.join( metrics_dir, f"{dd.get_sample_name(data)}-atac-metrics.csv") # complexity metrics only make sense for paired-end reads if not bam.is_paired(work_bam): return data if utils.file_exists(metrics_file): data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file) return data # BAM file must be sorted by read name work_bam = bam.sort(work_bam, dd.get_config(data), order="queryname") with file_transaction(metrics_file) as tx_metrics_file: with open(tx_metrics_file, "w") as out_handle: out_handle.write("mt,m0,m1,m2\n") cmd = ( f"{bedtools} bamtobed -bedpe -i {work_bam} | " "awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$4,$6,$9,$10}' | " "sort | " "uniq -c | " "awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} " "($1==2){m2=m2+1}{m0=m0+1}{mt=mt+$1}END{printf \"%d,%d,%d,%d\\n\", mt,m0,m1,m2}' >> " f"{tx_metrics_file}") message = f"Calculating ATAC-seq complexity metrics on {work_bam}, saving as {metrics_file}." do.run(cmd, message) data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file) return data
def extract_NF_regions(data): """ extract the nucleosome free regions from the work_bam. These regions will be < 100 bases """ MAX_FRAG_LENGTH = 100 sieve = config_utils.get_program("alignmentSieve", data) work_bam = dd.get_work_bam(data) num_cores = dd.get_num_cores(data) out_file = os.path.splitext(work_bam)[0] + "-NF.bam" log_file = os.path.splitext(work_bam)[0] + "-NF.log" if file_exists(out_file): data["NF_bam"] = out_file return data with file_transaction(out_file) as tx_out_file, \ file_transaction(log_file) as tx_log_file: tx_unsorted_bam = tx_out_file + ".unsorted" cmd = ( f"{sieve} --bam ${work_bam} --outFile {tx_unsorted_bam} --ATACshift " f"--numberOfProcessors {num_cores} --maxFragmentLength {MAX_FRAG_LENGTH} " f"--minMappingQuality 10 " f"--filterMetrics {tx_log_file} ") do.run(cmd, "Extract NF regions from {work_bam} to {tx_unsorted_bam}.") tx_out_file = bam.sort(tx_unsorted_bam) data["NF_bam"] = out_file return data
def _get_files(data): in_file = bam.sort(data["work_bam"], data["config"], order="queryname") gtf_file = data["genome_resources"]["rnaseq"]["transcripts"] work_dir = data["dirs"].get("work", "work") out_dir = os.path.join(work_dir, "htseq-count") out_file = os.path.join(out_dir, data['rgnames']['sample']) + ".counts" stats_file = os.path.join(out_dir, data['rgnames']['sample']) + ".stats" return in_file, gtf_file, out_file, stats_file
def _prepare_bam_file(bam_file, tmp_dir, config): """ Pipe sort by name cmd in case sort by coordinates """ sort_mode = _get_sort_order(bam_file, config) if sort_mode != "queryname": bam_file = sort(bam_file, config, "queryname") return bam_file
def _get_files(data): mapped = bam.mapped(data["work_bam"], data["config"]) in_file = bam.sort(mapped, data["config"], order="queryname") gtf_file = data["genome_resources"]["rnaseq"]["transcripts"] work_dir = data["dirs"].get("work", "work") out_dir = os.path.join(work_dir, "htseq-count") out_file = os.path.join(out_dir, data['rgnames']['sample']) + ".counts" stats_file = os.path.join(out_dir, data['rgnames']['sample']) + ".stats" return in_file, gtf_file, out_file, stats_file
def _get_files(data): mapped = bam.mapped(data["work_bam"], data["config"]) in_file = bam.sort(mapped, data["config"], order="queryname") gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") sample_name = dd.get_sample_name(data) out_file = os.path.join(out_dir, sample_name + ".counts") stats_file = os.path.join(out_dir, sample_name + ".stats") return in_file, gtf_file, out_file, stats_file
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) or dd.get_align_bam(data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) if dd.get_aligner(data) == "star": out_dir = os.path.join( out_dir, "%s_%s" % (dd.get_sample_name(data), dd.get_aligner(data))) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}") resources = config_utils.get_resources("featureCounts", data["config"]) if resources: options = resources.get("options") if options: cmd += " %s" % " ".join([str(x) for x in options]) message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) return count_file
def chipseq_count(data): """ count reads mapping to ChIP/ATAC consensus peaks with featureCounts """ method = dd.get_chip_method(data) if method == "chip": in_bam = dd.get_work_bam(data) elif method == "atac": in_bam = tz.get_in(("atac", "align", "NF"), data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data) saf_file = os.path.splitext(consensus_file)[0] + ".saf" work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "consensus") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): if method == "atac": data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count) return [[data]] featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) cmd = ( "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {sorted_bam}") message = ("Count reads in {sorted_bam} overlapping {saf_file} using " "featureCounts.") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) if method == "atac": data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count) return [[data]]
def run(items, config): """Run third party disambiguation script, resolving into single set of calls. """ assert len(items) == 2, "Can only resolve two organism disambiguation" # check aligner, handling tophat/tophat2 distinctions aligner = config["algorithm"].get("aligner") aligner = "tophat" if aligner.startswith("tophat") else aligner assert aligner in ["bwa", "hisat2", "tophat", "star"], "Disambiguation only supported for bwa, hisat2, star and tophat alignments." if items[0]["disambiguate"].get("base"): data_a, data_b = items else: data_b, data_a = items work_bam_a = bam.sort(data_a["work_bam"], config, "queryname") work_bam_b = bam.sort(data_b["work_bam"], config, "queryname") if data_a.get("align_split"): base_dir = utils.safe_makedir(os.path.normpath(os.path.join(os.path.dirname(work_bam_a), os.pardir, os.pardir, "disambiguate_%s" % aligner))) out_dir = os.path.join(base_dir, "_".join([str(x) for x in data_a["align_split"].split("-")])) else: out_dir = os.path.normpath(os.path.join(os.path.dirname(work_bam_a), os.pardir, "disambiguate_%s" % aligner)) base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a))[0]) summary_file = "%s_summary.txt" % base_name if not utils.file_exists(summary_file): with file_transaction(items[0], out_dir) as tx_out_dir: _run_cplusplus(work_bam_a, work_bam_b, tx_out_dir, aligner, os.path.basename(base_name), items) data_a["disambiguate"] = \ {data_b["genome_build"]: bam.sort("%s.disambiguatedSpeciesB.bam" % base_name, config), "%s-ambiguous" % data_a["genome_build"]: bam.sort("%s.ambiguousSpeciesA.bam" % base_name, config), "%s-ambiguous" % data_b["genome_build"]: bam.sort("%s.ambiguousSpeciesB.bam" % base_name, config), "summary": summary_file} data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name, config) return [[data_a]]
def run(items, config): """Run third party disambiguation script, resolving into single set of calls. """ assert len(items) == 2, "Can only resolve two organism disambiguation" # check aligner, handling tophat/tophat2 distinctions aligner = config["algorithm"].get("aligner") aligner = "tophat" if aligner.startswith("tophat") else aligner assert aligner in [ "bwa", "tophat", "star" ], "Disambiguation only supported for bwa, star and tophat alignments." if items[0]["disambiguate"].get("base"): data_a, data_b = items else: data_b, data_a = items work_bam_a = bam.sort(data_a["work_bam"], config, "queryname") work_bam_b = bam.sort(data_b["work_bam"], config, "queryname") out_dir = os.path.normpath( os.path.join(os.path.dirname(work_bam_a), os.pardir, "disambiguate_%s" % aligner)) base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a))[0]) summary_file = "%s_summary.txt" % base_name if not utils.file_exists(summary_file): with file_transaction(out_dir) as tx_out_dir: Args = collections.namedtuple( "Args", "A B output_dir intermediate_dir " "no_sort prefix aligner") args = Args(work_bam_a, work_bam_b, tx_out_dir, tx_out_dir, True, "", aligner) disambiguate_main(args) data_a["disambiguate"] = \ {data_b["genome_build"]: "%s.disambiguatedSpeciesB.bam" % base_name, "%s-ambiguous" % data_a["genome_build"]: "%s.ambiguousSpeciesA.bam" % base_name, "%s-ambiguous" % data_b["genome_build"]: "%s.ambiguousSpeciesB.bam" % base_name, "summary": summary_file} data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name, config) return [[data_a]]
def run_cplusplus(items, config): """Run third party disambiguation script, resolving into single set of calls. """ assert len(items) == 2, "Can only resolve two organism disambiguation" # check aligner, handling tophat/tophat2 distinctions aligner = config["algorithm"].get("aligner") aligner = "tophat" if aligner.startswith("tophat") else aligner assert aligner in [ "bwa", "hisat2", "tophat", "star" ], "Disambiguation only supported for bwa, hisat2, star and tophat alignments." if items[0]["disambiguate"].get("base"): data_a, data_b = items else: data_b, data_a = items work_bam_a = bam.sort(data_a["work_bam"], config, "queryname") work_bam_b = bam.sort(data_b["work_bam"], config, "queryname") out_dir = os.path.normpath( os.path.join(os.path.dirname(work_bam_a), os.pardir, os.pardir, "disambiguate")) base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a))[0]) summary_file = "%s_summary.txt" % base_name if not utils.file_exists(summary_file): with file_transaction(items[0], out_dir) as tx_out_dir: raise NotImplementedError( "Still need to test and support C++ version") cmd = "" do.run(cmd.format(**locals()), "Disambiguation", data_a) data_a["disambiguate"] = \ {data_b["genome_build"]: "%s.disambiguatedSpeciesB.bam" % base_name, "%s-ambiguous" % data_a["genome_build"]: "%s.ambiguousSpeciesA.bam" % base_name, "%s-ambiguous" % data_b["genome_build"]: "%s.ambiguousSpeciesB.bam" % base_name, "summary": summary_file} data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name, config) return [[data_a]]
def _align_from_fastq(fastq1, fastq2, aligner, align_ref, sam_ref, names, align_dir, data): """Align from fastq inputs, producing sorted BAM output. """ config = data["config"] align_fn = TOOLS[aligner].align_fn out = align_fn(fastq1, fastq2, align_ref, names, align_dir, data) # handle align functions that update the main data dictionary in place if isinstance(out, dict): assert "work_bam" in out return out # handle output of raw SAM files that need to be converted to BAM else: work_bam = bam.sam_to_bam(out, config) data["work_bam"] = bam.sort(work_bam, config) return data
def _run_meth_extractor(bam_in, sample, workdir, index_dir, config): """Run bismark_methylation_extractor command""" bismark = config_utils.get_program("bismark_methylation_extractor", config) cores = config["algorithm"].get('cores', 1) memory = config["algorithm"].get('mem', 5) bam_in = bam.sort(bam_in, config, order="queryname") cmd = "{bismark} --no_overlap --comprehensive --cytosine_report --genome_folder {index_dir} --merge_non_CpG --multicore {cores} --buffer_size {memory}G --bedGraph --gzip {bam_in}" out_dir = os.path.join(workdir, sample) mbias_file = os.path.join( out_dir, os.path.basename(splitext_plus(bam_in)[0]) + '.M-bias.txt') if not file_exists(mbias_file): with tx_tmpdir() as tx_dir: with chdir(tx_dir): do.run(cmd.format(**locals()), "bismark_methylation_extractor in %s" % bam_in) shutil.move(tx_dir, out_dir) assert os.path.exists( mbias_file), "mbias report doesn't exists:%s" % mbias_file return mbias_file
def _fix_unmapped(unmapped_file, config, names): """ the unmapped.bam file from Tophat 2.0.9 is missing some things 1) the RG tag is missing from the reads 2) MAPQ is set to 255 instead of 0 3) for reads where both are unmapped, the mate_is_unmapped flag is not set correctly """ out_file = os.path.splitext(unmapped_file)[0] + "_fixed.bam" if file_exists(out_file): return out_file picard = broad.runner_from_config(config) rg_fixed = picard.run_fn("picard_fix_rgs", unmapped_file, names) fixed = bam.sort(rg_fixed, config, "queryname") with closing(pysam.Samfile(fixed)) as work_sam: with file_transaction(out_file) as tx_out_file: tx_out = pysam.Samfile(tx_out_file, "wb", template=work_sam) for read1 in work_sam: if not read1.is_paired: if read1.is_unmapped: read1.mapq = 0 tx_out.write(read1) continue read2 = work_sam.next() if read1.qname != read2.qname: continue if read1.is_unmapped and not read2.is_unmapped: read1.mapq = 0 read1.tid = read2.tid if not read1.is_unmapped and read2.is_unmapped: read2.mapq = 0 read2.tid = read1.tid if read1.is_unmapped and read2.is_unmapped: read1.mapq = 0 read2.mapq = 0 read1.mate_is_unmapped = True read2.mate_is_unmapped = True tx_out.write(read1) tx_out.write(read2) tx_out.close() return out_file
def clean_ATAC(data): """ extract the nucleosome free regions from the work_bam. These regions will be < 100 bases. This also shifts the alignments for ATAC-seq. """ MAX_FRAG_LENGTH = 100 sieve = config_utils.get_program("alignmentSieve", data) work_bam = dd.get_work_bam(data) num_cores = dd.get_num_cores(data) out_file = os.path.splitext(work_bam)[0] + "-NF.bam" log_file = os.path.splitext(work_bam)[0] + "-NF.log" logger.info( f"Selecting nucleosome free regions from {work_bam} and saving as {out_file}." ) if utils.file_exists(out_file): data["full_bam"] = work_bam data["work_bam"] = out_file return data unsorted_bam = os.path.splitext(out_file)[0] + ".unsorted.bam" if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file, \ file_transaction(log_file) as tx_log_file: tx_unsorted_file = os.path.splitext(tx_out_file)[0] + ".tmp.bam" cmd = ( f"{sieve} --verbose --bam {work_bam} --outFile {tx_unsorted_file} --ATACshift " f"--numberOfProcessors {num_cores} --maxFragmentLength {MAX_FRAG_LENGTH} " f"--minMappingQuality 10 " f"--filterMetrics {tx_log_file} ") do.run( cmd, f"Extract NF regions from {work_bam} to {tx_unsorted_file}.") # shifting can cause the file to become unsorted sorted_file = bam.sort(tx_unsorted_file, dd.get_config(data), force=True) shutil.move(sorted_file, tx_out_file) bam.index(out_file, dd.get_config(data)) data["full_bam"] = work_bam data["work_bam"] = out_file return data
def _fix_unmapped(unmapped_file, config, names): """ the unmapped.bam file from Tophat 2.0.9 is missing some things 1) the RG tag is missing from the reads 2) MAPQ is set to 255 instead of 0 3) for reads where both are unmapped, the mate_is_unmapped flag is not set correctly """ out_file = os.path.splitext(unmapped_file)[0] + "_fixed.bam" if file_exists(out_file): return out_file picard = broad.runner_from_config(config) rg_fixed = picard.run_fn("picard_fix_rgs", unmapped_file, names) fixed = bam.sort(rg_fixed, config, "queryname") with closing(pysam.Samfile(fixed)) as work_sam: with file_transaction(config, out_file) as tx_out_file: tx_out = pysam.Samfile(tx_out_file, "wb", template=work_sam) for read1 in work_sam: if not read1.is_paired: if read1.is_unmapped: read1.mapq = 0 tx_out.write(read1) continue read2 = work_sam.next() if read1.qname != read2.qname: continue if read1.is_unmapped and not read2.is_unmapped: read1.mapq = 0 read1.tid = read2.tid if not read1.is_unmapped and read2.is_unmapped: read2.mapq = 0 read2.tid = read1.tid if read1.is_unmapped and read2.is_unmapped: read1.mapq = 0 read2.mapq = 0 read1.mate_is_unmapped = True read2.mate_is_unmapped = True tx_out.write(read1) tx_out.write(read2) tx_out.close() return out_file
def shift_ATAC(data): """ shift the ATAC-seq alignments """ MAX_FRAG_LENGTH = 100 sieve = config_utils.get_program("alignmentSieve", data) work_bam = dd.get_work_bam(data) num_cores = dd.get_num_cores(data) out_file = os.path.splitext(work_bam)[0] + "-shifted.bam" log_file = os.path.splitext(work_bam)[0] + "-shifted.log" if utils.file_exists(out_file): data["work_bam"] = out_file return data unsorted_bam = os.path.splitext(out_file)[0] + ".unsorted.bam" # shifting removes all reads if the BAM file is not paired shiftflag = "--ATACshift" if bam.is_paired(work_bam) else "" if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file, \ file_transaction(log_file) as tx_log_file: tx_unsorted_file = os.path.splitext(tx_out_file)[0] + ".tmp.bam" cmd = ( f"{sieve} --verbose --bam {work_bam} --outFile {tx_unsorted_file} " f"{shiftflag} " f"--numberOfProcessors {num_cores} --maxFragmentLength 0 " f"--minFragmentLength 0 " f"--minMappingQuality 10 " f"--filterMetrics {tx_log_file} ") do.run( cmd, f"Shifting ATAC-seq alignments in {work_bam} to {tx_unsorted_file}." ) # shifting can cause the file to become unsorted sorted_file = bam.sort(tx_unsorted_file, dd.get_config(data), force=True) shutil.move(sorted_file, tx_out_file) bam.index(out_file, dd.get_config(data)) data["work_bam"] = out_file return data
def clean_chipseq_alignment(data): # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) method = dd.get_chip_method(data) if method == "atac": data = shift_ATAC(data) work_bam = dd.get_work_bam(data) work_bam = bam.sort(work_bam, dd.get_config(data)) bam.index(work_bam, dd.get_config(data)) # an unfiltered BAM file is useful for calculating some metrics later data = tz.assoc_in(data, ['chipseq', 'align', "unfiltered"], work_bam) clean_bam = remove_nonassembled_chrom(work_bam, data) clean_bam = remove_mitochondrial_reads(clean_bam, data) data = atac.calculate_complexity_metrics(clean_bam, data) if not dd.get_keep_multimapped(data): clean_bam = remove_multimappers(clean_bam, data) if not dd.get_keep_duplicates(data): clean_bam = bam.remove_duplicates(clean_bam, data) data["work_bam"] = clean_bam # for ATAC-seq, brewak alignments into NF, mono/di/tri nucleosome BAM files if method == "atac": data = atac.split_ATAC(data) encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) try: data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) except subprocess.CalledProcessError: logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, " f" falling back to non-normalized coverage.") data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, config) options = _set_quality_flag(options, data) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, config) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError( "Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.bam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(config, out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist( fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-coverage-search"] = True options["no-mixed"] = True cmd = [sys.executable, config_utils.get_program("tophat", config)] for k, v in options.items(): if v is True: cmd.append("--%s" % k) else: assert not isinstance(v, bool) cmd.append("--%s=%s" % (k, v)) # tophat requires options before arguments, otherwise it silently ignores them cmd += files do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file)) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.bam" % out_base), ref_file, config) else: fixed = out_file fixed_unmapped = _fix_unmapped(fixed, unmapped, data) fixed = merge_unmapped(fixed, fixed_unmapped, config) fixed = _add_rg(fixed, config, names) fixed = bam.sort(fixed, config) picard = broad.runner_from_path("picard", config) # set the contig order to match the reference file so GATK works fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"], os.path.splitext(fixed)[0] + ".picard.bam") fixed = fix_insert_size(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out
def _get_sam_file(data): in_file = data["work_bam"] config = data["config"] sorted = bam.sort(in_file, config, "queryname") sam = bam.bam_to_sam(sorted, config) return sam
def run(items, config): """Run third party disambiguation script, resolving into single set of calls. """ assert len(items) == 2, "Can only resolve two organism disambiguation" # check aligner, handling tophat/tophat2 distinctions aligner = config["algorithm"].get("aligner") if items[0]["disambiguate"].get("base"): data_a, data_b = items else: data_b, data_a = items # Construct name of sorted input files work_bam_a_nsorted = os.path.splitext( data_a["work_bam"])[0] + '.nsorted.bam' work_bam_b_nsorted = os.path.splitext( data_b["work_bam"])[0] + '.nsorted.bam' # logger.info('Disambiguate prep of input BAM {} and {}'.format(work_bam_a_nsorted, work_bam_b_nsorted)) if data_a.get("align_split"): base_dir = utils.safe_makedir( os.path.normpath( os.path.join(os.path.dirname(work_bam_a_nsorted), os.pardir, os.pardir, "disambiguate_%s" % aligner))) logger.info( 'Disambiguate prep of prepped work bam BAM {} with base dir {}'. format(work_bam_a_nsorted, base_dir)) split_name = "_".join( [str(x) for x in data_a["align_split"].split("-")]) out_dir = os.path.join(base_dir, split_name) logger.info( 'Disambiguate prep of prepped work bam BAM {} with out dir {}'. format(work_bam_a_nsorted, out_dir)) else: out_dir = os.path.normpath( os.path.join(os.path.dirname(work_bam_a_nsorted), os.pardir, "disambiguate_%s" % aligner)) base_name = os.path.join( out_dir, os.path.splitext(os.path.basename(work_bam_a_nsorted))[0]) logger.info( 'Disambiguate prep of prepped work bam BAM {} with base name {}'. format(work_bam_a_nsorted, base_name)) summary_file = "%s_summary.txt" % base_name explant_bam = "%s.explant.sorted.bam" % base_name ambiguous_bam = "%s.ambiguous.sorted.bam" % base_name work_bam = "%s.human.sorted.bam" % base_name logger.info('Disambiguate prep with work bam {}'.format(work_bam)) logger.info( 'Deciding if disambiguation is required. Checking for existence of {}, {}, {} and {}' .format(summary_file, explant_bam, ambiguous_bam, work_bam)) if not utils.file_exists(summary_file) or not utils.file_exists( explant_bam) or not utils.file_exists( ambiguous_bam) or not utils.file_exists(work_bam): logger.info( 'Disambiguating work bam a {} since outputs are not already existing' .format(work_bam_a_nsorted)) work_bam_a = bam.sort(data_a["work_bam"], config, "queryname") work_bam_b = bam.sort(data_b["work_bam"], config, "queryname") logger.info('Disambiguate run with work bam a {}'.format(work_bam_a)) logger.info('Disambiguate run with work bam b {}'.format(work_bam_b)) with file_transaction(items[0], out_dir) as tx_out_dir: logger.info( 'Disambiguate run with sorted prep work bam a {} and tx out dir {}' .format(work_bam_a_nsorted, tx_out_dir)) tmp_base_name = os.path.join(tx_out_dir, os.path.basename(base_name)) logger.info( 'Disambiguate run with sorted prep work bam a {} and tmp_base_name {}' .format(work_bam_a_nsorted, tmp_base_name)) pdx_filter = PDXFilter( work_bam_a, work_bam_b, "%s.human.bam" % tmp_base_name, # Must be bam else it will not be merged "%s.explant.bam" % tmp_base_name, # Must be bam else it will not be merged "%s.ambiguous.bam" % tmp_base_name, # Must be bam else it will not be merged "%s_summary.txt" % tmp_base_name, hard_filter=True, debug=True) pdx_filter.run() # Perhaps this can be removed since it has been fixed in bcbio if data_a.get("align_split"): split_dir = os.path.join(out_dir, split_name) logger.info( 'Disambiguate post-run with sorted prep work bam a {} and split dir {}' .format(work_bam_a_nsorted, split_dir)) if os.path.isdir(split_dir): for tmp_file in os.listdir(split_dir): logger.info( 'Disambiguate post-run with sorted prep work bam a {} aiming to move file {}' .format(work_bam_a_nsorted, tmp_file)) src = os.path.join(split_dir, tmp_file) if os.path.isfile(src): dest = os.path.join(out_dir, tmp_file) logger.info( 'Disambiguate post-run with sorted prep work bam a {} moving file {} from {} to {}' .format(work_bam_a_nsorted, tmp_file, src, dest)) shutil.move(src, dest) shutil.rmtree(split_dir) try: if work_bam_a != data_a["work_bam"]: os.remove(work_bam_a) except: pass try: if work_bam_b != data_b["work_bam"]: os.remove(work_bam_b) except: pass else: logger.info( 'Skipping disambiguation for work bam a {} since outputs are already existing' .format(work_bam_a_nsorted)) explant_bam = os.path.isfile(explant_bam) and explant_bam or bam.sort( "%s.explant.bam" % base_name, config) ambiguous_bam = os.path.isfile( ambiguous_bam) and ambiguous_bam or bam.sort( "%s.ambiguous.bam" % base_name, config) work_bam = os.path.isfile(work_bam) and work_bam or bam.sort( "%s.human.bam" % base_name, config) # logger.info('Disambiguate run with post work_bam {}'.format(work_bam)) data_a["disambiguate"] = { data_b["genome_build"]: explant_bam, "%s-ambiguous" % data_a["genome_build"]: ambiguous_bam, "summary": summary_file } data_a["work_bam"] = work_bam try: os.remove("%s.explant.bam" % base_name) except: pass try: os.remove("%s.human.bam" % base_name) except: pass try: os.remove("%s.ambiguous.bam" % base_name) except: pass return [[data_a]]
def run(items, config): """Run third party disambiguation script, resolving into single set of calls. """ assert len(items) == 2, "Can only resolve two organism disambiguation" # check aligner, handling tophat/tophat2 distinctions aligner = config["algorithm"].get("aligner") if items[0]["disambiguate"].get("base"): data_a, data_b = items else: data_b, data_a = items # Construct name of sorted input files work_bam_a_nsorted = os.path.splitext(data_a["work_bam"])[0] + '.nsorted.bam' work_bam_b_nsorted = os.path.splitext(data_b["work_bam"])[0] + '.nsorted.bam' # logger.info('Disambiguate prep of input BAM {} and {}'.format(work_bam_a_nsorted, work_bam_b_nsorted)) if data_a.get("align_split"): base_dir = utils.safe_makedir(os.path.normpath( os.path.join(os.path.dirname(work_bam_a_nsorted), os.pardir, os.pardir, "disambiguate_%s" % aligner))) logger.info('Disambiguate prep of prepped work bam BAM {} with base dir {}'.format(work_bam_a_nsorted, base_dir)) split_name = "_".join([str(x) for x in data_a["align_split"].split("-")]) out_dir = os.path.join(base_dir, split_name) logger.info('Disambiguate prep of prepped work bam BAM {} with out dir {}'.format(work_bam_a_nsorted, out_dir)) else: out_dir = os.path.normpath(os.path.join(os.path.dirname(work_bam_a_nsorted), os.pardir, "disambiguate_%s" % aligner)) base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a_nsorted))[0]) logger.info('Disambiguate prep of prepped work bam BAM {} with base name {}'.format(work_bam_a_nsorted, base_name)) summary_file = "%s_summary.txt" % base_name explant_bam = "%s.explant.sorted.bam" % base_name ambiguous_bam = "%s.ambiguous.sorted.bam" % base_name work_bam = "%s.human.sorted.bam" % base_name logger.info('Disambiguate prep with work bam {}'.format(work_bam)) logger.info('Deciding if disambiguation is required. Checking for existence of {}, {}, {} and {}'.format(summary_file, explant_bam, ambiguous_bam, work_bam)) if not utils.file_exists(summary_file) or not utils.file_exists(explant_bam) or not utils.file_exists(ambiguous_bam) or not utils.file_exists(work_bam): logger.info('Disambiguating work bam a {} since outputs are not already existing'.format(work_bam_a_nsorted)) work_bam_a = bam.sort(data_a["work_bam"], config, "queryname") work_bam_b = bam.sort(data_b["work_bam"], config, "queryname") logger.info('Disambiguate run with work bam a {}'.format(work_bam_a)) logger.info('Disambiguate run with work bam b {}'.format(work_bam_b)) with file_transaction(items[0], out_dir) as tx_out_dir: logger.info('Disambiguate run with sorted prep work bam a {} and tx out dir {}'.format(work_bam_a_nsorted, tx_out_dir)) tmp_base_name = os.path.join(tx_out_dir, os.path.basename(base_name)) logger.info('Disambiguate run with sorted prep work bam a {} and tmp_base_name {}'.format(work_bam_a_nsorted, tmp_base_name)) pdx_filter = PDXFilter(work_bam_a, work_bam_b, "%s.human.bam" % tmp_base_name, # Must be bam else it will not be merged "%s.explant.bam" % tmp_base_name, # Must be bam else it will not be merged "%s.ambiguous.bam" % tmp_base_name, # Must be bam else it will not be merged "%s_summary.txt" % tmp_base_name, hard_filter=True, debug=True) pdx_filter.run() # Perhaps this can be removed since it has been fixed in bcbio if data_a.get("align_split"): split_dir = os.path.join(out_dir, split_name) logger.info('Disambiguate post-run with sorted prep work bam a {} and split dir {}'.format(work_bam_a_nsorted, split_dir)) if os.path.isdir(split_dir): for tmp_file in os.listdir(split_dir): logger.info('Disambiguate post-run with sorted prep work bam a {} aiming to move file {}'.format(work_bam_a_nsorted, tmp_file)) src = os.path.join(split_dir, tmp_file) if os.path.isfile(src): dest = os.path.join(out_dir, tmp_file) logger.info('Disambiguate post-run with sorted prep work bam a {} moving file {} from {} to {}'.format(work_bam_a_nsorted, tmp_file, src, dest)) shutil.move(src, dest) shutil.rmtree(split_dir) try: if work_bam_a != data_a["work_bam"]: os.remove(work_bam_a) except: pass try: if work_bam_b != data_b["work_bam"]: os.remove(work_bam_b) except: pass else: logger.info('Skipping disambiguation for work bam a {} since outputs are already existing'.format(work_bam_a_nsorted)) explant_bam = os.path.isfile(explant_bam) and explant_bam or bam.sort("%s.explant.bam" % base_name, config) ambiguous_bam = os.path.isfile(ambiguous_bam) and ambiguous_bam or bam.sort("%s.ambiguous.bam" % base_name, config) work_bam = os.path.isfile(work_bam) and work_bam or bam.sort("%s.human.bam" % base_name, config) # logger.info('Disambiguate run with post work_bam {}'.format(work_bam)) data_a["disambiguate"] = {data_b["genome_build"]: explant_bam, "%s-ambiguous" % data_a["genome_build"]: ambiguous_bam, "summary": summary_file} data_a["work_bam"] = work_bam try: os.remove("%s.explant.bam" % base_name) except: pass try: os.remove("%s.human.bam" % base_name) except: pass try: os.remove("%s.ambiguous.bam" % base_name) except: pass return [[data_a]]
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, config) options = _set_quality_flag(options, config) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, config) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError( "Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "%s.sam" % out_base) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.sam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist( fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-convert-bam"] = True options["no-coverage-search"] = True options["no-mixed"] = True tophat_runner = sh.Command( config_utils.get_program("tophat", config)) ready_options = {} for k, v in options.iteritems(): ready_options[k.replace("-", "_")] = v # tophat requires options before arguments, # otherwise it silently ignores them tophat_ready = tophat_runner.bake(**ready_options) cmd = str(tophat_ready.bake(*files)) do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file), None) _fix_empty_readnames(out_file) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.sam" % out_base), ref_file, config) else: fixed = out_file fixed = merge_unmapped(fixed, unmapped, config) fixed = _fix_unmapped(fixed, config, names) fixed = bam.sort(fixed, config) fixed = bam.bam_to_sam(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, config) options = _set_quality_flag(options, config) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, config) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError("Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.sam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-convert-bam"] = True options["no-coverage-search"] = True options["no-mixed"] = True tophat_runner = sh.Command(config_utils.get_program("tophat", config)) ready_options = {} for k, v in options.iteritems(): ready_options[k.replace("-", "_")] = v # tophat requires options before arguments, # otherwise it silently ignores them tophat_ready = tophat_runner.bake(**ready_options) cmd = str(tophat_ready.bake(*files)) do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file), None) _fix_empty_readnames(out_file) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.sam" % out_base), ref_file, config) else: fixed = out_file fixed = merge_unmapped(fixed, unmapped, config) fixed = _fix_unmapped(fixed, config, names) fixed = bam.sort(fixed, config) picard = broad.runner_from_config(config) # set the contig order to match the reference file so GATK works fixed = picard.run_fn("picard_reorder", out_file, data["sam_ref"], os.path.splitext(out_file)[0] + ".picard.bam") fixed = fix_insert_size(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out