def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config): """Convert BAM input file into FASTQ files. """ out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert")) qual_bin_method = config["algorithm"].get("quality_bin") if (qual_bin_method == "prealignment" or (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)): out_bindir = safe_makedir(os.path.join(out_dir, "qualbin")) in_file = cram.illumina_qual_bin(in_file, data["sam_ref"], out_bindir, config) out_files = [ os.path.join( out_dir, "{0}_{1}.fastq".format( os.path.splitext(os.path.basename(in_file))[0], x)) for x in ["1", "2"] ] if bam.is_paired(in_file): out1, out2 = out_files else: out1 = out_files[0] out2 = None if not file_exists(out1): broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2) if out2 and os.path.getsize(out2) == 0: out2 = None return [out1, out2]
def _prep_inputs(align_bams, ref_file, items): """Ensure inputs to calling are indexed as expected. """ broad_runner = broad.runner_from_path("picard", items[0]["config"]) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, items[0]["config"])
def prep_recal(data): """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Recalibrating %s with GATK" % str(dd.get_sample_name(data))) ref_file = data["sam_ref"] config = data["config"] dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.") return [[data]] platform = config["algorithm"].get("platform", "illumina") broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) if config["algorithm"].get("mark_duplicates", True): (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"]) else: dup_align_bam = data["work_bam"] bam.index(dup_align_bam, config) intervals = config["algorithm"].get("variant_regions", None) data["work_bam"] = dup_align_bam broad_runner = broad.runner_from_config(config) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data) return [[data]]
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) gatk_type = broad_runner.gatk_type() for x in align_bams: bam.index(x, config) if _use_spark(num_cores, gatk_type): # GATK4 spark runs use 2bit reference index params = ["--reference", dd.get_ref_twobit(items[0])] else: picard_runner = broad.runner_from_path("picard", config) picard_runner.run_fn("picard_index_ref", ref_file) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: if gatk_type == "gatk4": params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"] else: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += standard_cl_params(items) return broad_runner, params
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner = broad.runner_from_config(config) return broad_runner, params
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += [ "--standard_min_confidence_threshold_for_calling", confidence ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] params += standard_cl_params(items) broad_runner = broad.runner_from_config(config) return broad_runner, params
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for MuTect. """ base_config = items[0]["config"] broad_runner = broad.runner_from_path("picard", base_config) broad_runner.run_fn("picard_index_ref", ref_file) broad_runner = broad.runner_from_config(base_config, "mutect") _check_mutect_version(broad_runner) for x in align_bams: bam.index(x, base_config) paired = vcfutils.get_paired_bams(align_bams, items) if not paired: raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "pipelines.html#cancer-variant-calling\n" "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items])) params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"] params += ["--read_filter", "NotPrimaryAlignment"] params += ["-I:tumor", paired.tumor_bam] params += ["--tumor_sample_name", paired.tumor_name] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] params += ["--normal_sample_name", paired.normal_name] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] params += _config_params(base_config, assoc_files, region, out_file) return broad_runner, params
def collect_oxog_metrics(data): """ extracts 8-oxoguanine (OxoG) artifact metrics from CollectSequencingArtifacts output so we don't have to run CollectOxoGMetrics. """ input_base = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data), dd.get_sample_name(data)) if not utils.file_exists(input_base + ".pre_adapter_detail_metrics"): return None OUT_SUFFIXES = [".oxog_metrics"] picard = broad.runner_from_path("picard", dd.get_config(data)) out_dir = os.path.join(dd.get_work_dir(data), "metrics", "oxog", dd.get_sample_name(data)) utils.safe_makedir(out_dir) ref_file = dd.get_ref_file(data) out_base = os.path.join(out_dir, dd.get_sample_name(data)) out_files = [out_base + x for x in OUT_SUFFIXES] if all([utils.file_exists(x) for x in out_files]): return out_files with file_transaction(data, out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) out_base = os.path.join(tx_out_dir, dd.get_sample_name(data)) params = [("--INPUT_BASE", input_base), ("--OUTPUT_BASE", out_base), ("--REFERENCE_SEQUENCE", ref_file)] picard.run("ConvertSequencingArtifactToOxoG", params) return out_files
def collect_artifact_metrics(data): """Run CollectSequencingArtifacts to collect pre-adapter ligation artifact metrics https://gatk.broadinstitute.org/hc/en-us/articles/360037429491-CollectSequencingArtifactMetrics-Picard- use picard wrapper rather than gatk - works for gatk4 and gatk3 projects refactor - move to broad/picardrun """ OUT_SUFFIXES = [".bait_bias_detail_metrics", ".error_summary_metrics", ".pre_adapter_detail_metrics", ".pre_adapter_summary_metrics"] picard = broad.runner_from_path("picard", dd.get_config(data)) ref_file = dd.get_ref_file(data) bam_file = dd.get_work_bam(data) if not bam_file: return None if "collectsequencingartifacts" in dd.get_tools_off(data): return None out_dir = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data)) utils.safe_makedir(out_dir) out_base = os.path.join(out_dir, dd.get_sample_name(data)) out_files = [out_base + x for x in OUT_SUFFIXES] if all([utils.file_exists(x) for x in out_files]): return out_files with file_transaction(data, out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) out_base = os.path.join(tx_out_dir, dd.get_sample_name(data)) params = [("-REFERENCE_SEQUENCE", ref_file), ("-INPUT", bam_file), ("-OUTPUT", out_base)] # picard runner sets VALIDATION_STRINGENCY picard.run("CollectSequencingArtifactMetrics", params) return out_files
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config): """Convert BAM input file into FASTQ files. """ out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert")) qual_bin_method = config["algorithm"].get("quality_bin") if (qual_bin_method == "prealignment" or (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)): out_bindir = safe_makedir(os.path.join(out_dir, "qualbin")) in_file = cram.illumina_qual_bin(in_file, data["sam_ref"], out_bindir, config) out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format( os.path.splitext(os.path.basename(in_file))[0], x)) for x in ["1", "2"]] if bam.is_paired(in_file): out1, out2 = out_files else: out1 = out_files[0] out2 = None if not file_exists(out1): broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2) if out2 and os.path.getsize(out2) == 0: out2 = None return [out1, out2]
def prep_recal(data): """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM. """ if data["config"]["algorithm"].get("recalibrate", True) in [True, "gatk"]: logger.info("Recalibrating %s with GATK" % str(data["name"])) ref_file = data["sam_ref"] config = data["config"] dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info( "Skipping GATK BaseRecalibrator because no VCF file of known variants was found." ) return [[data]] platform = config["algorithm"].get("platform", "illumina") broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) if config["algorithm"].get("mark_duplicates", True): (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"]) else: dup_align_bam = data["work_bam"] bam.index(dup_align_bam, config) intervals = config["algorithm"].get("variant_regions", None) data["work_bam"] = dup_align_bam broad_runner = broad.runner_from_config(config) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data) return [[data]]
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) gatk_type = broad_runner.gatk_type() for x in align_bams: bam.index(x, config) picard_runner = broad.runner_from_path("picard", config) picard_runner.run_fn("picard_index_ref", ref_file) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] variant_regions = bedutils.population_variant_regions(items) region = subset_variant_regions(variant_regions, region, out_file, items) if region: if gatk_type == "gatk4": params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"] else: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] params += standard_cl_params(items) return broad_runner, params
def combine_bam(in_files, out_file, config): """Parallel target to combine multiple BAM files. """ runner = broad.runner_from_path("picard", config) runner.run_fn("picard_merge", in_files, out_file) for in_file in in_files: save_diskspace(in_file, "Merged into {0}".format(out_file), config) bam.index(out_file, config) return out_file
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = utils.to_single_data(data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"])) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and "vrn_file" in data: data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) else: raise ValueError("Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def get_ref_bedtool(ref_file, config, chrom=None): """Retrieve a pybedtool BedTool object with reference sizes from input reference. """ broad_runner = broad.runner_from_path("picard", config) ref_dict = broad_runner.run_fn("picard_index_ref", ref_file) ref_lines = [] with contextlib.closing(pysam.Samfile(ref_dict, "r")) as ref_sam: for sq in ref_sam.header["SQ"]: if not chrom or sq["SN"] == chrom: ref_lines.append("%s\t%s\t%s" % (sq["SN"], 0, sq["LN"])) return pybedtools.BedTool("\n".join(ref_lines), from_string=True)
def get_ref_bedtool(ref_file, config, chrom=None): """Retrieve a pybedtool BedTool object with reference sizes from input reference. """ broad_runner = broad.runner_from_path("picard", config) ref_dict = broad_runner.run_fn("picard_index_ref", ref_file) ref_lines = [] with pysam.Samfile(ref_dict, "r") as ref_sam: for sq in ref_sam.header["SQ"]: if not chrom or sq["SN"] == chrom: ref_lines.append("%s\t%s\t%s" % (sq["SN"], 0, sq["LN"])) return pybedtools.BedTool("\n".join(ref_lines), from_string=True)
def picard_prep(in_bam, names, ref_file, dirs, data): """Prepare input BAM using Picard and GATK cleaning tools. - ReorderSam to reorder file to reference - AddOrReplaceReadGroups to add read group information and coordinate sort - PrintReads to filters to remove problem records: - filterMBQ to remove reads with mismatching bases and base qualities """ runner = broad.runner_from_path("picard", data["config"]) work_dir = utils.safe_makedir(os.path.join(dirs["work"], "bamclean", names["sample"])) runner.run_fn("picard_index_ref", ref_file) reorder_bam = os.path.join(work_dir, "%s-reorder.bam" % os.path.splitext(os.path.basename(in_bam))[0]) reorder_bam = runner.run_fn("picard_reorder", in_bam, ref_file, reorder_bam) rg_bam = runner.run_fn("picard_fix_rgs", reorder_bam, names) return _filter_bad_reads(rg_bam, ref_file, data)
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, cosmic, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += [ "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: bam.index(x, config) paired = vcfutils.get_paired_bams(align_bams, items) if not paired: raise ValueError( "Specified MuTect calling but 'tumor' phenotype not present in batch\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "pipelines.html#cancer-variant-calling\n" "for samples: %s" % ", ".join([dd.get_sample_name(x) for x in items]) ) params += ["-I:tumor", paired.tumor_bam] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] if dbsnp: params += ["--dbsnp", dbsnp] if cosmic: params += ["--cosmic", cosmic] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner = broad.runner_from_config(config) return broad_runner, params
def ref_file_from_bam(bam_file, data): """Subset a fasta input file to only a fraction of input contigs. """ new_ref = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "inputs", "ref")), "%s-subset.fa" % dd.get_genome_build(data)) if not utils.file_exists(new_ref): with file_transaction(data, new_ref) as tx_out_file: contig_file = "%s-contigs.txt" % utils.splitext_plus(new_ref)[0] with open(contig_file, "w") as out_handle: for contig in [x.contig for x in idxstats(bam_file, data) if x.contig != "*"]: out_handle.write("%s\n" % contig) cmd = "seqtk subseq -l 100 %s %s > %s" % (dd.get_ref_file(data), contig_file, tx_out_file) do.run(cmd, "Subset %s to BAM file contigs" % dd.get_genome_build(data)) ref.fasta_idx(new_ref, data["config"]) runner = broad.runner_from_path("picard", data["config"]) runner.run_fn("picard_index_ref", new_ref) return {"base": new_ref}
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config): """Convert BAM input file into FASTQ files. """ out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert")) out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format( os.path.splitext(os.path.basename(in_file))[0], x)) for x in ["1", "2"]] if bam.is_paired(in_file): out1, out2 = out_files else: out1 = out_files[0] out2 = None if not file_exists(out1): broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2) if out2 and os.path.getsize(out2) == 0: out2 = None return [out1, out2]
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, cosmic, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: bam.index(x, config) paired = vcfutils.get_paired_bams(align_bams, items) if not paired: raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "pipelines.html#cancer-variant-calling\n" "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items])) params += ["-I:tumor", paired.tumor_bam] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] if dbsnp: params += ["--dbsnp", dbsnp] if cosmic: params += ["--cosmic", cosmic] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner = broad.runner_from_config(config) return broad_runner, params
def _fix_unmapped(unmapped_file, config, names): """ the unmapped.bam file from Tophat 2.0.9 is missing some things 1) the RG tag is missing from the reads 2) MAPQ is set to 255 instead of 0 3) for reads where both are unmapped, the mate_is_unmapped flag is not set correctly """ out_file = os.path.splitext(unmapped_file)[0] + "_fixed.bam" if file_exists(out_file): return out_file picard = broad.runner_from_path("picard", config) rg_fixed = picard.run_fn("picard_fix_rgs", unmapped_file, names) fixed = bam.sort(rg_fixed, config, "queryname") with closing(pysam.Samfile(fixed)) as work_sam: with file_transaction(config, out_file) as tx_out_file: tx_out = pysam.Samfile(tx_out_file, "wb", template=work_sam) for read1 in work_sam: if not read1.is_paired: if read1.is_unmapped: read1.mapq = 0 tx_out.write(read1) continue read2 = work_sam.next() if read1.qname != read2.qname: continue if read1.is_unmapped and not read2.is_unmapped: read1.mapq = 0 read1.tid = read2.tid if not read1.is_unmapped and read2.is_unmapped: read2.mapq = 0 read2.tid = read1.tid if read1.is_unmapped and read2.is_unmapped: read1.mapq = 0 read2.mapq = 0 read1.mate_is_unmapped = True read2.mate_is_unmapped = True tx_out.write(read1) tx_out.write(read2) tx_out.close() return out_file
def _add_rg(unmapped_file, config, names): """Add the missing RG header.""" picard = broad.runner_from_path("picard", config) rg_fixed = picard.run_fn("picard_fix_rgs", unmapped_file, names) return rg_fixed
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, config) options = _set_quality_flag(options, data) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, config) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError("Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.sam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(config, out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-convert-bam"] = True options["no-coverage-search"] = True options["no-mixed"] = True tophat_runner = sh.Command(config_utils.get_program("tophat", config)) ready_options = {} for k, v in options.iteritems(): ready_options[k.replace("-", "_")] = v # tophat requires options before arguments, # otherwise it silently ignores them tophat_ready = tophat_runner.bake(**ready_options) cmd = "%s %s" % (sys.executable, str(tophat_ready.bake(*files))) do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file), None) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.sam" % out_base), ref_file, config) else: fixed = out_file fixed = merge_unmapped(fixed, unmapped, config) fixed = _fix_unmapped(fixed, config, names) fixed = bam.sort(fixed, config) picard = broad.runner_from_path("picard", config) # set the contig order to match the reference file so GATK works fixed = picard.run_fn("picard_reorder", out_file, data["sam_ref"], os.path.splitext(out_file)[0] + ".picard.bam") fixed = fix_insert_size(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) if dd.get_correct_umis(data): data["work_bam"] = postalign.correct_umis(data) if dd.get_umi_consensus(data): data["umi_bam"] = dd.get_work_bam(data) if fastq2: f1, f2, avg_cov = postalign.umi_consensus(data) data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov del data["config"]["algorithm"]["umi_type"] data["config"]["algorithm"]["mark_duplicates"] = False data = align_to_sort_bam(f1, f2, aligner, data) else: raise ValueError( "Single fastq input for UMI processing; fgbio needs paired reads: %s" % dd.get_sample_name(data)) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError( "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) ref_file = dd.get_ref_file(data) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file, data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "remove_extracontigs": out_bam = cleanbam.remove_extracontigs(fastq1, data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join( data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) if not utils.file_exists(out_file): work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "{}-sort.bam".format(dd.get_sample_name(data))) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = _link_bam_file( fastq1, os.path.join(dd.get_work_dir(data), "prealign", dd.get_sample_name(data)), data) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and not dd.get_aligner(data): data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) elif "kraken" in config["algorithm"]: # kraken doesn's need bam pass else: raise ValueError( "Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, config) options = _set_quality_flag(options, data) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, config) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError( "Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.bam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(config, out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist( fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-coverage-search"] = True options["no-mixed"] = True cmd = [sys.executable, config_utils.get_program("tophat", config)] for k, v in options.items(): if v is True: cmd.append("--%s" % k) else: assert not isinstance(v, bool) cmd.append("--%s=%s" % (k, v)) # tophat requires options before arguments, otherwise it silently ignores them cmd += files do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file)) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.bam" % out_base), ref_file, config) else: fixed = out_file fixed_unmapped = _fix_unmapped(fixed, unmapped, data) fixed = merge_unmapped(fixed, fixed_unmapped, config) fixed = _add_rg(fixed, config, names) fixed = bam.sort(fixed, config) picard = broad.runner_from_path("picard", config) # set the contig order to match the reference file so GATK works fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"], os.path.splitext(fixed)[0] + ".picard.bam") fixed = fix_insert_size(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out