def count_covariates(picard, dup_align_bam, ref_file, platform, snp_file): """Step 1 of GATK recalibration process -- counting covariates. """ out_file = "%s.recal" % os.path.splitext(dup_align_bam)[0] params = ["-T", "CountCovariates", "-cov", "ReadGroupCovariate", "-cov", "QualityScoreCovariate", "-cov", "CycleCovariate", "-cov", "DinucCovariate", "-cov", "TileCovariate", "-recalFile", out_file, "-I", dup_align_bam, "-R", ref_file, "-l", "INFO", "-U", "-OQ", "--default_platform", platform, ] if snp_file: params += ["-B", "dbsnp,VCF,%s" % snp_file] if not os.path.exists(out_file): with curdir_tmpdir() as tmp_dir: picard.run_gatk(params, tmp_dir) return out_file
def picard_sort(picard, align_bam): base, ext = os.path.splitext(align_bam) out_file = "%s-sort%s" % (base, ext) if not os.path.exists(out_file): with curdir_tmpdir() as tmp_dir: opts = [("INPUT", align_bam), ("OUTPUT", out_file), ("TMP_DIR", tmp_dir), ("SORT_ORDER", "coordinate")] picard.run("SortSam", opts) return out_file
def picard_fixmate(picard, align_bam): """Run Picard's FixMateInformation generating an aligned output file. """ base, ext = os.path.splitext(align_bam) out_file = "%s-sort%s" % (base, ext) if not os.path.exists(out_file): with curdir_tmpdir() as tmp_dir: opts = [("INPUT", align_bam), ("OUTPUT", out_file), ("TMP_DIR", tmp_dir), ("SORT_ORDER", "coordinate")] picard.run("FixMateInformation", opts) return out_file
def mark_duplicates(picard, align_bam): base, ext = os.path.splitext(align_bam) base = base.replace(".", "-") dup_bam = "%s-dup%s" % (base, ext) dup_metrics = "%s-dup.dup_metrics" % base if not os.path.exists(dup_bam): with curdir_tmpdir() as tmp_dir: opts = [("INPUT", align_bam), ("OUTPUT", dup_bam), ("TMP_DIR", tmp_dir), ("METRICS_FILE", dup_metrics)] picard.run("MarkDuplicates", opts) return dup_bam
def merge_bam_files(bam_files, work_dir, config): """Merge multiple BAM files from a sample into a single BAM for processing. """ out_file = os.path.join(work_dir, os.path.basename(bam_files[0])) if not os.path.exists(out_file): picard = PicardRunner(config["program"]["picard"]) with utils.curdir_tmpdir() as tmp_dir: opts = [("OUTPUT", out_file), ("SORT_ORDER", "coordinate"), ("TMP_DIR", tmp_dir)] for b in bam_files: opts.append(("INPUT", b)) picard.run("MergeSamFiles", opts) return out_file
def fastq_to_bam(picard, sample_name, quality_format, read1, read2): base, ext = os.path.splitext(os.path.basename(read1)) out_file = "%s.bam" % base if not os.path.exists(out_file): with curdir_tmpdir() as tmp_dir: opts = [("FASTQ", read1), ("TMP_DIR", tmp_dir), ("QUALITY_FORMAT", quality_format), ("SAMPLE_NAME", sample_name), ("OUTPUT", out_file)] if read2: opts.append(("FASTQ2", read2)) picard.run("FastqToSam", opts) return out_file
def main(config_file, align_sam, ref_file, fastq_one, fastq_pair=None, sample_name=""): with open(config_file) as in_handle: config = yaml.load(in_handle) picard = PicardRunner(config["program"]["picard"]) index_ref_file(picard, ref_file) base_dir = os.path.split(align_sam)[0] with curdir_tmpdir() as tmp_dir: out_fastq_bam = picard_fastq_to_bam(picard, fastq_one, fastq_pair, base_dir, config["algorithm"]["quality_format"], sample_name, tmp_dir) out_bam = picard_merge_bam(picard, align_sam, out_fastq_bam, ref_file, tmp_dir, fastq_pair is not None) picard_sort(picard, out_bam, tmp_dir)
def indel_realignment(picard, align_bam, ref_file, intervals): """Perform realignment of BAM file in specified regions """ out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0] params = ["-T", "IndelRealigner", "-I", align_bam, "-R", ref_file, "-targetIntervals", intervals, "-o", out_file, "-l", "INFO", ] if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with curdir_tmpdir() as tmp_dir: picard.run_gatk(params, tmp_dir) return out_file
def gatk_recalibrate(picard, dup_align_bam, ref_file, recal_file, platform): """Step 2 of GATK recalibration -- use covariates to re-write output file. """ out_file = "%s-gatkrecal.bam" % os.path.splitext(dup_align_bam)[0] params = ["-T", "TableRecalibration", "-recalFile", recal_file, "-R", ref_file, "-I", dup_align_bam, "-outputBam", out_file, "-l", "INFO", "-U", "-OQ", "--default_platform", platform, ] if not os.path.exists(out_file): with curdir_tmpdir() as tmp_dir: picard.run_gatk(params, tmp_dir) return out_file
def main(config_file, align_sam, ref_file, fastq_one, fastq_pair=None, sample_name="", rg_name="", pu_name=""): with open(config_file) as in_handle: config = yaml.load(in_handle) picard = PicardRunner(config["program"]["picard"]) platform = config["algorithm"]["platform"] if platform.lower() == "illumina": qual_format = "Illumina" else: raise ValueError("Need to specify quality format for %s" % platform) index_ref_file(picard, ref_file) base_dir = os.path.split(align_sam)[0] with curdir_tmpdir() as tmp_dir: out_fastq_bam = picard_fastq_to_bam(picard, fastq_one, fastq_pair, base_dir, platform, qual_format, sample_name, rg_name, pu_name, tmp_dir) out_bam = picard_merge_bam(picard, align_sam, out_fastq_bam, ref_file, tmp_dir, fastq_pair is not None) picard_sort(picard, out_bam, tmp_dir)
def picard_run_maq(picard, maq_cmd, input_bam, ref_file, barcode, lane, out_base, stringency, is_paired=False, limit=None, ext=""): out_dir = "%s-maq%s" % (out_base, ext) if not os.path.exists(out_dir): os.makedirs(out_dir) bam_out_file = "%s.bam" % (out_dir) with curdir_tmpdir() as tmp_dir: std_opts = [("INPUT", input_bam), ("ANALYSIS_DIR", out_dir), ("FLOWCELL_BARCODE", barcode), ("LANE", lane), ("REFERENCE_SEQUENCE", ref_file), ("TMP_DIR", tmp_dir), ("PAIRED_RUN", ("true" if is_paired else "false"))] # Convert fastq to Maq ready files if len(glob.glob( os.path.join(out_dir, "%s.%s*bfq" % (barcode, lane)))) == 0: opts = std_opts + [ ("PREPARE", "true"), ] if limit: opts.append(("READS_TO_ALIGN", int(limit))) picard.run("RunMaq", opts) # actually run Maq. Use python as Picard is failing with same parameters if len(glob.glob( os.path.join(out_dir, "%s.%s*out*.map" % (barcode, lane)))) == 0: #opts = std_opts + [ # ("STRINGENCY", stringency), # ("ALIGN", "true"), # ] #picard.run("RunMaq", opts) run_maq(maq_cmd, stringency, out_dir, ref_file, barcode, lane) # Convert the output file to BAM aligned if not os.path.exists(bam_out_file): opts = std_opts + [ ("OUTPUT", bam_out_file), ("BAM_OUTPUT", "true") ] index_file = index_ref_file(picard, ref_file) picard.run("RunMaq", opts) #convert_map_to_bam(picard, out_dir, bam_out_file, ref_file, # barcode, lane) return bam_out_file