def ref_genome_info(info, config, dirs): """Retrieve reference genome information from configuration variables. """ genome_build = info.get("genome_build", None) (_, sam_ref) = get_genome_ref(genome_build, config["algorithm"]["aligner"], dirs["galaxy"]) return genome_build, sam_ref
def convert_bam_to_fastq(in_file, work_dir, item, dirs, config): """Convert BAM input file into FASTQ files. """ out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert")) qual_bin_method = config["algorithm"].get("quality_bin") if (qual_bin_method == "prealignment" or (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)): _, sam_ref = alignment.get_genome_ref(item["genome_build"], None, dirs["galaxy"]) out_bindir = safe_makedir(os.path.join(out_dir, "qualbin")) in_file = cram.illumina_qual_bin(in_file, sam_ref, out_bindir, config) out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format( os.path.splitext(os.path.basename(in_file))[0], x)) for x in ["1", "2"]] if _is_paired(in_file): out1, out2 = out_files else: out1 = out_files[0] out2 = None if not file_exists(out1): broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2) if os.path.getsize(out2) == 0: out2 = None return [out1, out2]
def _create_validate_config(vrn_file, rm_file, rm_interval_file, rm_genome, base_dir, data): """Create a bcbio.variation configuration input for validation. """ if rm_genome: rm_genome = alignment.get_genome_ref(rm_genome, None, data["dirs"]["galaxy"])[-1] if rm_genome != data["sam_ref"]: eval_genome = data["sam_ref"] else: eval_genome = None else: eval_genome = None rm_genome = data["sam_ref"] ref_call = {"file": rm_file, "name": "ref", "type": "grading-ref", "preclean": True, "prep": True, "remove-refcalls": True} if rm_interval_file: ref_call["intervals"] = rm_interval_file eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True} if eval_genome: eval_call["ref"] = eval_genome eval_call["preclean"] = True eval_call["prep"] = True exp = {"sample": data["name"][-1], "ref": rm_genome, "approach": "grade", "calls": [ref_call, eval_call]} if data.get("callable_bam"): exp["align"] = data["callable_bam"] intervals = ensemble.get_analysis_intervals(data) if intervals: exp["intervals"] = os.path.abspath(intervals) return {"dir": {"base": base_dir, "out": "work", "prep": "work/prep"}, "experiments": [exp]}
def convert_bam_to_fastq(in_file, work_dir, item, dirs, config): """Convert BAM input file into FASTQ files. """ out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert")) qual_bin_method = config["algorithm"].get("quality_bin") if (qual_bin_method == "prealignment" or (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)): _, sam_ref = alignment.get_genome_ref(item["genome_build"], None, dirs["galaxy"]) out_bindir = safe_makedir(os.path.join(out_dir, "qualbin")) in_file = cram.illumina_qual_bin(in_file, sam_ref, out_bindir, config) out_files = [ os.path.join( out_dir, "{0}_{1}.fastq".format( os.path.splitext(os.path.basename(in_file))[0], x)) for x in ["1", "2"] ] if _is_paired(in_file): out1, out2 = out_files else: out1 = out_files[0] out2 = None if not file_exists(out1): broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2) if os.path.getsize(out2) == 0: out2 = None return [out1, out2]
def process_sample(sample_name, fastq_files, info, bam_files, dirs, config, config_file): """Finalize processing for a sample, potentially multiplexed. """ config = _update_config_w_custom(config, info) genome_build = info.get("genome_build", None) (_, sam_ref) = get_genome_ref(genome_build, config["algorithm"]["aligner"], dirs["galaxy"]) fastq1, fastq2 = combine_fastq_files(fastq_files, dirs["work"]) log.info("Combining and preparing wig file %s" % str(sample_name)) sort_bam = merge_bam_files(bam_files, dirs["work"], config) (gatk_bam, vrn_file, effects_file) = ("", "", "") if config["algorithm"]["recalibrate"]: log.info("Recalibrating %s with GATK" % str(sample_name)) gatk_bam = recalibrate_quality(sort_bam, fastq1, fastq2, sam_ref, dirs, config) if config["algorithm"]["snpcall"]: log.info("SNP genotyping %s with GATK" % str(sample_name)) vrn_file = run_genotyper(gatk_bam, sam_ref, config) log.info("Calculating variation effects for %s" % str(sample_name)) effects_file = variation_effects(vrn_file, genome_build, config) if config["algorithm"].get("transcript_assemble", False): tx_file = assemble_transcripts(sort_bam, sam_ref, config) if sam_ref is not None: log.info("Generating summary files: %s" % str(sample_name)) generate_align_summary(sort_bam, fastq2 is not None, sam_ref, sample_name, config, dirs) bam_to_wig(sort_bam, config, config_file) return [sample_name, fastq_files, info, sort_bam, gatk_bam, vrn_file, effects_file]
def align_prep_full(fastq1, fastq2, info, lane_name, lane_desc, dirs, config, config_file): """Perform alignment and post-processing required on full BAM files. Prepare list of callable genome regions allowing subsequent parallelization. """ if fastq1 is None and "vrn_file" in info: _, ref_file = get_genome_ref(info["genome_build"], None, dirs["galaxy"]) config["algorithm"]["variantcaller"] = "" data = { "info": info, "sam_ref": ref_file, "work_bam": None, "genome_build": info["genome_build"], "name": ("", lane_desc), "vrn_file": info["vrn_file"], "dirs": copy.deepcopy(dirs), "config": config } else: align_out = process_alignment(fastq1, fastq2, info, lane_name, lane_desc, dirs, config)[0] data = _organize_merge_samples(align_out, dirs, config_file) callable_region_bed, analysis_regions = callable.block_regions( data["work_bam"], data["sam_ref"], config) data["regions"] = analysis_regions if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"][ "variant_regions"] = callable_region_bed data["callable_bam"] = data["work_bam"] data = _recal_no_markduplicates(data) return [data]
def align_prep_full(fastq1, fastq2, info, lane_name, lane_desc, dirs, config, config_file): """Perform alignment and post-processing required on full BAM files. Prepare list of callable genome regions allowing subsequent parallelization. """ if fastq1 is None and "vrn_file" in info: _, ref_file = get_genome_ref(info["genome_build"], None, dirs["galaxy"]) config["algorithm"]["variantcaller"] = "" data = {"info": info, "sam_ref": ref_file, "work_bam": None, "genome_build": info["genome_build"], "name": ("", lane_desc), "vrn_file": info["vrn_file"], "dirs": copy.deepcopy(dirs), "config": config} else: align_out = process_alignment(fastq1, fastq2, info, lane_name, lane_desc, dirs, config)[0] data = _organize_merge_samples(align_out, dirs, config_file) callable_region_bed, analysis_regions = callable.block_regions(data["work_bam"], data["sam_ref"], config) data["regions"] = analysis_regions if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data["callable_bam"] = data["work_bam"] data = _recal_no_markduplicates(data) return [data]
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc, dirs, config): """Do an alignment of fastq files, preparing a sorted BAM output file. """ aligner = config["algorithm"].get("aligner", None) out_bam = "" names = rg_names(lane_name, lane_desc, config) _, ref_file = get_genome_ref(info["genome_build"], None, dirs["galaxy"]) if os.path.exists(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (lane_name, aligner)) out_bam, ref_file = align_to_sort_bam(fastq1, fastq2, names, info["genome_build"], aligner, dirs, config) elif os.path.exists(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if sort_method: runner = broad.runner_from_config(config) out_file = os.path.join(dirs["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) elif bamclean is True or bamclean == "picard": out_bam = cleanbam.picard_prep(fastq1, names, ref_file, dirs, config) else: out_bam = link_bam_file(fastq1, os.path.join(dirs["work"], "prealign", names["sample"])) if not out_bam and not os.path.exists(fastq1): raise ValueError("Could not find input file: %s" % fastq1) return [{"fastq": [fastq1, fastq2], "work_bam": out_bam, "info": info, "sam_ref": ref_file, "config": config}]
def process_alignment(data): """Do an alignment of fastq files, preparing a sorted BAM output file. """ fastq1, fastq2 = data["files"] config = data["config"] aligner = config["algorithm"].get("aligner", None) out_bam = "" if os.path.exists(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) out_bam, ref_file = align_to_sort_bam(fastq1, fastq2, data["rgnames"], data["genome_build"], aligner, data["dirs"], data["config"]) elif os.path.exists(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if sort_method: runner = broad.runner_from_config(config) out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) elif bamclean is True or bamclean == "picard": out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"], config) else: out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"])) _check_prealigned_bam(fastq1, data["sam_ref"], config) if not out_bam and not os.path.exists(fastq1): raise ValueError("Could not find input file: %s" % fastq1) data["sam_ref"] = get_genome_ref(data["genome_build"], None, data["dirs"]["galaxy"])[-1] data["work_bam"] = out_bam return [[data]]
def split_read_files(fastq1, fastq2, item, split_size, out_dir, dirs, config): """Split input reads for parallel processing, dispatching on input type. """ if fastq1.endswith(".bam") and fastq2 is None: qual_bin_method = config["algorithm"].get("quality_bin") if qual_bin_method == "prealignment" or ( isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method ): _, sam_ref = alignment.get_genome_ref(item["genome_build"], None, dirs["galaxy"]) out_bindir = utils.safe_makedir(os.path.join(out_dir, "qualbin")) fastq1 = cram.illumina_qual_bin(fastq1, sam_ref, out_bindir, config) return split_bam_file(fastq1, split_size, out_dir, config) else: return split_fastq_files(fastq1, fastq2, split_size, out_dir, config)
def _create_validate_config(vrn_file, rm_file, rm_interval_file, rm_genome, base_dir, data): """Create a bcbio.variation configuration input for validation. """ if rm_genome: rm_genome = alignment.get_genome_ref(rm_genome, None, data["dirs"]["galaxy"])[-1] if rm_genome != data["sam_ref"]: eval_genome = data["sam_ref"] else: eval_genome = None else: eval_genome = None rm_genome = data["sam_ref"] ref_call = { "file": rm_file, "name": "ref", "type": "grading-ref", "preclean": True, "prep": True, "remove-refcalls": True } if rm_interval_file: ref_call["intervals"] = rm_interval_file eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True} if eval_genome: eval_call["ref"] = eval_genome eval_call["preclean"] = True eval_call["prep"] = True exp = { "sample": data["name"][-1], "ref": rm_genome, "approach": "grade", "calls": [ref_call, eval_call] } if data.get("callable_bam"): exp["align"] = data["callable_bam"] intervals = ensemble.get_analysis_intervals(data) if intervals: exp["intervals"] = os.path.abspath(intervals) return { "dir": { "base": base_dir, "out": "work", "prep": "work/prep" }, "experiments": [exp] }
def align_prep_full(data, config_file): """Perform alignment and post-processing required on full BAM files. Prepare list of callable genome regions allowing subsequent parallelization. """ _, ref_file = get_genome_ref(data["genome_build"], None, data["dirs"]["galaxy"]) data["sam_ref"] = ref_file if data["files"][0] is None and "vrn_file" in data: data["config"]["algorithm"]["variantcaller"] = "" data["work_bam"] = None else: data = process_alignment(data)[0][0] callable_region_bed, nblock_bed = callable.block_regions(data["work_bam"], data["sam_ref"], data["config"]) data["regions"] = {"nblock": nblock_bed} if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data["callable_bam"] = data["work_bam"] data = _recal_no_markduplicates(data) return [data]
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc, dirs, config): """Do an alignment of fastq files, preparing a sorted BAM output file. """ aligner = config["algorithm"].get("aligner", None) out_bam = "" names = rg_names(lane_name, lane_desc, config) _, ref_file = get_genome_ref(info["genome_build"], None, dirs["galaxy"]) if os.path.exists(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (lane_name, aligner)) out_bam, ref_file = align_to_sort_bam(fastq1, fastq2, names, info["genome_build"], aligner, dirs, config) elif os.path.exists(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if sort_method: runner = broad.runner_from_config(config) out_file = os.path.join( dirs["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) elif bamclean is True or bamclean == "picard": out_bam = cleanbam.picard_prep(fastq1, names, ref_file, dirs, config) else: out_bam = link_bam_file( fastq1, os.path.join(dirs["work"], "prealign", names["sample"])) if not out_bam and not os.path.exists(fastq1): raise ValueError("Could not find input file: %s" % fastq1) return [{ "fastq": [fastq1, fastq2], "work_bam": out_bam, "info": info, "sam_ref": ref_file, "config": config }]
def main(input_path, genome, filter_file, read1, read2, filtered_reads, aligner, slurm_parameters): if filter_file is None: filter_file, _ = get_genome_ref(genome, aligner, os.path.normpath(REFERENCE_DIR)) infiles = [] if read1 is None: if os.path.isdir(input_path): pat = os.path.join(input_path,"*barcode","*_1_fastq.txt") for read1 in glob.glob(pat): read2 = read1.replace("_1_fastq.txt","_2_fastq.txt") if not os.path.exists(read2): read2 = None infiles.append([read1,read2]) elif os.path.isfile(input_path): if input_path.endswith("_1_fastq.txt"): read1 = input_path read2 = read1.replace("_1_fastq.txt","_2_fastq.txt") if not os.path.exists(read2): read2 = None elif input_path.endswith("_2_fastq.txt"): read2 = input_path read1 = read1.replace("_2_fastq.txt","_1_fastq.txt") assert os.path.exists(read1), "ERROR: Could not find the first read file (expected %s)" % read1 else: read1 = input_path read2 = None infiles.append([read1,read2]) else: infiles.append([read1,read2]) for read1, read2 in infiles: jobid = filter_files_job(read1, read2, filtered_reads, filter_file, aligner, slurm_parameters) print "Your job was submitted with jobid %s" % jobid