def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] if dd.is_set_coverage_depth_max(data): coverage_depth_max = dd.get_coverage_depth_max(data) # GATK can only downsample to a minimum of 200 coverage_depth_max = max([200, coverage_depth_max]) params += ["--downsample_to_coverage", str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE"] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def calc_callable_loci(data, region=None, out_file=None): """Determine callable bases for an input BAM in the given region. """ if out_file is None: out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0] max_depth = dd.get_coverage_depth_max(data) depth = {"max": max_depth * 7 if max_depth > 0 else sys.maxint - 1, "min": dd.get_coverage_depth_min(data)} if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: ref_file = tz.get_in(["reference", "fasta", "base"], data) region_file, calc_callable = _regions_for_coverage(data, region, ref_file, tx_out_file) if calc_callable: _group_by_ctype(_get_coverage_file(data["work_bam"], ref_file, region, region_file, depth, tx_out_file, data), depth, region_file, tx_out_file, data) # special case, do not calculate if we are in a chromosome not covered by BED file else: os.rename(region_file, tx_out_file) return [{"callable_bed": out_file, "config": data["config"], "work_bam": data["work_bam"]}]
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] if dd.is_set_coverage_depth_max(data): coverage_depth_max = dd.get_coverage_depth_max(data) # GATK can only downsample to a minimum of 200 coverage_depth_max = max([200, coverage_depth_max]) params += [ "--downsample_to_coverage", str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE" ] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += [ "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] return broad_runner, params