def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path): variant_regions = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target_bed = target else: target_bed = os.path.join(tmp_path, "tmp.bed") if not utils.file_exists(target_bed): with file_transaction(config, target_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ( "Region must be a tuple - something odd just happened" ) raise ValueError(message) chrom, start, end = region with open(tx_tmp_bed, "w") as out_handle: print("%s\t%s\t%s" % (chrom, start, end), file=out_handle) if any(dd.get_coverage_interval(x) == "genome" for x in items): target_bed = shared.remove_highdepth_regions(target_bed, items) target_bed = shared.remove_lcr_regions(target_bed, items) return ["--bed", target_bed] else: return []
def _scalpel_options_from_config(items, config, out_file, region, tmp_path): opts = [] # output vcf, report only variants within bed regions opts += ["--format", "vcf", "--intarget", "--covthr 3", "--lowcov 1"] variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target_bed = target else: target_bed = os.path.join(tmp_path, "tmp.bed") with file_transaction(config, target_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ( "Region must be a tuple - something odd just happened") raise ValueError(message) chrom, start, end = region with open(tx_tmp_bed, "w") as out_handle: print("%s\t%s\t%s" % (chrom, start, end), file=out_handle) opts += ["--bed", remove_lcr_regions(target_bed, items)] resources = config_utils.get_resources("scalpel", config) if resources.get("options"): opts += resources["options"] if "--outratio" not in " ".join(opts): # add minimum reportable allele frequency, for which Scalpel defaults to 5 # but other somatic tools in bcbio default to 10 min_af = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += ["--outratio", str(min_af)] return opts
def _pindel_options(items, config, out_file, region, tmp_path): """parse pindel options. Add region to cmd. :param items: (dict) information from yaml :param config: (dict) information from yaml (items[0]['config']) :param region: (str or tupple) region to analyze :param tmp_path: (str) temporal folder :returns: (list) options for pindel """ variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file, items) opts = "" if target: if isinstance(target, six.string_types) and os.path.isfile(target): target_bed = target else: target_bed = os.path.join(tmp_path, "tmp.bed") with file_transaction(config, target_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ("Region must be a tuple - something odd just happened") raise ValueError(message) chrom, start, end = region with open(tx_tmp_bed, "w") as out_handle: print("%s\t%s\t%s" % (chrom, start, end), file=out_handle) opts = "-j " + remove_lcr_regions(target_bed, items) return opts
def _scalpel_options_from_config(items, config, out_file, region, tmp_path): opts = [] # output vcf, report only variants within bed regions opts += ["--format", "vcf", "--intarget", "--covthr 3", "--lowcov 1"] variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target_bed = target else: target_bed = os.path.join(tmp_path, "tmp.bed") with file_transaction(config, target_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ("Region must be a tuple - something odd just happened") raise ValueError(message) chrom, start, end = region with open(tx_tmp_bed, "w") as out_handle: print("%s\t%s\t%s" % (chrom, start, end), file=out_handle) opts += ["--bed", remove_lcr_regions(target_bed, items)] resources = config_utils.get_resources("scalpel", config) if resources.get("options"): opts += resources["options"] if "--outratio" not in " ".join(opts): # add minimum reportable allele frequency, for which Scalpel defaults to 5 # but other somatic tools in bcbio default to 10 min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += ["--outratio", str(min_af)] return opts
def _create_validate_config(vrn_file, rm_file, rm_interval_file, rm_genome, base_dir, data): """Create a bcbio.variation configuration input for validation. """ if rm_genome: rm_genome = utils.get_in(data, ("reference", "alt", rm_genome, "base")) if rm_genome and rm_genome != utils.get_in(data, ("reference", "fasta", "base")): eval_genome = utils.get_in(data, ("reference", "fasta", "base")) else: rm_genome = utils.get_in(data, ("reference", "fasta", "base")) eval_genome = None ref_call = {"file": str(rm_file), "name": "ref", "type": "grading-ref", "preclean": True, "prep": True, "remove-refcalls": True} a_intervals = get_analysis_intervals(data) if a_intervals: a_intervals = shared.remove_lcr_regions(a_intervals, [data]) if rm_interval_file: ref_call["intervals"] = rm_interval_file eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True} if eval_genome: eval_call["ref"] = eval_genome eval_call["preclean"] = True eval_call["prep"] = True if a_intervals and eval_genome: eval_call["intervals"] = os.path.abspath(a_intervals) exp = {"sample": data["name"][-1], "ref": rm_genome, "approach": "grade", "calls": [ref_call, eval_call]} if a_intervals and not eval_genome: exp["intervals"] = os.path.abspath(a_intervals) if data.get("callable_bam") and not eval_genome: exp["align"] = data["callable_bam"] return {"dir": {"base": base_dir, "out": "work", "prep": "work/prep"}, "experiments": [exp]}
def _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data): """Retrieve intervals to run validation on, merging reference and callable BED files. """ a_intervals = get_analysis_intervals(data, vrn_file, base_dir) if a_intervals: final_intervals = shared.remove_lcr_regions(a_intervals, [data]) if rm_interval_file: caller = _get_caller(data) sample = dd.get_sample_name(data) combo_intervals = os.path.join( base_dir, "%s-%s-%s-wrm.bed" % (utils.splitext_plus( os.path.basename(final_intervals))[0], sample, caller)) if not utils.file_uptodate(combo_intervals, final_intervals): with file_transaction(data, combo_intervals) as tx_out_file: with utils.chdir(os.path.dirname(tx_out_file)): # Copy files locally to avoid issues on shared filesystems # where BEDtools has trouble accessing the same base # files from multiple locations a = os.path.basename(final_intervals) b = os.path.basename(rm_interval_file) try: shutil.copyfile(final_intervals, a) except IOError: time.sleep(60) shutil.copyfile(final_intervals, a) try: shutil.copyfile(rm_interval_file, b) except IOError: time.sleep(60) shutil.copyfile(rm_interval_file, b) cmd = ( "bedtools intersect -nonamecheck -a {a} -b {b} > {tx_out_file}" ) do.run(cmd.format(**locals()), "Intersect callable intervals for rtg vcfeval") final_intervals = combo_intervals else: assert rm_interval_file, "No intervals to subset analysis with for %s" % vrn_file final_intervals = shared.remove_lcr_regions(rm_interval_file, [data]) return final_intervals
def _create_validate_config(vrn_file, rm_file, rm_interval_file, rm_genome, base_dir, data): """Create a bcbio.variation configuration input for validation. """ if rm_genome: rm_genome = utils.get_in(data, ("reference", "alt", rm_genome, "base")) if rm_genome and rm_genome != utils.get_in(data, ("reference", "fasta", "base")): eval_genome = utils.get_in(data, ("reference", "fasta", "base")) else: rm_genome = utils.get_in(data, ("reference", "fasta", "base")) eval_genome = None ref_call = { "file": str(rm_file), "name": "ref", "type": "grading-ref", "preclean": True, "prep": True, "remove-refcalls": True } a_intervals = get_analysis_intervals(data) if a_intervals: a_intervals = shared.remove_lcr_regions(a_intervals, [data]) if rm_interval_file: ref_call["intervals"] = rm_interval_file eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True} if eval_genome: eval_call["ref"] = eval_genome eval_call["preclean"] = True eval_call["prep"] = True if a_intervals and eval_genome: eval_call["intervals"] = os.path.abspath(a_intervals) exp = { "sample": data["name"][-1], "ref": rm_genome, "approach": "grade", "calls": [ref_call, eval_call] } if a_intervals and not eval_genome: exp["intervals"] = os.path.abspath(a_intervals) if data.get("align_bam") and not eval_genome: exp["align"] = data["align_bam"] elif data.get("work_bam") and not eval_genome: exp["align"] = data["work_bam"] return { "dir": { "base": base_dir, "out": "work", "prep": "work/prep" }, "experiments": [exp] }
def _get_merged_intervals(rm_interval_file, base_dir, data): """Retrieve intervals to run validation on, merging reference and callable BED files. """ a_intervals = get_analysis_intervals(data) if a_intervals: final_intervals = shared.remove_lcr_regions(a_intervals, [data]) if rm_interval_file: caller = _get_caller(data) sample = dd.get_sample_name(data) combo_intervals = os.path.join(base_dir, "%s-%s-%s-wrm.bed" % (utils.splitext_plus(os.path.basename(final_intervals))[0], sample, caller)) if not utils.file_uptodate(combo_intervals, final_intervals): with file_transaction(data, combo_intervals) as tx_out_file: with utils.chdir(os.path.dirname(tx_out_file)): # Copy files locally to avoid issues on shared filesystems # where BEDtools has trouble accessing the same base # files from multiple locations a = os.path.basename(final_intervals) b = os.path.basename(rm_interval_file) try: shutil.copyfile(final_intervals, a) except IOError: time.sleep(60) shutil.copyfile(final_intervals, a) try: shutil.copyfile(rm_interval_file, b) except IOError: time.sleep(60) shutil.copyfile(rm_interval_file, b) cmd = ("bedtools intersect -nonamecheck -a {a} -b {b} > {tx_out_file}") do.run(cmd.format(**locals()), "Intersect callable intervals for rtg vcfeval") final_intervals = combo_intervals else: assert rm_interval_file, "No intervals to subset analysis with" final_intervals = shared.remove_lcr_regions(rm_interval_file, [data]) return final_intervals
def _vardict_options_from_config(items, config, out_file, target=None): opts = ["-c 1", "-S 2", "-E 3", "-g 4"] # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += resources["options"] assert _is_bed_file(target) if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) target = shared.remove_lcr_regions(target, items) target = _enforce_max_region_size(target, items[0]) opts += [target] # this must be the last option return opts
def _get_merged_intervals(rm_interval_file, base_dir, data): """Retrieve intervals to run validation on, merging reference and callable BED files. """ a_intervals = get_analysis_intervals(data) if a_intervals: final_intervals = shared.remove_lcr_regions(a_intervals, [data]) if rm_interval_file: final_intervals = os.path.join(base_dir, "%s-wrm.bed" % utils.splitext_plus(os.path.basename(a_intervals))[0]) if not utils.file_uptodate(final_intervals, a_intervals): with file_transaction(data, final_intervals) as tx_out_file: pybedtools.BedTool(a_intervals).intersect(rm_interval_file).saveas(tx_out_file) else: assert rm_interval_file, "No intervals to subset analysis with" final_intervals = rm_interval_file return final_intervals
def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path): variant_regions = bedutils.population_variant_regions(items) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target_bed = target else: target_bed = os.path.join(tmp_path, "tmp.bed") if not utils.file_exists(target_bed): with file_transaction(config, target_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ("Region must be a tuple - something odd just happened") raise ValueError(message) chrom, start, end = region with open(tx_tmp_bed, "w") as out_handle: print("%s\t%s\t%s" % (chrom, start, end), file=out_handle) return ["--bed", remove_lcr_regions(target_bed, items)] else: return []
def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path): variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target_bed = target else: target_bed = os.path.join(tmp_path, "tmp.bed") if not utils.file_exists(target_bed): with file_transaction(config, target_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ("Region must be a tuple - something odd just happened") raise ValueError(message) chrom, start, end = region with open(tx_tmp_bed, "w") as out_handle: print("%s\t%s\t%s" % (chrom, start, end), file=out_handle) return ["--bed", remove_lcr_regions(target_bed, items)] else: return []
def _vardict_options_from_config(items, config, out_file, region=None, do_merge=False): opts = ["-c 1", "-S 2", "-E 3", "-g 4"] # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += resources["options"] variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = shared.subset_variant_regions(variant_regions, region, out_file, do_merge=do_merge) if target: if isinstance(target, basestring) and os.path.isfile(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) target = shared.remove_lcr_regions(target, items) target = _enforce_max_region_size(target, items[0]) opts += [target] # this must be the last option else: # one-based, end-inclusive coordinates as for Gatk opts += ["-R", bamprep.region_to_gatk(target)] return opts
def _create_validate_config(vrn_file, rm_file, rm_interval_file, base_dir, data): """Create a bcbio.variation configuration input for validation. """ ref_call = {"file": str(rm_file), "name": "ref", "type": "grading-ref", "fix-sample-header": True, "remove-refcalls": True} a_intervals = get_analysis_intervals(data) if a_intervals: a_intervals = shared.remove_lcr_regions(a_intervals, [data]) if rm_interval_file: ref_call["intervals"] = rm_interval_file eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True} exp = {"sample": data["name"][-1], "ref": dd.get_ref_file(data), "approach": "grade", "calls": [ref_call, eval_call]} if a_intervals: exp["intervals"] = os.path.abspath(a_intervals) if data.get("align_bam"): exp["align"] = data["align_bam"] elif data.get("work_bam"): exp["align"] = data["work_bam"] return {"dir": {"base": base_dir, "out": "work", "prep": "work/prep"}, "experiments": [exp]}
def _create_validate_config(vrn_file, rm_file, rm_interval_file, base_dir, data): """Create a bcbio.variation configuration input for validation. """ ref_call = {"file": str(rm_file), "name": "ref", "type": "grading-ref", "fix-sample-header": True, "remove-refcalls": True} a_intervals = get_analysis_intervals(data, vrn_file, base_dir) if a_intervals: a_intervals = shared.remove_lcr_regions(a_intervals, [data]) if rm_interval_file: ref_call["intervals"] = rm_interval_file eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True} exp = {"sample": data["name"][-1], "ref": dd.get_ref_file(data), "approach": "grade", "calls": [ref_call, eval_call]} if a_intervals: exp["intervals"] = os.path.abspath(a_intervals) if data.get("align_bam"): exp["align"] = data["align_bam"] elif data.get("work_bam"): exp["align"] = data["work_bam"] return {"dir": {"base": base_dir, "out": "work", "prep": "work/prep"}, "experiments": [exp]}
def _vardict_options_from_config(items, config, out_file, target=None): var2vcf_opts = [] opts = ["-c 1", "-S 2", "-E 3", "-g 4"] # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] # remove low mapping quality reads opts += ["-Q", "10"] # Remove QCfail reads, avoiding high depth repetitive regions opts += ["-F", "0x700"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += [str(x) for x in resources["options"]] resources = config_utils.get_resources("var2vcf", config) if resources.get("options"): var2vcf_opts += [str(x) for x in resources["options"]] if target and _is_bed_file(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) target = shared.remove_lcr_regions(target, items) target = _enforce_max_region_size(target, items[0]) opts += [target] # this must be the last option return " ".join(opts), " ".join(var2vcf_opts)