Пример #1
0
def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path):
    variant_regions = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(variant_regions, region, out_file,
                                           items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            target_bed = target
        else:
            target_bed = os.path.join(tmp_path, "tmp.bed")
            if not utils.file_exists(target_bed):
                with file_transaction(config, target_bed) as tx_tmp_bed:
                    if not isinstance(region, (list, tuple)):
                        message = (
                            "Region must be a tuple - something odd just happened"
                        )
                        raise ValueError(message)
                    chrom, start, end = region
                    with open(tx_tmp_bed, "w") as out_handle:
                        print("%s\t%s\t%s" % (chrom, start, end),
                              file=out_handle)
        if any(dd.get_coverage_interval(x) == "genome" for x in items):
            target_bed = shared.remove_highdepth_regions(target_bed, items)
            target_bed = shared.remove_lcr_regions(target_bed, items)
        return ["--bed", target_bed]
    else:
        return []
Пример #2
0
def _scalpel_options_from_config(items, config, out_file, region, tmp_path):
    opts = []
    # output vcf, report only variants within bed regions
    opts += ["--format", "vcf", "--intarget", "--covthr 3", "--lowcov 1"]
    variant_regions = utils.get_in(config, ("algorithm", "variant_regions"))
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            target_bed = target
        else:
            target_bed = os.path.join(tmp_path, "tmp.bed")
            with file_transaction(config, target_bed) as tx_tmp_bed:
                if not isinstance(region, (list, tuple)):
                    message = (
                        "Region must be a tuple - something odd just happened")
                    raise ValueError(message)
                chrom, start, end = region
                with open(tx_tmp_bed, "w") as out_handle:
                    print("%s\t%s\t%s" % (chrom, start, end), file=out_handle)
        opts += ["--bed", remove_lcr_regions(target_bed, items)]
    resources = config_utils.get_resources("scalpel", config)
    if resources.get("options"):
        opts += resources["options"]
    if "--outratio" not in " ".join(opts):
        # add minimum reportable allele frequency, for which Scalpel defaults to 5
        # but other somatic tools in bcbio default to 10
        min_af = float(
            utils.get_in(config,
                         ("algorithm", "min_allele_fraction"), 10)) / 100.0
        opts += ["--outratio", str(min_af)]
    return opts
Пример #3
0
def _pindel_options(items, config, out_file, region, tmp_path):
    """parse pindel options. Add region to cmd.
    :param items: (dict) information from yaml
    :param config: (dict) information from yaml (items[0]['config'])
    :param region: (str or tupple) region to analyze
    :param tmp_path: (str) temporal folder
    :returns: (list) options for pindel
    """
    variant_regions = utils.get_in(config, ("algorithm", "variant_regions"))
    target = subset_variant_regions(variant_regions, region, out_file, items)
    opts = ""
    if target:
        if isinstance(target, six.string_types) and os.path.isfile(target):
            target_bed = target
        else:
            target_bed = os.path.join(tmp_path, "tmp.bed")
            with file_transaction(config, target_bed) as tx_tmp_bed:
                if not isinstance(region, (list, tuple)):
                    message = ("Region must be a tuple - something odd just happened")
                    raise ValueError(message)
                chrom, start, end = region
                with open(tx_tmp_bed, "w") as out_handle:
                    print("%s\t%s\t%s" % (chrom, start, end), file=out_handle)
        opts = "-j " + remove_lcr_regions(target_bed, items)
    return opts
Пример #4
0
def _scalpel_options_from_config(items, config, out_file, region, tmp_path):
    opts = []
    # output vcf, report only variants within bed regions
    opts += ["--format", "vcf", "--intarget", "--covthr 3", "--lowcov 1"]
    variant_regions = utils.get_in(config, ("algorithm", "variant_regions"))
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            target_bed = target
        else:
            target_bed = os.path.join(tmp_path, "tmp.bed")
            with file_transaction(config, target_bed) as tx_tmp_bed:
                if not isinstance(region, (list, tuple)):
                    message = ("Region must be a tuple - something odd just happened")
                    raise ValueError(message)
                chrom, start, end = region
                with open(tx_tmp_bed, "w") as out_handle:
                    print("%s\t%s\t%s" % (chrom, start, end), file=out_handle)
        opts += ["--bed", remove_lcr_regions(target_bed, items)]
    resources = config_utils.get_resources("scalpel", config)
    if resources.get("options"):
        opts += resources["options"]
    if "--outratio" not in " ".join(opts):
        # add minimum reportable allele frequency, for which Scalpel defaults to 5
        # but other somatic tools in bcbio default to 10
        min_af = float(utils.get_in(config, ("algorithm",
                                             "min_allele_fraction"), 10)) / 100.0
        opts += ["--outratio", str(min_af)]
    return opts
Пример #5
0
def _create_validate_config(vrn_file, rm_file, rm_interval_file, rm_genome,
                            base_dir, data):
    """Create a bcbio.variation configuration input for validation.
    """
    if rm_genome:
        rm_genome = utils.get_in(data, ("reference", "alt", rm_genome, "base"))
    if rm_genome and rm_genome != utils.get_in(data, ("reference", "fasta", "base")):
        eval_genome = utils.get_in(data, ("reference", "fasta", "base"))
    else:
        rm_genome = utils.get_in(data, ("reference", "fasta", "base"))
        eval_genome = None
    ref_call = {"file": str(rm_file), "name": "ref", "type": "grading-ref",
                "preclean": True, "prep": True, "remove-refcalls": True}
    a_intervals = get_analysis_intervals(data)
    if a_intervals:
        a_intervals = shared.remove_lcr_regions(a_intervals, [data])
    if rm_interval_file:
        ref_call["intervals"] = rm_interval_file
    eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True}
    if eval_genome:
        eval_call["ref"] = eval_genome
        eval_call["preclean"] = True
        eval_call["prep"] = True
    if a_intervals and eval_genome:
        eval_call["intervals"] = os.path.abspath(a_intervals)
    exp = {"sample": data["name"][-1],
           "ref": rm_genome,
           "approach": "grade",
           "calls": [ref_call, eval_call]}
    if a_intervals and not eval_genome:
        exp["intervals"] = os.path.abspath(a_intervals)
    if data.get("callable_bam") and not eval_genome:
        exp["align"] = data["callable_bam"]
    return {"dir": {"base": base_dir, "out": "work", "prep": "work/prep"},
            "experiments": [exp]}
Пример #6
0
def _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data):
    """Retrieve intervals to run validation on, merging reference and callable BED files.
    """
    a_intervals = get_analysis_intervals(data, vrn_file, base_dir)
    if a_intervals:
        final_intervals = shared.remove_lcr_regions(a_intervals, [data])
        if rm_interval_file:
            caller = _get_caller(data)
            sample = dd.get_sample_name(data)
            combo_intervals = os.path.join(
                base_dir, "%s-%s-%s-wrm.bed" % (utils.splitext_plus(
                    os.path.basename(final_intervals))[0], sample, caller))
            if not utils.file_uptodate(combo_intervals, final_intervals):
                with file_transaction(data, combo_intervals) as tx_out_file:
                    with utils.chdir(os.path.dirname(tx_out_file)):
                        # Copy files locally to avoid issues on shared filesystems
                        # where BEDtools has trouble accessing the same base
                        # files from multiple locations
                        a = os.path.basename(final_intervals)
                        b = os.path.basename(rm_interval_file)
                        try:
                            shutil.copyfile(final_intervals, a)
                        except IOError:
                            time.sleep(60)
                            shutil.copyfile(final_intervals, a)
                        try:
                            shutil.copyfile(rm_interval_file, b)
                        except IOError:
                            time.sleep(60)
                            shutil.copyfile(rm_interval_file, b)
                        cmd = (
                            "bedtools intersect -nonamecheck -a {a} -b {b} > {tx_out_file}"
                        )
                        do.run(cmd.format(**locals()),
                               "Intersect callable intervals for rtg vcfeval")
            final_intervals = combo_intervals
    else:
        assert rm_interval_file, "No intervals to subset analysis with for %s" % vrn_file
        final_intervals = shared.remove_lcr_regions(rm_interval_file, [data])
    return final_intervals
Пример #7
0
def _create_validate_config(vrn_file, rm_file, rm_interval_file, rm_genome,
                            base_dir, data):
    """Create a bcbio.variation configuration input for validation.
    """
    if rm_genome:
        rm_genome = utils.get_in(data, ("reference", "alt", rm_genome, "base"))
    if rm_genome and rm_genome != utils.get_in(data,
                                               ("reference", "fasta", "base")):
        eval_genome = utils.get_in(data, ("reference", "fasta", "base"))
    else:
        rm_genome = utils.get_in(data, ("reference", "fasta", "base"))
        eval_genome = None
    ref_call = {
        "file": str(rm_file),
        "name": "ref",
        "type": "grading-ref",
        "preclean": True,
        "prep": True,
        "remove-refcalls": True
    }
    a_intervals = get_analysis_intervals(data)
    if a_intervals:
        a_intervals = shared.remove_lcr_regions(a_intervals, [data])
    if rm_interval_file:
        ref_call["intervals"] = rm_interval_file
    eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True}
    if eval_genome:
        eval_call["ref"] = eval_genome
        eval_call["preclean"] = True
        eval_call["prep"] = True
    if a_intervals and eval_genome:
        eval_call["intervals"] = os.path.abspath(a_intervals)
    exp = {
        "sample": data["name"][-1],
        "ref": rm_genome,
        "approach": "grade",
        "calls": [ref_call, eval_call]
    }
    if a_intervals and not eval_genome:
        exp["intervals"] = os.path.abspath(a_intervals)
    if data.get("align_bam") and not eval_genome:
        exp["align"] = data["align_bam"]
    elif data.get("work_bam") and not eval_genome:
        exp["align"] = data["work_bam"]
    return {
        "dir": {
            "base": base_dir,
            "out": "work",
            "prep": "work/prep"
        },
        "experiments": [exp]
    }
Пример #8
0
def _get_merged_intervals(rm_interval_file, base_dir, data):
    """Retrieve intervals to run validation on, merging reference and callable BED files.
    """
    a_intervals = get_analysis_intervals(data)
    if a_intervals:
        final_intervals = shared.remove_lcr_regions(a_intervals, [data])
        if rm_interval_file:
            caller = _get_caller(data)
            sample = dd.get_sample_name(data)
            combo_intervals = os.path.join(base_dir, "%s-%s-%s-wrm.bed" %
                                           (utils.splitext_plus(os.path.basename(final_intervals))[0],
                                            sample, caller))
            if not utils.file_uptodate(combo_intervals, final_intervals):
                with file_transaction(data, combo_intervals) as tx_out_file:
                    with utils.chdir(os.path.dirname(tx_out_file)):
                        # Copy files locally to avoid issues on shared filesystems
                        # where BEDtools has trouble accessing the same base
                        # files from multiple locations
                        a = os.path.basename(final_intervals)
                        b = os.path.basename(rm_interval_file)
                        try:
                            shutil.copyfile(final_intervals, a)
                        except IOError:
                            time.sleep(60)
                            shutil.copyfile(final_intervals, a)
                        try:
                            shutil.copyfile(rm_interval_file, b)
                        except IOError:
                            time.sleep(60)
                            shutil.copyfile(rm_interval_file, b)
                        cmd = ("bedtools intersect -nonamecheck -a {a} -b {b} > {tx_out_file}")
                        do.run(cmd.format(**locals()), "Intersect callable intervals for rtg vcfeval")
            final_intervals = combo_intervals
    else:
        assert rm_interval_file, "No intervals to subset analysis with"
        final_intervals = shared.remove_lcr_regions(rm_interval_file, [data])
    return final_intervals
Пример #9
0
def _vardict_options_from_config(items, config, out_file, target=None):
    opts = ["-c 1", "-S 2", "-E 3", "-g 4"]
    # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0",
    #  "-k", "3", "-r", "4", "-m", "8"]

    resources = config_utils.get_resources("vardict", config)
    if resources.get("options"):
        opts += resources["options"]
    assert _is_bed_file(target)
    if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items):
        target = shared.remove_highdepth_regions(target, items)
        target = shared.remove_lcr_regions(target, items)
    target = _enforce_max_region_size(target, items[0])
    opts += [target]  # this must be the last option
    return opts
Пример #10
0
def _vardict_options_from_config(items, config, out_file, target=None):
    opts = ["-c 1", "-S 2", "-E 3", "-g 4"]
    # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0",
    #  "-k", "3", "-r", "4", "-m", "8"]

    resources = config_utils.get_resources("vardict", config)
    if resources.get("options"):
        opts += resources["options"]
    assert _is_bed_file(target)
    if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome"
            for x in items):
        target = shared.remove_highdepth_regions(target, items)
        target = shared.remove_lcr_regions(target, items)
    target = _enforce_max_region_size(target, items[0])
    opts += [target]  # this must be the last option
    return opts
Пример #11
0
def _get_merged_intervals(rm_interval_file, base_dir, data):
    """Retrieve intervals to run validation on, merging reference and callable BED files.
    """
    a_intervals = get_analysis_intervals(data)
    if a_intervals:
        final_intervals = shared.remove_lcr_regions(a_intervals, [data])
        if rm_interval_file:
            final_intervals = os.path.join(base_dir, "%s-wrm.bed" %
                                           utils.splitext_plus(os.path.basename(a_intervals))[0])
            if not utils.file_uptodate(final_intervals, a_intervals):
                with file_transaction(data, final_intervals) as tx_out_file:
                    pybedtools.BedTool(a_intervals).intersect(rm_interval_file).saveas(tx_out_file)
    else:
        assert rm_interval_file, "No intervals to subset analysis with"
        final_intervals = rm_interval_file
    return final_intervals
Пример #12
0
def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path):
    variant_regions = bedutils.population_variant_regions(items)
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            target_bed = target
        else:
            target_bed = os.path.join(tmp_path, "tmp.bed")
            if not utils.file_exists(target_bed):
                with file_transaction(config, target_bed) as tx_tmp_bed:
                    if not isinstance(region, (list, tuple)):
                        message = ("Region must be a tuple - something odd just happened")
                        raise ValueError(message)
                    chrom, start, end = region
                    with open(tx_tmp_bed, "w") as out_handle:
                        print("%s\t%s\t%s" % (chrom, start, end), file=out_handle)
        return ["--bed", remove_lcr_regions(target_bed, items)]
    else:
        return []
Пример #13
0
def _scalpel_bed_file_opts(items, config, out_file, region, tmp_path):
    variant_regions = utils.get_in(config, ("algorithm", "variant_regions"))
    target = subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            target_bed = target
        else:
            target_bed = os.path.join(tmp_path, "tmp.bed")
            if not utils.file_exists(target_bed):
                with file_transaction(config, target_bed) as tx_tmp_bed:
                    if not isinstance(region, (list, tuple)):
                        message = ("Region must be a tuple - something odd just happened")
                        raise ValueError(message)
                    chrom, start, end = region
                    with open(tx_tmp_bed, "w") as out_handle:
                        print("%s\t%s\t%s" % (chrom, start, end), file=out_handle)
        return ["--bed", remove_lcr_regions(target_bed, items)]
    else:
        return []
Пример #14
0
def _vardict_options_from_config(items, config, out_file, region=None, do_merge=False):
    opts = ["-c 1", "-S 2", "-E 3", "-g 4"]
    # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0",
    #  "-k", "3", "-r", "4", "-m", "8"]

    resources = config_utils.get_resources("vardict", config)
    if resources.get("options"):
        opts += resources["options"]

    variant_regions = utils.get_in(config, ("algorithm", "variant_regions"))
    target = shared.subset_variant_regions(variant_regions, region, out_file, do_merge=do_merge)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome"
                   for x in items):
                target = shared.remove_highdepth_regions(target, items)
                target = shared.remove_lcr_regions(target, items)
            target = _enforce_max_region_size(target, items[0])
            opts += [target]  # this must be the last option
        else:
            # one-based, end-inclusive coordinates as for Gatk
            opts += ["-R", bamprep.region_to_gatk(target)]
    return opts
Пример #15
0
def _create_validate_config(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Create a bcbio.variation configuration input for validation.
    """
    ref_call = {"file": str(rm_file), "name": "ref", "type": "grading-ref",
                "fix-sample-header": True, "remove-refcalls": True}
    a_intervals = get_analysis_intervals(data)
    if a_intervals:
        a_intervals = shared.remove_lcr_regions(a_intervals, [data])
    if rm_interval_file:
        ref_call["intervals"] = rm_interval_file
    eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True}
    exp = {"sample": data["name"][-1],
           "ref": dd.get_ref_file(data),
           "approach": "grade",
           "calls": [ref_call, eval_call]}
    if a_intervals:
        exp["intervals"] = os.path.abspath(a_intervals)
    if data.get("align_bam"):
        exp["align"] = data["align_bam"]
    elif data.get("work_bam"):
        exp["align"] = data["work_bam"]
    return {"dir": {"base": base_dir, "out": "work", "prep": "work/prep"},
            "experiments": [exp]}
Пример #16
0
def _create_validate_config(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Create a bcbio.variation configuration input for validation.
    """
    ref_call = {"file": str(rm_file), "name": "ref", "type": "grading-ref",
                "fix-sample-header": True, "remove-refcalls": True}
    a_intervals = get_analysis_intervals(data, vrn_file, base_dir)
    if a_intervals:
        a_intervals = shared.remove_lcr_regions(a_intervals, [data])
    if rm_interval_file:
        ref_call["intervals"] = rm_interval_file
    eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True}
    exp = {"sample": data["name"][-1],
           "ref": dd.get_ref_file(data),
           "approach": "grade",
           "calls": [ref_call, eval_call]}
    if a_intervals:
        exp["intervals"] = os.path.abspath(a_intervals)
    if data.get("align_bam"):
        exp["align"] = data["align_bam"]
    elif data.get("work_bam"):
        exp["align"] = data["work_bam"]
    return {"dir": {"base": base_dir, "out": "work", "prep": "work/prep"},
            "experiments": [exp]}
Пример #17
0
def _vardict_options_from_config(items, config, out_file, target=None):
    var2vcf_opts = []
    opts = ["-c 1", "-S 2", "-E 3", "-g 4"]
    # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0",
    #  "-k", "3", "-r", "4", "-m", "8"]
    # remove low mapping quality reads
    opts += ["-Q", "10"]
    # Remove QCfail reads, avoiding high depth repetitive regions
    opts += ["-F", "0x700"]
    resources = config_utils.get_resources("vardict", config)
    if resources.get("options"):
        opts += [str(x) for x in resources["options"]]
    resources = config_utils.get_resources("var2vcf", config)
    if resources.get("options"):
        var2vcf_opts += [str(x) for x in resources["options"]]
    if target and _is_bed_file(target):
        if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome"
                for x in items):
            target = shared.remove_highdepth_regions(target, items)
            target = shared.remove_lcr_regions(target, items)
        target = _enforce_max_region_size(target, items[0])
        opts += [target]  # this must be the last option
    return " ".join(opts), " ".join(var2vcf_opts)