Пример #1
0
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = (do_db_build(samples, check_gemini=False) and
                        any(vcfutils.vcf_has_variants(f) for f in fnames))
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini", config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                load_opts = ""
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                    load_opts += " --passonly"
                # For small test files, skip gene table loading which takes a long time
                if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                    if _is_small_vcf(gemini_vcf):
                        load_opts += " --skip-gene-tables"
                    if "/test_automated_output/" in gemini_vcf:
                        load_opts += " --test-mode"
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd, "Create gemini database for %s %s" % (name, caller), data)
    return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None,
                              "vcf": gemini_vcf if is_batch else None}]]
Пример #2
0
def compare_to_rm(data):
    """Compare final variant calls against reference materials of known calls.
    """
    if isinstance(data, (list, tuple)):
        data = _normalize_cwl_inputs(data)
    toval_data = _get_validate(data)
    if toval_data:
        caller = _get_caller(toval_data)
        sample = dd.get_sample_name(toval_data)
        base_dir = utils.safe_makedir(
            os.path.join(toval_data["dirs"]["work"], "validate", sample,
                         caller))

        if isinstance(toval_data["vrn_file"], (list, tuple)):
            raise NotImplementedError(
                "Multiple input files for validation: %s" %
                toval_data["vrn_file"])
        else:
            vrn_file = os.path.abspath(toval_data["vrn_file"])
        rm_file = normalize_input_path(
            toval_data["config"]["algorithm"]["validate"], toval_data)
        rm_interval_file = _gunzip(
            normalize_input_path(
                toval_data["config"]["algorithm"].get("validate_regions"),
                toval_data), toval_data)
        rm_interval_file = bedutils.clean_file(
            rm_interval_file,
            toval_data,
            bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep")))
        rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(data),
                                         data.get("genome_build"), base_dir,
                                         data)
        rm_interval_file = (naming.handle_synonyms(
            rm_interval_file, dd.get_ref_file(data), data.get("genome_build"),
            base_dir, data) if rm_interval_file else None)
        vmethod = tz.get_in(["config", "algorithm", "validate_method"], data,
                            "rtg")
        if not vcfutils.vcf_has_variants(vrn_file):
            # RTG can fail on totally empty files. Skip these since we have nothing.
            pass
        # empty validation file, every call is a false positive
        elif not vcfutils.vcf_has_variants(rm_file):
            eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir,
                                         toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir,
                                                     toval_data)
        elif vmethod == "rtg":
            eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file,
                                       base_dir, toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir,
                                                     toval_data)
        elif vmethod == "hap.py":
            data["validate"] = _run_happy_eval(vrn_file, rm_file,
                                               rm_interval_file, base_dir,
                                               toval_data)
        elif vmethod == "bcbio.variation":
            data["validate"] = _run_bcbio_variation(vrn_file, rm_file,
                                                    rm_interval_file, base_dir,
                                                    sample, caller, toval_data)
    return [[data]]
Пример #3
0
def prep_gemini_db(fnames, call_info, samples, extras):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """#
    data = samples[0]
    use_gemini = do_db_build(samples) and any(
        vcfutils.vcf_has_variants(f) for f in fnames)
    name, caller, is_batch = call_info
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    if use_gemini:
        passonly = all("gemini_allvariants" not in dd.get_tools_on(d)
                       for d in samples)
        gemini_vcf = normalize.normalize(gemini_vcf, data, passonly=passonly)
    ann_vcf = _run_vcfanno(gemini_vcf, data, use_gemini)
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    if vcfutils.vcf_has_variants(gemini_vcf) and caller not in NO_DB_CALLERS:
        if not utils.file_exists(gemini_db) and use_gemini:
            ped_file = create_ped_file(samples + extras, gemini_vcf)
            # Use original approach for hg19/GRCh37 pending additional testing
            if support_gemini_orig(data) and not any(
                    dd.get_vcfanno(d) for d in samples):
                gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db,
                                                  ped_file)
            elif ann_vcf:
                gemini_db = create_gemini_db(ann_vcf, data, gemini_db,
                                             ped_file)
    return [[(name, caller), {
        "db": gemini_db if utils.file_exists(gemini_db) else None,
        "vcf": ann_vcf or gemini_vcf,
        "decomposed": use_gemini
    }]]
Пример #4
0
def prep_gemini_db(fnames, call_id, samples, data):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db")
    is_population = len(fnames) > 1
    if is_population:
        name, caller = call_id
        gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    else:
        gemini_vcf = fnames[0]
    use_gemini_quick = (do_db_build(samples, check_gemini=False) and
                        any(vcfutils.vcf_has_variants(f) for f in fnames))
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini", config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                    load_opts = "--passonly"
                else:
                    load_opts = ""
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd, "Create gemini database for %s" % str(call_id), data)
    return [[call_id, {"db": gemini_db if utils.file_exists(gemini_db) else None,
                       "vcf": gemini_vcf if is_population else None}]]
Пример #5
0
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = (do_db_build(samples, check_gemini=False)
                        and any(vcfutils.vcf_has_variants(f) for f in fnames))
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(
            vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(data, gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini",
                                                      config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                load_opts = ""
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion(
                        "0.6.2.1"):
                    load_opts += " --passonly"
                # For small test files, skip gene table loading which takes a long time
                if gemini_ver and LooseVersion(gemini_ver) > LooseVersion(
                        "0.6.4"):
                    if _is_small_vcf(gemini_vcf):
                        load_opts += " --skip-gene-tables"
                    if "/test_automated_output/" in gemini_vcf:
                        load_opts += " --test-mode"
                # Skip CADD or gerp-bp if neither are loaded
                if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion(
                        "0.7.0"):
                    gemini_dir = install.get_gemini_dir()
                    for skip_cmd, check_file in [
                        ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")
                    ]:
                        if not os.path.exists(
                                os.path.join(gemini_dir, check_file)):
                            load_opts += " %s" % skip_cmd
                # skip gerp-bp which slows down loading
                load_opts += " --skip-gerp-bp "
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                eanns = ("snpEff" if tz.get_in(
                    ("config", "algorithm",
                     "effects"), data, "snpeff") == "snpeff" else "VEP")
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd,
                       "Create gemini database for %s %s" % (name, caller),
                       data)
    return [[(name, caller), {
        "db": gemini_db if utils.file_exists(gemini_db) else None,
        "vcf": gemini_vcf if is_batch else None
    }]]
Пример #6
0
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(data, gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini", config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                load_opts = ""
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                    load_opts += " --passonly"
                # For small test files, skip gene table loading which takes a long time
                if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                    if _is_small_vcf(gemini_vcf):
                        load_opts += " --skip-gene-tables"
                    if "/test_automated_output/" in gemini_vcf:
                        load_opts += " --test-mode"
                # Skip CADD or gerp-bp if neither are loaded
                if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"):
                    gemini_dir = install.get_gemini_dir()
                    for skip_cmd, check_file in [
                        ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz"),
                        ("--skip-gerp-bp", "hg19.gerp.bw"),
                    ]:
                        if not os.path.exists(os.path.join(gemini_dir, check_file)):
                            load_opts += " %s" % skip_cmd
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                eanns = "snpEff" if tz.get_in(("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP"
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd, "Create gemini database for %s %s" % (name, caller), data)
    return [
        [
            (name, caller),
            {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None},
        ]
    ]
Пример #7
0
def run_filter(vrn_file, align_bam, ref_file, data, items):
    """Filter and annotate somatic VCFs with damage/bias artifacts on low frequency variants.

    Moves damage estimation to INFO field, instead of leaving in FILTER.
    """
    if not should_filter(items) or not vcfutils.vcf_has_variants(vrn_file):
        return data
    else:
        raw_file = "%s-damage.vcf" % utils.splitext_plus(vrn_file)[0]
        out_plot_files = ["%s%s" % (utils.splitext_plus(raw_file)[0], ext)
                          for ext in ["_seq_bias_simplified.pdf", "_pcr_bias_simplified.pdf"]]
        if not utils.file_uptodate(raw_file, vrn_file) and not utils.file_uptodate(raw_file + ".gz", vrn_file):
            with file_transaction(items[0], raw_file) as tx_out_file:
                # Does not apply --qcSummary plotting due to slow runtimes
                dkfzbiasfilter = utils.which(config_utils.get_program("dkfzbiasfilter.py", data))
                cmd = [dkfzbiasfilter, "--filterCycles", "1", "--passOnly",
                       "--tempFolder", os.path.dirname(tx_out_file),
                       vrn_file, align_bam, ref_file, tx_out_file]
                do.run(cmd, "Filter low frequency variants for DNA damage and strand bias")
                for out_plot in out_plot_files:
                    tx_plot_file = os.path.join("%s_qcSummary" % utils.splitext_plus(tx_out_file)[0], "plots",
                                                os.path.basename(out_plot))
                    if utils.file_exists(tx_plot_file):
                        shutil.move(tx_plot_file, out_plot)
        raw_file = vcfutils.bgzip_and_index(raw_file, items[0]["config"])
        data["vrn_file"] = _filter_to_info(raw_file, items[0])
        out_plot_files = [x for x in out_plot_files if utils.file_exists(x)]
        data["damage_plots"] = out_plot_files
        return data
Пример #8
0
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = (do_db_build(samples, check_gemini=False) and
                        any(vcfutils.vcf_has_variants(f) for f in fnames))
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            gemini_db = create_gemini_db(gemini_vcf, data, gemini_db)
    return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None,
                              "vcf": gemini_vcf if is_batch else None}]]
Пример #9
0
def _run_svtyper(in_file, full_bam, exclude_file, data):
    """Genotype structural variant calls with SVtyper.

    Removes calls in high depth regions to avoid slow runtimes:
    https://github.com/hall-lab/svtyper/issues/16
    """
    out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not vcfutils.vcf_has_variants(in_file):
                shutil.copy(in_file, out_file)
            else:
                python = sys.executable
                svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper")
                if exclude_file and utils.file_exists(exclude_file):
                    regions_to_rm = "-T ^%s" % (exclude_file)
                else:
                    regions_to_rm = ""
                # add FILTER headers, which are lost during svtyping
                header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0]
                with open(header_file, "w") as out_handle:
                    with utils.open_gzipsafe(in_file) as in_handle:
                        for line in in_handle:
                            if not line.startswith("#"):
                                break
                            if line.startswith("##FILTER"):
                                out_handle.write(line)
                    for region in ref.file_contigs(dd.get_ref_file(data), data["config"]):
                        out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size))
                cmd = ("bcftools view {in_file} {regions_to_rm} | "
                       "{python} {svtyper} --max_reads 1000 -B {full_bam} | "
                       "bcftools annotate -h {header_file} | "
                       "bgzip -c > {tx_out_file}")
                do.run(cmd.format(**locals()), "SV genotyping with svtyper")
    return vcfutils.sort_by_ref(out_file, data)
Пример #10
0
def snpeff_effects(vcf_in, data):
    """Annotate input VCF file with effects calculated by snpEff.
    """
    if vcfutils.vcf_has_variants(vcf_in):
        return _run_snpeff(vcf_in, "vcf", data)
    else:
        return None, None
Пример #11
0
def combine_calls(batch_id, samples, data):
    """Combine multiple callsets into a final set of merged calls.
    """
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"])))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id)
    exist_variants = False
    for tmp_vrn_file in vrn_files:
        if vcfutils.vcf_has_variants(tmp_vrn_file):
            exist_variants = True
            break
    if exist_variants:
        if "classifiers" not in edata["config"]["algorithm"]["ensemble"]:
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     edata["sam_ref"], edata)
        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file)
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": out_vcf_file,
                    "bed_file": None}
    return [[batch_id, callinfo]]
Пример #12
0
def snpeff_effects(vcf_in, data):
    """Annotate input VCF file with effects calculated by snpEff.
    """
    if vcfutils.vcf_has_variants(vcf_in):
        return _run_snpeff(vcf_in, "vcf", data)
    else:
        return None, None
Пример #13
0
def normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True,
              rerun_effects=True, remove_oldeffects=False, nonrefonly=False, work_dir=None):
    """Normalizes variants and reruns SnpEFF for resulting VCF
    """
    if remove_oldeffects:
        out_file = "%s-noeff-nomultiallelic%s" % utils.splitext_plus(in_file)
    else:
        out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file)
    if work_dir:
        out_file = os.path.join(work_dir, os.path.basename(out_file))
    if not utils.file_exists(out_file):
        if vcfutils.vcf_has_variants(in_file):
            ready_ma_file = _normalize(in_file, data, passonly=passonly,
                                       normalize_indels=normalize_indels,
                                       split_biallelic=split_biallelic,
                                       remove_oldeffects=remove_oldeffects,
                                       nonrefonly=nonrefonly,
                                       work_dir=work_dir)
            if rerun_effects:
                ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data)
                if ann_ma_file:
                    ready_ma_file = ann_ma_file
            utils.symlink_plus(ready_ma_file, out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Пример #14
0
def snpeff_effects(data):
    """Annotate input VCF file with effects calculated by snpEff.
    """
    vcf_in = data["vrn_file"]
    if vcfutils.vcf_has_variants(vcf_in):
        vcf_file = _run_snpeff(vcf_in, "vcf", data)
        return vcf_file
Пример #15
0
def snpeff_effects(data):
    """Annotate input VCF file with effects calculated by snpEff.
    """
    vcf_in = data["vrn_file"]
    if vcfutils.vcf_has_variants(vcf_in):
        vcf_file = _run_snpeff(vcf_in, "vcf", data)
        return vcf_file
Пример #16
0
def cutoff_w_expression(vcf_file, expression, data, name="+", filterext="",
                      extra_cmd="", limit_regions="variant_regions"):
    """Perform cutoff-based soft filtering using bcftools expressions like %QUAL < 20 || DP < 4.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            if vcfutils.vcf_has_variants(vcf_file):
                bcftools = config_utils.get_program("bcftools", data["config"])
                bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else ""
                intervals = ""
                if limit_regions == "variant_regions":
                    variant_regions = dd.get_variant_regions(data)
                    if variant_regions:
                        intervals = "-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"])
                cmd = ("{bcftools} filter -O v {intervals} --soft-filter '{name}' "
                       "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Cutoff-based soft filtering %s with %s" % (vcf_file, expression), data)
            else:
                shutil.copy(vcf_file, out_file)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Пример #17
0
def _has_variant_calls(data):
    if data.get("align_bam"):
        for vrn in data["variants"]:
            if vrn.get("vrn_file") and vcfutils.vcf_has_variants(
                    vrn["vrn_file"]):
                return True
    return False
Пример #18
0
def _freebayes_cutoff(in_file, data):
    """Perform filtering of FreeBayes results, flagging low confidence calls.

    Filters using cutoffs on low depth based on Meynert et al's work modeling sensitivity
    of homozygote and heterozygote calling on depth:

    http://www.ncbi.nlm.nih.gov/pubmed/23773188

    and high depth heterozygote SNP filtering based on Heng Li's work
    evaluating variant calling artifacts:

    http://arxiv.org/abs/1404.0929

    Tuned based on NA12878 call comparisons to Genome in a Bottle reference genome.
    """
    if not vcfutils.vcf_has_variants(in_file):
        base, ext = utils.splitext_plus(in_file)
        out_file = "{base}-filter{ext}".format(**locals())
        if not utils.file_exists(out_file):
            shutil.copy(in_file, out_file)
        if out_file.endswith(".vcf.gz"):
            out_file = vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file

    depth_thresh, qual_thresh = None, None
    if _do_high_depth_filter(data):
        stats = _calc_vcf_stats(in_file)
        if stats["avg_depth"] > 0:
            depth_thresh = int(math.ceil(stats["avg_depth"] + 3 * math.pow(stats["avg_depth"], 0.5)))
            qual_thresh = depth_thresh * 2.0  # Multiplier from default GATK QD cutoff filter
    filters = ('(AF[0] <= 0.5 && (max(FORMAT/DP) < 4 || (max(FORMAT/DP) < 13 && %QUAL < 10))) || '
               '(AF[0] > 0.5 && (max(FORMAT/DP) < 4 && %QUAL < 50))')
    if depth_thresh:
        filters += ' || (%QUAL < {qual_thresh} && max(FORMAT/DP) > {depth_thresh} && AF[0] <= 0.5)'.format(**locals())
    return cutoff_w_expression(in_file, filters, data, name="FBQualDepth")
Пример #19
0
def subset_by_supported(input_file, get_coords, calls_by_name, work_dir, data,
                        headers=("#",)):
    """Limit CNVkit input to calls with support from another caller.

    get_coords is a function that return chrom, start, end from a line of the
    input_file, allowing handling of multiple input file types.
    """
    support_files = [(c, tz.get_in([c, "vrn_file"], calls_by_name))
                     for c in ensemble.SUBSET_BY_SUPPORT["cnvkit"]]
    support_files = [(c, f) for (c, f) in support_files if f and vcfutils.vcf_has_variants(f)]
    if len(support_files) == 0:
        return input_file
    else:
        out_file = os.path.join(work_dir, "%s-havesupport%s" %
                                utils.splitext_plus(os.path.basename(input_file)))
        if not utils.file_uptodate(out_file, input_file):
            input_bed = _input_to_bed(input_file, work_dir, get_coords, headers)
            pass_coords = set([])
            with file_transaction(data, out_file) as tx_out_file:
                support_beds = " ".join([_sv_vcf_to_bed(f, c, out_file) for c, f in support_files])
                tmp_cmp_bed = "%s-intersectwith.bed" % utils.splitext_plus(tx_out_file)[0]
                cmd = "bedtools intersect -wa -f 0.5 -r -a {input_bed} -b {support_beds} > {tmp_cmp_bed}"
                do.run(cmd.format(**locals()), "Intersect CNVs with support files")
                for r in pybedtools.BedTool(tmp_cmp_bed):
                    pass_coords.add((str(r.chrom), str(r.start), str(r.stop)))
                with open(input_file) as in_handle:
                    with open(tx_out_file, "w") as out_handle:
                        for line in in_handle:
                            passes = True
                            if not line.startswith(headers):
                                passes = get_coords(line) in pass_coords
                            if passes:
                                out_handle.write(line)
        return out_file
Пример #20
0
def _pick_best_quality_score(vrn_file):
    """Flexible quality score selection, picking the best available.

    Implementation based on discussion:

    https://github.com/chapmanb/bcbio-nextgen/commit/a538cecd86c0000d17d3f9d4f8ac9d2da04f9884#commitcomment-14539249

    (RTG=AVR/GATK=VQSLOD/MuTect=t_lod_fstar, otherwise GQ, otherwise QUAL, otherwise DP.)

    For MuTect, it's not clear how to get t_lod_fstar, the right quality score, into VCF cleanly.
    """
    # pysam fails on checking reference contigs if input is empty
    if not vcfutils.vcf_has_variants(vrn_file):
        return "DP"
    to_check = 25
    scores = collections.defaultdict(int)
    try:
        in_handle = VariantFile(vrn_file)
    except ValueError:
        raise ValueError("Failed to parse input file in preparation for validation: %s" % vrn_file)
    with contextlib.closing(in_handle) as val_in:
        for i, rec in enumerate(val_in):
            if i > to_check:
                break
            if rec.info.get("VQSLOD") is not None:
                scores["INFO=VQSLOD"] += 1
            for skey in ["AVR", "GQ", "DP"]:
                if rec.samples[0].get(skey) is not None:
                    scores[skey] += 1
            if rec.qual:
                scores["QUAL"] += 1
    for key in ["AVR", "INFO=VQSLOD", "GQ", "QUAL", "DP"]:
        if scores[key] > 0:
            return key
    raise ValueError("Did not find quality score for validation from %s" % vrn_file)
Пример #21
0
def cutoff_w_expression(vcf_file, expression, data, name="+", filterext="",
                      extra_cmd="", limit_regions="variant_regions"):
    """Perform cutoff-based soft filtering using bcftools expressions like %QUAL < 20 || DP < 4.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            if vcfutils.vcf_has_variants(vcf_file):
                bcftools = config_utils.get_program("bcftools", data["config"])
                bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else ""
                intervals = ""
                if limit_regions == "variant_regions":
                    variant_regions = dd.get_variant_regions(data)
                    if variant_regions:
                        intervals = "-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"])
                cmd = ("{bcftools} filter -O v {intervals} --soft-filter '{name}' "
                       "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Cutoff-based soft filtering %s with %s" % (vcf_file, expression), data)
            else:
                shutil.copy(vcf_file, out_file)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Пример #22
0
def _freebayes_cutoff(in_file, data):
    """Perform filtering of FreeBayes results, flagging low confidence calls.

    Filters using cutoffs on low depth based on Meynert et al's work modeling sensitivity
    of homozygote and heterozygote calling on depth:

    http://www.ncbi.nlm.nih.gov/pubmed/23773188

    and high depth heterozygote SNP filtering based on Heng Li's work
    evaluating variant calling artifacts:

    http://arxiv.org/abs/1404.0929

    Tuned based on NA12878 call comparisons to Genome in a Bottle reference genome.
    """
    if not vcfutils.vcf_has_variants(in_file):
        base, ext = utils.splitext_plus(in_file)
        out_file = "{base}-filter{ext}".format(**locals())
        if not utils.file_exists(out_file):
            shutil.copy(in_file, out_file)
        if out_file.endswith(".vcf.gz"):
            out_file = vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file

    depth_thresh, qual_thresh = None, None
    if _do_high_depth_filter(data):
        stats = _calc_vcf_stats(in_file)
        if stats["avg_depth"] > 0:
            depth_thresh = int(math.ceil(stats["avg_depth"] + 3 * math.pow(stats["avg_depth"], 0.5)))
            qual_thresh = depth_thresh * 2.0  # Multiplier from default GATK QD cutoff filter
    filters = ('(AF[0] <= 0.5 && (max(FORMAT/DP) < 4 || (max(FORMAT/DP) < 13 && %QUAL < 10))) || '
               '(AF[0] > 0.5 && (max(FORMAT/DP) < 4 && %QUAL < 50))')
    if depth_thresh:
        filters += ' || (%QUAL < {qual_thresh} && max(FORMAT/DP) > {depth_thresh} && AF[0] <= 0.5)'.format(**locals())
    return cutoff_w_expression(in_file, filters, data, name="FBQualDepth")
Пример #23
0
def hard_w_expression(vcf_file, expression, data, name="+", filterext=""):
    """Perform hard filtering using bcftools expressions like %QUAL < 20 || DP < 4.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            if vcfutils.vcf_has_variants(vcf_file):
                bcftools = config_utils.get_program("bcftools", data["config"])
                output_type = "z" if out_file.endswith(".gz") else "v"
                variant_regions = utils.get_in(
                    data, ("config", "algorithm", "variant_regions"))
                intervals = (
                    "-T %s" %
                    vcfutils.bgzip_and_index(variant_regions, data["config"])
                    if variant_regions else "")
                cmd = (
                    "{bcftools} filter -O {output_type} {intervals} --soft-filter '{name}' "
                    "-e '{expression}' -m '+' {vcf_file} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Hard filtering %s with %s" % (vcf_file, expression),
                       data)
            else:
                shutil.copy(vcf_file, out_file)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Пример #24
0
def _get_build_type(fnames, samples, caller):
    """Confirm we should build a gemini database: need gemini in tools_on.

    Checks for valid conditions for running a database and gemini or gemini_orig
    configured in tools on.
    """
    build_type = set()
    if any(vcfutils.vcf_has_variants(f)
           for f in fnames) and caller not in NO_DB_CALLERS:
        for data in samples:
            if any([
                    x in dd.get_tools_on(data) for x in [
                        "gemini", "gemini_orig", "gemini_allvariants",
                        "vcf2db_expand"
                    ]
            ]):
                if vcfanno.annotate_gemini(data):
                    build_type.add("gemini_orig" if "gemini_orig" in
                                   dd.get_tools_on(data) else "gemini")
                else:
                    logger.info(
                        "Not running gemini, input data not found: %s" %
                        dd.get_sample_name(data))
            else:
                logger.info(
                    "Not running gemini, not configured in tools_on: %s" %
                    dd.get_sample_name(data))
    else:
        logger.info("Not running gemini, no samples with variants found: %s" %
                    (", ".join([dd.get_sample_name(d) for d in samples])))
    return build_type
Пример #25
0
def compare_to_rm(data):
    """Compare final variant calls against reference materials of known calls.
    """
    toval_data = _get_validate(data)
    if toval_data:
        if isinstance(toval_data["vrn_file"], (list, tuple)):
            raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"])
        else:
            vrn_file = os.path.abspath(toval_data["vrn_file"])
        rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data)
        rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"),
                                                        toval_data),
                                   toval_data)
        caller = _get_caller(toval_data)
        sample = dd.get_sample_name(toval_data)
        base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller))
        rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(data), data["genome_build"], base_dir, data)
        rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(data),
                                                   data["genome_build"], base_dir, data)
                            if rm_interval_file else None)
        vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg")
        if not vcfutils.vcf_has_variants(vrn_file):
            # RTG can fail on totally empty files. Skip these since we have nothing.
            pass
        elif vmethod == "rtg":
            eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data)
        elif vmethod == "bcbio.variation":
            data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir,
                                                    sample, caller, toval_data)
    return [[data]]
Пример #26
0
def combine_calls(batch_id, samples, data):
    """Combine multiple callsets into a final set of merged calls.
    """
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"])))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id)
    exist_variants = False
    for tmp_vrn_file in vrn_files:
        if vcfutils.vcf_has_variants(tmp_vrn_file):
            exist_variants = True
            break
    if exist_variants:
        if "classifiers" not in edata["config"]["algorithm"]["ensemble"]:
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     edata["sam_ref"], edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"])
        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file)
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
                    "bed_file": None}
    return [[batch_id, callinfo]]
Пример #27
0
def run_filter(vrn_file, align_bam, ref_file, data, items):
    """Filter and annotate somatic VCFs with damage/bias artifacts on low frequency variants.

    Moves damage estimation to INFO field, instead of leaving in FILTER.
    """
    if not should_filter(items) or not vcfutils.vcf_has_variants(vrn_file):
        return data
    else:
        raw_file = "%s-damage.vcf" % utils.splitext_plus(vrn_file)[0]
        out_plot_files = ["%s%s" % (utils.splitext_plus(raw_file)[0], ext)
                          for ext in ["_seq_bias_simplified.pdf", "_pcr_bias_simplified.pdf"]]
        if not utils.file_uptodate(raw_file, vrn_file) and not utils.file_uptodate(raw_file + ".gz", vrn_file):
            with file_transaction(items[0], raw_file) as tx_out_file:
                # Does not apply --qcSummary plotting due to slow runtimes
                cmd = ["dkfzbiasfilter.py", "--filterCycles", "1", "--passOnly",
                       "--tempFolder", os.path.dirname(tx_out_file),
                       vrn_file, align_bam, ref_file, tx_out_file]
                do.run(cmd, "Filter low frequency variants for DNA damage and strand bias")
                for out_plot in out_plot_files:
                    tx_plot_file = os.path.join("%s_qcSummary" % utils.splitext_plus(tx_out_file)[0], "plots",
                                                os.path.basename(out_plot))
                    if utils.file_exists(tx_plot_file):
                        shutil.move(tx_plot_file, out_plot)
        raw_file = vcfutils.bgzip_and_index(raw_file, items[0]["config"])
        data["vrn_file"] = _filter_to_info(raw_file, items[0])
        out_plot_files = [x for x in out_plot_files if utils.file_exists(x)]
        data["damage_plots"] = out_plot_files
        return data
Пример #28
0
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not vcfutils.vcf_has_variants(gemini_vcf):
        return None
    if not utils.file_exists(gemini_db):
        data_basepath = install.get_gemini_dir(data) if support_gemini_orig(
            data) else None
        conf_files = dd.get_vcfanno(data)
        if not conf_files:
            conf_files = ["gemini"]
        ann_file = vcfanno.run_vcfanno(gemini_vcf, conf_files, data,
                                       data_basepath)
        with file_transaction(data, gemini_db) as tx_gemini_db:
            vcf2db = config_utils.get_program("vcf2db.py", data)
            if "vcf2db_expand" in dd.get_tools_on(data):
                vcf2db_args = [
                    "--expand", "gt_types", "--expand", "gt_ref_depths",
                    "--expand", "gt_alt_depths"
                ]
            else:
                vcf2db_args = []
            cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] + vcf2db_args
            do.run(cmd, "GEMINI: create database with vcf2db")
    return gemini_db
Пример #29
0
def normalize(in_file,
              data,
              passonly=False,
              normalize_indels=True,
              split_biallelic=True,
              rerun_effects=True,
              remove_oldeffects=False,
              nonrefonly=False,
              work_dir=None):
    """Normalizes variants and reruns SnpEFF for resulting VCF
    """
    if remove_oldeffects:
        out_file = "%s-noeff-nomultiallelic%s" % utils.splitext_plus(in_file)
    else:
        out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file)
    if work_dir:
        out_file = os.path.join(work_dir, os.path.basename(out_file))
    if not utils.file_exists(out_file):
        if vcfutils.vcf_has_variants(in_file):
            ready_ma_file = _normalize(in_file,
                                       data,
                                       passonly=passonly,
                                       normalize_indels=normalize_indels,
                                       split_biallelic=split_biallelic,
                                       remove_oldeffects=remove_oldeffects,
                                       nonrefonly=nonrefonly,
                                       work_dir=work_dir)
            if rerun_effects:
                ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data)
                if ann_ma_file:
                    ready_ma_file = ann_ma_file
            utils.symlink_plus(ready_ma_file, out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Пример #30
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,
                                  "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion}
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Пример #31
0
def compare_to_rm(data):
    """Compare final variant calls against reference materials of known calls.
    """
    if isinstance(data, (list, tuple)) and cwlutils.is_cwl_run(utils.to_single_data(data[0])):
        data = _normalize_cwl_inputs(data)
    toval_data = _get_validate(data)
    toval_data = cwlutils.unpack_tarballs(toval_data, toval_data)
    if toval_data:
        caller = _get_caller(toval_data)
        sample = dd.get_sample_name(toval_data)
        base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller))

        if isinstance(toval_data["vrn_file"], (list, tuple)):
            raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"])
        else:
            vrn_file = os.path.abspath(toval_data["vrn_file"])
        rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data)
        rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"),
                                                        toval_data),
                                   toval_data)
        rm_interval_file = bedutils.clean_file(rm_interval_file, toval_data, prefix="validateregions-",
                                               bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep")))
        rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(toval_data), data.get("genome_build"),
                                         base_dir, data)
        rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(toval_data),
                                                   data.get("genome_build"), base_dir, data)
                            if rm_interval_file else None)
        vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg")
        # RTG can fail on totally empty files. Call everything in truth set as false negatives
        if not vcfutils.vcf_has_variants(vrn_file):
            eval_files = _setup_call_false(rm_file, rm_interval_file, base_dir, toval_data, "fn")
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data)
        # empty validation file, every call is a false positive
        elif not vcfutils.vcf_has_variants(rm_file):
            eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir, toval_data, "fp")
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data)
        elif vmethod in ["rtg", "rtg-squash-ploidy"]:
            eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data, vmethod)
            eval_files = _annotate_validations(eval_files, toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data)
        elif vmethod == "hap.py":
            data["validate"] = _run_happy_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data)
        elif vmethod == "bcbio.variation":
            data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir,
                                                    sample, caller, toval_data)
    return [[data]]
Пример #32
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,
                                  "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion}
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        return vcfutils.bgzip_and_index(out_file, data["config"])
Пример #33
0
def prep_gemini_db(fnames, call_info, samples, extras):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    multisample_vcf = get_multisample_vcf(fnames, name, caller, data)
    gemini_vcf = multiallelic.to_single(multisample_vcf, data)
    use_gemini_quick = (do_db_build(samples) and
                        any(vcfutils.vcf_has_variants(f) for f in fnames))
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            ped_file = create_ped_file(samples + extras, gemini_vcf)
            gemini_db = create_gemini_db(gemini_vcf, data, gemini_db, ped_file)
    return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None,
                              "vcf": multisample_vcf if is_batch else None}]]
Пример #34
0
def _has_variant_calls(data):
    for vrn in data["variants"]:
        if (
            vrn.get("vrn_file")
            and vcfutils.vcf_has_variants(vrn["vrn_file"])
            and (_has_precalled(data) or data.get("align_bam"))
        ):
            return True
    return False
Пример #35
0
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = (do_db_build(samples, check_gemini=False)
                        and any(vcfutils.vcf_has_variants(f) for f in fnames))
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(
            vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini",
                                                      config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                load_opts = ""
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion(
                        "0.6.2.1"):
                    load_opts += " --passonly"
                # For small test files, skip gene table loading which takes a long time
                if gemini_ver and LooseVersion(gemini_ver) > LooseVersion(
                        "0.6.4"):
                    if _is_small_vcf(gemini_vcf):
                        load_opts += " --skip-gene-tables"
                    if "/test_automated_output/" in gemini_vcf:
                        load_opts += " --test-mode"
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd,
                       "Create gemini database for %s %s" % (name, caller),
                       data)
    return [[(name, caller), {
        "db": gemini_db if utils.file_exists(gemini_db) else None,
        "vcf": gemini_vcf if is_batch else None
    }]]
Пример #36
0
def combine_calls(*args):
    """Combine multiple callsets into a final set of merged calls.
    """
    if len(args) == 3:
        is_cwl = False
        batch_id, samples, data = args
        caller_names, vrn_files = _organize_variants(samples, batch_id)
    else:
        is_cwl = True
        samples = [utils.to_single_data(x) for x in args]
        samples = [cwlutils.unpack_tarballs(x, x) for x in samples]
        data = samples[0]
        batch_id = data["batch_id"]
        caller_names = data["variants"]["variantcallers"]
        vrn_files = data["variants"]["calls"]
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(caller_names)))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    if any([vcfutils.vcf_has_variants(f) for f in vrn_files]):
        # Decompose multiallelic variants and normalize
        passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False)
        vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True,
                                         nonrefonly=True,
                                         work_dir=utils.safe_makedir(os.path.join(base_dir, c)))
                     for c, f in zip(caller_names, vrn_files)]
        if "classifiers" not in (dd.get_ensemble(edata) or {}):
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     dd.get_ref_file(edata), edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"])
        # After decomposing multiallelic variants and normalizing, re-evaluate effects
        ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data)
        if ann_ma_file:
            callinfo["vrn_file"] = ann_ma_file

        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples])
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
                    "bed_file": None}
    if is_cwl:
        callinfo["batch_samples"] = data["batch_samples"]
        callinfo["batch_id"] = batch_id
        return [{"ensemble": callinfo}]
    else:
        return [[batch_id, callinfo]]
Пример #37
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                loftee_args, loftee_fields = _get_loftee(data)
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length",
                       "--canonical", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args

                if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False):

                    # In case of clinical reporting, we need one and only one
                    # variant per gene
                    # From the VEP docs:
                    # "Pick once line of consequence data per variant,
                    # including transcript-specific columns. Consequences are
                    # chosen by the canonical, biotype status and length of the
                    # transcript, along with the ranking of the consequence
                    # type according to this table. This is the best method to
                    # use if you are interested only in one consequence per
                    #  variant.

                    cmd += ["--pick"]

                    # TODO investigate hgvs reporting but requires indexing the reference file
                    # cmd += ["--hgvs", "--shift-hgvs", "--fasta", dd.get_ref_file(data)]
                perllib = "export PERL5LIB=%s:$PERL5LIB" % _get_perllib()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perllib, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Пример #38
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                loftee_args, loftee_fields = _get_loftee(data)
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length",
                       "--canonical", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args

                if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False):

                    # In case of clinical reporting, we need one and only one
                    # variant per gene
                    # From the VEP docs:
                    # "Pick once line of consequence data per variant,
                    # including transcript-specific columns. Consequences are
                    # chosen by the canonical, biotype status and length of the
                    # transcript, along with the ranking of the consequence
                    # type according to this table. This is the best method to
                    # use if you are interested only in one consequence per
                    #  variant.

                    cmd += ["--pick"]

                    # TODO investigate hgvs reporting but requires indexing the reference file
                    # cmd += ["--hgvs", "--shift-hgvs", "--fasta", dd.get_ref_file(data)]
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (" ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Пример #39
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                if is_human:
                    dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                    loftee_args, loftee_fields = _get_loftee(data)
                    prediction_args = ["--sift", "b", "--polyphen", "b"]
                    prediction_fields = ["PolyPhen", "SIFT"]
                else:
                    dbnsfp_args, dbnsfp_fields = [], []
                    loftee_args, loftee_fields = [], []
                    prediction_args, prediction_fields = [], []
                if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False):
                    # In case of clinical reporting, we need one and only one variant per gene
                    # http://useast.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick
                    # Also use hgvs reporting but requires indexing the reference file
                    clinical_args = ["--pick", "--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)]
                    clinical_fields = ["HGVSc", "HGVSp"]
                else:
                    clinical_args, clinical_fields = [], []
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields + clinical_fields)] + \
                       prediction_args + dbnsfp_args + loftee_args + clinical_args

                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Пример #40
0
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            if "program_versions" in data["config"].get("resources", {}):
                gemini_ver = programs.get_version("gemini",
                                                  config=data["config"])
            else:
                gemini_ver = None
            # Recent versions of gemini allow loading only passing variants
            load_opts = ""
            if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion(
                    "0.6.2.1"):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                if _is_small_vcf(gemini_vcf):
                    load_opts += " --skip-gene-tables"
                if "/test_automated_output/" in gemini_vcf:
                    load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion(
                    "0.7.0"):
                gemini_dir = install.get_gemini_dir(data)
                for skip_cmd, check_file in [
                    ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")
                ]:
                    if not os.path.exists(os.path.join(gemini_dir,
                                                       check_file)):
                        load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            tmpdir = os.path.dirname(tx_gemini_db)
            eanns = _get_effects_flag(data)
            # Apply custom resource specifications, allowing use of alternative annotation_dir
            resources = config_utils.get_resources("gemini", data["config"])
            gemini_opts = " ".join([str(x) for x in resources["options"]
                                    ]) if resources.get("options") else ""
            cmd = (
                "{gemini} {gemini_opts} load {load_opts} -v {gemini_vcf} {eanns} --cores {num_cores} "
                "--tempdir {tmpdir} {tx_gemini_db}")
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
            if ped_file:
                cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db]
                do.run(cmd, "Add PED file to gemini database", data)
    return gemini_db
Пример #41
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                config_args, config_fields, prediction_fields = [], [], []
                if is_human:
                    plugin_fns = {"dbnsfp": _get_dbnsfp, "loftee": _get_loftee, "dbscsnv": _get_dbscsnv,
                                  "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer}
                    plugins = tz.get_in(("config", "resources", "vep", "plugins"), data, ["dbnsfp", "loftee"])
                    for plugin in plugins:
                        plugin_args, plugin_fields = plugin_fns[plugin](data)
                        config_args += plugin_args
                        config_fields += plugin_fields
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    prediction_fields += ["PolyPhen", "SIFT"]
                    # Use HGVS by default, requires indexing the reference genome
                    config_args += ["--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)]
                    config_fields += ["HGVSc", "HGVSp"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick"]
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds",
                       "--fields", ",".join(std_fields + config_fields)] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Пример #42
0
def run(bam_file, data, out_dir):
    out = {}
    vcinfo = variant.get_active_vcinfo(data)
    if vcinfo and vcfutils.vcf_has_variants(vcinfo["vrn_file"]):
        out_file = os.path.join(utils.safe_makedir(out_dir),
                                "%s-damage.yaml" % (dd.get_sample_name(data)))
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = ["dkfzbiasfilter_summarize.py", "--sample=%s" % dd.get_sample_name(data),
                       "--outfile=%s" % tx_out_file, vcinfo["vrn_file"]]
                do.run(cmd, "Summarize damage filtering")
        if utils.file_exists(out_file):
            out["base"] = out_file
    return out
Пример #43
0
def to_single(in_file, data, passonly=False):
    """Convert multi-allelic inputs in the original VCF file into single alleles.
    """
    out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        if vcfutils.vcf_has_variants(in_file):
            ready_ma_file = _decompose(in_file, data, passonly=passonly)
            ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data)
            if ann_ma_file:
                ready_ma_file = ann_ma_file
            out_file = ready_ma_file
        else:
            utils.symlink_plus(in_file, out_file)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Пример #44
0
def snpeff_effects(data):
    """Annotate input VCF file with effects calculated by snpEff.
    """
    vcf_in = data["vrn_file"]
    interval_file = data["config"]["algorithm"].get("variant_regions", None)
    if vcfutils.vcf_has_variants(vcf_in):
        se_interval = _convert_to_snpeff_interval(interval_file, vcf_in) if interval_file else None
        try:
            vcf_file = _run_snpeff(vcf_in, se_interval, "vcf", data)
        finally:
            for fname in [se_interval]:
                if fname and os.path.exists(fname):
                    os.remove(fname)
        return vcf_file
Пример #45
0
def _get_build_type(fnames, samples, caller):
    """Confirm we should build a gemini database: need gemini in tools_on.

    Checks for valid conditions for running a database and gemini or gemini_orig
    configured in tools on.
    """
    build_type = set()
    if any(vcfutils.vcf_has_variants(f) for f in fnames) and caller not in NO_DB_CALLERS:
        for data in samples:
            if any([x in dd.get_tools_on(data)
                    for x in ["gemini", "gemini_orig", "gemini_allvariants", "vcf2db_expand"]]):
                if vcfanno.annoated_gemini(data):
                    build_type.add("gemini_orig" if "gemini_orig" in dd.get_tools_on(data) else "gemini")
    return build_type
Пример #46
0
def to_single(in_file, data, passonly=False):
    """Convert multi-allelic inputs in the original VCF file into single alleles.
    """
    out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        if vcfutils.vcf_has_variants(in_file):
            ready_ma_file = _decompose(in_file, data, passonly=passonly)
            ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data)
            if ann_ma_file:
                ready_ma_file = ann_ma_file
            out_file = ready_ma_file
        else:
            utils.symlink_plus(in_file, out_file)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Пример #47
0
def prep_gemini_db(fnames, call_info, samples, extras):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
    name, caller, is_batch = call_info
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    if use_gemini:
        passonly = all("gemini_allvariants" not in dd.get_tools_on(d) for d in samples)
        gemini_vcf = multiallelic.to_single(gemini_vcf, data, passonly=passonly)
    gemini_vcf = _run_vcfanno(gemini_vcf, data, use_gemini)
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    if vcfutils.vcf_has_variants(gemini_vcf):
        if not utils.file_exists(gemini_db) and use_gemini:
            ped_file = create_ped_file(samples + extras, gemini_vcf)
            # Use original approach for hg19/GRCh37 pending additional testing
            if support_gemini_orig(data) and not any(dd.get_vcfanno(d) for d in samples):
                gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db, ped_file)
            else:
                gemini_db = create_gemini_db(gemini_vcf, data, gemini_db, ped_file)
    return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None,
                              "vcf": gemini_vcf,
                              "decomposed": use_gemini}]]
Пример #48
0
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not vcfutils.vcf_has_variants(gemini_vcf):
        return None
    if not utils.file_exists(gemini_db):
        data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None
        ann_file = vcfanno.run_vcfanno(gemini_vcf, "gemini", data, data_basepath)
        with file_transaction(data, gemini_db) as tx_gemini_db:
            vcf2db = config_utils.get_program("vcf2db.py", data)
            cmd = [vcf2db, ann_file, ped_file, tx_gemini_db]
            do.run(cmd, "GEMINI: create database with vcf2db")
    return gemini_db
Пример #49
0
def run(bam_file, data, out_dir):
    out = {}
    vcinfo = variant.get_active_vcinfo(data, use_ensemble=False)
    dkfzbiasfilter = config_utils.get_program("dkfzbiasfilter_summarize.py", data)
    if vcinfo and vcfutils.vcf_has_variants(vcinfo["vrn_file"]):
        out_file = os.path.join(utils.safe_makedir(out_dir),
                                "%s-damage.yaml" % (dd.get_sample_name(data)))
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [dkfzbiasfilter, "--sample=%s" % dd.get_sample_name(data),
                       "--outfile=%s" % tx_out_file, vcinfo["vrn_file"]]
                do.run(cmd, "Summarize damage filtering")
        if utils.file_exists(out_file):
            out["base"] = out_file
    return out
Пример #50
0
def to_single(in_file, data):
    """Convert multi-allelic inputs in the original VCF file into single alleles.
    """
    out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        ba_file, ma_file = _split_mulitallelic(in_file, data)
        if vcfutils.vcf_has_variants(ma_file):
            ready_ma_file = _decompose(ma_file, data)
            ann_ma_file = effects.add_to_vcf(ready_ma_file, data)
            if ann_ma_file:
                ready_ma_file = ann_ma_file
            out_file = vcfutils.merge_sorted([ready_ma_file, ba_file], out_file, data)
        else:
            utils.symlink_plus(in_file, out_file)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Пример #51
0
def _add_vcf_header_sample_cl(in_file, items, base_file):
    """Add phenotype information to a VCF header.

    Encode tumor/normal relationships in VCF header.
    Could also eventually handle more complicated pedigree information if useful.
    """
    paired = vcfutils.get_paired(items)
    if paired:
        toadd = ["##SAMPLE=<ID=%s,Genomes=Tumor>" % paired.tumor_name]
        if paired.normal_name:
            toadd.append("##SAMPLE=<ID=%s,Genomes=Germline>" % paired.normal_name)
            toadd.append("##PEDIGREE=<Derived=%s,Original=%s>" % (paired.tumor_name, paired.normal_name))
        new_header = _update_header(in_file, base_file, toadd, _fix_generic_tn_names(paired))
        if vcfutils.vcf_has_variants(in_file):
            cmd = "bcftools reheader -h {new_header} | bcftools view "
            return cmd.format(**locals())
Пример #52
0
def _add_vcf_header_sample_cl(in_file, items, base_file):
    """Add phenotype information to a VCF header.

    Encode tumor/normal relationships in VCF header.
    Could also eventually handle more complicated pedigree information if useful.
    """
    paired = vcfutils.get_paired(items)
    if paired:
        toadd = ["##SAMPLE=<ID=%s,Genomes=Tumor>" % paired.tumor_name]
        if paired.normal_name:
            toadd.append("##SAMPLE=<ID=%s,Genomes=Germline>" % paired.normal_name)
            toadd.append("##PEDIGREE=<Derived=%s,Original=%s>" % (paired.tumor_name, paired.normal_name))
        new_header = _update_header(in_file, base_file, toadd, _fix_generic_tn_names(paired))
        if vcfutils.vcf_has_variants(in_file):
            cmd = "bcftools reheader -h {new_header} | bcftools view "
            return cmd.format(**locals())
Пример #53
0
def snpeff_effects(data):
    """Annotate input VCF file with effects calculated by snpEff.
    """
    vcf_in = data["vrn_file"]
    interval_file = data["config"]["algorithm"].get("hybrid_target", None)
    if vcfutils.vcf_has_variants(vcf_in):
        se_interval = (_convert_to_snpeff_interval(interval_file, vcf_in)
                       if interval_file else None)
        try:
            vcf_file = _run_snpeff(vcf_in, _get_snpeff_genome(data),
                                   se_interval, "vcf", data)
        finally:
            for fname in [se_interval]:
                if fname and os.path.exists(fname):
                    os.remove(fname)
        return vcf_file
Пример #54
0
def _run_svtyper(in_file, full_bam, sr_bam, data):
    """Genotype structural variant calls with SVtyper.
    """
    out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not vcfutils.vcf_has_variants(in_file):
                shutil.copy(in_file, out_file)
            else:
                python = sys.executable
                svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper")
                cmd = ("gunzip -c {in_file} | "
                       "{python} {svtyper} -B {full_bam} -S {sr_bam} | "
                       "bgzip -c > {tx_out_file}")
                do.run(cmd.format(**locals()), "SV genotyping with svtyper")
    return vcfutils.sort_by_ref(out_file, data)
Пример #55
0
def create_gemini_db_orig(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Original GEMINI specific data loader, only works with hg19/GRCh37.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            load_opts = ""
            if "gemini_allvariants" not in dd.get_tools_on(data):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if _is_small_vcf(gemini_vcf):
                load_opts += " --skip-gene-tables"
            if "/test_automated_output/" in gemini_vcf:
                load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            gemini_dir = install.get_gemini_dir(data)
            for skip_cmd, check_file in [
                ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")
            ]:
                if not os.path.exists(os.path.join(gemini_dir, check_file)):
                    load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            tmpdir = os.path.dirname(tx_gemini_db)
            eanns = _get_effects_flag(data)
            # Apply custom resource specifications, allowing use of alternative annotation_dir
            resources = config_utils.get_resources("gemini", data["config"])
            gemini_opts = " ".join([str(x) for x in resources["options"]
                                    ]) if resources.get("options") else ""
            exports = utils.local_path_export()
            cmd = ("{exports} {gemini} {gemini_opts} load {load_opts} "
                   "-v {gemini_vcf} {eanns} --cores {num_cores} "
                   "--tempdir {tmpdir} {tx_gemini_db}")
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
            if ped_file:
                cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db]
                do.run(cmd, "Add PED file to gemini database", data)
    return gemini_db
Пример #56
0
def subset_by_supported(input_file,
                        get_coords,
                        calls_by_name,
                        work_dir,
                        data,
                        headers=("#", )):
    """Limit CNVkit input to calls with support from another caller.

    get_coords is a function that return chrom, start, end from a line of the
    input_file, allowing handling of multiple input file types.
    """
    support_files = [(c, tz.get_in([c, "vrn_file"], calls_by_name))
                     for c in convert.SUBSET_BY_SUPPORT["cnvkit"]]
    support_files = [(c, f) for (c, f) in support_files
                     if f and vcfutils.vcf_has_variants(f)]
    if len(support_files) == 0:
        return input_file
    else:
        out_file = os.path.join(
            work_dir, "%s-havesupport%s" %
            utils.splitext_plus(os.path.basename(input_file)))
        if not utils.file_uptodate(out_file, input_file):
            input_bed = _input_to_bed(input_file, work_dir, get_coords,
                                      headers)
            pass_coords = set([])
            with file_transaction(data, out_file) as tx_out_file:
                support_beds = " ".join(
                    [_sv_vcf_to_bed(f, c, out_file) for c, f in support_files])
                tmp_cmp_bed = "%s-intersectwith.bed" % utils.splitext_plus(
                    tx_out_file)[0]
                cmd = "bedtools intersect -wa -f 0.5 -r -a {input_bed} -b {support_beds} > {tmp_cmp_bed}"
                do.run(cmd.format(**locals()),
                       "Intersect CNVs with support files")
                for r in pybedtools.BedTool(tmp_cmp_bed):
                    pass_coords.add((str(r.chrom), str(r.start), str(r.stop)))
                with open(input_file) as in_handle:
                    with open(tx_out_file, "w") as out_handle:
                        for line in in_handle:
                            passes = True
                            if not line.startswith(headers):
                                passes = get_coords(line) in pass_coords
                            if passes:
                                out_handle.write(line)
        return out_file