Exemplo n.º 1
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,
                                  "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion}
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Exemplo n.º 2
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,
                                  "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion}
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        return vcfutils.bgzip_and_index(out_file, data["config"])