Пример #1
0
def run_vcfanno(vcf_file, data, decomposed=False):
    """Run vcfanno, providing annotations from external databases.
    """
    conf_files = dd.get_vcfanno(data)
    if conf_files:
        with_basepaths = collections.defaultdict(list)
        if not isinstance(conf_files, (list, tuple)):
            conf_files = [conf_files]
        for f in conf_files:
            data_basepath = (install.get_gemini_dir(data)
                             if f.find("gemini") >= 0
                             and is_human(data, builds=["37"]) else None)
            with_basepaths[data_basepath].append(f)
        conf_files = with_basepaths.items()
    else:
        conf_files = _default_conf_files(data)
    out_file = None
    if conf_files:
        for data_basepath, conf_files in conf_files:
            ann_file = vcfanno.run_vcfanno(vcf_file,
                                           conf_files,
                                           data,
                                           data_basepath=data_basepath,
                                           decomposed=decomposed)
            if ann_file:
                out_file = ann_file
                vcf_file = ann_file
    return out_file
Пример #2
0
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not vcfutils.vcf_has_variants(gemini_vcf):
        return None
    if not utils.file_exists(gemini_db):
        data_basepath = install.get_gemini_dir(data) if support_gemini_orig(
            data) else None
        conf_files = dd.get_vcfanno(data)
        if not conf_files:
            conf_files = ["gemini"]
        ann_file = vcfanno.run_vcfanno(gemini_vcf, conf_files, data,
                                       data_basepath)
        with file_transaction(data, gemini_db) as tx_gemini_db:
            vcf2db = config_utils.get_program("vcf2db.py", data)
            if "vcf2db_expand" in dd.get_tools_on(data):
                vcf2db_args = [
                    "--expand", "gt_types", "--expand", "gt_ref_depths",
                    "--expand", "gt_alt_depths"
                ]
            else:
                vcf2db_args = []
            cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] + vcf2db_args
            do.run(cmd, "GEMINI: create database with vcf2db")
    return gemini_db
Пример #3
0
def _has_gemini(data):
    from bcbio import install
    gemini_dir = install.get_gemini_dir(data)
    return ((os.path.exists(gemini_dir) and len(os.listdir(gemini_dir)) > 0)
            and os.path.exists(
                os.path.join(os.path.dirname(gemini_dir),
                             "gemini-config.yaml")))
Пример #4
0
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None):
    """Get a pre-prepared genome from S3, unpacking it locally.

    Supports runs on AWS where we can retrieve the resources on demand. Upgrades
    GEMINI in place if installed inside a Docker container with the biological data.
    GEMINI install requires write permissions to standard data directories -- works
    on AWS but not generalizable elsewhere.
    """
    from bcbio.variation import population
    from bcbio import install
    if not out_dir:
        out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                                  "inputs", "data", "genomes"))
    for target in REMAP_NAMES.get(name, [name]):
        ref_dir = os.path.join(out_dir, genome_build, target)
        if not os.path.exists(ref_dir):
            if target in INPLACE_INDEX:
                ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
                # Need to add genome resources so we can retrieve GTF files for STAR
                data["genome_resources"] = get_resources(data["genome_build"], ref_file, data)
                INPLACE_INDEX[target](ref_file, ref_dir, data)
            else:
                # XXX Currently only supports genomes from S3 us-east-1 bucket.
                # Need to assess how slow this is from multiple regions and generalize to non-AWS.
                fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target)
                try:
                    objectstore.connect(fname)
                except:
                    raise ValueError("Could not find reference genome file %s %s" % (genome_build, name))
                with utils.chdir(out_dir):
                    cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp"
                    do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build)
    ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
    if data.get("genome_build"):
        gresources = get_resources(data["genome_build"], ref_file, data)
        if data.get("files") and population.do_db_build([data], need_bam=False, gresources=gresources):
            # symlink base GEMINI directory to work directory, avoiding write/space issues
            out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data"))
            orig_gemini_dir = install.get_gemini_dir()
            # Remove empty initial directory created by installer
            if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0:
                if os.path.islink(orig_gemini_dir):
                    os.remove(orig_gemini_dir)
                else:
                    os.rmdir(orig_gemini_dir)
            if not os.path.exists(orig_gemini_dir):
                os.symlink(out_gemini_dir, orig_gemini_dir)
            cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"]
            do.run(cmd, "Download GEMINI data")
    genome_dir = os.path.join(out_dir, genome_build)
    genome_build = genome_build.replace("-test", "")
    if need_remap or name == "samtools":
        return os.path.join(genome_dir, "seq", "%s.fa" % genome_build)
    else:
        ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1])
        base_name = os.path.commonprefix(os.listdir(ref_dir))
        while base_name.endswith("."):
            base_name = base_name[:-1]
        return os.path.join(ref_dir, base_name)
Пример #5
0
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None):
    """Get a pre-prepared genome from S3, unpacking it locally.

    Supports runs on AWS where we can retrieve the resources on demand. Upgrades
    GEMINI in place if installed inside a Docker container with the biological data.
    GEMINI install requires write permissions to standard data directories -- works
    on AWS but not generalizable elsewhere.
    """
    from bcbio.variation import population
    from bcbio import install
    if not out_dir:
        out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                                  "inputs", "data", "genomes"))
    for target in REMAP_NAMES.get(name, [name]):
        ref_dir = os.path.join(out_dir, genome_build, target)
        if not os.path.exists(ref_dir):
            if target in INPLACE_INDEX:
                ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
                # Need to add genome resources so we can retrieve GTF files for STAR
                data["genome_resources"] = get_resources(data["genome_build"], ref_file, data)
                INPLACE_INDEX[target](ref_file, ref_dir, data)
            else:
                # XXX Currently only supports genomes from S3 us-east-1 bucket.
                # Need to assess how slow this is from multiple regions and generalize to non-AWS.
                fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target)
                try:
                    objectstore.connect(fname)
                except:
                    raise ValueError("Could not find reference genome file %s %s" % (genome_build, name))
                with utils.chdir(out_dir):
                    cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp"
                    do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build)
    ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
    if data.get("genome_build"):
        if (data.get("files") and population.do_db_build([data], need_bam=False)
              and population.support_gemini_orig(data)):
            # symlink base GEMINI directory to work directory, avoiding write/space issues
            out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data"))
            orig_gemini_dir = install.get_gemini_dir()
            # Remove empty initial directory created by installer
            if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0:
                if os.path.islink(orig_gemini_dir):
                    os.remove(orig_gemini_dir)
                else:
                    os.rmdir(orig_gemini_dir)
            if not os.path.exists(orig_gemini_dir):
                os.symlink(out_gemini_dir, orig_gemini_dir)
            cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"]
            do.run(cmd, "Download GEMINI data")
    genome_dir = os.path.join(out_dir, genome_build)
    genome_build = genome_build.replace("-test", "")
    if need_remap or name == "samtools":
        return os.path.join(genome_dir, "seq", "%s.fa" % genome_build)
    else:
        ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1])
        base_name = os.path.commonprefix(os.listdir(ref_dir))
        while base_name.endswith("."):
            base_name = base_name[:-1]
        return os.path.join(ref_dir, base_name)
Пример #6
0
def _default_conf_files(data):
    conf_files = collections.defaultdict(list)
    if _has_gemini_data(data):
        data_basepath = install.get_gemini_dir(data) if is_human(data, builds=["37"]) else None
        conf_files[data_basepath].append("gemini")
    if _annotate_somatic(data):
        conf_files[None].append("somatic")
    return conf_files.items()
Пример #7
0
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = (do_db_build(samples, check_gemini=False)
                        and any(vcfutils.vcf_has_variants(f) for f in fnames))
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(
            vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(data, gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini",
                                                      config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                load_opts = ""
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion(
                        "0.6.2.1"):
                    load_opts += " --passonly"
                # For small test files, skip gene table loading which takes a long time
                if gemini_ver and LooseVersion(gemini_ver) > LooseVersion(
                        "0.6.4"):
                    if _is_small_vcf(gemini_vcf):
                        load_opts += " --skip-gene-tables"
                    if "/test_automated_output/" in gemini_vcf:
                        load_opts += " --test-mode"
                # Skip CADD or gerp-bp if neither are loaded
                if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion(
                        "0.7.0"):
                    gemini_dir = install.get_gemini_dir()
                    for skip_cmd, check_file in [
                        ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")
                    ]:
                        if not os.path.exists(
                                os.path.join(gemini_dir, check_file)):
                            load_opts += " %s" % skip_cmd
                # skip gerp-bp which slows down loading
                load_opts += " --skip-gerp-bp "
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                eanns = ("snpEff" if tz.get_in(
                    ("config", "algorithm",
                     "effects"), data, "snpeff") == "snpeff" else "VEP")
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd,
                       "Create gemini database for %s %s" % (name, caller),
                       data)
    return [[(name, caller), {
        "db": gemini_db if utils.file_exists(gemini_db) else None,
        "vcf": gemini_vcf if is_batch else None
    }]]
Пример #8
0
def _run_vcfanno(gemini_vcf, data, use_gemini=False):
    data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None
    conf_files = dd.get_vcfanno(data)
    if not conf_files and use_gemini:
        conf_files = ["gemini"]
    if conf_files:
        return vcfanno.run_vcfanno(gemini_vcf, conf_files, data, data_basepath)
    else:
        return gemini_vcf
Пример #9
0
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            if "program_versions" in data["config"].get("resources", {}):
                gemini_ver = programs.get_version("gemini",
                                                  config=data["config"])
            else:
                gemini_ver = None
            # Recent versions of gemini allow loading only passing variants
            load_opts = ""
            if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion(
                    "0.6.2.1"):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                if _is_small_vcf(gemini_vcf):
                    load_opts += " --skip-gene-tables"
                if "/test_automated_output/" in gemini_vcf:
                    load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion(
                    "0.7.0"):
                gemini_dir = install.get_gemini_dir(data)
                for skip_cmd, check_file in [
                    ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")
                ]:
                    if not os.path.exists(os.path.join(gemini_dir,
                                                       check_file)):
                        load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            tmpdir = os.path.dirname(tx_gemini_db)
            eanns = _get_effects_flag(data)
            # Apply custom resource specifications, allowing use of alternative annotation_dir
            resources = config_utils.get_resources("gemini", data["config"])
            gemini_opts = " ".join([str(x) for x in resources["options"]
                                    ]) if resources.get("options") else ""
            cmd = (
                "{gemini} {gemini_opts} load {load_opts} -v {gemini_vcf} {eanns} --cores {num_cores} "
                "--tempdir {tmpdir} {tx_gemini_db}")
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
            if ped_file:
                cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db]
                do.run(cmd, "Add PED file to gemini database", data)
    return gemini_db
Пример #10
0
def handle_vcf_calls(vcf_file, data, orig_items):
    """Prioritize VCF calls based on external annotations supplied through GEMINI.
    """
    if not _do_prioritize(orig_items):
        return vcf_file
    else:
        if population.has_gemini_data(data):
            data_basepath = install.get_gemini_dir(data) if population.support_gemini_orig(data) else None
            ann_vcf = vcfanno.run_vcfanno(vcf_file, ["gemini"], data, data_basepath)
            if ann_vcf:
                priority_file = _prep_priority_filter_vcfanno(ann_vcf, data)
                return _apply_priority_filter(vcf_file, priority_file, data)
        # No GEMINI database for filtering, return original file
        return vcf_file
Пример #11
0
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not vcfutils.vcf_has_variants(gemini_vcf):
        return None
    if not utils.file_exists(gemini_db):
        data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None
        ann_file = vcfanno.run_vcfanno(gemini_vcf, "gemini", data, data_basepath)
        with file_transaction(data, gemini_db) as tx_gemini_db:
            vcf2db = config_utils.get_program("vcf2db.py", data)
            cmd = [vcf2db, ann_file, ped_file, tx_gemini_db]
            do.run(cmd, "GEMINI: create database with vcf2db")
    return gemini_db
Пример #12
0
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(data, gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini", config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                load_opts = ""
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                    load_opts += " --passonly"
                # For small test files, skip gene table loading which takes a long time
                if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                    if _is_small_vcf(gemini_vcf):
                        load_opts += " --skip-gene-tables"
                    if "/test_automated_output/" in gemini_vcf:
                        load_opts += " --test-mode"
                # Skip CADD or gerp-bp if neither are loaded
                if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"):
                    gemini_dir = install.get_gemini_dir()
                    for skip_cmd, check_file in [
                        ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz"),
                        ("--skip-gerp-bp", "hg19.gerp.bw"),
                    ]:
                        if not os.path.exists(os.path.join(gemini_dir, check_file)):
                            load_opts += " %s" % skip_cmd
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                eanns = "snpEff" if tz.get_in(("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP"
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd, "Create gemini database for %s %s" % (name, caller), data)
    return [
        [
            (name, caller),
            {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None},
        ]
    ]
Пример #13
0
def handle_vcf_calls(vcf_file, data, orig_items):
    """Prioritize VCF calls based on external annotations supplied through GEMINI.
    """
    if not _do_prioritize(orig_items):
        return vcf_file
    else:
        if population.has_gemini_data(data):
            data_basepath = install.get_gemini_dir(
                data) if population.support_gemini_orig(data) else None
            ann_vcf = vcfanno.run_vcfanno(vcf_file, ["gemini"], data,
                                          data_basepath)
            if ann_vcf:
                priority_file = _prep_priority_filter_vcfanno(ann_vcf, data)
                return _apply_priority_filter(vcf_file, priority_file, data)
        # No GEMINI database for filtering, return original file
        return vcf_file
Пример #14
0
def _back_compatible_gemini(conf_files, data):
    """Provide old install directory for configuration with GEMINI supplied tidy VCFs.

    Handles new style (bcbio installed) and old style (GEMINI installed)
    configuration and data locations.
    """
    if vcfanno.is_human(data, builds=["37"]):
        for f in conf_files:
            if f and os.path.basename(f) == "gemini.conf" and os.path.exists(f):
                with open(f) as in_handle:
                    for line in in_handle:
                        if line.startswith("file"):
                            fname = line.strip().split("=")[-1].replace('"', '').strip()
                            if fname.find(".tidy.") > 0:
                                return install.get_gemini_dir(data)
    return None
Пример #15
0
def create_gemini_db_orig(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Original GEMINI specific data loader, only works with hg19/GRCh37.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            load_opts = ""
            if "gemini_allvariants" not in dd.get_tools_on(data):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if _is_small_vcf(gemini_vcf):
                load_opts += " --skip-gene-tables"
            if "/test_automated_output/" in gemini_vcf:
                load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            gemini_dir = install.get_gemini_dir(data)
            for skip_cmd, check_file in [
                ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")
            ]:
                if not os.path.exists(os.path.join(gemini_dir, check_file)):
                    load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            tmpdir = os.path.dirname(tx_gemini_db)
            eanns = _get_effects_flag(data)
            # Apply custom resource specifications, allowing use of alternative annotation_dir
            resources = config_utils.get_resources("gemini", data["config"])
            gemini_opts = " ".join([str(x) for x in resources["options"]
                                    ]) if resources.get("options") else ""
            exports = utils.local_path_export()
            cmd = ("{exports} {gemini} {gemini_opts} load {load_opts} "
                   "-v {gemini_vcf} {eanns} --cores {num_cores} "
                   "--tempdir {tmpdir} {tx_gemini_db}")
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
            if ped_file:
                cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db]
                do.run(cmd, "Add PED file to gemini database", data)
    return gemini_db
Пример #16
0
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            if "program_versions" in data["config"].get("resources", {}):
                gemini_ver = programs.get_version("gemini", config=data["config"])
            else:
                gemini_ver = None
            # Recent versions of gemini allow loading only passing variants
            load_opts = ""
            if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                if _is_small_vcf(gemini_vcf):
                    load_opts += " --skip-gene-tables"
                if "/test_automated_output/" in gemini_vcf:
                    load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"):
                gemini_dir = install.get_gemini_dir(data)
                for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]:
                    if not os.path.exists(os.path.join(gemini_dir, check_file)):
                        load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            tmpdir = os.path.dirname(tx_gemini_db)
            eanns = _get_effects_flag(data)
            # Apply custom resource specifications, allowing use of alternative annotation_dir
            resources = config_utils.get_resources("gemini", data["config"])
            gemini_opts = " ".join([str(x) for x in resources["options"]]) if resources.get("options") else ""
            cmd = ("{gemini} {gemini_opts} load {load_opts} -v {gemini_vcf} {eanns} --cores {num_cores} "
                   "--tempdir {tmpdir} {tx_gemini_db}")
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
            if ped_file:
                cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db]
                do.run(cmd, "Add PED file to gemini database", data)
    return gemini_db
Пример #17
0
def create_gemini_db(gemini_vcf, data, gemini_db=None):
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            if "program_versions" in data["config"].get("resources", {}):
                gemini_ver = programs.get_version("gemini",
                                                  config=data["config"])
            else:
                gemini_ver = None
            # Recent versions of gemini allow loading only passing variants
            load_opts = ""
            if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion(
                    "0.6.2.1"):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                if _is_small_vcf(gemini_vcf):
                    load_opts += " --skip-gene-tables"
                if "/test_automated_output/" in gemini_vcf:
                    load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion(
                    "0.7.0"):
                gemini_dir = install.get_gemini_dir()
                for skip_cmd, check_file in [
                    ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")
                ]:
                    if not os.path.exists(os.path.join(gemini_dir,
                                                       check_file)):
                        load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            eanns = ("snpEff" if tz.get_in(
                ("config", "algorithm",
                 "effects"), data, "snpeff") == "snpeff" else "VEP")
            cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
    return gemini_db
Пример #18
0
def create_gemini_db_orig(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Original GEMINI specific data loader, only works with hg19/GRCh37.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            load_opts = ""
            if "gemini_allvariants" not in dd.get_tools_on(data):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if _is_small_vcf(gemini_vcf):
                load_opts += " --skip-gene-tables"
            if "/test_automated_output/" in gemini_vcf:
                load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            gemini_dir = install.get_gemini_dir(data)
            for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]:
                if not os.path.exists(os.path.join(gemini_dir, check_file)):
                    load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            tmpdir = os.path.dirname(tx_gemini_db)
            eanns = _get_effects_flag(data)
            # Apply custom resource specifications, allowing use of alternative annotation_dir
            resources = config_utils.get_resources("gemini", data["config"])
            gemini_opts = " ".join([str(x) for x in resources["options"]]) if resources.get("options") else ""
            exports = utils.local_path_export()
            cmd = (
                "{exports} {gemini} {gemini_opts} load {load_opts} "
                "-v {gemini_vcf} {eanns} --cores {num_cores} "
                "--tempdir {tmpdir} {tx_gemini_db}"
            )
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
            if ped_file:
                cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db]
                do.run(cmd, "Add PED file to gemini database", data)
    return gemini_db
Пример #19
0
def create_gemini_db(gemini_vcf, data, gemini_db=None):
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            if "program_versions" in data["config"].get("resources", {}):
                gemini_ver = programs.get_version("gemini", config=data["config"])
            else:
                gemini_ver = None
            # Recent versions of gemini allow loading only passing variants
            load_opts = ""
            if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                if _is_small_vcf(gemini_vcf):
                    load_opts += " --skip-gene-tables"
                if "/test_automated_output/" in gemini_vcf:
                    load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"):
                gemini_dir = install.get_gemini_dir()
                for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]:
                    if not os.path.exists(os.path.join(gemini_dir, check_file)):
                        load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            eanns = ("snpEff" if tz.get_in(("config", "algorithm", "effects"), data, "snpeff") == "snpeff"
                     else "VEP")
            cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
    return gemini_db
Пример #20
0
def _has_gemini(data):
    from bcbio import install
    gemini_dir = install.get_gemini_dir(data)
    return ((os.path.exists(gemini_dir) and len(os.listdir(gemini_dir)) > 0)
            and os.path.exists(os.path.join(os.path.dirname(gemini_dir), "gemini-config.yaml")))