Exemplo n.º 1
0
def add_reference_resources(data):
    """Add genome reference information to the item to process.
    """
    aligner = data["config"]["algorithm"].get("aligner", None)
    data["reference"] = genome.get_refs(data["genome_build"], aligner,
                                        data["dirs"]["galaxy"])
    # back compatible `sam_ref` target
    data["sam_ref"] = utils.get_in(data, ("reference", "fasta", "base"))
    ref_loc = utils.get_in(data, ("config", "resources", "species", "dir"),
                           utils.get_in(data, ("reference", "fasta", "base")))
    data["genome_resources"] = genome.get_resources(data["genome_build"],
                                                    ref_loc)
    data["reference"]["snpeff"] = effects.get_snpeff_files(data)
    alt_genome = utils.get_in(data,
                              ("config", "algorithm", "validate_genome_build"))
    if alt_genome:
        data["reference"]["alt"] = {
            alt_genome:
            genome.get_refs(alt_genome, None, data["dirs"]["galaxy"])["fasta"]
        }
    # Re-enable when we have ability to re-define gemini configuration directory
    if False:
        if population.do_db_build([data], check_gemini=False, need_bam=False):
            data["reference"]["gemini"] = population.get_gemini_files(data)
    return data
Exemplo n.º 2
0
def add_reference_resources(data, remote_retriever=None):
    """Add genome reference information to the item to process.
    """
    aligner = data["config"]["algorithm"].get("aligner", None)
    if remote_retriever:
        data["reference"] = remote_retriever.get_refs(data["genome_build"],
                                                      aligner, data["config"])
    else:
        data["reference"] = genome.get_refs(data["genome_build"], aligner,
                                            data["dirs"]["galaxy"], data)
        _check_ref_files(data["reference"], data)
    # back compatible `sam_ref` target
    data["sam_ref"] = utils.get_in(data, ("reference", "fasta", "base"))
    ref_loc = utils.get_in(data, ("config", "resources", "species", "dir"),
                           utils.get_in(data, ("reference", "fasta", "base")))
    if remote_retriever:
        data = remote_retriever.get_resources(data["genome_build"], ref_loc,
                                              data)
    else:
        data["genome_resources"] = genome.get_resources(
            data["genome_build"], ref_loc, data)
    if effects.get_type(
            data) == "snpeff" and "snpeff" not in data["reference"]:
        data["reference"]["snpeff"] = effects.get_snpeff_files(data)
    data = _fill_validation_targets(data)
    data = _fill_prioritization_targets(data)
    # Re-enable when we have ability to re-define gemini configuration directory
    if False:
        if population.do_db_build([data], need_bam=False):
            data["reference"]["gemini"] = population.get_gemini_files(data)
    return data
Exemplo n.º 3
0
def add_reference_resources(data, remote_retriever=None):
    """Add genome reference information to the item to process.
    """
    aligner = data["config"]["algorithm"].get("aligner", None)
    if remote_retriever:
        data["reference"] = remote_retriever.get_refs(data["genome_build"], aligner, data["config"])
    else:
        data["reference"] = genome.get_refs(data["genome_build"], aligner, data["dirs"]["galaxy"], data)
        _check_ref_files(data["reference"], data)
    # back compatible `sam_ref` target
    data["sam_ref"] = utils.get_in(data, ("reference", "fasta", "base"))
    ref_loc = utils.get_in(data, ("config", "resources", "species", "dir"),
                           utils.get_in(data, ("reference", "fasta", "base")))
    if remote_retriever:
        data = remote_retriever.get_resources(data["genome_build"], ref_loc, data)
    else:
        data["genome_resources"] = genome.get_resources(data["genome_build"], ref_loc, data)
    if effects.get_type(data) == "snpeff" and "snpeff" not in data["reference"]:
        data["reference"]["snpeff"] = effects.get_snpeff_files(data)
    data = _fill_validation_targets(data)
    data = _fill_prioritization_targets(data)
    # Re-enable when we have ability to re-define gemini configuration directory
    if False:
        if population.do_db_build([data], need_bam=False):
            data["reference"]["gemini"] = population.get_gemini_files(data)
    return data
Exemplo n.º 4
0
def postprocess_variants(items):
    """Provide post-processing of variant calls: filtering and effects annotation.
    """
    vrn_key = "vrn_file"
    if not isinstance(items, dict):
        items = [utils.to_single_data(x) for x in items]
        if "vrn_file_joint" in items[0]:
            vrn_key = "vrn_file_joint"
    data, items = _get_batch_representative(items, vrn_key)
    items = cwlutils.unpack_tarballs(items, data)
    data = cwlutils.unpack_tarballs(data, data)
    cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data))
    logger.info("Finalizing variant calls: %s" % cur_name)
    orig_vrn_file = data.get(vrn_key)
    data = _symlink_to_workdir(data, [vrn_key])
    data = _symlink_to_workdir(data,
                               ["config", "algorithm", "variant_regions"])
    if data.get(vrn_key):
        logger.info("Calculating variation effects for %s" % cur_name)
        ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data)
        if ann_vrn_file:
            data[vrn_key] = ann_vrn_file
        if vrn_stats:
            data["vrn_stats"] = vrn_stats
        orig_items = _get_orig_items(items)
        logger.info("Annotate VCF file: %s" % cur_name)
        data[vrn_key] = annotation.finalize_vcf(data[vrn_key],
                                                get_variantcaller(data),
                                                orig_items)
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            logger.info("Annotate RNA editing sites")
            ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"],
                                           data)
            if ann_file:
                data[vrn_key] = ann_file
        if cwlutils.is_cwl_run(data):
            logger.info("Annotate with population level variation data")
            ann_file = population.run_vcfanno(dd.get_vrn_file(data), data,
                                              population.do_db_build([data]))
            if ann_file:
                data[vrn_key] = ann_file
        logger.info("Filtering for %s" % cur_name)
        data[vrn_key] = variant_filtration(
            data[vrn_key], dd.get_ref_file(data),
            tz.get_in(("genome_resources", "variation"), data, {}), data,
            orig_items)
        logger.info("Prioritization for %s" % cur_name)
        prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data,
                                                    orig_items)
        if prio_vrn_file != data[vrn_key]:
            data[vrn_key] = prio_vrn_file
            logger.info("Germline extraction for %s" % cur_name)
            data = germline.extract(data, orig_items)

        if dd.get_align_bam(data):
            data = damage.run_filter(data[vrn_key], dd.get_align_bam(data),
                                     dd.get_ref_file(data), data, orig_items)
    if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file):
        data[vrn_key] = orig_vrn_file
    return [[data]]
Exemplo n.º 5
0
def run_rnaseq_variant_calling(data):
    """
    run RNA-seq variant calling, variation file is stored in `vrn_file`
    in the datadict
    """
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/bcbio/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller:
        if "gatk-haplotype" in variantcaller:
            data = variation.rnaseq_gatk_variant_calling(data)
        if vardict.get_vardict_command(data):
            data = variation.rnaseq_vardict_variant_calling(data)
    if dd.get_vrn_file(data):
        ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"],
                                       data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data,
                                          population.do_db_build([data]))
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    return [[data]]
Exemplo n.º 6
0
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None):
    """Get a pre-prepared genome from S3, unpacking it locally.

    Supports runs on AWS where we can retrieve the resources on demand. Upgrades
    GEMINI in place if installed inside a Docker container with the biological data.
    GEMINI install requires write permissions to standard data directories -- works
    on AWS but not generalizable elsewhere.
    """
    from bcbio.variation import population
    from bcbio import install
    if not out_dir:
        out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                                  "inputs", "data", "genomes"))
    for target in REMAP_NAMES.get(name, [name]):
        ref_dir = os.path.join(out_dir, genome_build, target)
        if not os.path.exists(ref_dir):
            if target in INPLACE_INDEX:
                ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
                # Need to add genome resources so we can retrieve GTF files for STAR
                data["genome_resources"] = get_resources(data["genome_build"], ref_file, data)
                INPLACE_INDEX[target](ref_file, ref_dir, data)
            else:
                # XXX Currently only supports genomes from S3 us-east-1 bucket.
                # Need to assess how slow this is from multiple regions and generalize to non-AWS.
                fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target)
                try:
                    objectstore.connect(fname)
                except:
                    raise ValueError("Could not find reference genome file %s %s" % (genome_build, name))
                with utils.chdir(out_dir):
                    cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp"
                    do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build)
    ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
    if data.get("genome_build"):
        if (data.get("files") and population.do_db_build([data], need_bam=False)
              and population.support_gemini_orig(data)):
            # symlink base GEMINI directory to work directory, avoiding write/space issues
            out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data"))
            orig_gemini_dir = install.get_gemini_dir()
            # Remove empty initial directory created by installer
            if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0:
                if os.path.islink(orig_gemini_dir):
                    os.remove(orig_gemini_dir)
                else:
                    os.rmdir(orig_gemini_dir)
            if not os.path.exists(orig_gemini_dir):
                os.symlink(out_gemini_dir, orig_gemini_dir)
            cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"]
            do.run(cmd, "Download GEMINI data")
    genome_dir = os.path.join(out_dir, genome_build)
    genome_build = genome_build.replace("-test", "")
    if need_remap or name == "samtools":
        return os.path.join(genome_dir, "seq", "%s.fa" % genome_build)
    else:
        ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1])
        base_name = os.path.commonprefix(os.listdir(ref_dir))
        while base_name.endswith("."):
            base_name = base_name[:-1]
        return os.path.join(ref_dir, base_name)
Exemplo n.º 7
0
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None):
    """Get a pre-prepared genome from S3, unpacking it locally.

    Supports runs on AWS where we can retrieve the resources on demand. Upgrades
    GEMINI in place if installed inside a Docker container with the biological data.
    GEMINI install requires write permissions to standard data directories -- works
    on AWS but not generalizable elsewhere.
    """
    from bcbio.variation import population
    from bcbio import install
    if not out_dir:
        out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                                  "inputs", "data", "genomes"))
    for target in REMAP_NAMES.get(name, [name]):
        ref_dir = os.path.join(out_dir, genome_build, target)
        if not os.path.exists(ref_dir):
            if target in INPLACE_INDEX:
                ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
                # Need to add genome resources so we can retrieve GTF files for STAR
                data["genome_resources"] = get_resources(data["genome_build"], ref_file, data)
                INPLACE_INDEX[target](ref_file, ref_dir, data)
            else:
                # XXX Currently only supports genomes from S3 us-east-1 bucket.
                # Need to assess how slow this is from multiple regions and generalize to non-AWS.
                fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target)
                try:
                    objectstore.connect(fname)
                except:
                    raise ValueError("Could not find reference genome file %s %s" % (genome_build, name))
                with utils.chdir(out_dir):
                    cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp"
                    do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build)
    ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
    if data.get("genome_build"):
        gresources = get_resources(data["genome_build"], ref_file, data)
        if data.get("files") and population.do_db_build([data], need_bam=False, gresources=gresources):
            # symlink base GEMINI directory to work directory, avoiding write/space issues
            out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data"))
            orig_gemini_dir = install.get_gemini_dir()
            # Remove empty initial directory created by installer
            if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0:
                if os.path.islink(orig_gemini_dir):
                    os.remove(orig_gemini_dir)
                else:
                    os.rmdir(orig_gemini_dir)
            if not os.path.exists(orig_gemini_dir):
                os.symlink(out_gemini_dir, orig_gemini_dir)
            cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"]
            do.run(cmd, "Download GEMINI data")
    genome_dir = os.path.join(out_dir, genome_build)
    genome_build = genome_build.replace("-test", "")
    if need_remap or name == "samtools":
        return os.path.join(genome_dir, "seq", "%s.fa" % genome_build)
    else:
        ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1])
        base_name = os.path.commonprefix(os.listdir(ref_dir))
        while base_name.endswith("."):
            base_name = base_name[:-1]
        return os.path.join(ref_dir, base_name)
Exemplo n.º 8
0
def handle_vcf_calls(vcf_file, data):
    """Prioritize VCF calls based on external annotations supplied through GEMINI.
    """
    if not _do_prioritize(data):
        return vcf_file
    else:
        if population.do_db_build([data]):
            gemini_db = population.create_gemini_db(vcf_file, data)
            if gemini_db:
                priority_file = _prep_priority_filter(gemini_db, data)
                return _apply_priority_filter(vcf_file, priority_file, data)
        # No GEMINI database for filtering, return original file
        return vcf_file
Exemplo n.º 9
0
def handle_vcf_calls(vcf_file, data):
    """Prioritize VCF calls based on external annotations supplied through GEMINI.
    """
    if not _do_prioritize(data):
        return vcf_file
    else:
        if population.do_db_build([data]):
            gemini_db = population.create_gemini_db(vcf_file, data)
            if gemini_db:
                priority_file = _prep_priority_filter(gemini_db, data)
                return _apply_priority_filter(vcf_file, priority_file, data)
        # No GEMINI database for filtering, return original file
        return vcf_file
Exemplo n.º 10
0
def _download_prepped_genome(genome_build, data, name, need_remap):
    """Get a pre-prepared genome from S3, unpacking it locally.

    Supports runs on AWS where we can retrieve the resources on demand. Upgrades
    GEMINI in place if installed inside a Docker container with the biological data.
    GEMINI install requires write permissions to standard data directories -- works
    on AWS but not generalizable elsewhere.
    """
    from bcbio.variation import population
    out_dir = utils.safe_makedir(
        os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data",
                     "genomes"))
    ref_dir = os.path.join(out_dir, genome_build, REMAP_NAMES.get(name, name))
    if not os.path.exists(ref_dir):
        target = REMAP_NAMES.get(name, name)
        if target in INPLACE_INDEX:
            ref_file = glob.glob(
                os.path.normpath(
                    os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
            INPLACE_INDEX[target](ref_file, ref_dir, data)
        else:
            with utils.chdir(out_dir):
                bucket = S3_INFO["bucket"]
                key = S3_INFO["key"].format(build=genome_build,
                                            target=REMAP_NAMES.get(name, name))
                cmd = (
                    "gof3r get --no-md5 -k {key} -b {bucket} | pigz -d -c | tar -xvp"
                )
                do.run(cmd.format(**locals()),
                       "Download pre-prepared genome data: %s" % genome_build)
    ref_file = glob.glob(
        os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
    gresources = get_resources(data["genome_build"], ref_file)
    if data.get("files") and population.do_db_build(
        [data], need_bam=False, gresources=gresources):
        cmd = [
            os.path.join(os.path.dirname(sys.executable), "gemini"), "update",
            "--dataonly"
        ]
        do.run(cmd, "Download GEMINI data")
    genome_dir = os.path.join(out_dir, genome_build)
    genome_build = genome_build.replace("-test", "")
    if need_remap or name == "samtools":
        return os.path.join(genome_dir, "seq", "%s.fa" % genome_build)
    else:
        ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, name))
        base_name = os.path.commonprefix(os.listdir(ref_dir))
        while base_name.endswith("."):
            base_name = base_name[:-1]
        return os.path.join(ref_dir, base_name)
Exemplo n.º 11
0
def add_reference_resources(data):
    """Add genome reference information to the item to process.
    """
    aligner = data["config"]["algorithm"].get("aligner", None)
    data["reference"] = genome.get_refs(data["genome_build"], aligner, data["dirs"]["galaxy"])
    # back compatible `sam_ref` target
    data["sam_ref"] = utils.get_in(data, ("reference", "fasta", "base"))
    ref_loc = utils.get_in(data, ("config", "resources", "species", "dir"),
                           utils.get_in(data, ("reference", "fasta", "base")))
    data["genome_resources"] = genome.get_resources(data["genome_build"], ref_loc)
    data["reference"]["snpeff"] = effects.get_snpeff_files(data)
    alt_genome = utils.get_in(data, ("config", "algorithm", "validate_genome_build"))
    if alt_genome:
        data["reference"]["alt"] = {alt_genome:
                                    genome.get_refs(alt_genome, None, data["dirs"]["galaxy"])["fasta"]}
    # Re-enable when we have ability to re-define gemini configuration directory
    if False:
        if population.do_db_build([data], check_gemini=False, need_bam=False):
            data["reference"]["gemini"] = population.get_gemini_files(data)
    return data
Exemplo n.º 12
0
def _download_prepped_genome(genome_build, data, name, need_remap):
    """Get a pre-prepared genome from S3, unpacking it locally.

    Supports runs on AWS where we can retrieve the resources on demand. Upgrades
    GEMINI in place if installed inside a Docker container with the biological data.
    GEMINI install requires write permissions to standard data directories -- works
    on AWS but not generalizable elsewhere.
    """
    from bcbio.variation import population
    out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                              "inputs", "data", "genomes"))
    ref_dir = os.path.join(out_dir, genome_build, REMAP_NAMES.get(name, name))
    if not os.path.exists(ref_dir):
        target = REMAP_NAMES.get(name, name)
        if target in INPLACE_INDEX:
            ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
            INPLACE_INDEX[target](ref_file, ref_dir, data)
        else:
            with utils.chdir(out_dir):
                bucket = S3_INFO["bucket"]
                key = S3_INFO["key"].format(build=genome_build, target=REMAP_NAMES.get(name, name))
                cmd = ("gof3r get --no-md5 -k {key} -b {bucket} | pigz -d -c | tar -xvp")
                do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build)
    ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
    gresources = get_resources(data["genome_build"], ref_file)
    if data.get("files") and population.do_db_build([data], need_bam=False, gresources=gresources):
        cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"]
        do.run(cmd, "Download GEMINI data")
    genome_dir = os.path.join(out_dir, genome_build)
    genome_build = genome_build.replace("-test", "")
    if need_remap or name == "samtools":
        return os.path.join(genome_dir, "seq", "%s.fa" % genome_build)
    else:
        ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, name))
        base_name = os.path.commonprefix(os.listdir(ref_dir))
        while base_name.endswith("."):
            base_name = base_name[:-1]
        return os.path.join(ref_dir, base_name)