示例#1
0
def _get_build_type(fnames, samples, caller):
    """Confirm we should build a gemini database: need gemini in tools_on.

    Checks for valid conditions for running a database and gemini or gemini_orig
    configured in tools on.
    """
    build_type = set()
    if any(vcfutils.vcf_has_variants(f)
           for f in fnames) and caller not in NO_DB_CALLERS:
        for data in samples:
            if any([
                    x in dd.get_tools_on(data) for x in [
                        "gemini", "gemini_orig", "gemini_allvariants",
                        "vcf2db_expand"
                    ]
            ]):
                if vcfanno.annotate_gemini(data):
                    build_type.add("gemini_orig" if "gemini_orig" in
                                   dd.get_tools_on(data) else "gemini")
                else:
                    logger.info(
                        "Not running gemini, input data not found: %s" %
                        dd.get_sample_name(data))
            else:
                logger.info(
                    "Not running gemini, not configured in tools_on: %s" %
                    dd.get_sample_name(data))
    else:
        logger.info("Not running gemini, no samples with variants found: %s" %
                    (", ".join([dd.get_sample_name(d) for d in samples])))
    return build_type
示例#2
0
def _create_config_file(out_dir, samples):
    """Provide configuration file hiding duplicate columns.

    Future entry point for providing top level configuration of output reports.
    """
    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {
        "table_columns_visible": {
            "SnpEff": {
                "Change_rate": False,
                "Ts_Tv_ratio": False,
                "Number_of_variants_before_filter": False
            },
            "samtools": {
                "error_rate": False
            }
        },
        "module_order": [
            "bcbio", "samtools", "goleft_indexcov", "bcftools", "picard",
            "qualimap", "snpeff", "fastqc"
        ]
    }
    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d)
            or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out,
                       out_handle,
                       default_flow_style=False,
                       allow_unicode=False)
    return out_file
示例#3
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.

    For VQSR, need to split the file to apply. For hard filters can run on the original
    filter, filtering by bcftools type.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if config_utils.use_vqsr(algs):
        assert "gvcf" not in dd.get_tools_on(data), \
            ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP",
                                              vfilter.gatk_snp_hard)
        indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_hard)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
        return _filter_nonref(combined_file, data)
    else:
        snp_filter = vfilter.gatk_snp_hard(call_file, data)
        indel_filter = vfilter.gatk_indel_hard(snp_filter, data)
        if "gvcf" not in dd.get_tools_on(data):
            return _filter_nonref(indel_filter, data)
        else:
            return indel_filter
示例#4
0
def _create_config_file(out_dir, samples):
    """Provide configuration file hiding duplicate columns.

    Future entry point for providing top level configuration of output reports.
    """
    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {"table_columns_visible": dict()}

    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
        out["table_columns_visible"]["FastQC"] = {"percent_gc": False}

    # Setting the module order
    module_order = []
    module_order.extend([
        "bcbio",
        "samtools",
        "goleft_indexcov",
        "peddy"
    ])
    out['bcftools'] = {'write_separate_table': True}
    # if germline calling was performed:
    if any("germline" in (get_active_vcinfo(s) or {})  # tumor-only somatic with germline extraction
           or dd.get_phenotype(s) == "germline"        # or paired somatic with germline calling for normal
           for s in samples):
        # Split somatic and germline variant stats into separate multiqc submodules,
        # with somatic going into General Stats, and germline going into a separate table:
        module_order.extend([{
            'bcftools': {
                'name': 'Bcftools (somatic)',
                'info': 'Bcftools stats for somatic variant calls only.',
                'path_filters': ['*_bcftools_stats.txt'],
                'write_general_stats': True,
            }},
            {'bcftools': {
                'name': 'Bcftools (germline)',
                'info': 'Bcftools stats for germline variant calls only.',
                'path_filters': ['*_bcftools_stats_germline.txt'],
                'write_general_stats': False
            }},
        ])
    else:
        module_order.append("bcftools")
    module_order.extend([
        "picard",
        "qualimap",
        "snpeff",
        "fastqc",
        "preseq",
    ])
    out["module_order"] = module_order

    preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)]
    if preseq_samples:
        out["preseq"] = _make_preseq_multiqc_config(preseq_samples)

    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return out_file
示例#5
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if config_utils.use_vqsr(algs):
        assert "gvcf" not in dd.get_tools_on(data), \
            ("Cannot force gVCF output and use VQSR. Try using cutoff-based soft filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(
            call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files,
                                              data, "SNP",
                                              vfilter.gatk_snp_cutoff)
        indel_filter_file = _variant_filtration(indel_file, ref_file,
                                                vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_cutoff)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file,
                                                       ref_file,
                                                       data["config"])
        return _filter_nonref(combined_file, data)
    else:
        snp_filter = vfilter.gatk_snp_cutoff(call_file, data)
        indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data)
        if "gvcf" not in dd.get_tools_on(data):
            return _filter_nonref(indel_filter, data)
        else:
            return indel_filter
示例#6
0
def _create_config_file(out_dir, samples):
    """Provide configuration file hiding duplicate columns.

    Future entry point for providing top level configuration of output reports.
    """
    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {"table_columns_visible": dict()}

    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
        out["table_columns_visible"]["FastQC"] = {"percent_gc": False}

    # Setting the module order
    module_order = []
    module_order.extend([
        "bcbio",
        "samtools",
        "goleft_indexcov"
    ])
    out['bcftools'] = {'write_separate_table': True}
    # if germline calling was performed:
    if any("germline" in (get_active_vcinfo(s) or {})  # tumor-only somatic with germline extraction
           or dd.get_phenotype(s) == "germline"        # or paired somatic with germline calling for normal
           for s in samples):
        # Split somatic and germline variant stats into separate multiqc submodules,
        # with somatic going into General Stats, and germline going into a separate table:
        module_order.extend([{
            'bcftools': {
                'name': 'Bcftools (somatic)',
                'info': 'Bcftools stats for somatic variant calls only.',
                'path_filters': ['*_bcftools_stats.txt'],
                'write_general_stats': True,
            }},
            {'bcftools': {
                'name': 'Bcftools (germline)',
                'info': 'Bcftools stats for germline variant calls only.',
                'path_filters': ['*_bcftools_stats_germline.txt'],
                'write_general_stats': False
            }},
        ])
    else:
        module_order.append("bcftools")
    module_order.extend([
        "picard",
        "qualimap",
        "snpeff",
        "fastqc",
        "preseq",
    ])
    out["module_order"] = module_order

    preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)]
    if preseq_samples:
        out["preseq"] = _make_preseq_multiqc_config(preseq_samples)

    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return out_file
示例#7
0
def _get_build_type(fnames, samples, caller):
    """Confirm we should build a gemini database: need gemini in tools_on.

    Checks for valid conditions for running a database and gemini or gemini_orig
    configured in tools on.
    """
    build_type = set()
    if any(vcfutils.vcf_has_variants(f) for f in fnames) and caller not in NO_DB_CALLERS:
        for data in samples:
            if any([x in dd.get_tools_on(data)
                    for x in ["gemini", "gemini_orig", "gemini_allvariants", "vcf2db_expand"]]):
                if vcfanno.annoated_gemini(data):
                    build_type.add("gemini_orig" if "gemini_orig" in dd.get_tools_on(data) else "gemini")
    return build_type
示例#8
0
def gatk_snp_cutoff(in_file, data):
    """Perform cutoff-based soft filtering on GATK SNPs using best-practice recommendations.

    We have a more lenient mapping quality (MQ) filter compared to GATK defaults.
    The recommended filter (MQ < 40) is too stringent, so we adjust to 30: 
    http://imgur.com/a/oHRVB

    QD and FS are not calculated when generating gVCF output:
    https://github.com/broadgsa/gatk-protected/blob/e91472ddc7d58ace52db0cab4d70a072a918d64c/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java#L300

    The extra command removes escaped quotes in the VCF output which
    pyVCF fails on.

    Does not use the GATK best practice recommend SOR filter (SOR > 3.0) as it
    has a negative impact on sensitivity relative to precision:

    https://github.com/bcbio/bcbio_validations/tree/master/gatk4#na12878-hg38
    """
    filters = ["MQ < 30.0", "MQRankSum < -12.5", "ReadPosRankSum < -8.0"]
    if "gvcf" not in dd.get_tools_on(data):
        filters += ["QD < 2.0", "FS > 60.0"]
    # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores
    # resulting in excessive filtering, so avoid this metric
    variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller"))
    if variantcaller not in ["gatk-haplotype", "haplotyper"]:
        filters.append("HaplotypeScore > 13.0")
    return cutoff_w_expression(in_file, 'TYPE="snp" && (%s)' % " || ".join(filters), data, "GATKCutoffSNP", "SNP",
                             extra_cmd=r"""| sed 's/\\"//g'""")
示例#9
0
def _bcftools_stats(data, out_dir, vcf_file_key=None, germline=False):
    """Run bcftools stats.
    """
    vcinfo = get_active_vcinfo(data)
    if vcinfo:
        out_dir = utils.safe_makedir(out_dir)
        vcf_file = vcinfo[vcf_file_key or "vrn_file"]
        if dd.get_jointcaller(data) or "gvcf" in dd.get_tools_on(data):
            opts = ""
        else:
            opts = "-f PASS,."
        name = dd.get_sample_name(data)
        out_file = os.path.join(out_dir, "%s_bcftools_stats%s.txt" % (name, ("_germline" if germline else "")))
        bcftools = config_utils.get_program("bcftools", data["config"])
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                orig_out_file = os.path.join(os.path.dirname(tx_out_file), "orig_%s" % os.path.basename(tx_out_file))
                cmd = ("{bcftools} stats -s {name} {opts} {vcf_file} > {orig_out_file}")
                do.run(cmd.format(**locals()), "bcftools stats %s" % name)
                with open(orig_out_file) as in_handle:
                    with open(tx_out_file, "w") as out_handle:
                        for line in in_handle:
                            if line.startswith("ID\t"):
                                parts = line.split("\t")
                                parts[-1] = "%s\n" % name
                                line = "\t".join(parts)
                            out_handle.write(line)
        return out_file
示例#10
0
def create_combined_tx2gene(data):
    out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
    items = disambiguate.split([data])
    tx2gene_files = []
    for i in items:
        odata = i[0]
        gtf_file = dd.get_transcriptome_gtf(odata)
        if not gtf_file:
            gtf_file = dd.get_gtf_file(odata)
        out_file = os.path.join(out_dir,
                                dd.get_genome_build(odata) + "-tx2gene.csv")
        tools_on = dd.get_tools_on(odata)
        if tools_on and "keep_gene_version" in tools_on:
            k_version = True
        else:
            k_version = False
        if file_exists(out_file):
            tx2gene_files.append(out_file)
        else:
            out_file = gtf.tx2genefile(gtf_file,
                                       out_file,
                                       tsv=False,
                                       keep_version=k_version)
            tx2gene_files.append(out_file)
    combined_file = os.path.join(out_dir, "tx2gene.csv")
    if file_exists(combined_file):
        return combined_file

    tx2gene_file_string = " ".join(tx2gene_files)
    cmd = "cat {tx2gene_file_string} > {tx_out_file}"
    with file_transaction(data, combined_file) as tx_out_file:
        do.run(cmd.format(**locals()), "Combining tx2gene CSV files.")
    return combined_file
示例#11
0
def sample_callable_bed(bam_file, ref_file, data):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    from bcbio.heterogeneity import chromhacks
    CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files")
    noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data)
    def callable_chrom_filter(r):
        """Filter to callable region, potentially limiting by chromosomes.
        """
        return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom))
    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir(data):
        sv_bed = regions.get_sv_bed(data)
        callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed)
        input_regions_bed = dd.get_variant_regions(data)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(data, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(callable_chrom_filter)
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return CovInfo(out_file, callable_bed, depth_files)
示例#12
0
def _add_config_regions(nblock_regions, ref_regions, data):
    """Add additional nblock regions based on configured regions to call.
    Identifies user defined regions which we should not be analyzing.
    """
    input_regions_bed = dd.get_variant_regions(data)
    if input_regions_bed:
        input_regions = pybedtools.BedTool(input_regions_bed)
        # work around problem with single region not subtracted correctly.
        if len(input_regions) == 1:
            str_regions = str(input_regions[0]).strip()
            input_regions = pybedtools.BedTool("%s\n%s" % (str_regions, str_regions),
                                               from_string=True)
        input_nblock = ref_regions.subtract(input_regions, nonamecheck=True)
        if input_nblock == ref_regions:
            raise ValueError("Input variant_region file (%s) "
                             "excludes all genomic regions. Do the chromosome names "
                             "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed)
        all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions)
    else:
        all_intervals = nblock_regions
    if "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data):
        from bcbio.heterogeneity import chromhacks
        remove_intervals = ref_regions.filter(lambda r: not chromhacks.is_nonalt(r.chrom))
        all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions)
    return all_intervals.merge()
示例#13
0
def run(bam_file, data, out_dir):
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    broad_runner = broad.PicardCmdRunner("picard", data["config"])
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    out_base = utils.splitext_plus(os.path.basename(bam_fname))[0]
    hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base)
    hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base)
    if not utils.file_exists(hsmetric_file) and not utils.file_exists(hsinsert_file):
        with utils.chdir(out_dir):
            with tx_tmpdir() as tmp_dir:
                cur_bam = os.path.basename(bam_fname)
                if not os.path.exists(cur_bam):
                    os.symlink(bam_fname, cur_bam)
                gen_metrics = PicardMetrics(broad_runner, tmp_dir)
                gen_metrics.report(cur_bam, ref_file,
                                bam.is_paired(bam_fname),
                                target_file, target_file, None, data["config"])
        if utils.file_exists(hsmetric_file):
            do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "")
        if utils.file_exists(hsinsert_file):
            do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "")
    return hsmetric_file
示例#14
0
def _associate_cnvkit_out(ckouts, items, is_somatic=False):
    """Associate cnvkit output with individual items.
    """
    assert len(ckouts) == len(items)
    out = []
    upload_counts = collections.defaultdict(int)
    for ckout, data in zip(ckouts, items):
        ckout = copy.deepcopy(ckout)
        ckout["variantcaller"] = "cnvkit"
        if utils.file_exists(ckout["cns"]) and _cna_has_values(ckout["cns"]):
            ckout = _add_seg_to_output(ckout, data)
            ckout = _add_gainloss_to_output(ckout, data)
            ckout = _add_segmetrics_to_output(ckout, data)
            ckout = _add_variantcalls_to_output(ckout, data, items, is_somatic)
            # ckout = _add_coverage_bedgraph_to_output(ckout, data)
            ckout = _add_cnr_bedgraph_and_bed_to_output(ckout, data)
            if "svplots" in dd.get_tools_on(data):
                ckout = _add_plots_to_output(ckout, data)
            ckout["do_upload"] = upload_counts[ckout.get("vrn_file")] == 0
        if "sv" not in data:
            data["sv"] = []
        data["sv"].append(ckout)
        if ckout.get("vrn_file"):
            upload_counts[ckout["vrn_file"]] += 1
        out.append(data)
    return out
示例#15
0
def run(bam_file, data, out_dir):
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    broad_runner = broad.PicardCmdRunner("picard", data["config"])
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    out_base = utils.splitext_plus(os.path.basename(bam_fname))[0]
    hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base)
    hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base)
    if not utils.file_exists(hsmetric_file) and not utils.file_exists(
            hsinsert_file):
        with utils.chdir(out_dir):
            with tx_tmpdir() as tmp_dir:
                cur_bam = os.path.basename(bam_fname)
                if not os.path.exists(cur_bam):
                    os.symlink(bam_fname, cur_bam)
                gen_metrics = PicardMetrics(broad_runner, tmp_dir)
                gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname),
                                   target_file, target_file, None,
                                   data["config"])
        if utils.file_exists(hsmetric_file):
            do.run(
                "sed -i 's/%s.bam//g' %s" %
                (out_base.replace(sample, ""), hsmetric_file), "")
        if utils.file_exists(hsinsert_file):
            do.run(
                "sed -i 's/%s.bam//g' %s" %
                (out_base.replace(sample, ""), hsinsert_file), "")
    return hsmetric_file
示例#16
0
def run(bam_file, data, out_dir):
    config = data["config"]
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data)
    broad_runner = broad.PicardCmdRunner("picard", config)
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample)
    hsinsert_file = os.path.join(out_dir, "%s-sort.insert_metrics" % sample)
    if utils.file_exists(hsmetric_file):
        return hsmetric_file
    with utils.chdir(out_dir):
        with tx_tmpdir() as tmp_dir:
            cur_bam = os.path.basename(bam_fname)
            if not os.path.exists(cur_bam):
                os.symlink(bam_fname, cur_bam)
            gen_metrics = PicardMetrics(broad_runner, tmp_dir)
            gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname),
                               target_file, target_file, None, config)
    do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "")
    do.run("sed -i 's/-sort.bam//g' %s" % hsinsert_file, "")
    return hsmetric_file
示例#17
0
def make_bcbiornaseq_object(data):
    """
    load the initial bcb.rda object using bcbioRNASeq
    """
    if "bcbiornaseq" not in dd.get_tools_on(data):
        return data
    upload_dir = tz.get_in(("upload", "dir"), data)
    report_dir = os.path.join(upload_dir, "bcbioRNASeq")
    safe_makedir(report_dir)
    organism = dd.get_bcbiornaseq(data).get("organism", None)
    groups = dd.get_bcbiornaseq(data).get("interesting_groups", None)
    loadstring = create_load_string(upload_dir, groups, organism, "gene")
    r_file = os.path.join(report_dir, "load_bcbioRNAseq.R")
    with file_transaction(r_file) as tmp_file:
        memoize_write_file(loadstring, tmp_file)
    rcmd = Rscript_cmd()
    with chdir(report_dir):
        do.run([rcmd, "--vanilla", r_file], "Loading bcbioRNASeq object.")
        write_counts(os.path.join(report_dir, "data", "bcb.rda"), "gene")
    loadstring = create_load_string(upload_dir, groups, organism, "transcript")
    r_file = os.path.join(report_dir, "load_transcript_bcbioRNAseq.R")
    with file_transaction(r_file) as tmp_file:
        memoize_write_file(loadstring, tmp_file)
    rcmd = Rscript_cmd()
    with chdir(report_dir):
        do.run([rcmd, "--vanilla", r_file],
               "Loading transcript-level bcbioRNASeq object.")
        write_counts(os.path.join(report_dir, "data-transcript", "bcb.rda"),
                     "transcript")
    make_quality_report(data)
    return data
示例#18
0
def _bedpes_from_cnv_caller(data, work_dir):
    """Retrieve BEDPEs deletion and duplications from CNV callers.

    Currently integrates with CNVkit.
    """
    supported = set(["cnvkit"])
    cns_file = None
    for sv in data.get("sv", []):
        if sv["variantcaller"] in supported and "cns" in sv and "lumpy_usecnv" in dd.get_tools_on(data):
            cns_file = sv["cns"]
            break
    if not cns_file:
        return None, None
    else:
        out_base = os.path.join(work_dir, utils.splitext_plus(os.path.basename(cns_file))[0])
        out_dels = out_base + "-dels.bedpe"
        out_dups = out_base + "-dups.bedpe"
        if not os.path.exists(out_dels) or not os.path.exists(out_dups):
            with file_transaction(data, out_dels, out_dups) as (tx_out_dels, tx_out_dups):
                try:
                    cnvanator_path = config_utils.get_program("cnvanator_to_bedpes.py", data)
                except config_utils.CmdNotFound:
                    return None, None
                cmd = [cnvanator_path, "-c", cns_file, "--cnvkit",
                        "--del_o=%s" % tx_out_dels, "--dup_o=%s" % tx_out_dups,
                        "-b", "250"]  # XXX Uses default piece size for CNVkit. Right approach?
                do.run(cmd, "Prepare CNVkit as input for lumpy", data)
        return out_dels, out_dups
示例#19
0
def _apply_priority_filter(in_file, priority_file, data):
    """Annotate variants with priority information and use to apply filters.
    """
    out_file = "%s-priority%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            header = (
                '##INFO=<ID=EPR,Number=.,Type=String,'
                'Description="Somatic prioritization based on external annotations, '
                'identify as likely germline">')
            header_file = "%s-repeatheader.txt" % utils.splitext_plus(
                tx_out_file)[0]
            with open(header_file, "w") as out_handle:
                out_handle.write(header)
            if "tumoronly_germline_filter" in dd.get_tools_on(data):
                filter_cmd = ("bcftools filter -m '+' -s 'LowPriority' "
                              """-e "EPR[0] != 'pass'" |""")
            else:
                filter_cmd = ""
            cmd = ("bcftools annotate -a {priority_file} -h {header_file} "
                   "-c CHROM,FROM,TO,REF,ALT,INFO/EPR {in_file} | "
                   "{filter_cmd} bgzip -c > {tx_out_file}")
            do.run(cmd.format(**locals()),
                   "Run external annotation based prioritization filtering")
    vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
示例#20
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        num_cores = dd.get_num_cores(items[0])
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores)
        gatk_type = broad_runner.gatk_type()
        assert gatk_type in ["restricted", "gatk4"], \
            "Require full version of GATK 2.4+, or GATK4 for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            if num_cores > 1 and gatk_type == "gatk4":
                params += ["-T", "HaplotypeCallerSpark", "--spark-master", "local[%s]" % num_cores,
                           "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)]
            else:
                params += ["-T", "HaplotypeCaller"]
            params += ["--annotation", "ClippingRankSumTest",
                       "--annotation", "DepthPerSampleHC"]
            if gatk_type == "gatk4":
                params += ["--output", tx_out_file]
            else:
                params += ["-o", tx_out_file]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE
                if not gatk_type == "gatk4" and _supports_avx():
                    params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
            # Prepare gVCFs if doing joint calling
            is_joint = False
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items):
                is_joint = True
                if gatk_type == "gatk4":
                    params += ["--emit-ref-confidence", "GVCF"]
                else:
                    params += ["--emitRefConfidence", "GVCF"]
                    params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands
                for boundary in [10, 20, 30, 40, 60, 80]:
                    params += ["-GQB", str(boundary)]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                # GenomicsDB does not support non-diploid samples in GATK4 joint calling
                # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4
                if not is_joint and gatk_type == "gatk4":
                    params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("gatk-haplotype", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None
            broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                  parallel_gc=(num_cores > 1 and gatk_type == "gatk4"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
示例#21
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    # back compatible -- older files were named with lane information, use sample name now
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if not utils.file_exists(out_file):
        out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    min_size = None
    if data.get("align_split") or fastq_file.endswith(".sdf"):
        if fastq_file.endswith(".sdf"):
            min_size = rtg.min_read_size(fastq_file)
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if ("bwa-mem" not in dd.get_tools_on(data) and
              ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file,
                                        names, rg_info, data)
        else:
            out_file = _align_mem(fastq_file, pair_file, ref_file, out_file,
                                  names, rg_info, data)
    data["work_bam"] = out_file
    return data
示例#22
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if includes_missingalt(data):
        logger.info("Removing variants with missing alts from %s." % call_file)
        call_file = gatk_remove_missingalt(call_file, data)

    if "gatkcnn" in dd.get_tools_on(data):
        return _cnn_filter(call_file, vrn_files, data)
    elif config_utils.use_vqsr(algs, call_file):
        if vcfutils.is_gvcf_file(call_file):
            raise ValueError("Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. "
                             "Try using cutoff-based soft filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP",
                                              vfilter.gatk_snp_cutoff)
        indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_cutoff)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
        return combined_file
    else:
        snp_filter = vfilter.gatk_snp_cutoff(call_file, data)
        indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data)
        return indel_filter
示例#23
0
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not vcfutils.vcf_has_variants(gemini_vcf):
        return None
    if not utils.file_exists(gemini_db):
        data_basepath = install.get_gemini_dir(data) if support_gemini_orig(
            data) else None
        conf_files = dd.get_vcfanno(data)
        if not conf_files:
            conf_files = ["gemini"]
        ann_file = vcfanno.run_vcfanno(gemini_vcf, conf_files, data,
                                       data_basepath)
        with file_transaction(data, gemini_db) as tx_gemini_db:
            vcf2db = config_utils.get_program("vcf2db.py", data)
            if "vcf2db_expand" in dd.get_tools_on(data):
                vcf2db_args = [
                    "--expand", "gt_types", "--expand", "gt_ref_depths",
                    "--expand", "gt_alt_depths"
                ]
            else:
                vcf2db_args = []
            cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] + vcf2db_args
            do.run(cmd, "GEMINI: create database with vcf2db")
    return gemini_db
示例#24
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.

    Checks for empty sets of target regions after filtering for high depth,
    in which case we should skip the FreeBayes run.
    """
    opts = ["--genotype-qualities", "--strict-vcf"]
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = bedutils.merge_overlaps(
        bedutils.population_variant_regions(items), items[0])
    # Produce gVCF output
    if any("gvcf" in dd.get_tools_on(d) for d in items):
        opts += ["--gvcf", "--gvcf-chunk", "50000"]
    no_target_regions = False
    target = shared.subset_variant_regions(variant_regions, region, out_file,
                                           items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            if any(
                    tz.get_in(["config", "algorithm", "coverage_interval"], x,
                              "").lower() == "genome" for x in items):
                target = shared.remove_highdepth_regions(target, items)
                if os.path.getsize(target) == 0:
                    no_target_regions = True
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts, no_target_regions
示例#25
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if includes_missingalt(data):
        logger.info("Removing variants with missing alts from %s." % call_file)
        call_file = gatk_remove_missingalt(call_file, data)

    if "gatkcnn" in dd.get_tools_on(data):
        return _cnn_filter(call_file, vrn_files, data)
    elif config_utils.use_vqsr(algs, call_file):
        if vcfutils.is_gvcf_file(call_file):
            raise ValueError(
                "Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. "
                "Try using cutoff-based soft filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(
            call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files,
                                              data, "SNP",
                                              vfilter.gatk_snp_cutoff)
        indel_filter_file = _variant_filtration(indel_file, ref_file,
                                                vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_cutoff)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file,
                                                       ref_file,
                                                       data["config"])
        return combined_file
    else:
        snp_filter = vfilter.gatk_snp_cutoff(call_file, data)
        indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data)
        return indel_filter
示例#26
0
def sample_callable_bed(bam_file, ref_file, data):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    from bcbio.heterogeneity import chromhacks
    CovInfo = collections.namedtuple("CovInfo",
                                     "callable, raw_callable, depth_files")
    noalt_calling = "noalt_calling" in dd.get_tools_on(
        data) or "altcontigs" in dd.get_exclude_regions(data)

    def callable_chrom_filter(r):
        """Filter to callable region, potentially limiting by chromosomes.
        """
        return r.name == "CALLABLE" and (not noalt_calling
                                         or chromhacks.is_nonalt(r.chrom))

    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir(data):
        sv_bed = regions.get_sv_bed(data)
        callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed)
        input_regions_bed = dd.get_variant_regions(data)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(data, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(callable_chrom_filter)
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(
                            input_regions,
                            nonamecheck=True).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return CovInfo(out_file, callable_bed, depth_files)
示例#27
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq"):
        if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
            to_run.append("qualimap_rnaseq")
        else:
            logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
    if not analysis.startswith("smallrna-seq"):
        to_run.append("samtools")
        to_run.append("gemini")
        if tz.get_in(["config", "algorithm", "kraken"], data):
            to_run.append("kraken")
    if analysis.startswith(("standard", "variant", "variant2")):
        to_run += ["qsignature", "coverage", "variants", "picard"]
    return to_run
示例#28
0
def _associate_cnvkit_out(ckouts, items, is_somatic=False):
    """Associate cnvkit output with individual items.
    """
    assert len(ckouts) == len(items)
    out = []
    upload_counts = collections.defaultdict(int)
    for ckout, data in zip(ckouts, items):
        ckout = copy.deepcopy(ckout)
        ckout["variantcaller"] = "cnvkit"
        if utils.file_exists(ckout["cns"]) and _cna_has_values(ckout["cns"]):
            ckout = _add_seg_to_output(ckout, data)
            ckout = _add_gainloss_to_output(ckout, data)
            ckout = _add_segmetrics_to_output(ckout, data)
            ckout = _add_variantcalls_to_output(ckout, data, items, is_somatic)
            # ckout = _add_coverage_bedgraph_to_output(ckout, data)
            ckout = _add_cnr_bedgraph_and_bed_to_output(ckout, data)
            if "svplots" in dd.get_tools_on(data):
                ckout = _add_plots_to_output(ckout, data)
            ckout["do_upload"] = upload_counts[ckout.get("vrn_file")] == 0
        if "sv" not in data:
            data["sv"] = []
        data["sv"].append(ckout)
        if ckout.get("vrn_file"):
            upload_counts[ckout["vrn_file"]] += 1
        out.append(data)
    return out
示例#29
0
def _add_config_regions(nblock_regions, ref_regions, data):
    """Add additional nblock regions based on configured regions to call.
    Identifies user defined regions which we should not be analyzing.
    """
    input_regions_bed = dd.get_variant_regions(data)
    if input_regions_bed:
        input_regions = pybedtools.BedTool(input_regions_bed)
        # work around problem with single region not subtracted correctly.
        if len(input_regions) == 1:
            str_regions = str(input_regions[0]).strip()
            input_regions = pybedtools.BedTool("%s\n%s" %
                                               (str_regions, str_regions),
                                               from_string=True)
        input_nblock = ref_regions.subtract(input_regions, nonamecheck=True)
        if input_nblock == ref_regions:
            raise ValueError(
                "Input variant_region file (%s) "
                "excludes all genomic regions. Do the chromosome names "
                "in the BED file match your genome (chr1 vs 1)?" %
                input_regions_bed)
        all_intervals = _combine_regions([input_nblock, nblock_regions],
                                         ref_regions)
    else:
        all_intervals = nblock_regions
    if "noalt_calling" in dd.get_tools_on(
            data) or "altcontigs" in dd.get_exclude_regions(data):
        from bcbio.heterogeneity import chromhacks
        remove_intervals = ref_regions.filter(
            lambda r: not chromhacks.is_nonalt(r.chrom))
        all_intervals = _combine_regions([all_intervals, remove_intervals],
                                         ref_regions)
    return all_intervals.merge()
示例#30
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    data = paired.tumor_data if paired else items[0]
    work_dir = _sv_workdir(data)
    variant_file = _get_out_file(work_dir, paired)
    if not utils.file_exists(variant_file):
        with file_transaction(data, work_dir) as tx_work_dir:
            utils.safe_makedir(tx_work_dir)
            tx_workflow_file = _prep_config(items, paired, tx_work_dir)
            _run_workflow(items, paired, tx_workflow_file, tx_work_dir)
    assert utils.file_exists(
        variant_file), "Manta finished without output file %s" % variant_file
    out = []
    for data in items:
        if paired and paired.normal_bam and "break-point-inspector" in dd.get_tools_on(
                data):
            variant_file = _run_break_point_inspector(data, variant_file,
                                                      paired)
        if "sv" not in data:
            data["sv"] = []
        final_vcf = shared.finalize_sv(variant_file, data, items)
        data["sv"].append({"variantcaller": "manta", "vrn_file": final_vcf})
        out.append(data)
    return out
示例#31
0
def haplotype_caller(align_bams,
                     items,
                     ref_file,
                     assoc_files,
                     region=None,
                     out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items,
                                   ref_file, assoc_files.get("dbsnp"),
                                   region, out_file)
        gatk_type = broad_runner.gatk_type()
        assert gatk_type in ["restricted", "gatk4"], \
            "Require full version of GATK 2.4+, or GATK4 for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            params += [
                "-T", "HaplotypeCaller", "--annotation", "ClippingRankSumTest",
                "--annotation", "DepthPerSampleHC"
            ]
            if gatk_type == "gatk4":
                params += ["--output", tx_out_file]
            else:
                params += ["-o", tx_out_file]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE
                if not gatk_type == "gatk4":
                    params += [
                        "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"
                    ]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(
                    broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            # Prepare gVCFs if doing joint calling
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d)
                                            for d in items):
                params += [
                    "--emitRefConfidence", "GVCF", "--variant_index_type",
                    "LINEAR", "--variant_index_parameter", "128000"
                ]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands
                for boundary in [10, 20, 30, 40, 60, 80]:
                    params += ["-GQB", str(boundary)]
            resources = config_utils.get_resources("gatk-haplotype",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params)
    return out_file
示例#32
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([
            tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]
    ]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq"):
        if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
            to_run.append("qualimap_rnaseq")
        else:
            logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
    if not analysis.startswith("smallrna-seq"):
        to_run.append("samtools")
        to_run.append("gemini")
        if tz.get_in(["config", "algorithm", "kraken"], data):
            to_run.append("kraken")
    if analysis.startswith(("standard", "variant", "variant2")):
        to_run += ["qsignature", "coverage", "variants", "picard"]
    return to_run
示例#33
0
def _bedpes_from_cnv_caller(data, work_dir):
    """Retrieve BEDPEs deletion and duplications from CNV callers.

    Currently integrates with CNVkit.
    """
    supported = set(["cnvkit"])
    cns_file = None
    for sv in data.get("sv", []):
        if sv["variantcaller"] in supported and "cns" in sv and "lumpy_usecnv" in dd.get_tools_on(
                data):
            cns_file = sv["cns"]
            break
    if not cns_file:
        return None, None
    else:
        out_base = os.path.join(
            work_dir,
            utils.splitext_plus(os.path.basename(cns_file))[0])
        out_dels = out_base + "-dels.bedpe"
        out_dups = out_base + "-dups.bedpe"
        if not os.path.exists(out_dels) or not os.path.exists(out_dups):
            with file_transaction(data, out_dels,
                                  out_dups) as (tx_out_dels, tx_out_dups):
                try:
                    cnvanator_path = config_utils.get_program(
                        "cnvanator_to_bedpes.py", data)
                except config_utils.CmdNotFound:
                    return None, None
                cmd = [
                    cnvanator_path, "-c", cns_file, "--cnvkit",
                    "--del_o=%s" % tx_out_dels,
                    "--dup_o=%s" % tx_out_dups, "-b", "250"
                ]  # XXX Uses default piece size for CNVkit. Right approach?
                do.run(cmd, "Prepare CNVkit as input for lumpy", data)
        return out_dels, out_dups
示例#34
0
def get_analysis_intervals(data, vrn_file, base_dir):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    if vrn_file and "gvcf" in dd.get_tools_on(data):
        callable_bed = _callable_from_gvcf(data, vrn_file, base_dir)
        if callable_bed:
            return callable_bed

    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif dd.get_callable_regions(data):
        return dd.get_callable_regions(data)
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif data.get("work_bam_callable"):
        return callable.sample_callable_bed(data["work_bam_callable"],
                                            dd.get_ref_file(data), data)[0]
    elif tz.get_in(["config", "algorithm", "callable_regions"], data):
        return tz.get_in(["config", "algorithm", "callable_regions"], data)
    elif tz.get_in(["config", "algorithm", "variant_regions"], data):
        return tz.get_in(["config", "algorithm", "variant_regions"], data)
示例#35
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    data = paired.tumor_data if paired else items[0]
    work_dir = _sv_workdir(data)
    variant_file = _get_out_file(work_dir, paired)
    if not utils.file_exists(variant_file):
        with file_transaction(data, work_dir) as tx_work_dir:
            utils.safe_makedir(tx_work_dir)
            tx_workflow_file = _prep_config(items, paired, tx_work_dir)
            _run_workflow(items, paired, tx_workflow_file, tx_work_dir)
    assert utils.file_exists(variant_file), "Manta finished without output file %s" % variant_file
    variant_file = shared.annotate_with_depth(variant_file, items)
    out = []
    upload_counts = collections.defaultdict(int)
    for data in items:
        if "break-point-inspector" in dd.get_tools_on(data):
            if paired and paired.normal_bam and paired.tumor_name == dd.get_sample_name(data):
                variant_file = _run_break_point_inspector(data, variant_file, paired, work_dir)
        if "sv" not in data:
            data["sv"] = []
        final_vcf = shared.finalize_sv(variant_file, data, items)
        vc = {"variantcaller": "manta",
              "do_upload": upload_counts[final_vcf] == 0,  # only upload a single file per batch
              "vrn_file": final_vcf}
        evidence_bam = _get_evidence_bam(work_dir, data)
        if evidence_bam:
            vc["read_evidence"] = evidence_bam
        data["sv"].append(vc)
        upload_counts[final_vcf] += 1
        out.append(data)
    return out
示例#36
0
def gatk_indel_hard(in_file, data):
    """Perform hard filtering on GATK indels using best-practice recommendations.
    """
    filters = ["ReadPosRankSum < -20.0"]
    if "gvcf" not in dd.get_tools_on(data):
        filters += ["QD < 2.0", "FS > 200.0"]
    return hard_w_expression(in_file, 'TYPE="indel" && (%s)' % " || ".join(filters), data, "GATKHardIndel", "INDEL")
示例#37
0
def _bcftools_stats(data, out_dir, vcf_file_key=None, germline=False):
    """Run bcftools stats.
    """
    vcinfo = get_active_vcinfo(data)
    if vcinfo:
        out_dir = utils.safe_makedir(out_dir)
        vcf_file = vcinfo[vcf_file_key or "vrn_file"]
        if dd.get_jointcaller(data) or "gvcf" in dd.get_tools_on(data):
            opts = ""
        else:
            opts = "-f PASS,."
        name = dd.get_sample_name(data)
        out_file = os.path.join(
            out_dir,
            "%s_bcftools_stats%s.txt" % (name,
                                         ("_germline" if germline else "")))
        bcftools = config_utils.get_program("bcftools", data["config"])
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                orig_out_file = os.path.join(
                    os.path.dirname(tx_out_file),
                    "orig_%s" % os.path.basename(tx_out_file))
                cmd = (
                    "{bcftools} stats -s {name} {opts} {vcf_file} > {orig_out_file}"
                )
                do.run(cmd.format(**locals()), "bcftools stats %s" % name)
                with open(orig_out_file) as in_handle:
                    with open(tx_out_file, "w") as out_handle:
                        for line in in_handle:
                            if line.startswith("ID\t"):
                                parts = line.split("\t")
                                parts[-1] = "%s\n" % name
                                line = "\t".join(parts)
                            out_handle.write(line)
        return out_file
示例#38
0
    def get_gatk_version(self):
        """Retrieve GATK version, handling locally and config cached versions.
        Calling version can be expensive due to all the startup and shutdown
        of JVMs, so we prefer cached version information.
        """
        if self._gatk_version is None:
            self._set_default_versions(self._config)

        if "gatk4" in dd.get_tools_on({"config": self._config}):
            # In cases whwere we don't have manifest versions. Not possible to get
            # version from commandline with GATK4 alpha version
            if self._gatk4_version is None:
                self._gatk4_version = "4.0"
            return self._gatk4_version
        elif self._gatk_version is not None:
            return self._gatk_version
        else:
            if self._has_gatk_conda_wrapper():
                gatk_jar = None
            else:
                gatk_jar = self._get_jar("GenomeAnalysisTK",
                                         ["GenomeAnalysisTKLite"],
                                         allow_missing=True)
            self._gatk_version = get_gatk_version(gatk_jar,
                                                  config=self._config)
            return self._gatk_version
示例#39
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    from bcbio.bam import callable
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "gatk-haplotype"))
    data = _setup_variant_regions(data, out_dir)
    out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        region_files = []
        regions = []
        for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data):
            str_region = "_".join([str(x) for x in cur_region])
            region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                    "variation", "rnaseq", "gatk-haplotype",
                                                                    "regions")),
                                    "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region))
            region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                                region=cur_region, out_file=region_file)
            region_files.append(region_file)
            regions.append(cur_region)
        out_file = vcfutils.concat_variant_files(region_files, out_file, regions,
                                                 dd.get_ref_file(data), data["config"])
    return dd.set_vrn_file(data, out_file)
示例#40
0
def gatk_snp_hard(in_file, data):
    """Perform hard filtering on GATK SNPs using best-practice recommendations.

    We have a more lenient mapping quality (MQ) filter compared to GATK defaults.
    The recommended filter (MQ < 40) is too stringent, so we adjust to 30: 
    http://imgur.com/a/oHRVB

    QD and FS are not calculated when generating gVCF output:
    https://github.com/broadgsa/gatk-protected/blob/e91472ddc7d58ace52db0cab4d70a072a918d64c/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java#L300

    The extra command removes escaped quotes in the VCF output which
    pyVCF fails on.
    """
    filters = ["MQ < 30.0", "MQRankSum < -12.5", "ReadPosRankSum < -8.0"]
    if "gvcf" not in dd.get_tools_on(data):
        filters += ["QD < 2.0", "FS > 60.0"]
    # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores
    # resulting in excessive filtering, so avoid this metric
    variantcaller = utils.get_in(data,
                                 ("config", "algorithm", "variantcaller"))
    if variantcaller not in ["gatk-haplotype"]:
        filters.append("HaplotypeScore > 13.0")
    return hard_w_expression(in_file,
                             'TYPE="snp" && (%s)' % " || ".join(filters),
                             data,
                             "GATKHardSNP",
                             "SNP",
                             extra_cmd=r"""| sed 's/\\"//g'""")
示例#41
0
def prep_gemini_db(fnames, call_info, samples, extras):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    name, caller, is_batch = call_info
    build_type = _get_build_type(fnames, samples, caller)
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    # If we're building a gemini database, normalize the inputs
    if build_type:
        passonly = all("gemini_allvariants" not in dd.get_tools_on(d)
                       for d in samples)
        gemini_vcf = normalize.normalize(gemini_vcf, data, passonly=passonly)
        decomposed = True
    else:
        decomposed = False
    ann_vcf = run_vcfanno(gemini_vcf, data, decomposed)
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    if ann_vcf and build_type and not utils.file_exists(gemini_db):
        ped_file = create_ped_file(samples + extras, gemini_vcf)
        # Original approach for hg19/GRCh37
        if vcfanno.is_human(data, builds=["37"
                                          ]) and "gemini_orig" in build_type:
            gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db,
                                              ped_file)
        else:
            gemini_db = create_gemini_db(ann_vcf, data, gemini_db, ped_file)
    # only pass along gemini_vcf_downstream if uniquely created here
    if os.path.islink(gemini_vcf):
        gemini_vcf = None
    return [[(name, caller), {
        "db": gemini_db if utils.file_exists(gemini_db) else None,
        "vcf": ann_vcf or gemini_vcf,
        "decomposed": decomposed
    }]]
示例#42
0
def run(bam_file, data, out_dir):
    config = data["config"]
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data)
    broad_runner = broad.PicardCmdRunner("picard", config)
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample)
    if utils.file_exists(hsmetric_file):
        return hsmetric_file
    with utils.chdir(out_dir):
        with tx_tmpdir() as tmp_dir:
            cur_bam = os.path.basename(bam_fname)
            if not os.path.exists(cur_bam):
                os.symlink(bam_fname, cur_bam)
            gen_metrics = PicardMetrics(broad_runner, tmp_dir)
            gen_metrics.report(cur_bam, ref_file,
                               bam.is_paired(bam_fname),
                               target_file, target_file, None, config)
    do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "")
    return hsmetric_file
示例#43
0
def _freebayes_options_from_config(items, config, out_file, region=None):
    """Prepare standard options from configuration input.

    Input BED target files are merged to avoid overlapping regions which
    cause FreeBayes to call multiple times.

    Checks for empty sets of target regions after filtering for high depth,
    in which case we should skip the FreeBayes run.
    """
    opts = ["--genotype-qualities"]
    opts += ["--ploidy", str(ploidy.get_ploidy(items, region))]

    variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")),
                                              items[0])
    # Produce gVCF output
    if any("gvcf" in dd.get_tools_on(d) for d in items):
        opts += ["--gvcf", "--gvcf-chunk", "50000"]
    no_target_regions = False
    target = shared.subset_variant_regions(variant_regions, region, out_file, items)
    if target:
        if isinstance(target, basestring) and os.path.isfile(target):
            if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome"
                   for x in items):
                target = shared.remove_highdepth_regions(target, items)
                if os.path.getsize(target) == 0:
                    no_target_regions = True
            opts += ["--targets", target]
        else:
            opts += ["--region", region_to_freebayes(target)]
    resources = config_utils.get_resources("freebayes", config)
    if resources.get("options"):
        opts += resources["options"]
    return opts, no_target_regions
示例#44
0
def prep_gemini_db(fnames, call_info, samples, extras):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    use_gemini = do_db_build(samples) and any(
        vcfutils.vcf_has_variants(f) for f in fnames)
    name, caller, is_batch = call_info
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    multisample_vcf = get_multisample_vcf(fnames, name, caller, data)
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    if not utils.file_exists(gemini_db) and use_gemini:
        passonly = all("gemini_allvariants" not in dd.get_tools_on(d)
                       for d in samples)
        gemini_vcf = multiallelic.to_single(multisample_vcf,
                                            data,
                                            passonly=passonly)
        ped_file = create_ped_file(samples + extras, gemini_vcf)
        # Use original approach for hg19/GRCh37 pending additional testing
        if support_gemini_orig(data) and not any(
                dd.get_vcfanno(d) for d in samples):
            gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db,
                                              ped_file)
        else:
            gemini_db = create_gemini_db(gemini_vcf, data, gemini_db, ped_file)
    return [[(name, caller), {
        "db": gemini_db if utils.file_exists(gemini_db) else None,
        "vcf": multisample_vcf if is_batch else None
    }]]
示例#45
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    from bcbio.bam import callable
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "gatk-haplotype"))
    data = _setup_variant_regions(data, out_dir)
    out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        region_files = []
        regions = []
        for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data):
            str_region = "_".join([str(x) for x in cur_region])
            region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                    "variation", "rnaseq", "gatk-haplotype",
                                                                    "regions")),
                                    "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region))
            region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                                region=cur_region, out_file=region_file)
            region_files.append(region_file)
            regions.append(cur_region)
        out_file = vcfutils.concat_variant_files(region_files, out_file, regions,
                                                 dd.get_ref_file(data), data["config"])
    return dd.set_vrn_file(data, out_file)
示例#46
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,
                                  "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion}
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
示例#47
0
def gatk_indel_cutoff(in_file, data):
    """Perform cutoff-based soft filtering on GATK indels using best-practice recommendations.
    """
    filters = ["ReadPosRankSum < -20.0"]
    if "gvcf" not in dd.get_tools_on(data):
        filters += ["QD < 2.0", "FS > 200.0", "SOR > 10.0"]
    return cutoff_w_expression(in_file, 'TYPE="indel" && (%s)' % " || ".join(filters), data, "GATKCutoffIndel",
                               "INDEL", extra_cmd=r"""| sed 's/\\"//g'""")
示例#48
0
def gatk_indel_cutoff(in_file, data):
    """Perform cutoff-based soft filtering on GATK indels using best-practice recommendations.
    """
    filters = ["ReadPosRankSum < -20.0"]
    if "gvcf" not in dd.get_tools_on(data):
        filters += ["QD < 2.0", "FS > 200.0"]
    return cutoff_w_expression(in_file, 'TYPE="indel" && (%s)' % " || ".join(filters), data, "GATKCutoffIndel",
                               "INDEL", extra_cmd=r"""| sed 's/\\"//g'""")
示例#49
0
def haplotype_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        num_cores = dd.get_num_cores(items[0])
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores)
        gatk_type = broad_runner.gatk_type()
        assert gatk_type in ["restricted", "gatk4"], \
            "Require full version of GATK 2.4+, or GATK4 for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            if num_cores > 1 and gatk_type == "gatk4":
                params += ["-T", "HaplotypeCallerSpark", "--sparkMaster", "local[%s]" % num_cores,
                           "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)]
            else:
                params += ["-T", "HaplotypeCaller"]
            params += ["--annotation", "ClippingRankSumTest",
                       "--annotation", "DepthPerSampleHC"]
            if gatk_type == "gatk4":
                params += ["--output", tx_out_file]
            else:
                params += ["-o", tx_out_file]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE
                if not gatk_type == "gatk4" and _supports_avx():
                    params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
            # Prepare gVCFs if doing joint calling
            is_joint = False
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items):
                is_joint = True
                params += ["--emitRefConfidence", "GVCF"]
                if not gatk_type == "gatk4":
                    params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands
                for boundary in [10, 20, 30, 40, 60, 80]:
                    params += ["-GQB", str(boundary)]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                # GenomicsDB does not support non-diploid samples in GATK4 joint calling
                # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4
                if not is_joint and gatk_type == "gatk4":
                    params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            resources = config_utils.get_resources("gatk-haplotype", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None
            broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                  parallel_gc=(num_cores > 1 and gatk_type == "gatk4"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
示例#50
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d)) for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    return checkpoints
示例#51
0
 def cl_gatk(self, params, tmp_dir, memscale=None, parallel_gc=False):
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     if self._has_gatk_conda_wrapper():
         gatk_jar = None
     else:
         gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True)
         if not gatk_jar:
             raise ValueError("GATK processing requested but gatk or older jar install not found: "
                              "http://bcbio-nextgen.readthedocs.io/en/latest/contents/"
                              "installation.html#gatk-and-mutect-mutect2")
     is_gatk4 = "gatk4" in dd.get_tools_on({"config": self._config})
     cores = self._config["algorithm"].get("num_cores", 1)
     config = self._config
     atype_index = params.index("-T") if params.count("-T") > 0 \
                     else params.index("--analysis_type")
     prog = params[atype_index + 1]
     # For GATK4 specify command first, so swap params to accomplish
     if is_gatk4:
         params = params[:]
         del params[atype_index + 1]
         del params[atype_index]
         params = [prog] + params
     if cores and int(cores) > 1:
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
             memscale = config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                                                "magnitude": max(1, int(cores) // 2)}
     # Filters and unsafe specifications not in GATK4
     if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9") and not is_gatk4:
         if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
             params.extend(["-U", "LENIENT_VCF_PROCESSING"])
         params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"])
     if memscale:
         jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False,
                                  parallel_gc=parallel_gc)
     else:
         # Decrease memory slightly from configuration to avoid memory allocation errors
         jvm_opts = config_utils.adjust_opts(self._jvm_opts,
                                             {"algorithm": {"memory_adjust":
                                                            {"magnitude": 1.1, "direction": "decrease"}}})
         jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc)
     if "keyfile" in self._gatk_resources:
         params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params
     if gatk_jar:
         return " ".join(["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params])
     else:
         cmd = gatk_cmd("gatk", jvm_opts, params, config=self._config)
         if cmd:
             return cmd
         else:
             raise ValueError("GATK processing requested but gatk or older jar install not found: "
                              "http://bcbio-nextgen.readthedocs.io/en/latest/contents/"
                              "installation.html#gatk-and-mutect-mutect2")
示例#52
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data)
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
示例#53
0
 def _has_gatk_conda_wrapper(self):
     cmd = gatk_cmd("gatk", [], ["--version"], config=self._config)
     if cmd:
         if "gatk4" in dd.get_tools_on({"config": self._config}):
             return True
         else:
             try:
                 stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
                 return stdout.find("GATK jar file not found") == -1
             except subprocess.CalledProcessError:
                 return False
示例#54
0
def make_quality_report(data):
    """
    create and render the bcbioRNASeq quality report
    """
    if "bcbiornaseq" not in dd.get_tools_on(data):
        return data
    upload_dir = tz.get_in(("upload", "dir"), data)
    report_dir = os.path.join(upload_dir, "bcbioRNASeq")
    safe_makedir(report_dir)
    quality_rmd = os.path.join(report_dir, "quality_control.Rmd")
    quality_html = os.path.join(report_dir, "quality_control.html")
    quality_rmd = rmarkdown_draft(quality_rmd, "quality_control", "bcbioRNASeq")
示例#55
0
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Call variation with GATK's HaplotypeCaller.

    This requires the full non open-source version of GATK.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner, params = _shared_gatk_call_prep(
            align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file
        )
        assert broad_runner.gatk_type() == "restricted", "Require full version of GATK 2.4+ for haplotype calling"
        with file_transaction(items[0], out_file) as tx_out_file:
            params += [
                "-T",
                "HaplotypeCaller",
                "-o",
                tx_out_file,
                "--annotation",
                "ClippingRankSumTest",
                "--annotation",
                "DepthPerSampleHC",
            ]
            # Enable hardware based optimizations in GATK 3.1+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"):
                params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"]
            # Enable non-diploid calling in GATK 3.3+
            if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"):
                params += ["-ploidy", str(ploidy.get_ploidy(items, region))]
            # Prepare gVCFs if doing joint calling
            if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items):
                params += [
                    "--emitRefConfidence",
                    "GVCF",
                    "--variant_index_type",
                    "LINEAR",
                    "--variant_index_parameter",
                    "128000",
                ]
                # Set GQ banding to not be single GQ resolution
                # No recommended default but try to balance resolution and size
                # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands
                for boundary in [10, 20, 30, 40, 60, 80]:
                    params += ["-GQB", str(boundary)]
            resources = config_utils.get_resources("gatk-haplotype", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params)
    return out_file
示例#56
0
def platypus(in_file, data):
    """Filter Platypus calls, removing Q20 hard filter and replacing with depth and quality based filter.

    Platypus uses its own VCF nomenclature: TC == DP, FR == AF

    Platypus gVCF output appears to have an 0/1 index problem so the reference block
    regions are 1 base outside regions of interest. We avoid limiting regions during
    filtering when using it.
    """
    filters = ('(FR[0] <= 0.5 && TC < 4 && %QUAL < 20) || '
               '(TC < 13 && %QUAL < 10) || '
               '(FR[0] > 0.5 && TC < 4 && %QUAL < 50)')
    limit_regions = "variant_regions" if "gvcf" not in dd.get_tools_on(data) else None
    return hard_w_expression(in_file, filters, data, name="PlatQualDepth",
                             extra_cmd="| sed 's/\\tQ20\\t/\\tPASS\\t/'", limit_regions=limit_regions)
示例#57
0
def finalize_sv(samples, config, initial_only=False):
    """Combine results from multiple sv callers into a single ordered 'sv' key.

    Handles ensemble calling and plotting of results.
    """
    by_bam = collections.OrderedDict()
    for x in samples:
        try:
            by_bam[x["align_bam"]].append(x)
        except KeyError:
            by_bam[x["align_bam"]] = [x]
    by_batch = collections.OrderedDict()
    lead_batches = {}
    for grouped_calls in by_bam.values():

        def orig_svcaller_order(x):
            return _get_svcallers(x).index(x["config"]["algorithm"]["svcaller_active"])

        sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order)
        final = grouped_calls[0]
        if len(sorted_svcalls) > 0:
            final_calls = reduce(operator.add, [x["sv"] for x in sorted_svcalls])
            if not initial_only:
                for caller in (c for c in _get_svcallers(final) if c in _ENSEMBLE_CALLERS):
                    final_calls = _ENSEMBLE_CALLERS[caller](final_calls, final)
                final_calls = ensemble.summarize(final_calls, final, grouped_calls)
                final_calls = validate.evaluate(final, final_calls)
            final["sv"] = final_calls
        del final["config"]["algorithm"]["svcaller_active"]
        batch = dd.get_batch(final) or dd.get_sample_name(final)
        batches = batch if isinstance(batch, (list, tuple)) else [batch]
        lead_batches[dd.get_sample_name(final)] = batches[0]
        for batch in batches:
            try:
                by_batch[batch].append(final)
            except KeyError:
                by_batch[batch] = [final]
    out = []
    for batch, items in by_batch.items():
        if any("svplots" in dd.get_tools_on(d) for d in items):
            plot_items = plot.by_regions(items)
        else:
            plot_items = items
        for data in plot_items:
            if lead_batches[dd.get_sample_name(data)] == batch:
                out.append([data])
    return out
示例#58
0
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not vcfutils.vcf_has_variants(gemini_vcf):
        return None
    if not utils.file_exists(gemini_db):
        with file_transaction(data, gemini_db) as tx_gemini_db:
            vcf2db = config_utils.get_program("vcf2db.py", data)
            if "vcf2db_expand" in dd.get_tools_on(data):
                vcf2db_args = ["--expand", "gt_types", "--expand", "gt_ref_depths", "--expand", "gt_alt_depths"]
            else:
                vcf2db_args = []
            cmd = [vcf2db, gemini_vcf, ped_file, tx_gemini_db] + vcf2db_args
            do.run(cmd, "GEMINI: create database with vcf2db")
    return gemini_db
示例#59
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if tz.get_in(["config", "algorithm", "kraken"], data):
        to_run.append("kraken")
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq") or analysis == "smallrna-seq":
        if "qualimap" not in dd.get_tools_off(data):
            if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
                to_run.append("qualimap_rnaseq")
            else:
                logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("chip-seq"):
        to_run.append("chipqc")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
        to_run.append("atropos")
    if "coverage_qc" not in dd.get_tools_off(data):
        to_run.append("samtools")
    if analysis.startswith(("standard", "variant", "variant2")):
        if "coverage_qc" not in dd.get_tools_off(data):
            to_run += ["coverage", "picard"]
        to_run += ["qsignature", "variants"]
        if vcfanno.is_human(data):
            to_run += ["contamination", "peddy"]
        if vcfutils.get_paired_phenotype(data):
            to_run += ["viral"]
        if damage.should_filter([data]):
            to_run += ["damage"]
    if dd.get_umi_consensus(data):
        to_run += ["umi"]
    if tz.get_in(["config", "algorithm", "preseq"], data):
        to_run.append("preseq")
    to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)]
    to_run.sort()
    return to_run