Exemplo n.º 1
0
def variants(data, out_dir):
    """Variants QC metrics"""
    if not "variants" in data:
        return None
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    bcfstats = _run_bcftools(data, work_dir)
    bed_file = dd.get_coverage(data)
    bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt")
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    with chdir(work_dir):
        if not file_exists(bcf_out):
            with open(bcf_out, "w") as out_handle:
                yaml.safe_dump(bcfstats,
                               out_handle,
                               default_flow_style=False,
                               allow_unicode=False)
        if "vrn_file" not in data or not bed_file:
            return None

        in_vcf = data['vrn_file']
        cleaned_bed = clean_file(bed_file, data)
        if file_exists(qc_file):
            return qc_file
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(parse_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T", "VariantAnnotator", "-R", ref_file, "-L",
                        cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A",
                        "Coverage", "--variant", in_vcf, "--out", tx_out
                    ]
                    broad_runner.run_gatk(params)
                cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                           "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                remove_plus(cg_file)
Exemplo n.º 2
0
def variants(data, out_dir):
    """Variants QC metrics"""
    if not "variants" in data:
        return None
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    bcfstats = _run_bcftools(data, work_dir)
    bed_file = dd.get_coverage(data)
    bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt")
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    with chdir(work_dir):
        if not file_exists(bcf_out):
            with open(bcf_out, "w") as out_handle:
                yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False)
        if "vrn_file" not in data or not bed_file:
            return None

        in_vcf = data['vrn_file']
        cleaned_bed = clean_file(bed_file, data)
        if file_exists(qc_file):
            return qc_file
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(parse_file):
                with file_transaction(cg_file) as tx_out:
                    params = ["-T", "VariantAnnotator",
                              "-R", ref_file,
                              "-L", cleaned_bed,
                              "-I", in_bam,
                              "-A", "GCContent",
                              "-A", "Coverage",
                              "--variant", in_vcf,
                              "--out", tx_out]
                    broad_runner.run_gatk(params)
                cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >>out_handle, "CG\tdepth\tsample"
                    cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                            "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                            "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                remove_plus(cg_file)
Exemplo n.º 3
0
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file, work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(items[0], work_dir) as tx_work_dir:
            workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir)
            _run_workflow(items[0], workflow_file, tx_work_dir)
        raw_file = os.path.join(work_dir, "results", "variants",
                                "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz")
        utils.copy_plus(raw_file, out_file)
        # Remove files with relative symlinks
        utils.remove_plus(os.path.join(work_dir, "results", "variants", "genome.vcf.gz"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Exemplo n.º 4
0
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file, work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(items[0], work_dir) as tx_work_dir:
            workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir)
            _run_workflow(items[0], workflow_file, tx_work_dir)
        raw_file = os.path.join(work_dir, "results", "variants",
                                "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz")
        utils.copy_plus(raw_file, out_file)
        # Remove files with relative symlinks
        utils.remove_plus(os.path.join(work_dir, "results", "variants", "genome.vcf.gz"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Exemplo n.º 5
0
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file,
                  work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(items[0], work_dir) as tx_work_dir:
            workflow_file = _configure_germline(align_bams, items, ref_file,
                                                region, out_file, tx_work_dir)
            if workflow_file:
                has_variants = True
                _run_workflow(items[0], workflow_file, tx_work_dir)
            else:
                has_variants = False
                vcfutils.write_empty_vcf(
                    out_file, items[0]["config"],
                    [dd.get_sample_name(d) for d in items])
        if has_variants:
            raw_file = os.path.join(
                work_dir, "results", "variants", "genome.vcf.gz"
                if joint.want_gvcf(items) else "variants.vcf.gz")
            utils.copy_plus(raw_file, out_file)
            # Remove files with relative symlinks
            utils.remove_plus(
                os.path.join(work_dir, "results", "variants", "genome.vcf.gz"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])