Пример #1
0
def _run_somatic(paired, ref_file, assoc_files, region, out_file, work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, work_dir) as tx_work_dir:
            workflow_file = _configure_somatic(paired, ref_file, region,
                                               out_file, tx_work_dir)
            if workflow_file:
                has_variants = True
                _run_workflow(paired.tumor_data, workflow_file, tx_work_dir)
            else:
                has_variants = False
                vcfutils.write_empty_vcf(
                    out_file, paired.tumor_data["config"], [
                        dd.get_sample_name(d)
                        for d in [paired.tumor_data, paired.normal_data]
                    ])
        if has_variants:
            var_dir = os.path.join(work_dir, "results", "variants")
            vcfutils.combine_variant_files([
                _postprocess_somatic(os.path.join(var_dir, f), paired)
                for f in ["somatic.snvs.vcf.gz", "somatic.indels.vcf.gz"]
            ],
                                           out_file,
                                           ref_file,
                                           paired.tumor_data["config"],
                                           region=region)
    return out_file
Пример #2
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        _rename_allelic_fraction_field(out_file_mutect,config)
        disable_SID = True # SID isn't great, so use Scalpel instead
        if "appistry" not in broad_runner.get_mutect_version() or disable_SID:
            # Scalpel InDels
            is_paired = "-I:normal" in params
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                with file_transaction(out_file_indels) as tx_out_file2:
                    if not is_paired:
                        scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                    else:
                        scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=items[0]["sam_ref"],
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        else:
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
    return out_file
Пример #3
0
def _run_somatic(paired, ref_file, assoc_files, region, out_file, work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, work_dir) as tx_work_dir:
            workflow_file = _configure_somatic(paired, ref_file, region, out_file, tx_work_dir)
            _run_workflow(paired.tumor_data, workflow_file, tx_work_dir)
        var_dir = os.path.join(work_dir, "results", "variants")
        vcfutils.combine_variant_files([_postprocess_somatic(os.path.join(var_dir, f), paired)
                                        for f in ["somatic.snvs.vcf.gz", "somatic.indels.vcf.gz"]],
                                       out_file, ref_file, paired.tumor_data["config"], region=region)
    return out_file
Пример #4
0
def _run_somatic(paired, ref_file, assoc_files, region, out_file, work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, work_dir) as tx_work_dir:
            workflow_file = _configure_somatic(paired, ref_file, region, out_file, tx_work_dir)
            _run_workflow(paired.tumor_data, workflow_file, tx_work_dir)
        var_dir = os.path.join(work_dir, "results", "variants")
        vcfutils.combine_variant_files([_postprocess_somatic(os.path.join(var_dir, f), paired)
                                        for f in ["somatic.snvs.vcf.gz", "somatic.indels.vcf.gz"]],
                                       out_file, ref_file, paired.tumor_data["config"], region=region)
    return out_file
Пример #5
0
 def test_3_vcf_split_combine(self):
     """Split a VCF file into SNPs and indels, then combine back together.
     """
     with make_workdir() as workdir:
         config = load_config(get_post_process_yaml(self.data_dir, workdir))
         config["algorithm"] = {}
     ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa")
     fname = os.path.join(self.var_dir, "S1-variants.vcf")
     snp_file, indel_file = vcfutils.split_snps_indels(fname, ref_file, config)
     merge_file = "%s-merge%s.gz" % os.path.splitext(fname)
     vcfutils.combine_variant_files([snp_file, indel_file], merge_file, ref_file,
                                    config)
     for f in [snp_file, indel_file, merge_file]:
         self._remove_vcf(f)
Пример #6
0
 def test_3_vcf_split_combine(self):
     """Split a VCF file into SNPs and indels, then combine back together.
     """
     with make_workdir() as workdir:
         config = load_config(get_post_process_yaml(self.automated_dir, workdir))
         config["algorithm"] = {}
     ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa")
     fname = os.path.join(self.var_dir, "S1-variants.vcf")
     snp_file, indel_file = vcfutils.split_snps_indels(fname, ref_file, config)
     merge_file = "%s-merge%s.gz" % os.path.splitext(fname)
     vcfutils.combine_variant_files([snp_file, indel_file], merge_file, ref_file,
                                    config)
     for f in [snp_file, indel_file, merge_file]:
         self._remove_vcf(f)
Пример #7
0
 def test_3_vcf_split_combine(self, global_config):
     """Split a VCF file into SNPs and indels, then combine back together.
     """
     from bcbio.variation import vcfutils
     config = load_config(global_config)
     config["algorithm"] = {}
     fname = os.path.join(self.var_dir, "S1-variants.vcf")
     snp_file, indel_file = vcfutils.split_snps_indels(
         fname, self.ref_file, config)
     merge_file = "%s-merge%s.gz" % os.path.splitext(fname)
     vcfutils.combine_variant_files([snp_file, indel_file], merge_file,
                                    self.ref_file, config)
     for f in [snp_file, indel_file, merge_file]:
         self._remove_vcf(f)
Пример #8
0
def variant_filtration(call_file, ref_file, vrn_files, config):
    """Filter variant calls using Variant Quality Score Recalibration.

    Newer GATK with Haplotype calling has combined SNP/indel filtering.
    """
    broad_runner = broad.runner_from_config(config)
    caller = config["algorithm"].get("variantcaller")
    if caller in ["gatk-haplotype"] and not _no_vqsr(config):
        return _variant_filtration_both(broad_runner, call_file, ref_file,
                                        vrn_files, config)
    elif caller in ["freebayes"]:
        return filter_freebayes(broad_runner, call_file, ref_file, vrn_files,
                                config)
    # no additional filtration for callers that filter as part of call process
    elif caller in ["samtools", "varscan"]:
        return call_file
    else:
        snp_file, indel_file = vcfutils.split_snps_indels(
            broad_runner, call_file, ref_file)
        snp_filter_file = _variant_filtration_snp(broad_runner, snp_file,
                                                  ref_file, vrn_files, config)
        indel_filter_file = _variant_filtration_indel(broad_runner, indel_file,
                                                      ref_file, vrn_files,
                                                      config)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "{base}combined.vcf".format(
            base=os.path.commonprefix(orig_files))
        return vcfutils.combine_variant_files(orig_files, out_file, ref_file,
                                              config)
Пример #9
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.

    For VQSR, need to split the file to apply. For hard filters can run on the original
    filter, filtering by bcftools type.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if config_utils.use_vqsr(algs):
        assert "gvcf" not in dd.get_tools_on(data), \
            ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP",
                                              vfilter.gatk_snp_hard)
        indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_hard)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
        return _filter_nonref(combined_file, data)
    else:
        snp_filter = vfilter.gatk_snp_hard(call_file, data)
        indel_filter = vfilter.gatk_indel_hard(snp_filter, data)
        if "gvcf" not in dd.get_tools_on(data):
            return _filter_nonref(indel_filter, data)
        else:
            return indel_filter
Пример #10
0
def get_multisample_vcf(fnames, name, caller, data):
    """Retrieve a multiple sample VCF file in a standard location.
    """
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_vcf = os.path.join(out_dir, "%s-%s.vcf" % (name, caller))
    return vcfutils.combine_variant_files(fnames, gemini_vcf, data["sam_ref"],
                                          data["config"])
Пример #11
0
def prep_gemini_db(fnames, call_id, samples, data):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db")
    use_gemini = _do_db_build(samples)
    is_population = len(fnames) > 1
    if is_population:
        gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0]
        gemini_vcf = vcfutils.combine_variant_files(fnames, gemini_vcf,
                                                    data["sam_ref"],
                                                    data["config"])
    else:
        gemini_vcf = fnames[0]
    if use_gemini and not utils.file_exists(gemini_db):
        with file_transaction(gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % str(call_id), data)
            subprocess.check_call(cmd, shell=True)
    return [[
        call_id, {
            "db": gemini_db if use_gemini else None,
            "vcf": gemini_vcf if is_population else None
        }
    ]]
Пример #12
0
def variant_filtration(call_file, ref_file, vrn_files, data):
    """Filter variant calls using Variant Quality Score Recalibration.

    Newer GATK with Haplotype calling has combined SNP/indel filtering.
    """
    caller = data["config"]["algorithm"].get("variantcaller")
    call_file = ploidy.filter_vcf_by_sex(call_file, data)
    if caller in ["freebayes"]:
        return vfilter.freebayes(call_file, ref_file, vrn_files, data)
    # no additional filtration for callers that filter as part of call process
    elif caller in ["samtools", "varscan", "mutect"]:
        return call_file
    else:
        config = data["config"]
        snp_file, indel_file = vcfutils.split_snps_indels(
            call_file, ref_file, config)
        snp_filter_file = _variant_filtration_snp(snp_file, ref_file,
                                                  vrn_files, data)
        indel_filter_file = _variant_filtration_indel(indel_file, ref_file,
                                                      vrn_files, data)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "{base}combined.vcf".format(
            base=os.path.commonprefix(orig_files))
        return vcfutils.combine_variant_files(orig_files, out_file, ref_file,
                                              config)
Пример #13
0
def run_qsnp(align_bams,
             items,
             ref_file,
             assoc_files,
             region=None,
             out_file=None):
    """Run qSNP calling on paired tumor/normal.
    """
    if utils.file_exists(out_file):
        return out_file
    paired = get_paired_bams(align_bams, items)
    if paired.normal_bam:
        region_files = []
        regions = _clean_regions(items, region)
        if regions:
            for region in regions:
                out_region_file = out_file.replace(".vcf.gz",
                                                   _to_str(region) + ".vcf.gz")
                region_file = _run_qsnp_paired(align_bams, items, ref_file,
                                               assoc_files, region,
                                               out_region_file)
                region_files.append(region_file)
            out_file = combine_variant_files(region_files, out_file, ref_file,
                                             items[0]["config"])
        if not region:
            out_file = _run_qsnp_paired(align_bams, items, ref_file,
                                        assoc_files, region, out_file)
        return out_file
    else:
        raise ValueError("qSNP only works on paired samples")
Пример #14
0
 def test_3_vcf_split_combine(self):
     """Split a VCF file into SNPs and indels, then combine back together.
     """
     from bcbio.variation import vcfutils
     with make_workdir() as workdir:
         config = load_config(
             get_post_process_yaml(self.automated_dir, workdir))
         config["algorithm"] = {}
     fname = os.path.join(self.var_dir, "S1-variants.vcf")
     snp_file, indel_file = vcfutils.split_snps_indels(
         fname, self.ref_file, config)
     merge_file = "%s-merge%s.gz" % os.path.splitext(fname)
     vcfutils.combine_variant_files([snp_file, indel_file], merge_file,
                                    self.ref_file, config)
     for f in [snp_file, indel_file, merge_file]:
         self._remove_vcf(f)
Пример #15
0
def run(items):
    """Perform detection of structural variations with delly.
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    bytype_vcfs = run_multicore(_run_delly, [(work_bams, sv_type, ref_file, work_dir, items)
                                             for sv_type in ["DEL", "DUP", "INV", "TRA"]],
                                config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"])
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = {}
        data["sv"]["delly"] = delly_vcf
        out.append(data)
    return out
Пример #16
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if config_utils.use_vqsr(algs):
        if "gvcf" in dd.get_tools_on(data) and not dd.get_jointcaller(data):
            raise ValueError(
                "Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. "
                "Try using cutoff-based soft filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(
            call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files,
                                              data, "SNP",
                                              vfilter.gatk_snp_cutoff)
        indel_filter_file = _variant_filtration(indel_file, ref_file,
                                                vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_cutoff)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file,
                                                       ref_file,
                                                       data["config"])
        return combined_file
    else:
        snp_filter = vfilter.gatk_snp_cutoff(call_file, data)
        indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data)
        return indel_filter
Пример #17
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if includes_missingalt(data):
        logger.info("Removing variants with missing alts from %s." % call_file)
        call_file = gatk_remove_missingalt(call_file, data)

    if "gatkcnn" in dd.get_tools_on(data):
        return _cnn_filter(call_file, vrn_files, data)
    elif config_utils.use_vqsr(algs, call_file):
        if vcfutils.is_gvcf_file(call_file):
            raise ValueError("Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. "
                             "Try using cutoff-based soft filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP",
                                              vfilter.gatk_snp_cutoff)
        indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_cutoff)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
        return combined_file
    else:
        snp_filter = vfilter.gatk_snp_cutoff(call_file, data)
        indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data)
        return indel_filter
Пример #18
0
def run(items):
    """Perform detection of structural variations with delly.
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    sv_types = ["DEL", "DUP", "INV"]  # "TRA" has invalid VCF END specifications that GATK doesn't like
    with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam:
        bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items)
                                                 for (chrom, sv_type)
                                                 in itertools.product(pysam_work_bam.references, sv_types)],
                                    config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"])
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = {}
        data["sv"]["delly"] = delly_vcf
        out.append(data)
    return out
Пример #19
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.

    For VQSR, need to split the file to apply. For hard filters can run on the original
    filter, filtering by bcftools type.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if config_utils.use_vqsr(algs):
        assert "gvcf" not in dd.get_tools_on(data), \
            ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(
            call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files,
                                              data, "SNP",
                                              vfilter.gatk_snp_hard)
        indel_filter_file = _variant_filtration(indel_file, ref_file,
                                                vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_hard)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file,
                                                       ref_file,
                                                       data["config"])
        return _filter_nonref(combined_file, data)
    else:
        snp_filter = vfilter.gatk_snp_hard(call_file, data)
        indel_filter = vfilter.gatk_indel_hard(snp_filter, data)
        if "gvcf" not in dd.get_tools_on(data):
            return _filter_nonref(indel_filter, data)
        else:
            return indel_filter
Пример #20
0
def run(items):
    """Perform detection of structural variations with delly.
    """
    work_dir = utils.safe_makedir(
        os.path.join(items[0]["dirs"]["work"], "structural",
                     items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {
        "type": "local",
        "cores": config["algorithm"].get("num_cores", 1),
        "progs": ["delly"]
    }
    sv_types = [
        "DEL", "DUP", "INV"
    ]  # "TRA" has invalid VCF END specifications that GATK doesn't like
    bytype_vcfs = run_multicore(
        _run_delly, [(work_bams, sv_type, ref_file, work_dir, items)
                     for sv_type in sv_types], config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file,
                                               items[0]["config"])
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = {}
        data["sv"]["delly"] = delly_vcf
        out.append(data)
    return out
Пример #21
0
def get_multisample_vcf(fnames, name, caller, data):
    """Retrieve a multiple sample VCF file in a standard location.
    """
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_vcf = os.path.join(out_dir, "%s-%s.vcf" % (name, caller))
    return vcfutils.combine_variant_files(fnames, gemini_vcf, data["sam_ref"],
                                          data["config"])
Пример #22
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(
        os.path.join(items[0]["dirs"]["work"], "structural",
                     items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {
        "type": "local",
        "cores": config["algorithm"].get("num_cores", 1),
        "progs": ["delly"]
    }
    sv_types = [
        "DEL", "DUP", "INV"
    ]  # "TRA" has invalid VCF END specifications that GATK doesn't like
    with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam:
        bytype_vcfs = run_multicore(
            _run_delly,
            [(work_bams, chrom, sv_type, ref_file, work_dir, items)
             for (chrom, sv_type
                  ) in itertools.product(pysam_work_bam.references, sv_types)],
            config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file,
                                               items[0]["config"])
    delly_vcf = vfilter.genotype_filter(combo_vcf,
                                        'DV / (DV + DR) > 0.35 && DV > 4',
                                        data, "DVSupport")
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(delly_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = "%s-%s%s" % (base, sample, ext)
        data["sv"].append({
            "variantcaller":
            "delly",
            "vrn_file":
            vcfutils.select_sample(delly_vcf, sample, delly_sample_vcf,
                                   data["config"])
        })
        out.append(data)
    return out
Пример #23
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(
        os.path.join(items[0]["dirs"]["work"], "structural",
                     items[0]["name"][-1], "delly"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = 1
    config["resources"]["delly"] = delly_config
    parallel = {
        "type": "local",
        "cores": config["algorithm"].get("num_cores", 1),
        "progs": ["delly"]
    }
    work_bams = run_multicore(_prep_subsampled_bams,
                              [(data, work_dir) for data in items], config,
                              parallel)
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    sv_types = [
        "DEL", "DUP"
    ]  # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow
    exclude_file = _get_full_exclude_file(items, work_dir)
    bytype_vcfs = run_multicore(
        _run_delly,
        [(work_bams, chrom, sv_type, ref_file, work_dir, items)
         for (chrom, sv_type) in itertools.product(
             sshared.get_sv_chroms(items, exclude_file), sv_types)], config,
        parallel)
    out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file,
                                               config)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = vcfutils.select_sample(
            combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"])
        delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data)
        data["sv"].append({
            "variantcaller": "delly",
            "vrn_file": delly_vcf,
            "exclude": exclude_file
        })
        out.append(data)
    return out
Пример #24
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(
            utils.get_in(data, ("config", "algorithm",
                                "aligner")) in ["bwa", False, None]
            for data in items):
        raise ValueError(
            "Require bwa-mem alignment input for lumpy structural variation detection"
        )
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(
        paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(
            data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(
            lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample),
            data["config"])
        std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
        std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file,
                                  data)
        gt_vcf = vcfutils.combine_variant_files(
            orig_files=[std_gt_vcf, bnd_vcf],
            out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
            ref_file=dd.get_ref_file(data),
            config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name],
                                        [paired.normal_name], gt_vcfs,
                                        paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        data["sv"].append({
            "variantcaller": "lumpy",
            "vrn_file": effects_vcf or vcf_file,
            "exclude_file": exclude_file
        })
        out.append(data)
    return out
Пример #25
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"])
    snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP",
                                          vfilter.gatk_snp_hard)
    indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL",
                                            vfilter.gatk_indel_hard)
    orig_files = [snp_filter_file, indel_filter_file]
    out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
    return vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
Пример #26
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"])
    snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP",
                                          vfilter.gatk_snp_hard)
    indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL",
                                            vfilter.gatk_indel_hard)
    orig_files = [snp_filter_file, indel_filter_file]
    out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
    combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
    return _filter_nonref(combined_file, data)
Пример #27
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(
        os.path.join(items[0]["dirs"]["work"], "structural",
                     dd.get_sample_name(items[0]), "delly"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = 1
    config["resources"]["delly"] = delly_config
    parallel = {
        "type": "local",
        "cores": config["algorithm"].get("num_cores", 1),
        "progs": ["delly"]
    }
    work_bams = [dd.get_align_bam(d) for d in items]
    ref_file = dd.get_ref_file(items[0])
    exclude_file = _get_full_exclude_file(items, work_bams, work_dir)
    bytype_vcfs = run_multicore(
        _run_delly, [(work_bams, chrom, ref_file, work_dir, items)
                     for chrom in sshared.get_sv_chroms(items, exclude_file)],
        config, parallel)
    out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file,
                                               config)
    out = []
    upload_counts = collections.defaultdict(int)
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        final_vcf = sshared.finalize_sv(combo_vcf, data, items)
        if final_vcf:
            delly_vcf = _delly_count_evidence_filter(final_vcf, data)
            data["sv"].append({
                "variantcaller": "delly",
                "vrn_file": delly_vcf,
                "do_upload": upload_counts[final_vcf] ==
                0,  # only upload a single file per batch
                "exclude": exclude_file
            })
            upload_counts[final_vcf] += 1
        out.append(data)
    return out
Пример #28
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        if "appistry" in broad_runner.get_mutect_version():
            out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf"
                               in out_file else out_file + "-mutect.vcf")
        else:
            out_file_mutect = out_file
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
            return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        if "appistry" in broad_runner.get_mutect_version():
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file,
                                           assoc_files, region,
                                           out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(
                orig_files=[out_file_mutect, out_file_indels],
                out_file=out_file,
                ref_file=items[0]["sam_ref"],
                config=items[0]["config"],
                region=region)
    return out_file
Пример #29
0
def run(items):
    """Perform detection of structural variations with delly.
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    work_bams = [data["work_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    bytype_vcfs = [_run_delly(work_bams, sv_type, ref_file, work_dir) for sv_type in ["DEL", "DUP", "INV", "TRA"]]
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"])
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = {}
        data["sv"]["delly"] = delly_vcf
        out.append(data)
    return out
Пример #30
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = 1
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    work_bams = run_multicore(_prep_subsampled_bams,
                              [(data, work_dir) for data in items],
                              config, parallel)
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    sv_types = ["DEL", "DUP"]  # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow
    exclude_file = _get_full_exclude_file(items, work_dir)
    bytype_vcfs = run_multicore(_run_delly,
                                [(work_bams, chrom, sv_type, ref_file, work_dir, items)
                                 for (chrom, sv_type)
                                 in itertools.product(sshared.get_sv_chroms(items, exclude_file), sv_types)],
                                config, parallel)
    out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample,
                                                  "%s-%s%s" % (base, sample, ext), data["config"])
        delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data)
        data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf,
                           "exclude": exclude_file})
        out.append(data)
    return out
Пример #31
0
def variant_filtration(call_file, ref_file, vrn_files, config):
    """Filter variant calls using Variant Quality Score Recalibration.

    Newer GATK with Haplotype calling has combined SNP/indel filtering.
    """
    caller = config["algorithm"].get("variantcaller")
    if caller in ["freebayes"]:
        return vfilter.freebayes(call_file, ref_file, vrn_files, config)
    # no additional filtration for callers that filter as part of call process
    elif caller in ["samtools", "varscan"]:
        return call_file
    else:
        snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, config)
        snp_filter_file = _variant_filtration_snp(snp_file, ref_file, vrn_files, config)
        indel_filter_file = _variant_filtration_indel(indel_file, ref_file, vrn_files, config)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "{base}combined.vcf".format(base=os.path.commonprefix(orig_files))
        return vcfutils.combine_variant_files(orig_files, out_file, ref_file, config)
Пример #32
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    sv_types = ["DEL", "DUP", "INV"]  # "TRA" has invalid VCF END specifications that GATK doesn't like
    with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam:
        bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items)
                                                 for (chrom, sv_type)
                                                 in itertools.product(pysam_work_bam.references, sv_types)],
                                    config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"])
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample,
                                                  "%s-%s%s" % (base, sample, ext), data["config"])
        delly_vcf = vfilter.hard_w_expression(delly_sample_vcf,
                                              "FMT/DV < 4 || (FMT/DV / (FMT/DV + FMT/DR)) < 0.2", data,
                                              name="DVSupport")
        data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf})
        out.append(data)
    return out
Пример #33
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               dd.get_sample_name(items[0]), "delly"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = 1
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    work_bams = [dd.get_align_bam(d) for d in items]
    ref_file = dd.get_ref_file(items[0])
    exclude_file = _get_full_exclude_file(items, work_bams, work_dir)
    bytype_vcfs = run_multicore(_run_delly,
                                [(work_bams, chrom, ref_file, work_dir, items)
                                 for chrom in sshared.get_sv_chroms(items, exclude_file)],
                                config, parallel)
    out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config)
    out = []
    upload_counts = collections.defaultdict(int)
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        final_vcf = sshared.finalize_sv(combo_vcf, data, items)
        if final_vcf:
            delly_vcf = _delly_count_evidence_filter(final_vcf, data)
            data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf,
                               "do_upload": upload_counts[final_vcf] == 0,  # only upload a single file per batch
                               "exclude": exclude_file})
            upload_counts[final_vcf] += 1
        out.append(data)
    return out
Пример #34
0
def run_qsnp(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Run qSNP calling on paired tumor/normal.
    """
    if utils.file_exists(out_file):
        return out_file
    paired = get_paired_bams(align_bams, items)
    if paired.normal_bam:
        region_files = []
        regions = _clean_regions(items, region)
        if regions:
            for region in regions:
                out_region_file = out_file.replace(".vcf.gz", _to_str(region) + ".vcf.gz")
                region_file = _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region, out_region_file)
                region_files.append(region_file)
            out_file = combine_variant_files(region_files, out_file, ref_file, items[0]["config"])
        if not region:
            out_file = _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region, out_file)
        return out_file
    else:
        raise ValueError("qSNP only works on paired samples")
Пример #35
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
        std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data)
        gt_vcf = vcfutils.combine_variant_files(orig_files=[std_gt_vcf, bnd_vcf],
                                                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                                                ref_file=dd.get_ref_file(data), config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Пример #36
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        if "appistry" in broad_runner.get_mutect_version():
            out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                               if "vcf" in out_file else out_file + "-mutect.vcf")
        else:
            out_file_mutect = out_file
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        if "appistry" in broad_runner.get_mutect_version():
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
    return out_file
Пример #37
0
def prep_gemini_db(fnames, call_id, samples, data):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db")
    use_gemini = _do_db_build(samples)
    is_population = len(fnames) > 1
    if is_population:
        gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0]
        gemini_vcf = vcfutils.combine_variant_files(fnames, gemini_vcf, data["sam_ref"],
                                                    data["config"])
    else:
        gemini_vcf = fnames[0]
    if use_gemini and not utils.file_exists(gemini_db):
        with file_transaction(gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % str(call_id), data)
    return [[call_id, {"db": gemini_db if use_gemini else None,
                       "vcf": gemini_vcf if is_population else None}]]
Пример #38
0
def combine_variant_files(*args):
    return vcfutils.combine_variant_files(*args)
Пример #39
0
def combine_variant_files(*args):
    return vcfutils.combine_variant_files(*args)
Пример #40
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):
    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        affected_batch = items[0]["metadata"]["batch"]
        message = ("Batch {} requires both tumor and normal BAM files for"
                   " VarScan cancer calling").format(affected_batch)
        raise ValueError(message)

    if not file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        base, ext = utils.splitext_plus(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"),
                             (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(config, mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname],
                                                ref_file,
                                                config,
                                                max_read_depth,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(orig_out_file, config)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"
        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)
        with file_transaction(config, indel_file,
                              snp_file) as (tx_indel, tx_snp):
            with tx_tmpdir(items[0]) as tmp_dir:
                jvm_opts = _get_varscan_opts(config, tmp_dir)
                fix_ambig = vcfutils.fix_ambiguous_cl()
                tx_snp_in = "%s-orig" % os.path.splitext(tx_snp)[0]
                tx_indel_in = "%s-orig" % os.path.splitext(tx_indel)[0]
                varscan_cmd = (
                    "java {jvm_opts} -jar {varscan_jar} somatic"
                    " {normal_tmp_mpileup} {tumor_tmp_mpileup} "
                    "--output-snp {tx_snp_in} --output-indel {tx_indel_in} "
                    " --output-vcf --min-coverage 5 --p-value 0.98 "
                    "--strand-filter 1 ")
                # add minimum AF
                if "--min-var-freq" not in varscan_cmd:
                    min_af = float(
                        utils.get_in(paired.tumor_config,
                                     ("algorithm", "min_allele_fraction"),
                                     10)) / 100.0
                    varscan_cmd += "--min-var-freq {min_af} "
                do.run(varscan_cmd.format(**locals()), "Varscan", None, None)
                for orig_fname, fname in [(tx_snp_in, tx_snp),
                                          (tx_indel_in, tx_indel)]:
                    cmd = "vcfuniqalleles {orig_fname}.vcf | {fix_ambig} > {fname}"
                    do.run(cmd.format(**locals()), "Varscan paired fix")

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records
        to_combine = []
        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name,
                             config)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name,
                             config)

        if not to_combine:
            write_empty_vcf(orig_out_file, config)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file,
                                         ref_file,
                                         config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            for ext in ["", ".gz", ".gz.tbi"]:
                if os.path.exists(extra_file + ext):
                    os.remove(extra_file + ext)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)

        if orig_out_file.endswith(".gz"):
            out_file = bgzip_and_index(out_file, config)

        _add_reject_flag(out_file, config)
Пример #41
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
            return
        out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect)
        if not file_exists(out_file_orig):
            with file_transaction(config, out_file_orig) as tx_out_file:
                # Rationale: MuTect writes another table to stdout, which we don't need
                params += ["--vcf", tx_out_file, "-o", os.devnull]
                broad_runner.run_mutect(params)
        is_paired = "-I:normal" in params
        if not utils.file_uptodate(out_file_mutect, out_file_orig):
            out_file_mutect = _fix_mutect_output(out_file_orig, config,
                                                 out_file_mutect, is_paired)
        indelcaller = vcfutils.get_indelcaller(base_config)
        if ("scalpel" in indelcaller.lower() and region
                and isinstance(region, (tuple, list))
                and chromhacks.is_autosomal_or_sex(region[0])):
            # Scalpel InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                if not is_paired:
                    vcfutils.check_paired_problems(items)
                    scalpel._run_scalpel_caller(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                else:
                    scalpel._run_scalpel_paired(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(
                    orig_files=[out_file_mutect, out_file_indels],
                    out_file=out_file,
                    ref_file=items[0]["sam_ref"],
                    config=items[0]["config"],
                    region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif "pindel" in indelcaller.lower():
            from bcbio.structural import pindel
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            if pindel.is_installed(items[0]["config"]):
                pindel._run_tumor_pindel_caller(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(
                    orig_files=[out_file_mutect, out_file_indels],
                    out_file=out_file,
                    ref_file=ref_file,
                    config=items[0]["config"],
                    region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif (("somaticindeldetector" in indelcaller.lower()
               or "sid" in indelcaller.lower())
              and "appistry" in broad_runner.get_mutect_version()):
            # SomaticIndelDetector InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file,
                                           assoc_files, region,
                                           out_file_indels)
            with file_transaction(config, out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(
                orig_files=[out_file_mutect, out_file_indels],
                out_file=out_file,
                ref_file=items[0]["sam_ref"],
                config=items[0]["config"],
                region=region)
        else:
            utils.symlink_plus(out_file_mutect, out_file)
    return out_file
Пример #42
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):

    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan",
        config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    tumor_bam, tumor_name, normal_bam, normal_name = get_paired_bams(
        align_bams, items)

    if not file_exists(out_file):
        base, ext = os.path.splitext(out_file)
        cleanup_files = []
        for fname, mpext in [(normal_bam, "normal"), (tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname], ref_file,
                                                max_read_depth, config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(out_file)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        jvm_opts = _get_varscan_opts(config)
        varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98")

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"

        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)

        to_combine = []

        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            varscan_cmd = varscan_cmd.format(**locals())
            do.run(varscan_cmd, "Varscan".format(**locals()), None,
                   None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, normal_name, tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, normal_name, tumor_name)

        if not to_combine:
            write_empty_vcf(out_file)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file, ref_file, config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            os.remove(extra_file)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)
Пример #43
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):
    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        raise ValueError(
            "Require both tumor and normal BAM files for VarScan cancer calling"
        )

    if not file_exists(out_file):
        base, ext = os.path.splitext(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"),
                             (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname],
                                                ref_file,
                                                max_read_depth,
                                                config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(out_file)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        jvm_opts = _get_varscan_opts(config)
        varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98 "
                       "--strand-filter 1 ")

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"

        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)

        to_combine = []

        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            varscan_cmd = varscan_cmd.format(**locals())
            do.run(varscan_cmd, "Varscan".format(**locals()), None, None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name)

        if not to_combine:
            write_empty_vcf(out_file)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file,
                                         ref_file,
                                         config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            os.remove(extra_file)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)
Пример #44
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect)
        with file_transaction(config, out_file_orig) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        is_paired = "-I:normal" in params
        out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired)
        indelcaller = vcfutils.get_indelcaller(base_config)
        if "scalpel" in indelcaller.lower():
            # Scalpel InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                with file_transaction(config, out_file_indels) as tx_out_file2:
                    if not is_paired:
                        vcfutils.check_paired_problems(items)
                        scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                    else:
                        scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=items[0]["sam_ref"],
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif "pindel" in indelcaller.lower():
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if pindel.is_installed(items[0]["config"]):
                pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region,
                                          out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=ref_file,
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower())
              and "appistry" in broad_runner.get_mutect_version()):
            # SomaticIndelDetector InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(config, out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
        else:
            utils.symlink_plus(out_file_mutect, out_file)
    return out_file
Пример #45
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):

    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if LooseVersion(version) < LooseVersion("v2.3.6"):
        raise IOError(
            "Please install version 2.3.6 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan",
        config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        raise ValueError("Require both tumor and normal BAM files for VarScan cancer calling")

    if not file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        base, ext = utils.splitext_plus(out_file)
        cleanup_files = []
        for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname], ref_file,
                                                max_read_depth, config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(orig_out_file, config)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"
        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)
        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            with utils.curdir_tmpdir(items[0]) as tmp_dir:
                jvm_opts = _get_varscan_opts(config, tmp_dir)
                varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98 "
                       "--strand-filter 1 ")
                # add minimum AF
                if "--min-var-freq" not in varscan_cmd:
                    min_af = float(utils.get_in(paired.tumor_config, ("algorithm",
                                                                      "min_allele_fraction"),10)) / 100.0
                    varscan_cmd += "--min-var-freq {min_af} "
                do.run(varscan_cmd.format(**locals()), "Varscan", None, None)

        # VarScan files need to be corrected to match the VCF specification
        # We do this before combining them otherwise merging may fail
        # if there are invalid records

        to_combine = []
        if do.file_exists(snp_file):
            to_combine.append(snp_file)
            _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)
            _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name)

        if not to_combine:
            write_empty_vcf(orig_out_file, config)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file, ref_file, config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            for ext in ["", ".gz", ".gz.tbi"]:
                if os.path.exists(extra_file + ext):
                    os.remove(extra_file + ext)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)

        if orig_out_file.endswith(".gz"):
            out_file = bgzip_and_index(out_file, config)

        _add_reject_flag(out_file, config)
Пример #46
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):

    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]
    paired = get_paired_bams(align_bams, items)
    if not paired.normal_bam:
        affected_batch = items[0]["metadata"]["batch"]
        message = ("Batch {} requires both tumor and normal BAM files for"
                   " VarScan cancer calling").format(affected_batch)
        raise ValueError(message)

    if not utils.file_exists(out_file):
        assert out_file.endswith(".vcf.gz"), "Expect bgzipped output to VarScan"
        normal_mpileup_cl = samtools.prep_mpileup([paired.normal_bam], ref_file,
                                                  config, max_read_depth,
                                                  target_regions=target_regions,
                                                  want_bcf=False)
        tumor_mpileup_cl = samtools.prep_mpileup([paired.tumor_bam], ref_file,
                                                 config, max_read_depth,
                                                 target_regions=target_regions,
                                                 want_bcf=False)
        base, ext = utils.splitext_plus(out_file)
        indel_file = base + "-indel.vcf"
        snp_file = base + "-snp.vcf"
        with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp):
            with tx_tmpdir(items[0]) as tmp_dir:
                jvm_opts = _get_varscan_opts(config, tmp_dir)
                remove_zerocoverage = r"ifne grep -v -P '\t0\t\t$'"
                varscan_cmd = ("varscan {jvm_opts} somatic "
                               " <({normal_mpileup_cl} | {remove_zerocoverage}) "
                               "<({tumor_mpileup_cl} | {remove_zerocoverage}) "
                               "--output-snp {tx_snp} --output-indel {tx_indel} "
                               " --output-vcf --min-coverage 5 --p-value 0.98 "
                               "--strand-filter 1 ")
                # add minimum AF
                if "--min-var-freq" not in varscan_cmd:
                    min_af = float(utils.get_in(paired.tumor_config, ("algorithm",
                                                                      "min_allele_fraction"), 10)) / 100.0
                    varscan_cmd += "--min-var-freq {min_af} "
                do.run(varscan_cmd.format(**locals()), "Varscan", None, None)

        to_combine = []
        for fname in [snp_file, indel_file]:
            if utils.file_exists(fname):
                fix_file = "%s-fix.vcf.gz" % (utils.splitext_plus(fname)[0])
                with file_transaction(config, fix_file) as tx_fix_file:
                    fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                    fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                    py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                    normal_name = paired.normal_name
                    tumor_name = paired.tumor_name
                    cmd = ("cat {fname} | "
                           "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x,"
                            """ "{normal_name}", "{tumor_name}")' | """
                           "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles | "
                           """bcftools filter -m + -s REJECT -e "SS != '.' && SS != '2'" 2> /dev/null | """
                           "{py_cl} -x 'bcbio.variation.varscan.spv_freq_filter(x, 1)' | "
                           "bgzip -c > {tx_fix_file}")
                    do.run(cmd.format(**locals()), "Varscan paired fix")
                to_combine.append(fix_file)

        if not to_combine:
            out_file = write_empty_vcf(out_file, config)
        else:
            out_file = combine_variant_files(to_combine,
                                             out_file, ref_file, config,
                                             region=target_regions)
        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)
        if out_file.endswith(".gz"):
            out_file = bgzip_and_index(out_file, config)