Exemplo n.º 1
0
def _add_config_regions(nblock_regions, ref_regions, data):
    """Add additional nblock regions based on configured regions to call.
    Identifies user defined regions which we should not be analyzing.
    """
    input_regions_bed = dd.get_variant_regions(data)
    if input_regions_bed:
        input_regions = pybedtools.BedTool(input_regions_bed)
        # work around problem with single region not subtracted correctly.
        if len(input_regions) == 1:
            str_regions = str(input_regions[0]).strip()
            input_regions = pybedtools.BedTool("%s\n%s" %
                                               (str_regions, str_regions),
                                               from_string=True)
        input_nblock = ref_regions.subtract(input_regions, nonamecheck=True)
        if input_nblock == ref_regions:
            raise ValueError(
                "Input variant_region file (%s) "
                "excludes all genomic regions. Do the chromosome names "
                "in the BED file match your genome (chr1 vs 1)?" %
                input_regions_bed)
        all_intervals = _combine_regions([input_nblock, nblock_regions],
                                         ref_regions)
    else:
        all_intervals = nblock_regions
    if "noalt_calling" in dd.get_tools_on(
            data) or "altcontigs" in dd.get_exclude_regions(data):
        from bcbio.heterogeneity import chromhacks
        remove_intervals = ref_regions.filter(
            lambda r: not chromhacks.is_nonalt(r.chrom))
        all_intervals = _combine_regions([all_intervals, remove_intervals],
                                         ref_regions)
    return all_intervals.merge()
Exemplo n.º 2
0
def _add_config_regions(nblock_regions, ref_regions, data):
    """Add additional nblock regions based on configured regions to call.
    Identifies user defined regions which we should not be analyzing.
    """
    input_regions_bed = dd.get_variant_regions(data)
    if input_regions_bed:
        input_regions = pybedtools.BedTool(input_regions_bed)
        # work around problem with single region not subtracted correctly.
        if len(input_regions) == 1:
            str_regions = str(input_regions[0]).strip()
            input_regions = pybedtools.BedTool("%s\n%s" % (str_regions, str_regions),
                                               from_string=True)
        input_nblock = ref_regions.subtract(input_regions, nonamecheck=True)
        if input_nblock == ref_regions:
            raise ValueError("Input variant_region file (%s) "
                             "excludes all genomic regions. Do the chromosome names "
                             "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed)
        all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions)
    else:
        all_intervals = nblock_regions
    if "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data):
        from bcbio.heterogeneity import chromhacks
        remove_intervals = ref_regions.filter(lambda r: not chromhacks.is_nonalt(r.chrom))
        all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions)
    return all_intervals.merge()
Exemplo n.º 3
0
def to_standardonly(in_file, ref_file, data):
    """Subset a VCF input file to standard chromosomes (1-22,X,Y,MT).
    """
    from bcbio.heterogeneity import chromhacks
    out_file = "%s-stdchrs.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_exists(out_file):
        stds = []
        for c in ref.file_contigs(ref_file):
            if chromhacks.is_nonalt(c.name):
                stds.append(c.name)
        if stds:
            with file_transaction(data, out_file) as tx_out_file:
                stds = ",".join(stds)
                in_file = bgzip_and_index(in_file, data["config"])
                cmd = "bcftools view -o {tx_out_file} -O z {in_file} {stds}"
                do.run(cmd.format(**locals()), "Subset to standard chromosomes")
    return bgzip_and_index(out_file, data["config"]) if utils.file_exists(out_file) else in_file
Exemplo n.º 4
0
def to_standardonly(in_file, ref_file, data):
    """Subset a VCF input file to standard chromosomes (1-22,X,Y,MT).
    """
    from bcbio.heterogeneity import chromhacks
    out_file = "%s-stdchrs.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_exists(out_file):
        stds = []
        for c in ref.file_contigs(ref_file):
            if chromhacks.is_nonalt(c.name):
                stds.append(c.name)
        if stds:
            with file_transaction(data, out_file) as tx_out_file:
                stds = ",".join(stds)
                in_file = bgzip_and_index(in_file, data["config"])
                cmd = "bcftools view -o {tx_out_file} -O z {in_file} {stds}"
                do.run(cmd.format(**locals()), "Subset to standard chromosomes")
    return bgzip_and_index(out_file, data["config"]) if utils.file_exists(out_file) else in_file
Exemplo n.º 5
0
def _maybe_limit_chromosomes(data):
    """Potentially limit chromosomes to avoid problematically named HLA contigs.

    HLAs have ':' characters in them which confuse downstream processing. If
    we have no problematic chromosomes we don't limit anything.
    """
    std_chroms = []
    prob_chroms = []
    noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data)
    for contig in ref.file_contigs(dd.get_ref_file(data)):
        if contig.name.find(":") > 0 or (noalt_calling and not chromhacks.is_nonalt(contig.name)):
            prob_chroms.append(contig.name)
        else:
            std_chroms.append(contig.name)
    if len(prob_chroms) > 0:
        return std_chroms
    else:
        return []
Exemplo n.º 6
0
def _maybe_limit_chromosomes(data):
    """Potentially limit chromosomes to avoid problematically named HLA contigs.

    HLAs have ':' characters in them which confuse downstream processing. If
    we have no problematic chromosomes we don't limit anything.
    """
    std_chroms = []
    prob_chroms = []
    noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data)
    for contig in ref.file_contigs(dd.get_ref_file(data)):
        if contig.name.find(":") > 0 or (noalt_calling and not chromhacks.is_nonalt(contig.name)):
            prob_chroms.append(contig.name)
        else:
            std_chroms.append(contig.name)
    if len(prob_chroms) > 0:
        return std_chroms
    else:
        return []
Exemplo n.º 7
0
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items):
    """Run lumpy-sv using smoove.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    name = "%s%s" % (dd.get_sample_name(items[0]), ext)
    out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name)
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz"
                                % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    if utils.file_exists(old_out_file):
        return old_out_file, sv_exclude_bed
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cores = dd.get_num_cores(items[0])
            out_dir = os.path.dirname(tx_out_file)
            ref_file = dd.get_ref_file(items[0])
            full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items,
                                                      os.path.dirname(tx_out_file)))
            std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"]
            def _is_std_exclude(n):
                clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes]
                return any([n.startswith(x) or n.endswith(x) for x in clean_excludes])
            exclude_chrs = [c.name for c in ref.file_contigs(ref_file)
                            if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)]
            exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs)
            exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else ""
            tempdir = os.path.dirname(tx_out_file)
            cmd = ("export TMPDIR={tempdir} && "
                   "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} "
                   "--name {name} --outdir {out_dir} "
                   "{exclude_bed} {exclude_chrs} {full_bams}")
            with utils.chdir(tempdir):
                try:
                    do.run(cmd.format(**locals()), "smoove lumpy calling", items[0])
                except subprocess.CalledProcessError as msg:
                    if _allowed_errors(str(msg)):
                        vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"],
                                                samples=[dd.get_sample_name(d) for d in items])
                    else:
                        logger.exception()
                        raise
    vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file, sv_exclude_bed
Exemplo n.º 8
0
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items):
    """Run lumpy-sv using smoove.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    name = "%s%s" % (dd.get_sample_name(items[0]), ext)
    out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name)
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz"
                                % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    if utils.file_exists(old_out_file):
        return old_out_file, sv_exclude_bed
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cores = dd.get_num_cores(items[0])
            out_dir = os.path.dirname(tx_out_file)
            ref_file = dd.get_ref_file(items[0])
            full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items,
                                                      os.path.dirname(tx_out_file)))
            std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"]
            def _is_std_exclude(n):
                clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes]
                return any([n.startswith(x) or n.endswith(x) for x in clean_excludes])
            exclude_chrs = [c.name for c in ref.file_contigs(ref_file)
                            if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)]
            exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs)
            exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else ""
            tempdir = os.path.dirname(tx_out_file)
            cmd = ("export TMPDIR={tempdir} && "
                   "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} "
                   "--name {name} --outdir {out_dir} "
                   "{exclude_bed} {exclude_chrs} {full_bams}")
            with utils.chdir(tempdir):
                try:
                    do.run(cmd.format(**locals()), "smoove lumpy calling", items[0])
                except subprocess.CalledProcessError as msg:
                    if _allowed_errors(msg):
                        vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"],
                                                 samples=[dd.get_sample_name(d) for d in items])
                    else:
                        logger.exception()
                        raise
    vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file, sv_exclude_bed
Exemplo n.º 9
0
def main(ref_file):
    ref_bedtool = get_ref_bedtool(ref_file, {})

    mappable_file = os.path.basename(URL)
    r = requests.get(URL, stream=True)
    with open(mappable_file, "wb") as f:
        shutil.copyfileobj(r.raw, f)

    ref_bedtool.subtract(mappable_file,
                         nonamecheck=True).saveas(OUT_FILE + ".tmp")
    with open(OUT_FILE + ".tmp") as in_handle:
        with open(OUT_FILE, "w") as out_handle:
            for line in in_handle:
                if chromhacks.is_nonalt(line.split()[0]):
                    out_handle.write("%s\tumap_k100_mappability\n" %
                                     line.strip())
    os.remove(OUT_FILE + ".tmp")
    vcfutils.bgzip_and_index(OUT_FILE)
    os.remove(mappable_file)
Exemplo n.º 10
0
def _setup_variant_regions(data, out_dir):
    """Ensure we have variant regions for calling, using transcript if not present.

    Respects noalt_calling by removing additional contigs to improve
    speeds.
    """
    vr_file = dd.get_variant_regions(data)
    if not vr_file:
        vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir)
    contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))])
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")),
                            "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0])
    if not utils.file_uptodate(out_file, vr_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with shared.bedtools_tmpdir(data):
                    for r in pybedtools.BedTool(vr_file):
                        if r.chrom in contigs:
                            if chromhacks.is_nonalt(r.chrom):
                                out_handle.write(str(r))
    data = dd.set_variant_regions(data, out_file)
    return data
Exemplo n.º 11
0
def _setup_variant_regions(data, out_dir):
    """Ensure we have variant regions for calling, using transcript if not present.

    Respects noalt_calling by removing additional contigs to improve
    speeds.
    """
    vr_file = dd.get_variant_regions(data)
    if not vr_file:
        vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir)
    contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))])
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")),
                            "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0])
    if not utils.file_uptodate(out_file, vr_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with shared.bedtools_tmpdir(data):
                    for r in pybedtools.BedTool(vr_file):
                        if r.chrom in contigs:
                            if chromhacks.is_nonalt(r.chrom):
                                out_handle.write(str(r))
    data = dd.set_variant_regions(data, out_file)
    return data
Exemplo n.º 12
0
 def callable_chrom_filter(r):
     """Filter to callable region, potentially limiting by chromosomes.
     """
     return r.name == "CALLABLE" and (not noalt_calling
                                      or chromhacks.is_nonalt(r.chrom))
Exemplo n.º 13
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect)
        if not file_exists(out_file_orig):
            with file_transaction(config, out_file_orig) as tx_out_file:
                # Rationale: MuTect writes another table to stdout, which we don't need
                params += ["--vcf", tx_out_file, "-o", os.devnull]
                broad_runner.run_mutect(params)
        is_paired = "-I:normal" in params
        if not utils.file_uptodate(out_file_mutect, out_file_orig):
            out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired)
        indelcaller = vcfutils.get_indelcaller(base_config)
        if ("scalpel" in indelcaller.lower() and region and isinstance(region, (tuple, list))
              and chromhacks.is_nonalt(region[0])):
            # Scalpel InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                if not is_paired:
                    vcfutils.check_paired_problems(items)
                    scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                                                region=region, out_file=out_file_indels)
                else:
                    scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                                                region=region, out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=items[0]["sam_ref"],
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif "pindel" in indelcaller.lower():
            from bcbio.structural import pindel
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if pindel.is_installed(items[0]["config"]):
                pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region,
                                          out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=ref_file,
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower())
              and "appistry" in broad_runner.get_mutect_version()):
            # SomaticIndelDetector InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(config, out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
        else:
            utils.symlink_plus(out_file_mutect, out_file)
    return out_file
Exemplo n.º 14
0
 def callable_chrom_filter(r):
     """Filter to callable region, potentially limiting by chromosomes.
     """
     return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom))