Пример #1
0
def plot_model_segments(seg_files, work_dir, data):
    """Diagnostic plots of segmentation and inputs.
    """
    from bcbio.heterogeneity import chromhacks
    out_file = os.path.join(work_dir, "%s.modeled.png" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            dict_file = utils.splitext_plus(dd.get_ref_file(data))[0] + ".dict"
            plot_dict = os.path.join(os.path.dirname(tx_out_file), os.path.basename(dict_file))
            with open(dict_file) as in_handle:
                with open(plot_dict, "w") as out_handle:
                    for line in in_handle:
                        if line.startswith("@SQ"):
                            cur_chrom = [x.split(":", 1)[1].strip()
                                         for x in line.split("\t") if x.startswith("SN:")][0]
                            if chromhacks.is_autosomal_or_sex(cur_chrom):
                                out_handle.write(line)
                        else:
                            out_handle.write(line)
            params = ["-T", "PlotModeledSegments",
                      "--denoised-copy-ratios", tz.get_in(["depth", "bins", "normalized"], data),
                      "--segments", seg_files["final_seg"],
                      "--allelic-counts", seg_files["tumor_hets"],
                      "--sequence-dictionary", plot_dict,
                      "--minimum-contig-length", "10",
                      "--output-prefix", dd.get_sample_name(data),
                      "-O", os.path.dirname(tx_out_file)]
            _run_with_memory_scaling(params, tx_out_file, data)
    return {"seg": out_file}
Пример #2
0
def remove_extracontigs(in_bam, data):
    """Remove extra contigs (non chr1-22,X,Y) from an input BAM.

    These extra contigs can often be arranged in different ways, causing
    incompatibility issues with GATK and other tools. This also fixes the
    read group header as in fixrg.
    """
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "bamclean",
                     dd.get_sample_name(data)))
    out_file = os.path.join(
        work_dir,
        "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            target_chroms = [
                x.name for x in ref.file_contigs(dd.get_ref_file(data))
                if chromhacks.is_autosomal_or_sex(x.name)
            ]
            str_chroms = " ".join(target_chroms)
            comma_chroms = ",".join(target_chroms)
            rg_info = novoalign.get_rg_info(data["rgnames"])
            bcbio_py = sys.executable
            cmd = (
                "samtools view -h {in_bam} {str_chroms} | "
                """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """
                """cleanbam.fix_header("{comma_chroms}")' | """
                "samtools view -u - | "
                "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - "
            )
            do.run(
                cmd.format(**locals()),
                "bamprep, remove extra contigs: %s" % dd.get_sample_name(data))
    return out_file
Пример #3
0
def _target_chroms_and_header(bam_file, data):
    """Get a list of chromosomes to target and new updated ref_file header.

    Could potentially handle remapping from chr1 -> 1 but currently disabled due
    to speed issues.
    """
    special_remaps = {"chrM": "MT", "MT": "chrM"}
    target_chroms = dict([(x.name, i) for i, x in enumerate(ref.file_contigs(dd.get_ref_file(data)))
                          if chromhacks.is_autosomal_or_sex(x.name)])
    out_chroms = []
    with pysam.Samfile(bam_file, "rb") as bamfile:
        for bami, bam_contig in enumerate([c["SN"] for c in bamfile.header["SQ"]]):
            if bam_contig in target_chroms:
                target_chrom = bam_contig
            elif bam_contig in special_remaps and special_remaps[bam_contig] in target_chroms:
                target_chrom = special_remaps[bam_contig]
            elif bam_contig.startswith("chr") and bam_contig.replace("chr", "") in target_chroms:
                target_chrom = bam_contig.replace("chr", "")
            elif "chr%s" % bam_contig in target_chroms:
                target_chrom = "chr%s" % bam_contig
            else:
                target_chrom = None
            # target_chrom == bam_contig ensures we don't try chr1 -> 1 style remapping
            if target_chrom and target_chrom == bam_contig:
                # Order not required if dealing with SAM file header fixing
                #assert bami == target_chroms[target_chrom], \
                #    ("remove_extracontigs: Non-matching order of standard contig: %s %s (%s vs %s)" %
                #     (bam_file, target_chrom, bami, target_chroms[target_chrom]))
                out_chroms.append(target_chrom)
    assert out_chroms, ("remove_extracontigs: Did not find any chromosomes in reference file: %s %s" %
                        (bam_file, target_chroms))
    return out_chroms
Пример #4
0
def plot_model_segments(seg_files, work_dir, data):
    """Diagnostic plots of segmentation and inputs.
    """
    from bcbio.heterogeneity import chromhacks
    out_file = os.path.join(work_dir, "%s.modeled.png" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            dict_file = utils.splitext_plus(dd.get_ref_file(data))[0] + ".dict"
            plot_dict = os.path.join(os.path.dirname(tx_out_file), os.path.basename(dict_file))
            with open(dict_file) as in_handle:
                with open(plot_dict, "w") as out_handle:
                    for line in in_handle:
                        if line.startswith("@SQ"):
                            cur_chrom = [x.split(":", 1)[1].strip()
                                         for x in line.split("\t") if x.startswith("SN:")][0]
                            if chromhacks.is_autosomal_or_sex(cur_chrom):
                                out_handle.write(line)
                        else:
                            out_handle.write(line)
            params = ["-T", "PlotModeledSegments",
                      "--denoised-copy-ratios", tz.get_in(["depth", "bins", "normalized"], data),
                      "--segments", seg_files["final_seg"],
                      "--allelic-counts", seg_files["tumor_hets"],
                      "--sequence-dictionary", plot_dict,
                      "--minimum-contig-length", "10",
                      "--output-prefix", dd.get_sample_name(data),
                      "-O", os.path.dirname(tx_out_file)]
            _run_with_memory_scaling(params, tx_out_file, data)
    return {"seg": out_file}
Пример #5
0
def main():
    url = "http://evs.gs.washington.edu/evs_bulk_data/ESP6500SI-V2-SSA137.GRCh38-liftover.snps_indels.vcf.tar.gz"
    ref_file = "../seq/hg38.fa"
    subprocess.check_call(
        "wget -c -O esp-orig.tar.gz {url}".format(**locals()), shell=True)
    subprocess.check_call("tar -xzvpf esp-orig.tar.gz", shell=True)
    raw_file = "esp-raw.vcf"
    with open(raw_file, "w") as out_handle:
        for i, chrom in enumerate(range(1, 22) + ["X", "Y"]):
            fnames = glob.glob("*chr%s.snps_indels.vcf" % chrom)
            assert len(fnames) == 1, (chrom, fnames)
            with open(fnames[0]) as in_handle:
                for line in in_handle:
                    if line.startswith("#"):
                        if i == 0:
                            if line.startswith("#CHROM"):
                                _add_contigs(out_handle, ref_file)
                            out_handle.write(line)
                    else:
                        parts = line.strip().split("\t")
                        key, val = parts[-1].split(";")[-1].split("=")
                        assert key == "GRCh38_POSITION"
                        if val != "-1":
                            new_chrom, new_pos = val.split(":")
                            if chromhacks.is_autosomal_or_sex(new_chrom):
                                parts[0] = "chr%s" % new_chrom
                                parts[1] = new_pos
                                out_handle.write("\t".join(parts) + "\n")
    out_file = "ESP6500SI-V2-hg38.vcf.gz"
    subprocess.check_call(
        ("vt sort {raw_file} | vt decompose -s - | "
         "vt normalize -n -r {ref_file} - | bgzip -c > {out_file}").format(
             **locals()),
        shell=True)
    vcfutils.bgzip_and_index(out_file)
Пример #6
0
def _target_chroms_and_header(bam_file, data):
    """Get a list of chromosomes to target and new updated ref_file header.

    Could potentially handle remapping from chr1 -> 1 but currently disabled due
    to speed issues.
    """
    special_remaps = {"chrM": "MT", "MT": "chrM"}
    target_chroms = dict([(x.name, i) for i, x in enumerate(ref.file_contigs(dd.get_ref_file(data)))
                          if chromhacks.is_autosomal_or_sex(x.name)])
    out_chroms = []
    with pysam.Samfile(bam_file, "rb") as bamfile:
        for bami, bam_contig in enumerate([c["SN"] for c in bamfile.header["SQ"]]):
            if bam_contig in target_chroms:
                target_chrom = bam_contig
            elif bam_contig in special_remaps and special_remaps[bam_contig] in target_chroms:
                target_chrom = special_remaps[bam_contig]
            elif bam_contig.startswith("chr") and bam_contig.replace("chr", "") in target_chroms:
                target_chrom = bam_contig.replace("chr", "")
            elif "chr%s" % bam_contig in target_chroms:
                target_chrom = "chr%s" % bam_contig
            else:
                target_chrom = None
            # target_chrom == bam_contig ensures we don't try chr1 -> 1 style remapping
            if target_chrom and target_chrom == bam_contig:
                # Order not required if dealing with SAM file header fixing
                #assert bami == target_chroms[target_chrom], \
                #    ("remove_extracontigs: Non-matching order of standard contig: %s %s (%s vs %s)" %
                #     (bam_file, target_chrom, bami, target_chroms[target_chrom]))
                out_chroms.append(target_chrom)
    assert out_chroms, ("remove_extracontigs: Did not find any chromosomes in reference file: %s %s" %
                        (bam_file, target_chroms))
    return out_chroms
Пример #7
0
def remove_extracontigs(in_bam, data):
    """Remove extra contigs (non chr1-22,X,Y) from an input BAM.

    These extra contigs can often be arranged in different ways, causing
    incompatibility issues with GATK and other tools. This also fixes the
    read group header as in fixrg.
    """
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data)))
    out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            target_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data))
                             if chromhacks.is_autosomal_or_sex(x.name)]
            str_chroms = " ".join(target_chroms)
            comma_chroms = ",".join(target_chroms)
            rg_info = novoalign.get_rg_info(data["rgnames"])
            bcbio_py = sys.executable
            cmd = ("samtools view -h {in_bam} {str_chroms} | "
                   """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """
                   """cleanbam.fix_header("{comma_chroms}")' | """
                   "samtools view -u - | "
                   "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ")
            do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data))
    return out_file
Пример #8
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
            return
        out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect)
        if not file_exists(out_file_orig):
            with file_transaction(config, out_file_orig) as tx_out_file:
                # Rationale: MuTect writes another table to stdout, which we don't need
                params += ["--vcf", tx_out_file, "-o", os.devnull]
                broad_runner.run_mutect(params)
        is_paired = "-I:normal" in params
        if not utils.file_uptodate(out_file_mutect, out_file_orig):
            out_file_mutect = _fix_mutect_output(out_file_orig, config,
                                                 out_file_mutect, is_paired)
        indelcaller = vcfutils.get_indelcaller(base_config)
        if ("scalpel" in indelcaller.lower() and region
                and isinstance(region, (tuple, list))
                and chromhacks.is_autosomal_or_sex(region[0])):
            # Scalpel InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                if not is_paired:
                    vcfutils.check_paired_problems(items)
                    scalpel._run_scalpel_caller(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                else:
                    scalpel._run_scalpel_paired(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(
                    orig_files=[out_file_mutect, out_file_indels],
                    out_file=out_file,
                    ref_file=items[0]["sam_ref"],
                    config=items[0]["config"],
                    region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif "pindel" in indelcaller.lower():
            from bcbio.structural import pindel
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            if pindel.is_installed(items[0]["config"]):
                pindel._run_tumor_pindel_caller(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(
                    orig_files=[out_file_mutect, out_file_indels],
                    out_file=out_file,
                    ref_file=ref_file,
                    config=items[0]["config"],
                    region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif (("somaticindeldetector" in indelcaller.lower()
               or "sid" in indelcaller.lower())
              and "appistry" in broad_runner.get_mutect_version()):
            # SomaticIndelDetector InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file,
                                           assoc_files, region,
                                           out_file_indels)
            with file_transaction(config, out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(
                orig_files=[out_file_mutect, out_file_indels],
                out_file=out_file,
                ref_file=items[0]["sam_ref"],
                config=items[0]["config"],
                region=region)
        else:
            utils.symlink_plus(out_file_mutect, out_file)
    return out_file
Пример #9
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect)
        if not file_exists(out_file_orig):
            with file_transaction(config, out_file_orig) as tx_out_file:
                # Rationale: MuTect writes another table to stdout, which we don't need
                params += ["--vcf", tx_out_file, "-o", os.devnull]
                broad_runner.run_mutect(params)
        is_paired = "-I:normal" in params
        if not utils.file_uptodate(out_file_mutect, out_file_orig):
            out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired)
        indelcaller = vcfutils.get_indelcaller(base_config)
        if ("scalpel" in indelcaller.lower() and region and isinstance(region, (tuple, list))
              and chromhacks.is_autosomal_or_sex(region[0])):
            # Scalpel InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                if not is_paired:
                    vcfutils.check_paired_problems(items)
                    scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                                                region=region, out_file=out_file_indels)
                else:
                    scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                                                region=region, out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=items[0]["sam_ref"],
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif "pindel" in indelcaller.lower():
            from bcbio.structural import pindel
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if pindel.is_installed(items[0]["config"]):
                pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region,
                                          out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=ref_file,
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower())
              and "appistry" in broad_runner.get_mutect_version()):
            # SomaticIndelDetector InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(config, out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
        else:
            utils.symlink_plus(out_file_mutect, out_file)
    return out_file
Пример #10
0
def _add_contigs(out_handle, ref_file):
    for contig in ref.file_contigs(ref_file):
        if chromhacks.is_autosomal_or_sex(contig.name):
            out_handle.write("##contig=<ID=%s,length=%s>\n" %
                             (contig.name, contig.size))