Exemplo n.º 1
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    out = []
    for i, data in enumerate(items):
        if "sv" not in data:
            data["sv"] = []
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        gt_vcf = _run_svtyper(sample_vcf, dedup_bam, sr_bam, data)
        filter_vcf = _filter_by_support(gt_vcf, data)
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": filter_vcf,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Exemplo n.º 2
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(
            utils.get_in(data, ("config", "algorithm",
                                "aligner")) in ["bwa", False, None]
            for data in items):
        raise ValueError(
            "Require bwa-mem alignment input for lumpy structural variation detection"
        )
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(
        paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(
            data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(
            lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample),
            data["config"])
        std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
        std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file,
                                  data)
        gt_vcf = vcfutils.combine_variant_files(
            orig_files=[std_gt_vcf, bnd_vcf],
            out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
            ref_file=dd.get_ref_file(data),
            config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name],
                                        [paired.normal_name], gt_vcfs,
                                        paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        data["sv"].append({
            "variantcaller": "lumpy",
            "vrn_file": effects_vcf or vcf_file,
            "exclude_file": exclude_file
        })
        out.append(data)
    return out
Exemplo n.º 3
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dedup_bam, sr_bam, exclude_file, data)
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Exemplo n.º 4
0
def _prep_subsampled_bams(data, work_dir):
    """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs.

    This attempts to minimize run times by pre-extracting useful reads mixed
    with subsampled normal pairs to estimate paired end distributions:

    https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ

    Subsamples correctly aligned reads to 100 million based on speedseq defaults and
    evaluations on NA12878 whole genome data:

    https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102

    XXX Currently not used as new versions of delly do not get good sensitivity
    with downsampled BAMs.
    """
    sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
    ds_bam = bam.downsample(
        dd.get_align_bam(data),
        data,
        1e8,
        read_filter="-F 'not secondary_alignment and proper_pair'",
        always_run=True,
        work_dir=work_dir)
    out_bam = "%s-final%s" % utils.splitext_plus(ds_bam)
    if not utils.file_exists(out_bam):
        bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"])
    bam.index(out_bam, data["config"])
    return [out_bam]
Exemplo n.º 5
0
def _prep_subsampled_bams(data, work_dir):
    """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs.

    This attempts to minimize run times by pre-extracting useful reads mixed
    with subsampled normal pairs to estimate paired end distributions:

    https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ

    Subsamples correctly aligned reads to 100 million based on speedseq defaults and
    evaluations on NA12878 whole genome data:

    https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102

    XXX Currently does not downsample as new versions do not get good sensitivity with
    downsampled BAMs.
    """
    full_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
    return [full_bam]

    ds_bam = bam.downsample(full_bam, data, 1e8, read_filter="-F 'not secondary_alignment and proper_pair'",
                            always_run=True, work_dir=work_dir)
    out_bam = "%s-final%s" % utils.splitext_plus(ds_bam)
    if not utils.file_exists(out_bam):
        bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"])
    bam.index(out_bam, data["config"])
    return [out_bam]
Exemplo n.º 6
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1],
                                               "lumpy"))
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    pebed_file, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    out = []
    sample_config_file = _write_samples_to_ids(pebed_file, items)
    lumpy_vcf = _bedpe_to_vcf(pebed_file, sample_config_file, items)
    for i, data in enumerate(items):
        if "sv" not in data:
            data["sv"] = []
        sample = tz.get_in(["rgnames", "sample"], data)
        sample_bedpe = _filter_by_support(_subset_to_sample(pebed_file, i, data), i, data)
        if lumpy_vcf:
            sample_vcf = utils.append_stem(lumpy_vcf, "-%s" % sample)
            sample_vcf = _filter_by_bedpe(vcfutils.select_sample(lumpy_vcf, sample, sample_vcf, data["config"]),
                                          sample_bedpe, data)
        else:
            sample_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": sample_vcf,
                           "exclude_file": exclude_file,
                           "bedpe_file": sample_bedpe,
                           "sample_bed": sample_config_file})
        out.append(data)
    return out
Exemplo n.º 7
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data)
        elif "lumpy-genotype" in dd.get_tools_off(data):
            gt_vcf = sample_vcf
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Exemplo n.º 8
0
 def _run(in_file, work_dir, data):
     dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
     return _run_svtyper(in_file, dedup_bam, sr_bam,
                         call.get("exclude_file"), data)
Exemplo n.º 9
0
 def _run(in_file, work_dir, data):
     dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
     return _run_svtyper(in_file, dedup_bam, sr_bam, call.get("exclude_file"), data)