Пример #1
0
def is_over_threshold(bam, variant_id, regions, max_reads):
    over_threshold = False
    (regionA, regionB) = regions
    (countA, countB) = (count_reads_in_region(regionA, bam),
                        count_reads_in_region(regionB, bam))
    if countA > max_reads or countB > max_reads:
        over_threshold = True
        msg = (
            "SKIPPING -- Variant '{}' has a region with too many reads (> {})\n"
            "\t\t A: (sample={} chrom={} center={} leftflank={} rightflank={}) : {}\n"
            "\t\t B: (sample={} chrom={} center={} leftflank={} rightflank={}) : {}"
        ).format(
            variant_id,
            max_reads,
            regionA[0],
            regionA[1],
            regionA[2],
            regionA[3],
            regionA[4],
            countA,
            regionB[0],
            regionB[1],
            regionB[2],
            regionB[3],
            regionB[4],
            countB,
        )
        logit(msg)
    return over_threshold
Пример #2
0
def dump_library_metrics(lib_info_path, sample):
    sample_list = [sample]
    if (lib_info_path is not None) and (not os.path.exists(lib_info_path)):
        logit('Writing library metrics to %s...' % lib_info_path)
        lib_info_file = open(lib_info_path, 'w')
        write_sample_json(sample_list, lib_info_file)
        lib_info_file.close()
        logit('Finished writing library metrics')
Пример #3
0
def dump_library_metrics(lib_info_path, sample):
    sample_list = [sample]
    if (lib_info_path is not None) and (not os.path.exists(lib_info_path)):
        logit('Writing library metrics to %s...' % lib_info_path)
        lib_info_file = open(lib_info_path, 'w')
        write_sample_json(sample_list, lib_info_file)
        lib_info_file.close()
        logit('Finished writing library metrics')
Пример #4
0
def parallel_calculate_genotype(alignment_file, reference_fasta, library_data,
                                active_libs, sample_name, split_slop,
                                min_aligned, split_weight, disc_weight,
                                max_reads, debug, batch_breakpoints,
                                batch_regions, batch_number):
    logit("Starting batch: {}".format(batch_number))
    bam = open_alignment_file(alignment_file, reference_fasta)

    genotype_results = []
    (skip_count, no_read_count) = (0, 0)
    t0 = time.time()
    for breakpoint, regions in zip(batch_breakpoints, batch_regions):
        (read_batches, many) = gather_reads(bam, breakpoint['id'], regions,
                                            library_data, active_libs,
                                            max_reads)

        # if there are too many reads around the breakpoint
        if many is True:
            skip_count += 1
            genotype_results.append(
                make_empty_genotype_result(breakpoint['id'], sample_name))
            continue

        # if there are no reads around the breakpoint
        if bool(read_batches) is False:
            no_read_count += 1
            genotype_results.append(
                make_detailed_empty_genotype_result(breakpoint['id'],
                                                    sample_name))
            continue

        counts = tally_variant_read_fragments(split_slop, min_aligned,
                                              breakpoint, read_batches, debug)

        total = sum([counts[k] for k in list(counts.keys())])
        if total == 0:
            genotype_results.append(
                make_detailed_empty_genotype_result(breakpoint['id'],
                                                    sample_name))
            continue

        result = bayesian_genotype(breakpoint, counts, split_weight,
                                   disc_weight, debug)
        genotype_results.append({
            'variant.id': breakpoint['id'],
            'sample.name': sample_name,
            'genotype': result
        })

    t1 = time.time()
    logit("Batch {} Processing Elapsed Time: {:.4f} secs".format(
        batch_number, t1 - t0))
    bam.close()
    return {
        'genotypes': genotype_results,
        'skip-count': skip_count,
        'no-read-count': no_read_count
    }
Пример #5
0
def tally_variant_read_fragments(split_slop, min_aligned, breakpoint,
                                 sam_fragments, debug):
    # initialize counts to zero
    ref_span, alt_span = 0, 0
    ref_seq, alt_seq = 0, 0
    alt_clip = 0

    ref_ciA = [0, 0]
    ref_ciB = [0, 0]

    for query_name in sorted(sam_fragments.keys()):
        fragment = sam_fragments[query_name]

        (ref_seq_calc, alt_seq_calc, alt_clip_calc) = \
            gather_split_read_evidence(
            fragment, breakpoint, split_slop, min_aligned)

        ref_seq += ref_seq_calc
        alt_seq += alt_seq_calc
        alt_clip += alt_clip_calc

        (ref_span_calc, alt_span_calc, ref_ciA_calc, ref_ciB_calc) = \
            gather_paired_end_evidence(fragment, breakpoint, min_aligned)

        ref_span += ref_span_calc
        alt_span += alt_span_calc
        ref_ciA = [x + y for x, y in zip(ref_ciA, ref_ciA_calc)]
        ref_ciB = [x + y for x, y in zip(ref_ciB, ref_ciB_calc)]

    # in the absence of evidence for a particular type, ignore the reference
    # support for that type as well
    if (alt_seq + alt_clip) < 0.5 and alt_span >= 1:
        alt_seq = 0
        alt_clip = 0
        ref_seq = 0
    if alt_span < 0.5 and (alt_seq + alt_clip) >= 1:
        alt_span = 0
        ref_span = 0
    if alt_span + alt_seq == 0 and alt_clip > 0:
        # discount any SV that's only supported by clips.
        alt_clip = 0

    counts = {
        'ref_seq': ref_seq,
        'alt_seq': alt_seq,
        'ref_span': ref_span,
        'alt_span': alt_span,
        'alt_clip': alt_clip
    }

    if debug:
        items = ('ref_span', 'alt_span', 'ref_seq', 'alt_seq', 'alt_clip')
        cmsg = "\n".join(['{}: {}'.format(i, counts[i]) for i in items])
        logit("{} -- read fragment tally counts:\n{}".format(
            breakpoint['id'], cmsg))

    return counts
Пример #6
0
def dump_piped_vcf_to_file(stdin, basedir):
    vcf = os.path.join(basedir, 'input.vcf')
    logit('dumping vcf inputs into a temporary file: {}'.format(vcf))
    line_count = 0
    with open(vcf, 'w') as f:
        for line in stdin:
            print(line, end='', file=f)
            line_count += 1
    logit('finished temporary vcf dump -- {} lines'.format(line_count))
    return vcf
Пример #7
0
def dump_piped_vcf_to_file(stdin, basedir):
    vcf = os.path.join(basedir, 'input.vcf')
    logit('dumping vcf inputs into a temporary file: {}'.format(vcf))
    line_count = 0
    with open(vcf, 'w') as f:
        for line in stdin:
            print(line, end='', file=f)
            line_count += 1
    logit('finished temporary vcf dump -- {} lines'.format(line_count))
    return vcf
Пример #8
0
def init_vcf(vcffile, sample, scratchdir):
    v = Vcf()
    v.filename = vcffile
    hdrs = list(vcf_headers(vcffile))
    v.add_header(hdrs)
    v.add_custom_svtyper_headers()
    vcf_samples_list = vcf_samples(vcffile)
    if sample.name not in vcf_samples_list:
        fname = '<stdin>' if scratchdir in v.filename else v.filename
        msg = ("Note: Did not find sample name : '{}' "
               "in input vcf: '{}' -- adding").format(sample.name, fname)
        logit(msg)
    v.add_sample(sample.name)
    return v
Пример #9
0
def init_vcf(vcffile, sample, scratchdir):
    v = Vcf()
    v.filename = vcffile
    hdrs = list(vcf_headers(vcffile))
    v.add_header(hdrs)
    v.add_custom_svtyper_headers()
    vcf_samples_list = vcf_samples(vcffile)
    if sample.name not in vcf_samples_list:
        fname = '<stdin>' if scratchdir in v.filename else v.filename
        msg = ("Note: Did not find sample name : '{}' "
               "in input vcf: '{}' -- adding").format(sample.name, fname)
        logit(msg)
    v.add_sample(sample.name)
    return v
Пример #10
0
def tally_variant_read_fragments(split_slop, min_aligned, breakpoint, sam_fragments, debug):
    # initialize counts to zero
    ref_span, alt_span = 0, 0
    ref_seq, alt_seq = 0, 0
    alt_clip = 0

    ref_ciA = [0,0]
    ref_ciB = [0,0]

    for query_name in sorted(sam_fragments.keys()):
        fragment = sam_fragments[query_name]

        (ref_seq_calc, alt_seq_calc, alt_clip_calc) = \
                gather_split_read_evidence(fragment, breakpoint, split_slop, min_aligned)

        ref_seq += ref_seq_calc
        alt_seq += alt_seq_calc
        alt_clip += alt_clip_calc

        (ref_span_calc, alt_span_calc, ref_ciA_calc, ref_ciB_calc) = \
                gather_paired_end_evidence(fragment, breakpoint, min_aligned)

        ref_span += ref_span_calc
        alt_span += alt_span_calc
        ref_ciA = [ x + y for x,y in zip(ref_ciA, ref_ciA_calc)]
        ref_ciB = [ x + y for x,y in zip(ref_ciB, ref_ciB_calc)]

    # in the absence of evidence for a particular type, ignore the reference
    # support for that type as well
    if (alt_seq + alt_clip) < 0.5 and alt_span >= 1:
        alt_seq = 0
        alt_clip = 0
        ref_seq = 0
    if alt_span < 0.5 and (alt_seq + alt_clip) >= 1:
        alt_span = 0
        ref_span = 0
    if alt_span + alt_seq == 0 and alt_clip > 0:
        # discount any SV that's only supported by clips.
        alt_clip = 0

    counts = { 'ref_seq' : ref_seq, 'alt_seq' : alt_seq,
               'ref_span' : ref_span, 'alt_span' : alt_span,
               'alt_clip' : alt_clip }

    if debug:
        items = ('ref_span', 'alt_span', 'ref_seq', 'alt_seq', 'alt_clip')
        cmsg = "\n".join(['{}: {}'.format(i, counts[i]) for i in items])
        logit("{} -- read fragment tally counts:\n{}".format(breakpoint['id'], cmsg))

    return counts
Пример #11
0
def is_over_threshold(bam, variant_id, regions, max_reads):
    over_threshold = False
    (regionA, regionB) = regions
    (countA, countB) = ( count_reads_in_region(regionA, bam), count_reads_in_region(regionB, bam) )
    if countA > max_reads or countB > max_reads:
        over_threshold = True
        msg = ("SKIPPING -- Variant '{}' has a region with too many reads (> {})\n"
                "\t\t A: (sample={} chrom={} center={} leftflank={} rightflank={}) : {}\n"
                "\t\t B: (sample={} chrom={} center={} leftflank={} rightflank={}) : {}").format(
                        variant_id,
                        max_reads,
                        regionA[0], regionA[1], regionA[2], regionA[3], regionA[4],
                        countA,
                        regionB[0], regionB[1], regionB[2], regionB[3], regionB[4],
                        countB,
                )
        logit(msg)
    return over_threshold
Пример #12
0
def parallel_calculate_genotype(alignment_file, reference_fasta, library_data, active_libs, sample_name, split_slop, min_aligned, split_weight, disc_weight, max_reads, debug, batch_breakpoints, batch_regions, batch_number):
    logit("Starting batch: {}".format(batch_number))
    bam = open_alignment_file(alignment_file, reference_fasta)

    genotype_results = []
    (skip_count, no_read_count) = (0, 0)
    t0 = time.time()
    for breakpoint, regions in zip(batch_breakpoints, batch_regions):
        (read_batches, many) = gather_reads(bam, breakpoint['id'], regions, library_data, active_libs, max_reads)

        # if there are too many reads around the breakpoint
        if many is True:
            skip_count += 1
            genotype_results.append(make_empty_genotype_result(breakpoint['id'], sample_name))
            continue

        # if there are no reads around the breakpoint
        if bool(read_batches) is False:
            no_read_count += 1
            genotype_results.append(make_detailed_empty_genotype_result(breakpoint['id'], sample_name))
            continue

        counts = tally_variant_read_fragments(
            split_slop,
            min_aligned,
            breakpoint,
            read_batches,
            debug
        )

        total = sum([ counts[k] for k in counts.keys() ])
        if total == 0:
            genotype_results.append(make_detailed_empty_genotype_result(breakpoint['id'], sample_name))
            continue

        result = bayesian_genotype(breakpoint, counts, split_weight, disc_weight, debug)
        genotype_results.append({ 'variant.id' : breakpoint['id'], 'sample.name' : sample_name, 'genotype' : result })

    t1 = time.time()
    logit("Batch {} Processing Elapsed Time: {:.4f} secs".format(batch_number, t1 - t0))
    bam.close()
    return { 'genotypes' : genotype_results, 'skip-count' : skip_count, 'no-read-count' : no_read_count }
Пример #13
0
def setup_sample(bam, lib_info_path, reference_fasta, sampling_number, min_aligned):
    fd = open_alignment_file(bam, reference_fasta)

    # only consider libraries that constitute at least this fraction of the BAM
    min_lib_prevalence = 1e-3

    sample = None
    if (lib_info_path is not None) and os.path.exists(lib_info_path):
        # use pre-calculated library metrics
        logit('Reading library metrics from %s...' % lib_info_path)
        with open(lib_info_path, 'r') as f:
            lib_info = json.load(f)
            sample = Sample.from_lib_info(fd, lib_info, min_lib_prevalence)
    else:
        # calculate and include library metrics from bam/cram
        sample = Sample.from_bam(fd, sampling_number, min_lib_prevalence)

    sample.set_exp_seq_depth(min_aligned)
    sample.set_exp_spanning_depth(min_aligned)

    return sample
Пример #14
0
def setup_sample(bam, lib_info_path, reference_fasta, sampling_number,
                 min_aligned):
    fd = open_alignment_file(bam, reference_fasta)

    # only consider libraries that constitute at least this fraction of the BAM
    min_lib_prevalence = 1e-3

    sample = None
    if (lib_info_path is not None) and os.path.exists(lib_info_path):
        # use pre-calculated library metrics
        logit('Reading library metrics from %s...' % lib_info_path)
        with open(lib_info_path, 'r') as f:
            lib_info = json.load(f)
            sample = Sample.from_lib_info(fd, lib_info, min_lib_prevalence)
    else:
        # calculate and include library metrics from bam/cram
        sample = Sample.from_bam(fd, sampling_number, min_lib_prevalence)

    sample.set_exp_seq_depth(min_aligned)
    sample.set_exp_spanning_depth(min_aligned)

    return sample
Пример #15
0
def apply_genotypes_to_vcf(src_vcf, out_vcf, genotypes, sample, sum_quals):
    # initializations
    bnd_cache = {}
    src_vcf.write_header(out_vcf)
    total_variants = len(list(vcf_variants(src_vcf.filename)))

    for i, vline in enumerate(vcf_variants(src_vcf.filename)):
        v = vline.rstrip().split('\t')
        variant = Variant(v, src_vcf)
        if not sum_quals:
            variant.qual = 0

        if not variant.has_svtype():
            msg = ('Warning: SVTYPE missing '
                   'at variant %s. '
                   'Skipping.\n') % (variant.var_id)
            logit(msg)
            variant.write(out_vcf)
            continue

        if not variant.is_valid_svtype():
            msg = ('Warning: Unsupported SVTYPE '
                   'at variant %s (%s). '
                   'Skipping.\n') % (variant.var_id, variant.get_svtype())
            logit(msg)
            variant.write(out_vcf)
            continue

        # special BND processing
        if variant.get_svtype() == 'BND':
            if variant.info['MATEID'] in bnd_cache:
                variant2 = variant
                variant = bnd_cache[variant.info['MATEID']]
                del bnd_cache[variant.var_id]
            else:
                bnd_cache[variant.var_id] = variant
                continue

        result = genotypes[variant.var_id]

        if result is None:
            msg = ("Found no genotype results for variant "
                   "'{}' ({})").format(variant.var_id, variant.get_svtype())
            logit(msg)
            raise RuntimeError(msg)

        variant = assign_genotype_to_variant(variant, sample, result)
        variant.write(out_vcf)

        # special BND processing
        if variant.get_svtype() == 'BND':
            variant2.qual = variant.qual
            variant2.active_formats = variant.active_formats
            variant2.genotype = variant.genotype
            variant2.write(out_vcf)
Пример #16
0
def apply_genotypes_to_vcf(src_vcf, out_vcf, genotypes, sample, sum_quals):
    # initializations
    bnd_cache = {}
    src_vcf.write_header(out_vcf)
    total_variants = len(list(vcf_variants(src_vcf.filename)))

    for i, vline in enumerate(vcf_variants(src_vcf.filename)):
        v = vline.rstrip().split('\t')
        variant = Variant(v, src_vcf)
        if not sum_quals:
            variant.qual = 0

        if not variant.has_svtype():
            msg = ('Warning: SVTYPE missing '
                   'at variant %s. '
                   'Skipping.\n') % (variant.var_id)
            logit(msg)
            variant.write(out_vcf)
            continue

        if not variant.is_valid_svtype():
            msg = ('Warning: Unsupported SVTYPE '
                   'at variant %s (%s). '
                   'Skipping.\n') % (variant.var_id, variant.get_svtype())
            logit(msg)
            variant.write(out_vcf)
            continue

        # special BND processing
        if variant.get_svtype() == 'BND':
            if variant.info['MATEID'] in bnd_cache:
                variant2 = variant
                variant = bnd_cache[variant.info['MATEID']]
                del bnd_cache[variant.var_id]
            else:
                bnd_cache[variant.var_id] = variant
                continue

        result = genotypes[variant.var_id]

        if result is None:
            msg = ("Found no genotype results for variant "
                   "'{}' ({})").format(variant.var_id, variant.get_svtype())
            logit(msg)
            raise RuntimeError(msg)

        variant = assign_genotype_to_variant(variant, sample, result)
        variant.write(out_vcf)

        # special BND processing
        if variant.get_svtype() == 'BND':
            variant2.qual = variant.qual
            variant2.active_formats = variant.active_formats
            variant2.genotype = variant.genotype
            variant2.write(out_vcf)
Пример #17
0
def sso_genotype(bam_string,
                 vcf_in,
                 vcf_out,
                 min_aligned,
                 split_weight,
                 disc_weight,
                 num_samp,
                 lib_info_path,
                 debug,
                 ref_fasta,
                 sum_quals,
                 max_reads,
                 max_ci_dist,
                 cores,
                 batch_size):

    # quit early if input VCF is absent
    if vcf_in is None:
        return

    invcf = os.path.abspath(vcf_in.name)
    full_bam_path = os.path.abspath(bam_string)
    ensure_valid_alignment_file(full_bam_path)

    sample = setup_sample(full_bam_path, lib_info_path, ref_fasta, num_samp, min_aligned)
    dump_library_metrics(lib_info_path, sample)

    # set variables for genotyping
    z = 3
    split_slop = 3 # amount of slop around breakpoint to count splitters

    with tempdir() as scratchdir:
        logit("Temporary scratch directory: {}".format(scratchdir))

        # dump the vcf file into the tmp directory, if we're reading from stdin
        src_vcf_file = setup_src_vcf_file(vcf_in, invcf, scratchdir)

        # create the vcf object
        src_vcf = init_vcf(src_vcf_file, sample, scratchdir)

        if cores is None:
            logit("Genotyping Input VCF (Serial Mode)")
            # pass through input vcf -- perform actual genotyping
            genotype_serial(src_vcf, vcf_out, sample, z, split_slop, min_aligned, sum_quals, split_weight, disc_weight, max_reads, max_ci_dist, debug)
        else:
            logit("Genotyping Input VCF (Parallel Mode)")

            genotype_parallel(src_vcf, vcf_out, sample, z, split_slop, min_aligned, sum_quals, split_weight, disc_weight, max_reads, max_ci_dist, debug, cores, batch_size, ref_fasta)


    sample.close()
Пример #18
0
def sso_genotype(bam_string, vcf_in, vcf_out, min_aligned, split_weight,
                 disc_weight, num_samp, lib_info_path, debug, ref_fasta,
                 sum_quals, max_reads, max_ci_dist, cores, batch_size):

    # quit early if input VCF is absent
    if vcf_in is None:
        return

    invcf = os.path.abspath(vcf_in.name)
    full_bam_path = os.path.abspath(bam_string).encode('UTF-8')
    ensure_valid_alignment_file(full_bam_path)

    sample = setup_sample(full_bam_path, lib_info_path, ref_fasta, num_samp,
                          min_aligned)
    dump_library_metrics(lib_info_path, sample)

    # set variables for genotyping
    z = 3
    split_slop = 3  # amount of slop around breakpoint to count splitters

    with tempdir() as scratchdir:
        logit("Temporary scratch directory: {}".format(scratchdir))

        # dump the vcf file into the tmp directory, if we're reading from stdin
        src_vcf_file = setup_src_vcf_file(vcf_in, invcf, scratchdir)

        # create the vcf object
        src_vcf = init_vcf(src_vcf_file, sample, scratchdir)

        if cores is None:
            logit("Genotyping Input VCF (Serial Mode)")
            # pass through input vcf -- perform actual genotyping
            genotype_serial(src_vcf, vcf_out, sample, z, split_slop,
                            min_aligned, sum_quals, split_weight, disc_weight,
                            max_reads, max_ci_dist, debug)
        else:
            logit("Genotyping Input VCF (Parallel Mode)")

            genotype_parallel(src_vcf, vcf_out, sample, z, split_slop,
                              min_aligned, sum_quals, split_weight,
                              disc_weight, max_reads, max_ci_dist, debug,
                              cores, batch_size, ref_fasta)

    sample.close()
Пример #19
0
def bayesian_genotype(breakpoint, counts, split_weight, disc_weight, debug):
    is_dup = breakpoint['svtype'] == 'DUP'

    elems = ('ref_seq', 'alt_seq', 'alt_clip', 'ref_span', 'alt_span')
    (ref_seq, alt_seq, alt_clip, ref_span, alt_span) = \
        [counts[i] for i in elems]

    # pre-calculations
    alt_splitters = alt_seq + alt_clip
    QR = int(split_weight * ref_seq) + int(disc_weight * ref_span)
    QA = int(split_weight * alt_splitters) + int(disc_weight * alt_span)

    # the actual bayesian calculation and decision
    gt_lplist = bayes_gt(QR, QA, is_dup)
    best, second_best = sorted([ (i, e) for i, e in enumerate(gt_lplist) ], key=lambda(x): x[1], reverse=True)[0:2]
    gt_idx = best[0]

    # print log probabilities of homref, het, homalt
    if debug:
        msg = ("{} -- "
               "log probabilities (homref, het, homalt) : "
               "{}").format(breakpoint['id'], gt_lplist)
        logit(msg)

    result = blank_genotype_result()
    result['formats']['GL'] = ','.join(['%.0f' % x for x in gt_lplist])
    result['formats']['DP'] = int(ref_seq + alt_seq + alt_clip + ref_span + alt_span)
    result['formats']['RO'] = int(ref_seq + ref_span)
    result['formats']['AO'] = int(alt_seq + alt_clip + alt_span)
    result['formats']['QR'] = QR
    result['formats']['QA'] = QA
    # if detailed:
    result['formats']['RS'] = int(ref_seq)
    result['formats']['AS'] = int(alt_seq)
    result['formats']['ASC'] = int(alt_clip)
    result['formats']['RP'] = int(ref_span)
    result['formats']['AP'] = int(alt_span)
    try:
        result['formats']['AB'] = '%.2g' % (QA / float(QR + QA))
    except ZeroDivisionError:
        result['formats']['AB'] = '.'

    # assign genotypes
    gt_sum = 0
    for gt in gt_lplist:
        try:
            gt_sum += 10**gt
        except OverflowError:
            gt_sum += 0
    if gt_sum > 0:
        gt_sum_log = math.log(gt_sum, 10)
        sample_qual = abs(-10 * (gt_lplist[0] - gt_sum_log)) # phred-scaled probability site is non-reference in this sample
        phred_gq = min(-10 * (second_best[1] - best[1]), 200)
        result['formats']['GQ'] = int(phred_gq)
        result['formats']['SQ'] = sample_qual
        result['qual'] += sample_qual
        if gt_idx == 1:
            result['formats']['GT'] = '0/1'
        elif gt_idx == 2:
            result['formats']['GT'] = '1/1'
        elif gt_idx == 0:
            result['formats']['GT'] = '0/0'
    else:
        result['formats']['GQ'] = '.'
        result['formats']['SQ'] = '.'
        result['formats']['GT'] = './.'
    
    return result
Пример #20
0
def genotype_parallel(src_vcf, out_vcf, sample, z, split_slop, min_aligned,
                      sum_quals, split_weight, disc_weight, max_reads,
                      max_ci_dist, debug, cores, breakpoint_batch_size,
                      ref_fasta):

    # cleanup unused library attributes
    for rg in sample.rg_to_lib:
        sample.rg_to_lib[rg].cleanup()

    # 1st pass through input vcf -- collect all the relevant breakpoints
    logit("Collecting breakpoints")
    breakpoints = collect_breakpoints(src_vcf, max_ci_dist)
    logit("Number of breakpoints/SVs to process: {}".format(len(breakpoints)))
    logit("Collecting regions")
    regions = [get_breakpoint_regions(b, sample, z) for b in breakpoints]
    logit("Batch breakpoints into groups of {}".format(breakpoint_batch_size))
    breakpoints_batches = list(
        partition_all(breakpoint_batch_size, breakpoints))
    logit("Batch regions into groups of {}".format(breakpoint_batch_size))
    regions_batches = list(partition_all(breakpoint_batch_size, regions))

    if len(breakpoints_batches) != len(regions_batches):
        raise RuntimeError(
            "Batch error: breakpoint batches ({}) != region batches ({})".
            format(breakpoints_batches, regions_batches))

    logit("Number of batches to parallel process: {}".format(
        len(breakpoints_batches)))

    std_args = (sample.bam.filename, ref_fasta, sample.rg_to_lib,
                sample.active_libs, sample.name, split_slop, min_aligned,
                split_weight, disc_weight, max_reads, debug)

    pool = mp.Pool(processes=cores)
    results = [
        pool.apply_async(parallel_calculate_genotype,
                         args=std_args + (b, r, i))
        for i, (b, r) in enumerate(zip(breakpoints_batches, regions_batches))
    ]
    results = [p.get() for p in results]
    logit("Finished parallel breakpoint processing")
    logit("Merging genotype results")
    merged_genotypes = {
        g['variant.id']: g
        for batch in results for g in batch['genotypes']
    }

    total_variants_skipped = sum([batch['skip-count'] for batch in results])
    total_variants_with_no_reads = sum(
        [batch['no-read-count'] for batch in results])

    logit("Number of variants skipped (surpassed max-reads threshold): {}".
          format(total_variants_skipped))
    logit("Number of variants with no reads: {}".format(
        total_variants_with_no_reads))

    # 2nd pass through input vcf -- apply the calculated genotypes to the variants
    logit("Applying genotype results to vcf")
    apply_genotypes_to_vcf(src_vcf, out_vcf, merged_genotypes, sample,
                           sum_quals)
    logit("All Done!")
Пример #21
0
def genotype_serial(src_vcf, out_vcf, sample, z, split_slop, min_aligned, sum_quals, split_weight, disc_weight, max_reads, max_ci_dist, debug):
    # initializations
    bnd_cache = {}
    src_vcf.write_header(out_vcf)
    total_variants = len(list(vcf_variants(src_vcf.filename)))

    # cleanup unused library attributes
    for rg in sample.rg_to_lib:
        sample.rg_to_lib[rg].cleanup()

    for i, vline in enumerate(vcf_variants(src_vcf.filename)):
        v = vline.rstrip().split('\t')
        variant = Variant(v, src_vcf)
        if i % 1000 == 0:
            logit("[ {} | {} ] Processing variant {}".format(i, total_variants, variant.var_id))
        if not sum_quals:
            variant.qual = 0

        if not variant.has_svtype():
            msg = ('Warning: SVTYPE missing '
                   'at variant %s. '
                   'Skipping.\n') % (variant.var_id)
            logit(msg)
            variant.write(out_vcf)
            continue

        if not variant.is_valid_svtype():
            msg = ('Warning: Unsupported SVTYPE '
                   'at variant %s (%s). '
                   'Skipping.\n') % (variant.var_id, variant.get_svtype())
            logit(msg)
            variant.write(out_vcf)
            continue

        breakpoints = src_vcf.get_variant_breakpoints(variant, max_ci_dist)

        # special BND processing
        if variant.get_svtype() == 'BND':
            if variant.info['MATEID'] in bnd_cache:
                variant2 = variant
                variant = bnd_cache[variant.info['MATEID']]
                del bnd_cache[variant.var_id]
            else:
                bnd_cache[variant.var_id] = variant
                continue

        if breakpoints is None:
            msg = ("Found no breakpoints for variant "
                   "'{}' ({})").format(variant.var_id, variant.get_svtype())
            logit(msg)
            continue

        result = serial_calculate_genotype(
                sample.bam,
                get_breakpoint_regions(breakpoints, sample, z),
                sample.rg_to_lib,
                sample.active_libs,
                sample.name,
                split_slop,
                min_aligned,
                split_weight,
                disc_weight,
                breakpoints,
                max_reads,
                debug
        )

        variant = assign_genotype_to_variant(variant, sample, result)
        variant.write(out_vcf)

        # special BND processing
        if variant.get_svtype() == 'BND':
            variant2.qual = variant.qual
            variant2.active_formats = variant.active_formats
            variant2.genotype = variant.genotype
            variant2.write(out_vcf)
Пример #22
0
def genotype_serial(src_vcf, out_vcf, sample, z, split_slop, min_aligned,
                    sum_quals, split_weight, disc_weight, max_reads,
                    max_ci_dist, debug):
    # initializations
    bnd_cache = {}
    src_vcf.write_header(out_vcf)
    total_variants = len(list(vcf_variants(src_vcf.filename)))

    # cleanup unused library attributes
    for rg in sample.rg_to_lib:
        sample.rg_to_lib[rg].cleanup()

    for i, vline in enumerate(vcf_variants(src_vcf.filename)):
        v = vline.rstrip().split('\t')
        variant = Variant(v, src_vcf)
        if i % 1000 == 0:
            logit("[ {} | {} ] Processing variant {}".format(
                i, total_variants, variant.var_id))
        if not sum_quals:
            variant.qual = 0

        if not variant.has_svtype():
            msg = ('Warning: SVTYPE missing '
                   'at variant %s. '
                   'Skipping.\n') % (variant.var_id)
            logit(msg)
            variant.write(out_vcf)
            continue

        if not variant.is_valid_svtype():
            msg = ('Warning: Unsupported SVTYPE '
                   'at variant %s (%s). '
                   'Skipping.\n') % (variant.var_id, variant.get_svtype())
            logit(msg)
            variant.write(out_vcf)
            continue

        breakpoints = src_vcf.get_variant_breakpoints(variant, max_ci_dist)

        # special BND processing
        if variant.get_svtype() == 'BND':
            if variant.info['MATEID'] in bnd_cache:
                variant2 = variant
                variant = bnd_cache[variant.info['MATEID']]
                del bnd_cache[variant.var_id]
            else:
                bnd_cache[variant.var_id] = variant
                continue

        if breakpoints is None:
            msg = ("Found no breakpoints for variant "
                   "'{}' ({})").format(variant.var_id, variant.get_svtype())
            logit(msg)
            continue

        result = serial_calculate_genotype(
            sample.bam, get_breakpoint_regions(breakpoints, sample,
                                               z), sample.rg_to_lib,
            sample.active_libs, sample.name, split_slop, min_aligned,
            split_weight, disc_weight, breakpoints, max_reads, debug)

        variant = assign_genotype_to_variant(variant, sample, result)
        variant.write(out_vcf)

        # special BND processing
        if variant.get_svtype() == 'BND':
            variant2.qual = variant.qual
            variant2.active_formats = variant.active_formats
            variant2.genotype = variant.genotype
            variant2.write(out_vcf)
Пример #23
0
def genotype_parallel(src_vcf, out_vcf, sample, z, split_slop, min_aligned, sum_quals, split_weight, disc_weight, max_reads, max_ci_dist, debug, cores, breakpoint_batch_size, ref_fasta):

    # cleanup unused library attributes
    for rg in sample.rg_to_lib:
        sample.rg_to_lib[rg].cleanup()

    # 1st pass through input vcf -- collect all the relevant breakpoints
    logit("Collecting breakpoints")
    breakpoints = collect_breakpoints(src_vcf, max_ci_dist)
    logit("Number of breakpoints/SVs to process: {}".format(len(breakpoints)))
    logit("Collecting regions")
    regions = [ get_breakpoint_regions(b, sample, z) for b in breakpoints ]
    logit("Batch breakpoints into groups of {}".format(breakpoint_batch_size))
    breakpoints_batches = list(partition_all(breakpoint_batch_size, breakpoints))
    logit("Batch regions into groups of {}".format(breakpoint_batch_size))
    regions_batches = list(partition_all(breakpoint_batch_size, regions))

    if len(breakpoints_batches) != len(regions_batches):
        raise RuntimeError("Batch error: breakpoint batches ({}) != region batches ({})".format(breakpoints_batches, regions_batches))

    logit("Number of batches to parallel process: {}".format(len(breakpoints_batches)))

    std_args = (
        sample.bam.filename,
        ref_fasta,
        sample.rg_to_lib,
        sample.active_libs,
        sample.name,
        split_slop,
        min_aligned,
        split_weight,
        disc_weight,
        max_reads,
        debug
    )

    pool = mp.Pool(processes=cores)
    results = [pool.apply_async(parallel_calculate_genotype, args=std_args + (b, r, i)) for i, (b, r) in enumerate(zip(breakpoints_batches, regions_batches))]
    results = [p.get() for p in results]
    logit("Finished parallel breakpoint processing")
    logit("Merging genotype results")
    merged_genotypes = { g['variant.id'] : g for batch in results for g in batch['genotypes'] }

    total_variants_skipped = sum([ batch['skip-count'] for batch in results ])
    total_variants_with_no_reads = sum([ batch['no-read-count'] for batch in results ])

    logit("Number of variants skipped (surpassed max-reads threshold): {}".format(total_variants_skipped))
    logit("Number of variants with no reads: {}".format(total_variants_with_no_reads))

    # 2nd pass through input vcf -- apply the calculated genotypes to the variants
    logit("Applying genotype results to vcf")
    apply_genotypes_to_vcf(src_vcf, out_vcf, merged_genotypes, sample, sum_quals)
    logit("All Done!")
Пример #24
0
def bayesian_genotype(breakpoint, counts, split_weight, disc_weight, debug):
    is_dup = breakpoint['svtype'] == 'DUP'

    elems = ('ref_seq', 'alt_seq', 'alt_clip', 'ref_span', 'alt_span')
    (ref_seq, alt_seq, alt_clip, ref_span, alt_span) = \
        [counts[i] for i in elems]

    # pre-calculations
    alt_splitters = alt_seq + alt_clip
    QR = int(split_weight * ref_seq) + int(disc_weight * ref_span)
    QA = int(split_weight * alt_splitters) + int(disc_weight * alt_span)

    # the actual bayesian calculation and decision
    gt_lplist = bayes_gt(QR, QA, is_dup)
    best, second_best = sorted([(i, e) for i, e in enumerate(gt_lplist)],
                               key=lambda x: x[1],
                               reverse=True)[0:2]
    gt_idx = best[0]

    # print log probabilities of homref, het, homalt
    if debug:
        msg = ("{} -- "
               "log probabilities (homref, het, homalt) : "
               "{}").format(breakpoint['id'], gt_lplist)
        logit(msg)

    result = blank_genotype_result()
    result['formats']['GL'] = ','.join(['%.0f' % x for x in gt_lplist])
    result['formats']['DP'] = int(ref_seq + alt_seq + alt_clip + ref_span +
                                  alt_span)
    result['formats']['RO'] = int(ref_seq + ref_span)
    result['formats']['AO'] = int(alt_seq + alt_clip + alt_span)
    result['formats']['QR'] = QR
    result['formats']['QA'] = QA
    # if detailed:
    result['formats']['RS'] = int(ref_seq)
    result['formats']['AS'] = int(alt_seq)
    result['formats']['ASC'] = int(alt_clip)
    result['formats']['RP'] = int(ref_span)
    result['formats']['AP'] = int(alt_span)
    try:
        result['formats']['AB'] = '%.2g' % (QA / float(QR + QA))
    except ZeroDivisionError:
        result['formats']['AB'] = '.'

    # assign genotypes
    gt_sum = 0
    for gt in gt_lplist:
        try:
            gt_sum += 10**gt
        except OverflowError:
            gt_sum += 0
    if gt_sum > 0:
        gt_sum_log = math.log(gt_sum, 10)
        sample_qual = abs(
            -10 * (gt_lplist[0] - gt_sum_log)
        )  # phred-scaled probability site is non-reference in this sample
        phred_gq = min(-10 * (second_best[1] - best[1]), 200)
        result['formats']['GQ'] = int(phred_gq)
        result['formats']['SQ'] = sample_qual
        result['qual'] += sample_qual
        if gt_idx == 1:
            result['formats']['GT'] = '0/1'
        elif gt_idx == 2:
            result['formats']['GT'] = '1/1'
        elif gt_idx == 0:
            result['formats']['GT'] = '0/0'
    else:
        result['formats']['GQ'] = '.'
        result['formats']['SQ'] = '.'
        result['formats']['GT'] = './.'

    return result