def get_phased_counts_variant(record, LR_bam, reference_pyfasta): chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) ref = tk_io.get_record_ref(record) alt_alleles = tk_io.get_record_alt_alleles(record) if LR_bam.references[0][0:3] != "chr": chrom = chrom[3:] # this function does the realignment counts, _, _, _, _, _ = tk_bam.get_phased_allele_read_info( chrom, pos, ref, alt_alleles, 30, 0, 0, 0, LR_bam, reference_pyfasta, match=1, mismatch=-3, gap_open=-1, gap_extend=-4) unphased = (counts[0][1], sum(counts[0])) hap_1 = (counts[1][1], sum(counts[1])) hap_2 = (counts[2][1], sum(counts[2])) return (unphased, hap_1, hap_2)
def get_phase_set(record, bam): chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) for read in bam.fetch(chrom, pos-1, pos+1): if dict(read.tags).get('PS') is not None: return dict(read.tags).get('PS') return None
def validate_variant(record, validation_bam, reference_pyfasta): chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) ref = tk_io.get_record_ref(record) alt_alleles = tk_io.get_record_alt_alleles(record) if validation_bam.references[0][0:3] != "chr": chrom = chrom[3:] # this function does the realignment counts, _, _, _, _, _ = tk_bam.get_allele_read_info(chrom, pos, ref, alt_alleles, 30, 0, 0, 0, validation_bam, reference_pyfasta, match=1, mismatch=-3, gap_open=-1, gap_extend=-4) validation_cov = sum(counts) validation_ao = counts[1] return (validation_ao, validation_cov)
def lockstep_variant_iterator(vfr_left, vfr_right, shared_locus): """ Traverse two copies of the same variants in lockstep, making sure we never get out of sync. """ iter_left = get_variant_iterator(vfr_left, shared_locus) iter_right = get_variant_iterator(vfr_right, shared_locus) for (var_left, var_right) in zip(iter_left, iter_right): # keep these 1-indexed since they're only used for the error message pos_left = tk_io.get_record_pos(var_left) pos_right = tk_io.get_record_pos(var_right) if pos_left != pos_right: raise Exception( "Variant positions are out of sync: {0}:{1}, {0}:{2}".format( shared_locus.chrom, pos_left, pos_right)) yield (var_left, var_right)
def get_closest_variant_pos(variants, target_pos, direction, het_only=False): """ Get closest variant to target, looking in specified direction (-1 = before target, 1 = after target) """ # too lazy to implement binary search. for variant in variants[::direction]: pos = tk_io.get_record_pos(variant) - 1 right_direction = (pos <= target_pos) if direction < 0 else ( pos >= target_pos) if (gt_is_het(variant) or not het_only) and right_direction: return pos # edge case - no variants in that direction. return None
def filter_variant(var, bam, reference_pyfasta): if tk_io.get_record_qual(var) < 50: tk_io.set_record_filters(var, ['10X_QUAL_FILTER']) return chrom = tk_io.get_record_chrom(var) pos = tk_io.get_record_pos(var) ref = tk_io.get_record_ref(var) alts = tk_io.get_record_alt_alleles(var) (counts, _, _, _, _, _) = tk_bam.get_allele_read_info(chrom, pos, ref, alts, 30, 30, 30, 45, bam, reference_pyfasta) if float(counts[1]) < 2 or float( counts[1]) / float(counts[0] + counts[1]) < 0.15: tk_io.set_record_filters(var, ['10X_ALLELE_FRACTION_FILTER'])
def pair_iter(i1, i2): v1 = None v2 = None while True: if v1 is None: try: v1 = i1.next() except StopIteration: if v2 is not None: yield (None, v2) for x2 in i2: yield (None, x2) break if v2 is None: try: v2 = i2.next() except StopIteration: if v1 is not None: yield (v1, None) for x1 in i1: yield (x1, None) break k1 = tk_io.get_record_pos(v1) k2 = tk_io.get_record_pos(v2) if k1 == k2: yield (v1, v2) v1 = None v2 = None elif k1 < k2: yield (v1, None) v1 = None else: yield (None, v2) v2 = None
def split_variant_iterator(vfr_left, vfr_right, new_locus_left, new_locus_right): # assert no overlap assert (new_locus_left.end <= new_locus_right.start) for record_out in get_variant_iterator(vfr_left, new_locus_left): yield record_out first_phase_set_right = None for record_out in get_variant_iterator(vfr_right, new_locus_right): if first_phase_set_right is None: first_phase_set_right = tk_io.get_record_pos(record_out) - 1 # if we see a real phase set that's less than the new one, # then the block was truncated and should be updated current_ps = tk_io.get_record_phase_set(record_out) if current_ps > 0 and current_ps < first_phase_set_right: adjust_phasing(record_out, first_phase_set_right, flip=False) yield record_out
def populate_fields(record, bam, reference_pyfasta, args): alleles = tk_io.get_record_alt_alleles(record) ref = tk_io.get_record_ref(record) post_homopolymer_counts = [] post_homopolymer_bases = [] chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) ref = tk_io.get_record_ref(record) post_homopolymer_counts = [] post_homopolymer_bases = [] post_dinucleotide_counts = [] post_dinucleotide_bases = [] post_trinucleotide_counts = [] post_trinucleotide_bases = [] for allele in alleles: variant_length = tk_io.get_allele_length(ref, allele) if variant_length != 0: post_hp_c, post_hp_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 1) post_dn_c, post_dn_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 2) post_tn_c, post_tn_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 3) post_homopolymer_counts.append(post_hp_c) post_homopolymer_bases.append(post_hp_b) post_dinucleotide_counts.append(post_dn_c) post_dinucleotide_bases.append(post_dn_b) post_trinucleotide_counts.append(post_tn_c) post_trinucleotide_bases.append(post_tn_b) if len(post_homopolymer_counts) != 0: record.INFO['POSTHPC'] = post_homopolymer_counts record.INFO['POSTHPB'] = post_homopolymer_bases record.INFO['POSTDNC'] = post_dinucleotide_counts record.INFO['POSTDNB'] = post_dinucleotide_bases record.INFO['POSTTNC'] = post_trinucleotide_counts record.INFO['POSTTNB'] = post_trinucleotide_bases (counts, mean_mapqs, bc_qual_string, molecule_differences, AS, rescue) = tk_bam.get_allele_read_info(chrom, pos, ref, alleles, 30, -1, args.min_mapq_attach_bc, args.default_indel_qual, bam, reference_pyfasta) tk_io.set_record_barcodes(record, bc_qual_string) record.INFO['MMD'] = numpy.mean(molecule_differences[1]) if math.isnan(record.INFO['MMD']): record.INFO['MMD'] = -1 record.INFO['MUMAP_REF'] = mean_mapqs[0] record.INFO['MUMAP_ALT'] = mean_mapqs[1:] record.INFO['RO'] = counts[0] record.INFO['AO'] = counts[1:] record.INFO['RESCUED'] = numpy.sum(numpy.sum(x) for x in rescue) record.INFO['NOT_RESCUED'] = numpy.sum([y for y in [numpy.sum([1-z for z in x]) for x in rescue]])
def load_variant_barcode_phasing_info(record, fragment_barcode_info): chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) end = pos + tk_io.get_record_max_length(record) barcode_info = {} sample = record.samples[0] phase_set = int(get_data(sample.data, "PS", -1)) for line in tk_tabix.tabix_safe_fetch(fragment_barcode_info, chrom, pos, end + 1): info = line.strip("\n").split("\t") barcode = info[6] frag_phase_set = int(info[3]) if frag_phase_set != phase_set and phase_set != -1: continue assert (not barcode in barcode_info) barcode_info[barcode] = (float(info[7]), float(info[8]), float(info[9])) return barcode_info
def test_basic(self): self.run_stage(self.args) # Load the output file vfr = tk_io.VariantFileReader(os.path.join(job_dir,"default.vcf")) for r in vfr.record_getter(): pos = tk_io.get_record_pos(r) barcodes = tk_io.get_record_barcodes(r) if pos == 26357747 or pos == 26357748: print barcodes assert(barcodes[1][0] =='1-ATAGGAGTTCAGGG_63') print tk_io.get_record_alt_allele_counts(r) assert(int(tk_io.get_record_alt_allele_counts(r)[0]) in [31,32]) assert(int(tk_io.get_record_ref_allele_count(r)) == 0) if pos == 26501280: print barcodes assert(barcodes[0][0] == '1-TGAAGACATAACCC_61_61') assert(int(r.INFO['POSTHPC'][0]) == 10)
def test_call_haps(self): out_vcf = open(OUTPUT_VCF, 'w') vfw = VariantFileWriter(out_vcf, template_file=open(SNP_INPUT_VCF, 'r')) out_bc_haps = open(OUTPUT_TSV, 'w') self.p.call_haps(vfw, out_bc_haps) out_vcf.close() out_bc_haps.close() vfr = VariantFileReader(OUTPUT_VCF) hap_calls = {} for record in vfr.record_getter(): chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) - 1 genotype, phased = tk_io.get_record_genotype_phased(record) hap_calls[(chrom, pos)] = genotype self.assertTrue(phased) print hap_calls self.assertTrue((hap_calls[('chr1', 2)] == [1, 2] and hap_calls[('chr1', 3)] == [1, 0]) or hap_calls[('chr1', 2)] == [2, 1] and hap_calls[('chr1', 3)] == [0, 1])
def __init__(self, current_phase_set, record): self.chrom = tk_io.get_record_chrom(record) self.pos = tk_io.get_record_pos(record) self.key = (self.chrom, self.pos) self.ref = tk_io.get_record_ref(record) self.filters = tk_io.get_record_passes_filters(record) alt_alleles = tk_io.get_record_alt_alleles(record) all_alleles = [self.ref] + alt_alleles (genotype, self.phased) = tk_io.get_record_genotype_phased(record) # always set homozygotes as phased if genotype[0] == genotype[1]: self.phased = True # note -- if there are two alts, this will just pick one. self.phase_set = current_phase_set self.hap = (all_alleles[genotype[0]], all_alleles[genotype[1]]) self.record = record
def populate_repeat_info(record, bam, variant_length, reference_pyfasta, length): post_poly_count = 0 post_poly_base = None chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) lastBase = None gap = min(30, len(reference_pyfasta[chrom])-pos-1) #sequence = {x: tk_bam.get_base_counts_at_locus(chrom, pos + x, bam) for x in range(0 , gap + max(-variant_length,1))} sequence = reference_pyfasta[chrom][(pos+1):(pos+gap+1)].upper() #from the base after the indel to the end of the gap for base in range(0, gap, length): if lastBase is None: post_poly_count = 1 post_poly_base = sequence[base:base+length] lastBase = post_poly_base elif lastBase is not None: if lastBase == sequence[base:base+length]: post_poly_count += 1 else: break else: break return post_poly_count, post_poly_base
def get_record_data(record): record_set = tk_io.get_record_phase_set(record) record_chrom = tk_io.get_record_chrom(record) record_pos = tk_io.get_record_pos(record) - 1 return (record_set, record_chrom, record_pos)