def dprimecalc(record1, record2, snpcheck = True): '''Calculates Lewontin's D' statistic (D/Dmax) between two VCF records. Will check that records are biallelic (single-ALT) SNPs unless snpcheck = False. ''' if snpcheck: snpchecker(record1, record2) elif not snpcheck: pass values = freqsgetter(record1, record2, snpcheck = False)[1] # get allele frequencies d = dcalc(record1, record2, snpcheck = False) if d >= 0: dmax = min(values['p1'] * values['q2'], values['p2'] * values['q1']) if dmax == 0: out = 0 else: out = d/dmax elif d < 0: dmin = max(-1 * values['p1'] * values['q1'], -1 * values['p2'] * values['q2']) if dmin == 0: out = 0 else: out = d/dmin elif round(d, 6) == 0: out = 0 return out
def quick_dcalc(record1, record2): haps = freqsgetter(record1, record2, snpcheck = False)[2] try: LHS = haps['AB'] * haps['ab'] except KeyError: # either hap missing LHS = 0 try: RHS = haps['Ab'] * haps['aB'] except KeyError: RHS = 0 d = LHS - RHS # d = round(d, 5) return d
def freqscalc(record1, record2, snpcheck = True, aaf = False): '''Exploratory convenience function. Given two VCF records, returns observed haplotype frequencies. Will check that records are biallelic (single-ALT) SNPs unless snpcheck = False. aaf will return AF values hardcoded in the VCF itself, while aaf = False (default) will make freqscalc calculate them instead (more accurate option). ''' if snpcheck: snpchecker(record1, record2) elif not snpcheck: pass # get allele frequencies if aaf == True: p = 1 - record1.aaf[0] q = 1 - record2.aaf[0] p2 = record1.aaf[0] q2 = record2.aaf[0] elif aaf == False: values = freqsgetter(record1, record2)[1] p = values['p1'] q = values['q1'] p2 = values['p2'] q2 = values['q2'] print(record1.CHROM, record1.POS, '- ref', record1.REF, 'alt', record1.ALT[0]) print(record2.CHROM, record2.POS, '- ref', record2.REF, 'alt', record2.ALT[0]) print('p1 ', p, 'p2 ', p2) print('q1 ', q, 'q2 ', q2) # get samples + check for same samples b/w both records strainlist = straingetter(record1, record2) # score haplotypes haplist = [] for strain in strainlist: gt1 = record1.genotype(strain)['GT'] gt2 = record2.genotype(strain)['GT'] if gt1 == '.' or gt2 == '.': continue if gt1 == '1' and gt2 == '1': outgt = str(record1.ALT[0]) + str(record2.ALT[0]) elif gt1 == '1' and gt2 == '0': outgt = str(record1.ALT[0]) + record2.REF elif gt1 == '0' and gt2 == '1': outgt = record1.REF + str(record2.ALT[0]) elif gt1 == '0' and gt2 == '0': outgt = record1.REF + record2.REF haplist.append(outgt) # create list of observed genotypes uniques = set(haplist) for hap in uniques: print(hap, round(haplist.count(hap)/len(haplist), 5))
def r2calc(record1, record2, snpcheck = True): '''Calculates r^2 (correlation) between two VCF records. Will check that records are biallelic (single-ALT) SNPs unless snpcheck = False. ''' if snpcheck: snpchecker(record1, record2) elif not snpcheck: pass values = freqsgetter(record1, record2, snpcheck = False)[1] if values['p1'] == 0 or values['q1'] == 0 or values['p2'] == 0 or values['q2'] == 0: out = 0 else: dsquared = dcalc(record1, record2, snpcheck = False)**2 out = dsquared/(values['p1'] * values['q1'] * values['p2'] * values['q2']) # out = round(out, 4) return out
def dcalc(record1, record2, snpcheck = True): '''Calculates D statistic between two VCF records. Will check that records are biallelic (single-ALT) SNPs unless snpcheck = False. ''' if snpcheck: snpchecker(record1, record2) elif not snpcheck: pass haps = freqsgetter(record1, record2, snpcheck = False)[2] try: LHS = haps['AB'] * haps['ab'] except KeyError: # either hap missing LHS = 0 try: RHS = haps['Ab'] * haps['aB'] except KeyError: RHS = 0 d = LHS - RHS # d = round(d, 5) return d