def _smooth(self, snp_idx_to_smooth, snp_gts, samples): snps, gt_for_snps_in_win = zip(*snp_gts) # we need the recomb rates # TODO, this is already calculated in ab coding, we have to cache snp1 = snps[snp_idx_to_smooth] snp1_calls = [snp1.genotype(sample) for sample in samples] weights = [] for snp2 in snps: snp2_calls = [snp2.genotype(sample) for sample in samples] haplos = _count_biallelic_haplotypes(snp1_calls, snp2_calls, return_alleles=True) if haplos is None: weight = 0 else: haplo_cnt, alleles = haplos recomb_rate = (haplo_cnt.aB + haplo_cnt.Ab) / sum(haplo_cnt) weight = 2 * (0.5 - recomb_rate) if recomb_rate < 0.5 else 0 weights.append(weight) # we have to transpose, we want the genotype for each indi not for # each snp indis_gts = {indi: [] for indi in samples} for indi in samples: for snp_gt in gt_for_snps_in_win: snp_indi_gt = snp_gt.get(indi, (indi, None)) snp_indi_gt = tuple(sorted(snp_indi_gt)) indis_gts[indi].append(snp_indi_gt) # Now we can do the smoothing recomb_thres = self.recomb_threshold smooth_threhsold = self.smooth_threhsold smoothed_genos = [] for indi in samples: indi_gt = indis_gts[indi] n_recombs = self._count_recombinations(indi_gt) counts = Counter() for weight, geno in zip(weights, indi_gt): counts[geno] += weight smoothed_geno, vote = counts.most_common(1)[0] index = vote / sum(counts.values()) self._recombs.append(n_recombs) self._smoothes.append(index) if recomb_thres is None: if index > smooth_threhsold: geno = smoothed_geno else: geno = None else: if n_recombs > recomb_thres: # We're assuming diploid here geno = ('A', 'B') else: if index > smooth_threhsold: geno = smoothed_geno else: geno = None smoothed_genos.append(geno) return smoothed_genos
def test_empy_snv(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t./.\t./.\t./.\t./.\t./.\t./. 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) call1 = snps[0].record.samples call2 = snps[1].record.samples counts = _count_biallelic_haplotypes(call1, call2) assert counts is None
def _deduce_coding(self, snp_and_coding, snp1_calls, snp2_idxs): votes = Counter() offspring = self.offspring for snp2_idx in snp2_idxs: snp2, coding2 = snp_and_coding[snp2_idx] if coding2 is None: continue snp2_calls = [snp2.genotype(sample) for sample in offspring] haplos = _count_biallelic_haplotypes(snp1_calls, snp2_calls, return_alleles=True) if haplos is None: continue else: haplo_cnt, alleles = haplos alleles_in_major_haplo = { alleles.b: alleles.a, alleles.B: alleles.A } if haplo_cnt is None: continue if (coding2.A not in alleles_in_major_haplo or coding2.B not in alleles_in_major_haplo): # The offspring alleles in snp2 do not match the alleles # in the parents continue # This is allele A in snp 1 allele1A = alleles_in_major_haplo[coding2.A] # This is allele B in snp 1 allele1B = alleles_in_major_haplo[coding2.B] voted_coding1 = AlleleCoding(allele1A, allele1B) recomb_rate = (haplo_cnt.aB + haplo_cnt.Ab) / sum(haplo_cnt) weight = 2 * (0.5 - recomb_rate) if recomb_rate < 0.5 else 0 votes[voted_coding1] += weight if not votes or sum(votes.values()) == 0: deduced_coding1 = None self.log[NO_INFO] += 1 elif len(votes) > 2: deduced_coding1 = None self.log[MORE_THAN_2_CODINGS] += 1 else: deduced_coding1 = votes.most_common(1)[0][0] index = votes[deduced_coding1] / sum(votes.values()) self.indexes.append(index) if index < self.parent_index_threshold: deduced_coding1 = None self.log[NOT_ENOUGH_SUPPORT] += 1 else: self.log[ENOUGH_SUPPORT] += 1 if deduced_coding1 is None: return None return {deduced_coding1.A: 'A', deduced_coding1.B: 'B'}
def _deduce_coding(self, snp_and_coding, snp1_calls, snp2_idxs): votes = Counter() offspring = self.offspring for snp2_idx in snp2_idxs: snp2, coding2 = snp_and_coding[snp2_idx] if coding2 is None: continue snp2_calls = [snp2.genotype(sample) for sample in offspring] haplos = _count_biallelic_haplotypes(snp1_calls, snp2_calls, return_alleles=True) if haplos is None: continue else: haplo_cnt, alleles = haplos alleles_in_major_haplo = {alleles.b: alleles.a, alleles.B: alleles.A} if haplo_cnt is None: continue if (coding2.A not in alleles_in_major_haplo or coding2.B not in alleles_in_major_haplo): # The offspring alleles in snp2 do not match the alleles # in the parents continue # This is allele A in snp 1 allele1A = alleles_in_major_haplo[coding2.A] # This is allele B in snp 1 allele1B = alleles_in_major_haplo[coding2.B] voted_coding1 = AlleleCoding(allele1A, allele1B) recomb_rate = (haplo_cnt.aB + haplo_cnt.Ab) / sum(haplo_cnt) weight = 2 * (0.5 - recomb_rate) if recomb_rate < 0.5 else 0 votes[voted_coding1] += weight if not votes or sum(votes.values()) == 0: deduced_coding1 = None self.log[NO_INFO] += 1 elif len(votes) > 2: deduced_coding1 = None self.log[MORE_THAN_2_CODINGS] += 1 else: deduced_coding1 = votes.most_common(1)[0][0] index = votes[deduced_coding1] / sum(votes.values()) self.indexes.append(index) if index < self.parent_index_threshold: deduced_coding1 = None self.log[NOT_ENOUGH_SUPPORT] += 1 else: self.log[ENOUGH_SUPPORT] += 1 if deduced_coding1 is None: return None return {deduced_coding1.A: 'A', deduced_coding1.B: 'B'}
def test_count_homo_haplotypes(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) call1 = snps[0].record.samples call2 = snps[1].record.samples counts = _count_biallelic_haplotypes(call1, call2) assert counts.AB == 3 assert counts.ab == 2 vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1\t1/1\t0/0 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1\t1/1\t0/0''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) call1 = snps[0].record.samples call2 = snps[1].record.samples counts = _count_biallelic_haplotypes(call1, call2) assert counts.AB == 4 assert counts.ab == 3 vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1\t1/1\t0/0 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t2/2\t1/1\t0/0\t0/0\t0/1\t1/1\t0/0''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) call1 = snps[0].record.samples call2 = snps[1].record.samples counts = _count_biallelic_haplotypes(call1, call2) assert counts.AB == 4 assert counts.ab == 3 r_sqr = calculate_r_sqr(snps[0], snps[1]) self.assertAlmostEqual(r_sqr, 1) vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1\t1/1\t0/0 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t1/1\t1/1\t0/0\t1/1\t0/0\t1/1''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) call1 = snps[0].record.samples call2 = snps[1].record.samples counts = _count_biallelic_haplotypes(call1, call2) r_sqr = calculate_r_sqr(snps[0], snps[1]) assert r_sqr - 1.0 < 0.0001 vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1\t1/1\t0/0 20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1\t1/1\t1/1''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) call1 = snps[0].record.samples call2 = snps[1].record.samples counts = _count_biallelic_haplotypes(call1, call2) r_sqr = calculate_r_sqr(snps[0], snps[1]) assert r_sqr - 1.0 < 0.0001 # monomorphic vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0 20\t3\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) call1 = snps[0].record.samples call2 = snps[1].record.samples assert _count_biallelic_haplotypes(call1, call2) is None r_sqr = calculate_r_sqr(snps[0], snps[1]) assert r_sqr is None assert fisher_exact(snps[0], snps[1]) is None # Ab and aB vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t0/0\t 20\t3\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t1/1\t1/1\t0/0\t1/1\t''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) call1 = snps[0].record.samples call2 = snps[1].record.samples counts = _count_biallelic_haplotypes(call1, call2) assert counts.AB == 3 assert counts.ab == 2 assert counts.aB == 1 assert counts.Ab == 1 # different major allele names in snp1 (1, 2) and snp2 (2,3) vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t0/0\t 20\t3\t.\tG\tA\t29\tPASS\tNS=3\tGT\t3/3\t3/3\t3/3\t2/2\t2/2\t3/3\t3/3\t''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) call1 = snps[0].record.samples call2 = snps[1].record.samples counts = _count_biallelic_haplotypes(call1, call2) assert counts.AB == 4 assert counts.ab == 2 assert counts.aB == 1 assert counts.Ab == 0 # missing data vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t./.\t 20\t3\t.\tG\tA\t29\tPASS\tNS=3\tGT\t3/3\t3/3\t3/3\t2/2\t2/2\t3/3\t3/3\t''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) call1 = snps[0].record.samples call2 = snps[1].record.samples counts = _count_biallelic_haplotypes(call1, call2) assert counts.AB == 3 assert counts.ab == 2 assert counts.aB == 1 assert counts.Ab == 0