def test_logdet_tk_adjustment(self): """logdet using tamura kumar differs from classic""" aln = load_aligned_seqs("data/brca1_5.paml", moltype=DNA) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) tk = logdet_calc.get_pairwise_distances() logdet_calc.run(use_tk_adjustment=False, show_progress=False) not_tk = logdet_calc.get_pairwise_distances() self.assertNotEqual(tk, not_tk)
def test_logdet_variance(self): """calculate logdet variance consistent with hand calculation""" data = [ ( "seq1", "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", ), ( "seq2", "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC", ), ] aln = make_aligned_seqs(data=data, moltype=DNA) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) self.assertEqual(logdet_calc.variances[1, 1], None) index = dict(list(zip("ACGT", list(range(4))))) J = numpy.zeros((4, 4)) for p in zip(data[0][1], data[1][1]): J[index[p[0]], index[p[1]]] += 1 for i in range(4): if J[i, i] == 0: J[i, i] += 0.5 J /= J.sum() M = numpy.linalg.inv(J) var = 0.0 for i in range(4): for j in range(4): var += M[j, i]**2 * J[i, j] - 1 var /= 16 * len(data[0][1]) logdet_calc.run(use_tk_adjustment=False, show_progress=False) dists = logdet_calc.get_pairwise_distances() assert_allclose(logdet_calc.variances[1, 1], var, atol=1e-3)
def test_logdet_pair_aa(self): """logdet shouldn't fail to produce distances for aa seqs""" aln = load_aligned_seqs("data/brca1_5.paml", moltype=DNA) aln = aln.get_translation() logdet_calc = LogDetPair(moltype=PROTEIN, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.get_pairwise_distances()
def test_logdet_pair_dna(self): """logdet should produce distances that match MEGA""" aln = load_aligned_seqs("data/brca1_5.paml", moltype=DNA) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.get_pairwise_distances().to_dict() all_expected = { ("Human", "NineBande"): 0.075336929999999996, ("NineBande", "DogFaced"): 0.0898575452, ("DogFaced", "Human"): 0.1061747919, ("HowlerMon", "DogFaced"): 0.0934480008, ("Mouse", "HowlerMon"): 0.26422862920000001, ("NineBande", "Human"): 0.075336929999999996, ("HowlerMon", "NineBande"): 0.062202897899999998, ("DogFaced", "NineBande"): 0.0898575452, ("DogFaced", "HowlerMon"): 0.0934480008, ("Human", "DogFaced"): 0.1061747919, ("Mouse", "Human"): 0.26539976700000001, ("NineBande", "HowlerMon"): 0.062202897899999998, ("HowlerMon", "Human"): 0.036571181899999999, ("DogFaced", "Mouse"): 0.2652555144, ("HowlerMon", "Mouse"): 0.26422862920000001, ("Mouse", "DogFaced"): 0.2652555144, ("NineBande", "Mouse"): 0.22754789210000001, ("Mouse", "NineBande"): 0.22754789210000001, ("Human", "Mouse"): 0.26539976700000001, ("Human", "HowlerMon"): 0.036571181899999999, } for pair in dists: got = dists[pair] expected = all_expected[pair] assert_allclose(got, expected)
def test_logdet_for_determinant_lte_zero(self): """returns distance of None if the determinant is <= 0""" data = dict( seq1="AGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", seq2="TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCC", ) aln = make_aligned_seqs(data=data, moltype=DNA) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.get_pairwise_distances().to_dict() self.assertTrue(numpy.isnan(list(dists.values())[0])) logdet_calc.run(use_tk_adjustment=False, show_progress=False) dists = logdet_calc.get_pairwise_distances().to_dict() self.assertTrue(numpy.isnan(list(dists.values())[0])) # but raises ArithmeticError if told to logdet_calc = LogDetPair(moltype=DNA, alignment=aln, invalid_raises=True) with self.assertRaises(ArithmeticError): logdet_calc.run(use_tk_adjustment=True, show_progress=False)
def test_logdet_missing_states(self): """should calculate logdet measurement with missing states""" data = [ ( "seq1", "GGGGGGGGGGGCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGCGGTTTTTTTTTTTTTTTTTT", ), ( "seq2", "TAAAAAAAAAAGGGGGGGGGGGGGGGGGGTTTTTNTTTTTTTTTTTTCCCCCCCCCCCCCCCCC", ), ] aln = make_aligned_seqs(data=data, moltype=DNA) logdet_calc = LogDetPair(moltype=DNA, alignment=aln) logdet_calc.run(use_tk_adjustment=True, show_progress=False) dists = logdet_calc.get_pairwise_distances().to_dict() self.assertTrue(list(dists.values())[0] is not None) logdet_calc.run(use_tk_adjustment=False, show_progress=False) dists = logdet_calc.get_pairwise_distances().to_dict() self.assertTrue(list(dists.values())[0] is not None)