def imm_example(): alphabet = Alphabet.create(b"AC", b"X") hmm = HMM.create(alphabet) S = MuteState.create(b"S", alphabet) hmm.add_state(S, log(1.0)) E = MuteState.create(b"E", alphabet) hmm.add_state(E, lprob_zero()) M1 = NormalState.create(b"M1", alphabet, [log(0.8), log(0.2)]) hmm.add_state(M1, lprob_zero()) M2 = NormalState.create(b"M2", alphabet, [log(0.4), log(0.6)]) hmm.add_state(M2, lprob_zero()) hmm.set_transition(S, M1, log(1.0)) hmm.set_transition(M1, M2, log(1.0)) hmm.set_transition(M2, E, log(1.0)) hmm.set_transition(E, E, log(1.0)) hmm.normalize() hmm.set_transition(E, E, lprob_zero()) dp = hmm.create_dp(E) return {"hmm": hmm, "dp": dp, "alphabet": alphabet}
def test_hmm_viterbi_2(): alphabet = Alphabet.create(b"AC", b"X") hmm = HMM.create(alphabet) S = MuteState.create(b"S", alphabet) hmm.add_state(S, log(1.0)) E = MuteState.create(b"E", alphabet) hmm.add_state(E, lprob_zero()) M1 = NormalState.create(b"M1", alphabet, [log(0.8), log(0.2)]) hmm.add_state(M1, lprob_zero()) M2 = NormalState.create(b"M2", alphabet, [log(0.4), log(0.6)]) hmm.add_state(M2, lprob_zero()) hmm.set_transition(S, M1, log(1.0)) hmm.set_transition(M1, M2, log(1.0)) hmm.set_transition(M2, E, log(1.0)) hmm.set_transition(E, E, log(1.0)) hmm.normalize() hmm.set_transition(E, E, lprob_zero()) dp = hmm.create_dp(E) dp_task = DPTask.create(dp) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.48)) seq = Sequence.create(b"AA", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.32)) seq = Sequence.create(b"CA", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.08)) seq = Sequence.create(b"CC", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.12)) hmm.set_transition(M1, E, log(1.0)) seq = Sequence.create(b"AC", alphabet) dp = hmm.create_dp(E) dp_task = DPTask.create(dp) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.48)) seq = Sequence.create(b"AA", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.32))
def _null_amino_lprobs(symbols: str): """ Copy/paste from HMMER3 amino acid frequences infered form Swiss-Prot 50.8, (Oct 2006), counting over 85956127 (86.0M) residues. """ lprobs = { "A": log(0.0787945), "C": log(0.0151600), "D": log(0.0535222), "E": log(0.0668298), "F": log(0.0397062), "G": log(0.0695071), "H": log(0.0229198), "I": log(0.0590092), "K": log(0.0594422), "L": log(0.0963728), "M": log(0.0237718), "N": log(0.0414386), "P": log(0.0482904), "Q": log(0.0395639), "R": log(0.0540978), "S": log(0.0683364), "T": log(0.0540687), "V": log(0.0673417), "W": log(0.0114135), "Y": log(0.0304133), } return [lprobs.get(sym, lprob_zero()) for sym in list(symbols)]
def _create_base_table(codonp: CodonLprob): base_abc = codonp.alphabet base_lprob = {base: lprob_zero() for base in base_abc.symbols} norm = log(3) for codon in codon_iter(base_abc): lprob = codonp.get_lprob(codon) triplet = codon.symbols base_lprob[triplet[0]] = lprob_add(base_lprob[triplet[0]], lprob - norm) base_lprob[triplet[1]] = lprob_add(base_lprob[triplet[1]], lprob - norm) base_lprob[triplet[2]] = lprob_add(base_lprob[triplet[2]], lprob - norm) assert len(base_lprob) == 4 bases = base_abc.symbols assert len(bases) == 4 return BaseLprob.create( base_abc, ( base_lprob[bases[0]], base_lprob[bases[1]], base_lprob[bases[2]], base_lprob[bases[3]], ), )
def _get_target_length_model(self, target_length: int) -> SpecialTransitions: L = target_length if L == 0: raise ValueError("Target length cannot be zero.") if self._multiple_hits: q = 0.5 log_q = log(0.5) else: q = 0.0 log_q = lprob_zero() lp = log(L) - log(L + 2 + q / (1 - q)) l1p = log(2 + q / (1 - q)) - log(L + 2 + q / (1 - q)) lr = log(L) - log(L + 1) t = self._special_trans t.NN = t.CC = t.JJ = lp t.NB = t.CT = t.JB = l1p t.RR = lr t.EJ = log_q t.EC = log(1 - q) return t
def test_hmm_trans_prob(): alphabet = Alphabet.create(b"ACGU", b"X") hmm = HMM.create(alphabet) S = MuteState.create(b"S", alphabet) with pytest.raises(RuntimeError): hmm.set_start_lprob(S, log(0.4)) hmm.add_state(S) E = MuteState.create(b"E", alphabet) with pytest.raises(RuntimeError): hmm.transition(S, E) with pytest.raises(ValueError): hmm.set_transition(S, E, lprob_zero()) with pytest.raises(ValueError): hmm.set_transition(E, S, lprob_zero()) with pytest.raises(ValueError): hmm.del_state(E) hmm.add_state(E) with pytest.raises(RuntimeError): hmm.set_transition(E, S, lprob_invalid()) with pytest.raises(ValueError): hmm.normalize() hmm.set_transition(S, E, log(0.5)) assert_allclose(hmm.transition(S, S), lprob_zero()) assert_allclose(hmm.transition(S, E), log(0.5)) assert_allclose(hmm.transition(E, S), lprob_zero()) assert_allclose(hmm.transition(E, E), lprob_zero()) with pytest.raises(ValueError): hmm.normalize() with pytest.raises(ValueError): hmm.normalize() hmm.set_start_lprob(S, log(0.4)) hmm.set_transition(E, E, log(0.1)) hmm.normalize() assert_allclose(hmm.transition(S, E), log(1.0)) assert_allclose(hmm.transition(E, S), lprob_zero()) assert_allclose(hmm.transition(S, S), lprob_zero()) assert_allclose(hmm.transition(E, E), log(1.0))
def _create_codon_prob(aminot: AminoLprob, gencode: CodonTable) -> CodonLprob: codonp = CodonLprob.create(gencode.base_alphabet) codon_lprobs = [] lprob_norm = lprob_zero() for i in range(len(aminot.alphabet.symbols)): aa = aminot.alphabet.symbols[i:i + 1] lprob = aminot.lprob(aa) codons = gencode.codons(aa) if len(codons) == 0: continue norm = log(len(codons)) for codon in codons: codon_lprobs.append((codon, lprob - norm)) lprob_norm = lprob_add(lprob_norm, codon_lprobs[-1][1]) for codon, lprob in codon_lprobs: codonp.set_lprob(codon, lprob - lprob_norm) return codonp
def _sort(self, lprobs: Mapping[str, float]) -> List[float]: symbols = self._alphabet.symbols.decode() return [lprobs.get(sym, lprob_zero()) for sym in symbols]
def test_hmm_loglikelihood(): alphabet = Alphabet.create(b"ACGU", b"X") hmm = HMM.create(alphabet) S = MuteState.create(b"S", alphabet) hmm.add_state(S, log(1.0)) E = MuteState.create(b"E", alphabet) hmm.add_state(E, lprob_zero()) M1 = NormalState.create( b"M1", alphabet, [log(0.8), log(0.2), lprob_zero(), lprob_zero()], ) hmm.add_state(M1, lprob_zero()) M2 = NormalState.create( b"M2", alphabet, [log(0.4 / 1.6), log(0.6 / 1.6), lprob_zero(), log(0.6 / 1.6)] ) hmm.add_state(M2, lprob_zero()) hmm.set_transition(S, M1, log(1.0)) hmm.set_transition(M1, M2, log(1.0)) hmm.set_transition(M2, E, log(1.0)) hmm.set_transition(E, E, log(1.0)) hmm.normalize() p = hmm.loglikelihood( Sequence.create(b"AC", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.3)) p = hmm.loglikelihood( Sequence.create(b"AA", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.2)) p = hmm.loglikelihood( Sequence.create(b"AG", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"AU", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.3)) p = hmm.loglikelihood( Sequence.create(b"CC", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.075)) p = hmm.loglikelihood( Sequence.create(b"CA", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.05)) p = hmm.loglikelihood( Sequence.create(b"CG", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"CG", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"CU", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.075)) p = hmm.loglikelihood( Sequence.create(b"GC", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"GA", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"GG", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"GU", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"UC", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"UA", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"UG", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"UU", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) M3 = NormalState.create( b"M2", alphabet, [log(0.4), log(0.6), lprob_zero(), log(0.6)], ) with pytest.raises(ValueError): hmm.loglikelihood( Sequence.create(b"UU", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M3, 1), Step.create(E, 0), ] ), )
def test_hmm_viterbi_3(): alphabet = Alphabet.create(b"AC", b"X") hmm = HMM.create(alphabet) S = MuteState.create(b"S", alphabet) hmm.add_state(S, log(1.0)) E = MuteState.create(b"E", alphabet) hmm.add_state(E, lprob_zero()) M1 = NormalState.create(b"M1", alphabet, [log(0.8), log(0.2)]) hmm.add_state(M1, lprob_zero()) D1 = MuteState.create(b"D1", alphabet) hmm.add_state(D1, lprob_zero()) M2 = NormalState.create(b"M2", alphabet, [log(0.4), log(0.6)]) hmm.add_state(M2, lprob_zero()) D2 = MuteState.create(b"D2", alphabet) hmm.add_state(D2, lprob_zero()) hmm.set_transition(S, M1, log(0.8)) hmm.set_transition(S, D1, log(0.2)) hmm.set_transition(M1, M2, log(0.8)) hmm.set_transition(M1, D2, log(0.2)) hmm.set_transition(D1, D2, log(0.2)) hmm.set_transition(D1, M2, log(0.8)) hmm.set_transition(D2, E, log(1.0)) hmm.set_transition(M2, E, log(1.0)) hmm.set_transition(E, E, log(1.0)) hmm.normalize() hmm.set_transition(E, E, lprob_zero()) dp = hmm.create_dp(E) dp_task = DPTask.create(dp) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) result = dp.viterbi(dp_task) score = hmm.loglikelihood(seq, result.path) assert bytes(result.sequence) == b"AC" path = result.path steps = list(path) assert steps[0].seq_len == 0 assert steps[1].seq_len == 1 assert steps[2].seq_len == 1 assert steps[3].seq_len == 0 assert_allclose(score, log(0.3072)) seq = Sequence.create(b"AA", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.2048)) seq = Sequence.create(b"A", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.128)) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.3072)) dp = hmm.create_dp(M2) dp_task = DPTask.create(dp) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.3072)) hmm.del_state(E) dp = hmm.create_dp(M2) dp_task = DPTask.create(dp) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) result = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, result.path), log(0.3072))
def test_hmm_viterbi_1(): alphabet = Alphabet.create(b"ACGU", b"X") hmm = HMM.create(alphabet) S = MuteState.create(b"S", alphabet) hmm.add_state(S, log(1.0)) E = MuteState.create(b"E", alphabet) hmm.add_state(E, lprob_zero()) M1 = NormalState.create( b"M1", alphabet, [log(0.8), log(0.2), lprob_zero(), lprob_zero()], ) hmm.add_state(M1, lprob_zero()) M2 = NormalState.create( b"M2", alphabet, [log(0.4 / 1.6), log(0.6 / 1.6), lprob_zero(), log(0.6 / 1.6)], ) hmm.add_state(M2, lprob_zero()) hmm.set_transition(S, M1, log(1.0)) hmm.set_transition(M1, M2, log(1.0)) hmm.set_transition(M2, E, log(1.0)) hmm.set_transition(E, E, log(1.0)) hmm.normalize() hmm.set_transition(E, E, lprob_zero()) assert_allclose(hmm.transition(E, E), lprob_zero()) assert_allclose(hmm.transition(S, S), lprob_zero()) assert_allclose(hmm.transition(S, E), lprob_zero()) assert_allclose(hmm.transition(E, S), lprob_zero()) dp = hmm.create_dp(E) dp_task = DPTask.create(dp) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) result = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, result.path), log(0.3))