def test_random_mat(): A = nlp.hmm() iters = 10000 num_states = 5 sum_mat = np.matrix(np.zeros((5, 5))) sum_mat2 = np.matrix(np.zeros((2, 5))) sum_mat3 = np.matrix(np.zeros((5))) for i in range(iters): A.randomize(['b', 'a'], 5) sum_mat += A.transition_matrix t = sum(A.transition_matrix[0], 0) sum_mat2[0] += A.emission_matrix['a'] sum_mat2[1] += A.emission_matrix['b'] sum_mat3 += A.initial_matrix assert (np.isclose(np.average(np.sum(sum_mat, 1) / iters), 1.0, atol=0.01)) assert (np.isclose(np.average(np.sum(sum_mat2, 0) / iters), 1.0, atol=0.01)) assert (np.isclose(np.average(np.sum(sum_mat3, 1) / iters), 1.0, atol=0.01)) assert (A.transition_matrix.shape == (5, 5)) assert (A.emission_matrix['a'].shape == (1, 5)) assert (A.emission_matrix['b'].shape == (1, 5)) assert (A.initial_matrix.shape == (1, 5))
def get_test_model0(): A = nlp.hmm() A.initial_matrix = np.matrix([0.4, 0.6]) A.transition_matrix = np.matrix([[0.1, 0.9], [0.5, 0.5]]) A.emission_matrix = { 'a': np.matrix([0.7, 0.4]), 'b': np.matrix([0.3, 0.6]) } return A
def test_forward_backward_alg(): A = nlp.hmm() A.randomize(['a', 'b'], 2) s = 'ab' b_arr = A._backward_algorithm(list(s)) f_arr = A._forward_algorithm(list(s)) r0 = np.multiply(b_arr, f_arr).sum(1) s = 'aa' b_arr = A._backward_algorithm(list(s)) f_arr = A._forward_algorithm(list(s)) r1 = np.multiply(b_arr, f_arr).sum(1) s = 'bb' b_arr = A._backward_algorithm(list(s)) f_arr = A._forward_algorithm(list(s)) r2 = np.multiply(b_arr, f_arr).sum(1) s = 'ba' b_arr = A._backward_algorithm(list(s)) f_arr = A._forward_algorithm(list(s)) r3 = np.multiply(b_arr, f_arr).sum(1) r = r0 + r1 + r2 + r3 assert (np.all(np.isclose(r, np.ones(r.shape), atol=0.001)))
import nlp import matplotlib.pylab as plb import numpy as np import pickle from functools import reduce tagged_sents = pickle.load(open('brown_tags.dat', 'rb')) h = nlp.hmm() h.load_from_file('hmm2.dat') tagged_words = reduce((lambda x, y: x + y), tagged_sents) tags = list(set([t for w, t in tagged_words])) tag_hit_freqs = np.zeros(len(tags), dtype=np.float32) tag_total_freqs = np.zeros(len(tags), dtype=np.float32) for sent in tagged_sents: our_tags = h.viterbi([w for w, t in sent]) for i, wt in enumerate(sent): if wt[1] == our_tags[i]: tag_hit_freqs[tags.index(wt[1])] += 1 tag_total_freqs[tags.index(wt[1])] += 1 output_array = np.zeros(len(tag_total_freqs) + 1) output_array[:-1] = tag_hit_freqs / tag_total_freqs output_array[-1] = (tag_hit_freqs / tag_total_freqs).mean() plb.bar(np.arange(len(tags) + 1), output_array) plb.xticks(np.arange(len(tags) + 1), tags + ['average'], rotation='vertical')