def test_accumulative(self): hmm = HMM.from_file(self._model_filename) for i in xrange(self._num_hidden): self.assertAlmostEqual(1.0, hmm._accumulative_transition_matrix[-1, i], delta=1e-6) self.assertAlmostEqual(1.0, hmm._accumulative_observation_matrix[-1, i], delta=1e-6)
def test_slfit(self): sequences = io.load_sequences(self._train_filename) hmm = HMM.from_file(self._model_filename) learner = SLHMM(self._num_hidden, self._num_observ) learner.fit(sequences, verbose=True) for sequence in sequences: pprint("True probability: %f" % hmm.predict(sequence)) pprint("Infered probability: %f" % learner.predict(sequence))
def test_predict(self): sequences = io.load_sequences(self._train_filename) hmm = HMM.from_file(self._model_filename) for sequence in sequences: self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), "HMM.prediction Error") sequences = [[0, 1], [1, 2, 3, 0], [0, 0, 0, 1]] for sequence in sequences: self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), "HMM.prediction Error")
def test_predict(self): sequences = io.load_sequences(self._train_filename) hmm = HMM.from_file(self._model_filename) for sequence in sequences: self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), "HMM.prediction Error") sequences = [[0,1], [1,2,3,0], [0,0,0,1]] for sequence in sequences: self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), "HMM.prediction Error")
def __init__(self, training_filename, test_filename, model_filename, num_hidden, num_observ, num_em_restarts=20): self._training_data = [np.loadtxt(training_filename, dtype=np.int, delimiter=",")] # self._test_data = np.loadtxt(test_filename, dtype=np.int, delimiter=",") self._test_data = [] with file(test_filename, "rb") as fin: reader = csv.reader(fin) for line in reader: self._test_data.append(np.asarray(map(int, line))) self._model = HMM.from_file(model_filename) self._num_hidden = num_hidden self._num_observ = num_observ self._num_em_restarts = num_em_restarts
def test_decode(self): sequences = io.load_sequences(self._train_filename) hmm = HMM.from_file(self._model_filename) for sequence in sequences: decoded_sequence = hmm.decode(sequence) self.assertEqual(len(sequence), len(decoded_sequence), "HMM.decode Error") for i in xrange(len(sequence)): self.assertEqual(sequence[i], sequence[i], "HMM.decode Error") sequences = [[0, 1], [1, 2], [0, 1, 2, 0]] for sequence in sequences: decoded_sequence = hmm.decode(sequence) self.assertEqual(len(sequence), len(decoded_sequence), "HMM.decode Error") for i in xrange(len(sequence)): self.assertEqual(decoded_sequence[i], decoded_sequence[i], "HMM.decode Error")
def __init__(self, training_filename, test_filename, model_filename, num_hidden, num_observ, num_em_restarts=20): self._training_data = [ np.loadtxt(training_filename, dtype=np.int, delimiter=",") ] # self._test_data = np.loadtxt(test_filename, dtype=np.int, delimiter=",") self._test_data = [] with file(test_filename, "rb") as fin: reader = csv.reader(fin) for line in reader: self._test_data.append(np.asarray(map(int, line))) self._model = HMM.from_file(model_filename) self._num_hidden = num_hidden self._num_observ = num_observ self._num_em_restarts = num_em_restarts
def test_emfit(self): sequences = io.load_sequences(self._train_filename) hmm = HMM.from_file(self._model_filename) learner = EMHMM(self._num_hidden, self._num_observ) learner.fit(sequences, verbose=True, repeats=1) for sequence in sequences: pprint("True probability: %f" % hmm.predict(sequence)) pprint("Infered probability: %f" % learner.predict(sequence)) pprint("Learned parameter using EM algorithm:") pprint("Transition matrix: ") pprint(learner.transition_matrix) pprint("Observation matrix: ") pprint(learner.observation_matrix) pprint("Initial distribution: ") pprint(learner.initial_dist) pprint("*" * 50) pprint("True Transition matrix: ") pprint(hmm.transition_matrix) pprint("True Observation matrix: ") pprint(hmm.observation_matrix) pprint("True initial distribution: ") pprint(hmm.initial_dist)
def model_selection(trainfile, testfile, modelpath, log_filename): training_data = np.loadtxt(trainfile, dtype=np.int, delimiter=",") test_data = [] with file(testfile, "rb") as fin: reader = csv.reader(fin) for line in reader: test_data.append(np.asarray(map(int, line))) model = HMM.from_file(modelpath) num_hidden = model.m num_observ = model.n variation_measure = np.zeros(num_observ, dtype=np.float) neg_num_measure = np.zeros(num_observ, dtype=np.int) neg_proportion_measure = np.zeros(num_observ, dtype=np.float) for m in xrange(1, num_observ + 1): slearner = SpectralLearner() slearner.train(training_data, m, num_observ) true_probs = np.zeros(len(test_data)) sl_probs = np.zeros(len(test_data)) for i, seq in enumerate(test_data): true_probs[i] = model.probability(seq) sl_probs[i] = slearner.predict(seq) # pprint("%e %e" % (true_probs[i], sl_probs[i])) neg_num_measure[m - 1] = np.sum(sl_probs < 0, dtype=np.float) neg_proportion_measure[m - 1] = neg_num_measure[m - 1] / float( len(test_data)) partition_function = np.sum(true_probs) #Normalizing joint probability distribution to get conditional distribution true_probs /= partition_function sl_probs /= partition_function variation_measure[m - 1] = np.sum(np.abs(sl_probs - true_probs)) pprint("Model Rank Hyperparameter: %d" % m) pprint("Sum of all true probabilities: %f" % np.sum(true_probs)) pprint("Sum of all estimated probabilities: %f" % np.sum(sl_probs)) pprint("*" * 50) statistics = np.array( [variation_measure, neg_num_measure, neg_proportion_measure]) statistics = statistics.T np.savetxt(log_filename, statistics, delimiter=",", fmt="%e")
def model_selection(trainfile, testfile, modelpath, log_filename): training_data = np.loadtxt(trainfile, dtype=np.int, delimiter=",") test_data = [] with file(testfile, "rb") as fin: reader = csv.reader(fin) for line in reader: test_data.append(np.asarray(map(int, line))) model = HMM.from_file(modelpath) num_hidden = model.m num_observ = model.n variation_measure = np.zeros(num_observ, dtype=np.float) neg_num_measure = np.zeros(num_observ, dtype=np.int) neg_proportion_measure = np.zeros(num_observ, dtype=np.float) for m in xrange(1, num_observ + 1): slearner = SpectralLearner() slearner.train(training_data, m, num_observ) true_probs = np.zeros(len(test_data)) sl_probs = np.zeros(len(test_data)) for i, seq in enumerate(test_data): true_probs[i] = model.probability(seq) sl_probs[i] = slearner.predict(seq) # pprint("%e %e" % (true_probs[i], sl_probs[i])) neg_num_measure[m - 1] = np.sum(sl_probs < 0, dtype=np.float) neg_proportion_measure[m - 1] = neg_num_measure[m - 1] / float(len(test_data)) partition_function = np.sum(true_probs) #Normalizing joint probability distribution to get conditional distribution true_probs /= partition_function sl_probs /= partition_function variation_measure[m - 1] = np.sum(np.abs(sl_probs - true_probs)) pprint("Model Rank Hyperparameter: %d" % m) pprint("Sum of all true probabilities: %f" % np.sum(true_probs)) pprint("Sum of all estimated probabilities: %f" % np.sum(sl_probs)) pprint("*" * 50) statistics = np.array([variation_measure, neg_num_measure, neg_proportion_measure]) statistics = statistics.T np.savetxt(log_filename, statistics, delimiter=",", fmt="%e")
def test_loading(self): sequences = io.load_sequences(self._train_filename) hmm = HMM.from_file(self._model_filename) for sequence in sequences: self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), "Inferred probability is wrong")
def regenerate_training(hmm, training_filename, dsize): training_seq = hmm.generate_train_data(dsize) np.savetxt(training_filename, [training_seq], delimiter=",", fmt="%d") def regenerate_test(hmm, test_filename, dsize, max_length=50): test_seqs = hmm.generate_test_data(tsize, min_seq_len=50, max_seq_len=51) with file(test_filename, "wb") as fout: writer = csv.writer(fout) for seq in test_seqs: writer.writerow(seq) if __name__ == '__main__': usage = '''./generator.py file m n test_set_size file is the filepath to store the data m is the size of states in HMM n is the size of observations in HMM test_set_size is the size of test_set ''' if len(sys.argv) < 5: sys.stderr.write(usage) exit() m = int(sys.argv[2]) n = int(sys.argv[3]) dsizes = [10000, 50000, 100000, 500000, 1000000, 5000000] tsize = int(sys.argv[4]) file_tager = sys.argv[1] #generate(m, n, dsizes, tsize, file_tager) hmm = HMM.from_file(sys.argv[1]) #regenerate_training(hmm, sys.argv[2], int(sys.argv[3])) regenerate_test(hmm, "m4n8_50_len_test.data", tsize)
np.savetxt(training_filename, [training_seq], delimiter=",", fmt="%d") def regenerate_test(hmm, test_filename, dsize, max_length=50): test_seqs = hmm.generate_test_data(tsize, min_seq_len=50, max_seq_len=51) with file(test_filename, "wb") as fout: writer = csv.writer(fout) for seq in test_seqs: writer.writerow(seq) if __name__ == '__main__': usage = '''./generator.py file m n test_set_size file is the filepath to store the data m is the size of states in HMM n is the size of observations in HMM test_set_size is the size of test_set ''' if len(sys.argv) < 5: sys.stderr.write(usage) exit() m = int(sys.argv[2]) n = int(sys.argv[3]) dsizes = [10000, 50000, 100000, 500000, 1000000, 5000000] tsize = int(sys.argv[4]) file_tager = sys.argv[1] #generate(m, n, dsizes, tsize, file_tager) hmm = HMM.from_file(sys.argv[1]) #regenerate_training(hmm, sys.argv[2], int(sys.argv[3])) regenerate_test(hmm, "m4n8_50_len_test.data", tsize)