예제 #1
0
 def test_accumulative(self):
     hmm = HMM.from_file(self._model_filename)
     for i in xrange(self._num_hidden):
         self.assertAlmostEqual(1.0, hmm._accumulative_transition_matrix[-1, i], 
                          delta=1e-6)
         self.assertAlmostEqual(1.0, hmm._accumulative_observation_matrix[-1, i], 
                          delta=1e-6)
예제 #2
0
 def test_slfit(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     learner = SLHMM(self._num_hidden, self._num_observ)
     learner.fit(sequences, verbose=True)
     for sequence in sequences:
         pprint("True probability: %f" % hmm.predict(sequence))
         pprint("Infered probability: %f" % learner.predict(sequence))
예제 #3
0
 def test_slfit(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     learner = SLHMM(self._num_hidden, self._num_observ)
     learner.fit(sequences, verbose=True)
     for sequence in sequences:
         pprint("True probability: %f" % hmm.predict(sequence))
         pprint("Infered probability: %f" % learner.predict(sequence))
예제 #4
0
 def test_accumulative(self):
     hmm = HMM.from_file(self._model_filename)
     for i in xrange(self._num_hidden):
         self.assertAlmostEqual(1.0,
                                hmm._accumulative_transition_matrix[-1, i],
                                delta=1e-6)
         self.assertAlmostEqual(1.0,
                                hmm._accumulative_observation_matrix[-1, i],
                                delta=1e-6)
예제 #5
0
 def test_predict(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     for sequence in sequences:
         self.assertEqual(hmm.predict(sequence), hmm.predict(sequence),
                          "HMM.prediction Error")
     sequences = [[0, 1], [1, 2, 3, 0], [0, 0, 0, 1]]
     for sequence in sequences:
         self.assertEqual(hmm.predict(sequence), hmm.predict(sequence),
                          "HMM.prediction Error")
예제 #6
0
 def test_predict(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     for sequence in sequences:
         self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), 
                          "HMM.prediction Error")
     sequences = [[0,1], [1,2,3,0], [0,0,0,1]]
     for sequence in sequences:
         self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), 
                          "HMM.prediction Error")
예제 #7
0
 def __init__(self, training_filename, test_filename, model_filename, num_hidden,
              num_observ, num_em_restarts=20):
     self._training_data = [np.loadtxt(training_filename, dtype=np.int, delimiter=",")]
     # self._test_data = np.loadtxt(test_filename, dtype=np.int, delimiter=",")
     self._test_data = []
     with file(test_filename, "rb") as fin:
         reader = csv.reader(fin)
         for line in reader:
             self._test_data.append(np.asarray(map(int, line)))
     self._model = HMM.from_file(model_filename)
     self._num_hidden = num_hidden
     self._num_observ = num_observ
     self._num_em_restarts = num_em_restarts
예제 #8
0
 def test_decode(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     for sequence in sequences:
         decoded_sequence = hmm.decode(sequence)
         self.assertEqual(len(sequence), len(decoded_sequence), "HMM.decode Error")
         for i in xrange(len(sequence)):
             self.assertEqual(sequence[i], sequence[i], "HMM.decode Error")
     sequences = [[0, 1], [1, 2], [0, 1, 2, 0]]
     for sequence in sequences:
         decoded_sequence = hmm.decode(sequence)
         self.assertEqual(len(sequence), len(decoded_sequence), "HMM.decode Error")
         for i in xrange(len(sequence)):
             self.assertEqual(decoded_sequence[i], decoded_sequence[i], 
                              "HMM.decode Error")
예제 #9
0
 def test_decode(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     for sequence in sequences:
         decoded_sequence = hmm.decode(sequence)
         self.assertEqual(len(sequence), len(decoded_sequence),
                          "HMM.decode Error")
         for i in xrange(len(sequence)):
             self.assertEqual(sequence[i], sequence[i], "HMM.decode Error")
     sequences = [[0, 1], [1, 2], [0, 1, 2, 0]]
     for sequence in sequences:
         decoded_sequence = hmm.decode(sequence)
         self.assertEqual(len(sequence), len(decoded_sequence),
                          "HMM.decode Error")
         for i in xrange(len(sequence)):
             self.assertEqual(decoded_sequence[i], decoded_sequence[i],
                              "HMM.decode Error")
예제 #10
0
 def __init__(self,
              training_filename,
              test_filename,
              model_filename,
              num_hidden,
              num_observ,
              num_em_restarts=20):
     self._training_data = [
         np.loadtxt(training_filename, dtype=np.int, delimiter=",")
     ]
     # self._test_data = np.loadtxt(test_filename, dtype=np.int, delimiter=",")
     self._test_data = []
     with file(test_filename, "rb") as fin:
         reader = csv.reader(fin)
         for line in reader:
             self._test_data.append(np.asarray(map(int, line)))
     self._model = HMM.from_file(model_filename)
     self._num_hidden = num_hidden
     self._num_observ = num_observ
     self._num_em_restarts = num_em_restarts
예제 #11
0
 def test_emfit(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     learner = EMHMM(self._num_hidden, self._num_observ)
     learner.fit(sequences, verbose=True, repeats=1)
     for sequence in sequences:
         pprint("True probability: %f" % hmm.predict(sequence))
         pprint("Infered probability: %f" % learner.predict(sequence))
     pprint("Learned parameter using EM algorithm:")
     pprint("Transition matrix: ")
     pprint(learner.transition_matrix)
     pprint("Observation matrix: ")
     pprint(learner.observation_matrix)
     pprint("Initial distribution: ")
     pprint(learner.initial_dist)
     pprint("*" * 50)
     pprint("True Transition matrix: ")
     pprint(hmm.transition_matrix)
     pprint("True Observation matrix: ")
     pprint(hmm.observation_matrix)
     pprint("True initial distribution: ")
     pprint(hmm.initial_dist)
예제 #12
0
 def test_emfit(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     learner = EMHMM(self._num_hidden, self._num_observ)
     learner.fit(sequences, verbose=True, repeats=1)
     for sequence in sequences:
         pprint("True probability: %f" % hmm.predict(sequence))
         pprint("Infered probability: %f" % learner.predict(sequence))
     pprint("Learned parameter using EM algorithm:")
     pprint("Transition matrix: ")
     pprint(learner.transition_matrix)
     pprint("Observation matrix: ")
     pprint(learner.observation_matrix)
     pprint("Initial distribution: ")
     pprint(learner.initial_dist)
     pprint("*" * 50)
     pprint("True Transition matrix: ")
     pprint(hmm.transition_matrix)
     pprint("True Observation matrix: ")
     pprint(hmm.observation_matrix)
     pprint("True initial distribution: ")
     pprint(hmm.initial_dist)
예제 #13
0
def model_selection(trainfile, testfile, modelpath, log_filename):
    training_data = np.loadtxt(trainfile, dtype=np.int, delimiter=",")
    test_data = []
    with file(testfile, "rb") as fin:
        reader = csv.reader(fin)
        for line in reader:
            test_data.append(np.asarray(map(int, line)))
    model = HMM.from_file(modelpath)
    num_hidden = model.m
    num_observ = model.n
    variation_measure = np.zeros(num_observ, dtype=np.float)
    neg_num_measure = np.zeros(num_observ, dtype=np.int)
    neg_proportion_measure = np.zeros(num_observ, dtype=np.float)
    for m in xrange(1, num_observ + 1):
        slearner = SpectralLearner()
        slearner.train(training_data, m, num_observ)
        true_probs = np.zeros(len(test_data))
        sl_probs = np.zeros(len(test_data))
        for i, seq in enumerate(test_data):
            true_probs[i] = model.probability(seq)
            sl_probs[i] = slearner.predict(seq)
            # pprint("%e %e" % (true_probs[i], sl_probs[i]))
        neg_num_measure[m - 1] = np.sum(sl_probs < 0, dtype=np.float)
        neg_proportion_measure[m - 1] = neg_num_measure[m - 1] / float(
            len(test_data))
        partition_function = np.sum(true_probs)
        #Normalizing joint probability distribution to get conditional distribution
        true_probs /= partition_function
        sl_probs /= partition_function
        variation_measure[m - 1] = np.sum(np.abs(sl_probs - true_probs))
        pprint("Model Rank Hyperparameter: %d" % m)
        pprint("Sum of all true probabilities: %f" % np.sum(true_probs))
        pprint("Sum of all estimated probabilities: %f" % np.sum(sl_probs))
        pprint("*" * 50)
    statistics = np.array(
        [variation_measure, neg_num_measure, neg_proportion_measure])
    statistics = statistics.T
    np.savetxt(log_filename, statistics, delimiter=",", fmt="%e")
예제 #14
0
def model_selection(trainfile, testfile, modelpath, log_filename):
    training_data = np.loadtxt(trainfile, dtype=np.int, delimiter=",")
    test_data = []
    with file(testfile, "rb") as fin:
        reader = csv.reader(fin)
        for line in reader:
            test_data.append(np.asarray(map(int, line)))
    model = HMM.from_file(modelpath)
    num_hidden = model.m
    num_observ = model.n
    variation_measure = np.zeros(num_observ, dtype=np.float)
    neg_num_measure = np.zeros(num_observ, dtype=np.int)
    neg_proportion_measure = np.zeros(num_observ, dtype=np.float)
    for m in xrange(1, num_observ + 1):
        slearner = SpectralLearner()
        slearner.train(training_data, m, num_observ)
        true_probs = np.zeros(len(test_data))
        sl_probs = np.zeros(len(test_data))
        for i, seq in enumerate(test_data):
            true_probs[i] = model.probability(seq)
            sl_probs[i] = slearner.predict(seq)
            # pprint("%e %e" % (true_probs[i], sl_probs[i]))
        neg_num_measure[m - 1] = np.sum(sl_probs < 0, dtype=np.float)
        neg_proportion_measure[m - 1] = neg_num_measure[m - 1] / float(len(test_data))
        partition_function = np.sum(true_probs)
        #Normalizing joint probability distribution to get conditional distribution
        true_probs /= partition_function
        sl_probs /= partition_function
        variation_measure[m - 1] = np.sum(np.abs(sl_probs - true_probs))
        pprint("Model Rank Hyperparameter: %d" % m)
        pprint("Sum of all true probabilities: %f" % np.sum(true_probs))
        pprint("Sum of all estimated probabilities: %f" % np.sum(sl_probs))
        pprint("*" * 50)
    statistics = np.array([variation_measure, neg_num_measure, neg_proportion_measure])
    statistics = statistics.T
    np.savetxt(log_filename, statistics, delimiter=",", fmt="%e")
예제 #15
0
 def test_loading(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     for sequence in sequences:
         self.assertEqual(hmm.predict(sequence), hmm.predict(sequence),
                          "Inferred probability is wrong")
예제 #16
0
def regenerate_training(hmm, training_filename, dsize):
    training_seq = hmm.generate_train_data(dsize)
    np.savetxt(training_filename, [training_seq], delimiter=",", fmt="%d")

def regenerate_test(hmm, test_filename, dsize, max_length=50):
    test_seqs = hmm.generate_test_data(tsize, min_seq_len=50, max_seq_len=51)
    with file(test_filename, "wb") as fout:
        writer = csv.writer(fout)
        for seq in test_seqs:
            writer.writerow(seq)

if __name__ == '__main__':    
    usage = '''./generator.py file m n test_set_size 
            file is the filepath to store the data 
            m is the size of states in HMM    
            n is the size of observations in HMM  
            test_set_size is the size of test_set
            '''
    if len(sys.argv) < 5:
        sys.stderr.write(usage)
        exit()
    m = int(sys.argv[2])
    n = int(sys.argv[3])
    dsizes = [10000, 50000, 100000, 500000, 1000000, 5000000]
    tsize = int(sys.argv[4])
    file_tager = sys.argv[1]
    #generate(m, n, dsizes, tsize, file_tager)
    hmm = HMM.from_file(sys.argv[1])
    #regenerate_training(hmm, sys.argv[2], int(sys.argv[3]))
    regenerate_test(hmm, "m4n8_50_len_test.data", tsize)
예제 #17
0
 def test_loading(self):
     sequences = io.load_sequences(self._train_filename)
     hmm = HMM.from_file(self._model_filename)
     for sequence in sequences:
         self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), 
                          "Inferred probability is wrong")
예제 #18
0
    np.savetxt(training_filename, [training_seq], delimiter=",", fmt="%d")


def regenerate_test(hmm, test_filename, dsize, max_length=50):
    test_seqs = hmm.generate_test_data(tsize, min_seq_len=50, max_seq_len=51)
    with file(test_filename, "wb") as fout:
        writer = csv.writer(fout)
        for seq in test_seqs:
            writer.writerow(seq)


if __name__ == '__main__':
    usage = '''./generator.py file m n test_set_size 
            file is the filepath to store the data 
            m is the size of states in HMM    
            n is the size of observations in HMM  
            test_set_size is the size of test_set
            '''
    if len(sys.argv) < 5:
        sys.stderr.write(usage)
        exit()
    m = int(sys.argv[2])
    n = int(sys.argv[3])
    dsizes = [10000, 50000, 100000, 500000, 1000000, 5000000]
    tsize = int(sys.argv[4])
    file_tager = sys.argv[1]
    #generate(m, n, dsizes, tsize, file_tager)
    hmm = HMM.from_file(sys.argv[1])
    #regenerate_training(hmm, sys.argv[2], int(sys.argv[3]))
    regenerate_test(hmm, "m4n8_50_len_test.data", tsize)