def setUp(self): self.states = ["Coin 1", "Coin 2", "Coin 3"] self.vocabulary = ["Heads", "Tails"] self.initial_probabilities = { "Coin 1": 0.4, "Coin 2": 0.3, "Coin 3": 0.3 } # The probability of moving from state a to state b # such that the sum of the all the probabilities = 1 self.transition_probabilities = { "Coin 1": {"Coin 1": 0.6, "Coin 2": 0.3, "Coin 3": 0.1}, "Coin 2": {"Coin 1": 0.2, "Coin 2": 0.5, "Coin 3": 0.3}, "Coin 3": {"Coin 1": 0.3, "Coin 2": 0.2, "Coin 3": 0.5} } # the probability of the observation O being generated from the state q self.emission_probabilities = { "Coin 1": {"Heads": 0.7, "Tails": 0.3}, "Coin 2": {"Heads": 0.3, "Tails": 0.7}, "Coin 3": {"Heads": 0.5, "Tails": 0.5} } self.hmm = HiddenMarkovModel( self.states, # all the possible hidden states self.vocabulary, # all possible observation types self.transition_probabilities, self.emission_probabilities, self.initial_probabilities )
def main(): log("Starting named entity recognition task") log("Splitting training set into training and validation sets") training, validation = Utils.create_training_validation_split(TRAINING_FILE_PATH) log("Writing training and validation sets to file") Utils.write_streams_to_file(training, "./input/training.txt") Utils.write_streams_to_file(validation, "./input/validation.txt") log("Reading training file") token_stream, pos_stream, tag_stream = Utils.read_training_file(TRAINING_FILE_PATH) log("Replacing low frequency tokens from training set") token_stream, closed_vocabulary = Unknown.replace_low_frequency_tokens(token_stream) log("Reading test file") test_stream = Utils.read_test_file(TEST_FILE_PATH) log("Replacing unknown tokens from test set") test_stream = Unknown.replace_unknown_tokens(test_stream, closed_vocabulary) log("Training most frequent class baseline") baseline = MostFrequentClassBaseline(token_stream, tag_stream) log("Predicting tags using baseline") baseline_predictions = baseline.classify_test_stream(test_stream) log("Writing predictions with baseline to file") Utils.write_results_to_file(baseline_predictions, "../output/baseline_output.txt") log("Training Hidden Markov Model") hmm = HiddenMarkovModel(token_stream, tag_stream) log("Predicting tags using HMM") hmm_predictions = hmm.classify_test_stream(test_stream) log("Writing predictions with HMM to file") Utils.write_results_to_file(hmm_predictions, "../output/hmm_output.txt")
def test_hmm(self): '''A = np.matrix([[0,1,0,0], [0.4,0,0.6,0], [0,0.4,0,0.6], [0,0,0.5,0.5]]) B = np.matrix([[0.5,0.5], [0.3,0.7], [0.6,0.4], [0.8,0.2]]) PI = np.matrix([0.25,0.25,0.25,0.25]).transpose() #observation result O = [0,0,1,1,0]''' A = np.matrix([[0.5, 0.2, 0.3], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]]) B = np.matrix([[0.5, 0.5], [0.4, 0.6], [0.7, 0.3]]) PI = np.matrix([0.2, 0.4, 0.4]).transpose() O = [0, 1, 0] hmm = HiddenMarkovModel(A, B, PI, O) prob, path = hmm.viterbi() print("The probability for Observation States", str(O), " is ", hmm.forward_backword()) assert hmm.forward_backword() == 0.130218 print("The max probs for Observation States", str(O), " is ", prob, " and the hidden state path is ", '-'.join(['%s' % id for id in path])) assert prob == 0.014699999999999998 assert path == [3, 3, 3]
def test_larger_history(self): sequence = (('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B')) model = HiddenMarkovModel(label_history_size=3) model.train(sequence, fallback_model=None, use_linear_smoothing=False) self.assertEqual(len(model.transition), 7) self.assertEqual(len(model.transition['B::A::B']), 2) self.assertEqual(model.transition['<START>::<START>::<START>']['<START>::<START>::A'], log(0.5)) self.assertEqual(model.transition['<START>::<START>::A']['<START>::A::B'], 0.0) self.assertEqual(model.transition['<START>::A::B']['A::B::A'], 0.0) self.assertEqual(model.transition['B::A::B']['A::B::<STOP>'], log(1.0 / 2.0)) self.assertEqual(model.transition['A::B::A']['B::A::B'], 0.0) self.assertEqual(len(model.reverse_transition), 8) self.assertEqual(len(model.reverse_transition['A::B::A']), 2) self.assertEqual(model.reverse_transition['<START>::A::B']['<START>::<START>::A'], 0.0) self.assertEqual(model.reverse_transition['B::A::B']['A::B::A'], 0.0) self.assertEqual(model.reverse_transition['A::B::A']['B::A::B'], log(1.0 / 2.0)) self.assertEqual(len(model.label_emissions), 4) self.assertEqual(len(model.label_emissions['A']), 2) self.assertEqual(model.label_emissions['A']['A::B::A'], log(2.0/3.0)) self.assertEqual(len(model.emission), 8) assert all(len(values) == 1 for values in model.emission.itervalues()) assert all(all(val == 0.0 for val in values.itervalues()) for values in model.emission.itervalues())
def test_unk_emission(self): # print "Testing UNK emission with emission == label and self-biased transitions: ", model = HiddenMarkovModel(label_history_size=1) model.labels = ('A', 'B', START_LABEL, STOP_LABEL) self.set_defaults(model) self.identity_emissions(model) self.biased_transitions(model) emissions = ['A', 'C', 'A', 'B', 'B'] labels = ['A', 'A', 'A', 'B', 'B'] score = log(0.5) * 2 + log(0.25) + log(0.5) + log(0.25) + log(0.5) + log(0.25) self._test_label(model, emissions, score, labels=labels) emissions = ['A', 'C', 'C', 'B', 'B'] labels = [['A', 'A', 'A', 'B', 'B'], ['A', 'A', 'B', 'B', 'B'], ['A', 'B', 'B', 'B', 'B']] score = None for label in labels: new_score = model.score(zip(label, emissions)) if score: self.assertAlmostEqual(score, new_score, 5)#, "score(%s) (%f) bad" % (label, new_score) score = new_score emissions = ['A', 'C', 'C', 'B', 'B'] labels = [['A', 'A', 'A', 'B', 'B'], ['A', 'A', 'B', 'B', 'B'], ['A', 'B', 'B', 'B', 'B']] score = None for label in labels: new_score = model.score(zip(label, emissions)) if score: self.assertAlmostEqual(score, new_score, 5)#, "score(%s) (%f) bad" % (label, new_score) score = new_score
def test_alternating_sequence(self): sequence = (('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B')) model = HiddenMarkovModel(label_history_size=2) model.train(sequence, fallback_model=None, use_linear_smoothing=False) self.assertEqual(len(model.transition), 5) self.assertEqual(len(model.transition['A::B']), 2) self.assertEqual(model.transition['A::B']['B::A'], log(2.0 / 3.0)) self.assertEqual(model.transition['B::A']['A::B'], 0.0) self.assertEqual(model.transition['<START>::<START>']['<START>::A'], log(0.5)) self.assertEqual(len(model.reverse_transition), 6) self.assertEqual(len(model.reverse_transition['A::B']), 2) self.assertEqual(model.reverse_transition['A::B']['B::A'], 0.0) self.assertEqual(model.reverse_transition['B::A']['A::B'], log(2.0 / 3.0)) self.assertEqual(len(model.label_emissions), 4) self.assertEqual(len(model.label_emissions['A']), 2) self.assertEqual(model.label_emissions['A']['B::A'], log(2.0 / 3.0)) self.assertEqual(len(model.emission), 6) self.assertEqual(len(model.emission['B::A']), 1) self.assertEqual(model.emission['B::A']['A'], 0.0)
def test_three_history_single(self): sequence = zip(repeat('A', 6), repeat('A', 6)) model = HiddenMarkovModel(label_history_size=3) model.train(sequence, fallback_model=None, use_linear_smoothing=False) self.assertEqual(model.label(list(repeat('A', 3))), list(repeat('A', 3))) self.assertEqual(model.label(list(repeat('A', 6))), list(repeat('A', 6)))
def test_three_history_alternating(self): alternating = lambda n: [(l, e) for l, e, _ in izip(cycle(('A', 'B')), cycle(('A', 'B')), xrange(n))] sequence = alternating(6) model = HiddenMarkovModel(label_history_size=3) model.train(sequence, fallback_model=None, use_linear_smoothing=False) self.assertEqual(model.label(alternating(4)), [label for label, _ in alternating(4)]) self.assertEqual(model.label(alternating(6)), [label for label, _ in alternating(6)])
def test_identity_emission_uniform_transitions(self): # print "Testing emission == state w/ uniform transitions chain: ", model = HiddenMarkovModel(label_history_size=1) model.labels = ('A', 'B', START_LABEL, STOP_LABEL) self.set_defaults(model) self.uniform_transitions(model) self.identity_emissions(model) tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'], ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']] for test in tests: self._test_label(model, test, log(1.0 / 2.0) + log(1.0 / 3.0) * 4)
def test_extend_labels_simple(self): stream = (('1', 1), ('2', 2), ('3', 3)) two_extended = [('1', ('<START>', ), 1), ('2', ('1', ), 2), ('3', ('2', ), 3)] self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 2)), two_extended)
def test_extend_labels_three_history(self): stream = (('1', 1), ('2', 2), ('3', 3)) three_extended = [('1', ('<START>', '<START>::<START>'), 1), ('2', ('1', '<START>::1'), 2), ('3', ('2', '1::2'), 3)] self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 3)), three_extended)
def test_extend_labels_one_history(self): stream = (('1', 1), ('2', 2), ('3', 3)) one_extended = [('1', tuple(), 1), ('2', tuple(), 2), ('3', tuple(), 3)] self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 1)), one_extended)
def test_even_larger_history(self): sequence = (('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B')) model = HiddenMarkovModel(label_history_size=4) model.train(sequence, fallback_model=None, use_linear_smoothing=False) self.assertEqual(len(model.transition), 9) self.assertEqual(len(model.transition['<START>::A::B::A']), 1) self.assertEqual(len(model.transition['A::B::A::B']), 2) self.assertEqual(model.transition['<START>::<START>::<START>::<START>']['<START>::<START>::<START>::A'], log(0.5)) self.assertEqual(model.transition['<START>::<START>::<START>::A']['<START>::<START>::A::B'], 0.0) self.assertEqual(model.transition['<START>::<START>::A::B']['<START>::A::B::A'], 0.0) self.assertEqual(model.transition['<START>::A::B::A']['A::B::A::B'], 0.0) self.assertEqual(model.transition['A::B::A::B']['B::A::B::A'], log(0.5))
def test_identity_emissions_non_uniform_transitions(self): # print "Testing emissions == labels with non-uniform transitions chain: ", model = HiddenMarkovModel(label_history_size=1) model.labels = ('A', 'B', START_LABEL, STOP_LABEL) self.set_defaults(model) self.biased_transitions(model) self.identity_emissions(model) tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'], ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']] scores = [log(0.5) * 4 + log(0.25), log(0.5) * 4 + log(0.25), log(0.5)*3 + log(0.25)*2, log(0.5)*2 + log(0.25)*3] scored_tests = zip(tests, scores) for test, score in scored_tests: self._test_label(model, test, score)
def test_extend_labels_simple(self): stream = (('1', 1), ('2', 2), ('3', 3)) two_extended = [('1', ('<START>',), 1), ('2', ('1',), 2), ('3', ('2',), 3)] self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 2)), two_extended)
def test_biased_emissions_uniform_transitions(self): # print "Testing uniform transitions with self-biased emissions: ", model = HiddenMarkovModel(label_history_size=1) model.labels = ('A', 'B', START_LABEL, STOP_LABEL) self.set_defaults(model) self.uniform_transitions(model) self.biased_emissions(model) tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'], ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']] scores = [log(0.5) + log(1.0 / 3.0) * 4.0 + 6.0 * log(2.0 / 3.0) for i in xrange(4)] scored_tests = zip(tests, scores) for test, score in scored_tests: self._test_label(model, test, score)
def test_biased_emissions_biased_transitions(self): # print "Testing self-biased transitions with self-biased emissions: ", model = HiddenMarkovModel(label_history_size=1) model.labels = ('A', 'B', START_LABEL, STOP_LABEL) self.set_defaults(model) self.biased_transitions(model) self.biased_emissions(model) tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'], ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']] scores = [log(0.5) * 4 + log(0.25), log(0.5) * 4 + log(0.25), log(0.5)*3 + log(0.25)*2, log(0.5)*2 + log(0.25)*3] scores = [6.0 * log(2.0 / 3.0) + score for score in scores] scored_tests = zip(tests, scores) for test, score in scored_tests: self._test_label(model, test, score)
def simple_hmm(): A = np.array([[0.9, 0.1], [0.4, 0.6]]) B = np.array([[0.9, 0.1], [0.2, 0.8]]) pi = np.array([0.8, 0.2]) model = HiddenMarkovModel(A, B, pi) emissions = np.array([0, 1]) return (model, emissions)
def make_model(states = 10, outputs = 5): model = HiddenMarkovModel() for state in range(states): model.add_initial_state(state, 1 / states) for output in range(outputs): model.add_emission(state, output, 1 / outputs) for state in range(states): for to_state in range(states): model.add_transition(state, to_state, 1 / states) model.normalize() return model
def test_extend_labels_multiple_sentences(self): stream = (('1', 1), ('2', 2), ('<STOP>', '<STOP>'), ('<START>', '<START>'), ('1', 1)) two_extended = [('1', ('<START>',), 1), ('2', ('1',), 2), ('<STOP>', ('2',), '<STOP>'), ('<START>', ('<START>',), '<START>'), ('1', ('<START>',), 1)] self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 2)), two_extended)
def setUp(self): self.states = ["rainy", "sunny"] self.vocabulary = ["walk", "shop", "clean"] self.initial_probabilities = { "rainy": 0.6, "sunny": 0.4, } # The probability of moving from state a to state b # such that the sum of the all the probabilities = 1 self.transition_probabilities = { "rainy": { "rainy": 0.7, "sunny": 0.3, }, "sunny": { "rainy": 0.4, "sunny": 0.6, }, } # the probability of the observation O being generated from the state q self.emission_probabilities = { "rainy": { "walk": 0.1, "shop": 0.4, "clean": 0.5, }, "sunny": { "walk": 0.6, "shop": 0.3, "clean": 0.1, } } self.hmm = HiddenMarkovModel( self.states, # all the possible hidden states self.vocabulary, # all possible observation types self.transition_probabilities, self.emission_probabilities, self.initial_probabilities )
def test_extend_labels_multiple_sentences(self): stream = (('1', 1), ('2', 2), ('<STOP>', '<STOP>'), ('<START>', '<START>'), ('1', 1)) two_extended = [('1', ('<START>', ), 1), ('2', ('1', ), 2), ('<STOP>', ('2', ), '<STOP>'), ('<START>', ('<START>', ), '<START>'), ('1', ('<START>', ), 1)] self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 2)), two_extended)
def test_unk_emission(self): # print "Testing UNK emission with emission == label and self-biased transitions: ", model = HiddenMarkovModel(label_history_size=1) model.labels = ('A', 'B', START_LABEL, STOP_LABEL) self.set_defaults(model) self.identity_emissions(model) self.biased_transitions(model) emissions = ['A', 'C', 'A', 'B', 'B'] labels = ['A', 'A', 'A', 'B', 'B'] score = log(0.5) * 2 + log(0.25) + log(0.5) + log(0.25) + log( 0.5) + log(0.25) self._test_label(model, emissions, score, labels=labels) emissions = ['A', 'C', 'C', 'B', 'B'] labels = [['A', 'A', 'A', 'B', 'B'], ['A', 'A', 'B', 'B', 'B'], ['A', 'B', 'B', 'B', 'B']] score = None for label in labels: new_score = model.score(zip(label, emissions)) if score: self.assertAlmostEqual( score, new_score, 5) #, "score(%s) (%f) bad" % (label, new_score) score = new_score emissions = ['A', 'C', 'C', 'B', 'B'] labels = [['A', 'A', 'A', 'B', 'B'], ['A', 'A', 'B', 'B', 'B'], ['A', 'B', 'B', 'B', 'B']] score = None for label in labels: new_score = model.score(zip(label, emissions)) if score: self.assertAlmostEqual( score, new_score, 5) #, "score(%s) (%f) bad" % (label, new_score) score = new_score
def test_extend_labels_longer_history_than_sentence(self): stream = (('1', 1), ('2', 2)) five_extended = [('1', ('<START>', '<START>::<START>', '<START>::<START>::<START>', '<START>::<START>::<START>::<START>'), 1), ('2', ('1', '<START>::1', '<START>::<START>::1', '<START>::<START>::<START>::1'), 2)] self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 5)), five_extended)
def test_extend_labels_longer_history_than_sentence(self): stream = (('1', 1), ('2', 2)) five_extended = [ ('1', ('<START>', '<START>::<START>', '<START>::<START>::<START>', '<START>::<START>::<START>::<START>'), 1), ('2', ('1', '<START>::1', '<START>::<START>::1', '<START>::<START>::<START>::1'), 2) ] self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 5)), five_extended)
def test_biased_emissions_uniform_transitions(self): # print "Testing uniform transitions with self-biased emissions: ", model = HiddenMarkovModel(label_history_size=1) model.labels = ('A', 'B', START_LABEL, STOP_LABEL) self.set_defaults(model) self.uniform_transitions(model) self.biased_emissions(model) tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'], ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']] scores = [ log(0.5) + log(1.0 / 3.0) * 4.0 + 6.0 * log(2.0 / 3.0) for i in xrange(4) ] scored_tests = zip(tests, scores) for test, score in scored_tests: self._test_label(model, test, score)
def fit(sc, data, model, stop_threshold=1E-9, min_iterations=0, max_iterations=1000): """ fits HIddenMarkovModel (model) using baum welch to a set of data (data) in parrallel using SparkContext (sc) until either a threshold (stop_threshold) is met or at least min_iterations are run. max_iterations is used to stop the fitting process. :param sc: sparkContext :param data: data to be fitted :param model: HiddenMarkovModel instance :param stop_threshold: :param min_iterations: minimum number of iterations :param max_iterations: maximum number of iterations :return: HiddenMarkovModel instances fitted to the data. Use model to extract estimated model parameters """ def fit_worker(batch): bwelch = BaumWelchBatch(model.loga, model.logb, model.logpi) for d in batch: bwelch.fit_sequence(d) return [bwelch] p_data = sc.parallelize(data) log_prob_sum = p_data.map( model.observation_log_probability).reduce(lambda x, y: x + y) iteration = 0 improvement = float('inf') new_model = None while improvement > stop_threshold or iteration < min_iterations + 1: s = time.time() batches = p_data.mapPartitions(fit_worker).collect() logger.info(f'got baches of size {len(batches)}') new_model = HiddenMarkovModel.from_batches(batches) new_log_prob_sum = p_data.map( new_model.observation_log_probability).reduce(lambda x, y: x + y) improvement = new_log_prob_sum - log_prob_sum e = time.time() logger.info(f'took {e - s}') logger.info(f'improvement = {improvement:.5f}') log_prob_sum = new_log_prob_sum if iteration >= max_iterations: break iteration += 1 return new_model
def test_identity_emissions_non_uniform_transitions(self): # print "Testing emissions == labels with non-uniform transitions chain: ", model = HiddenMarkovModel(label_history_size=1) model.labels = ('A', 'B', START_LABEL, STOP_LABEL) self.set_defaults(model) self.biased_transitions(model) self.identity_emissions(model) tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'], ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']] scores = [ log(0.5) * 4 + log(0.25), log(0.5) * 4 + log(0.25), log(0.5) * 3 + log(0.25) * 2, log(0.5) * 2 + log(0.25) * 3 ] scored_tests = zip(tests, scores) for test, score in scored_tests: self._test_label(model, test, score)
def test_even_larger_history(self): sequence = (('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B')) model = HiddenMarkovModel(label_history_size=4) model.train(sequence, fallback_model=None, use_linear_smoothing=False) self.assertEqual(len(model.transition), 9) self.assertEqual(len(model.transition['<START>::A::B::A']), 1) self.assertEqual(len(model.transition['A::B::A::B']), 2) self.assertEqual( model.transition['<START>::<START>::<START>::<START>'] ['<START>::<START>::<START>::A'], log(0.5)) self.assertEqual( model.transition['<START>::<START>::<START>::A'] ['<START>::<START>::A::B'], 0.0) self.assertEqual( model.transition['<START>::<START>::A::B']['<START>::A::B::A'], 0.0) self.assertEqual(model.transition['<START>::A::B::A']['A::B::A::B'], 0.0) self.assertEqual(model.transition['A::B::A::B']['B::A::B::A'], log(0.5))
def test_simple_sequence(self): sequence = (('A', 'A'), ('A', 'A'), ('A', 'A'), ('A', 'A'), ('A', 'A'), ('A', 'A')) model = HiddenMarkovModel(label_history_size=2) model.train(sequence, fallback_model=None, use_linear_smoothing=False) self.assertEqual(len(model.transition), 4) self.assertEqual(len(model.transition['A::A']), 2) self.assertEqual(model.transition['A::A']['A::A'], log(4.0 / 5.0)) self.assertEqual(len(model.reverse_transition), 5) self.assertEqual(len(model.reverse_transition['A::A']), 2) self.assertEqual(model.reverse_transition['A::A']['A::A'], log(4.0 / 5.0)) self.assertEqual(len(model.emission), 5) self.assertEqual(len(model.emission['A::A']), 1) self.assertEqual(model.emission['A::A']['A'], 0.0) self.assertEqual(len(model.label_emissions), 3) self.assertEqual(len(model.label_emissions['A']), 2) self.assertEqual(model.label_emissions['A']['A::A'], log(5.0 / 6.0))
def test_biased_emissions_biased_transitions(self): # print "Testing self-biased transitions with self-biased emissions: ", model = HiddenMarkovModel(label_history_size=1) model.labels = ('A', 'B', START_LABEL, STOP_LABEL) self.set_defaults(model) self.biased_transitions(model) self.biased_emissions(model) tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'], ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']] scores = [ log(0.5) * 4 + log(0.25), log(0.5) * 4 + log(0.25), log(0.5) * 3 + log(0.25) * 2, log(0.5) * 2 + log(0.25) * 3 ] scores = [6.0 * log(2.0 / 3.0) + score for score in scores] scored_tests = zip(tests, scores) for test, score in scored_tests: self._test_label(model, test, score)
def test_larger_history(self): sequence = (('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B')) model = HiddenMarkovModel(label_history_size=3) model.train(sequence, fallback_model=None, use_linear_smoothing=False) self.assertEqual(len(model.transition), 7) self.assertEqual(len(model.transition['B::A::B']), 2) self.assertEqual( model.transition['<START>::<START>::<START>'] ['<START>::<START>::A'], log(0.5)) self.assertEqual( model.transition['<START>::<START>::A']['<START>::A::B'], 0.0) self.assertEqual(model.transition['<START>::A::B']['A::B::A'], 0.0) self.assertEqual(model.transition['B::A::B']['A::B::<STOP>'], log(1.0 / 2.0)) self.assertEqual(model.transition['A::B::A']['B::A::B'], 0.0) self.assertEqual(len(model.reverse_transition), 8) self.assertEqual(len(model.reverse_transition['A::B::A']), 2) self.assertEqual( model.reverse_transition['<START>::A::B']['<START>::<START>::A'], 0.0) self.assertEqual(model.reverse_transition['B::A::B']['A::B::A'], 0.0) self.assertEqual(model.reverse_transition['A::B::A']['B::A::B'], log(1.0 / 2.0)) self.assertEqual(len(model.label_emissions), 4) self.assertEqual(len(model.label_emissions['A']), 2) self.assertEqual(model.label_emissions['A']['A::B::A'], log(2.0 / 3.0)) self.assertEqual(len(model.emission), 8) assert all(len(values) == 1 for values in model.emission.itervalues()) assert all( all(val == 0.0 for val in values.itervalues()) for values in model.emission.itervalues())
def test_three_history_alternating(self): alternating = lambda n: [(l, e) for l, e, _ in izip(cycle( ('A', 'B')), cycle( ('A', 'B')), xrange(n))] sequence = alternating(6) model = HiddenMarkovModel(label_history_size=3) model.train(sequence, fallback_model=None, use_linear_smoothing=False) self.assertEqual(model.label(alternating(4)), [label for label, _ in alternating(4)]) self.assertEqual(model.label(alternating(6)), [label for label, _ in alternating(6)])
def create_hidden_markov_model(self): if self.emission_mode == "projection": assert hasattr(self, "emission_probability") self.hidden_markov_model = HiddenMarkovModel( self.transition_probability, self.emission_probability, self.initial_probability, self.state_space.states, update_matrix=self.update_matrix, enable_warnings=True) elif self.emission_mode == "gaussian": assert hasattr(self, "mean") and hasattr(self, "covariance") self.hidden_markov_model = GaussianHiddenMarkovModel( self.transition_probability, self.initial_probability, self.state_space.states, self.mean, self.covariance, update_matrix=self.update_matrix, enable_warnings=True)
# use testing parameters in dev mode if args.dev: alphabet = 'abdi#' words = ['babi#', 'dida#'] # for #3, we run the training program 20 times to look for local maxima if args.mode == 'loop': print "Beginning iteration for Local Maxima question..." for i in xrange(1, 21): print "Local Maxima question: iteration %s" % i outfile = open('results/result%s.txt' % i, 'w') hmm = HiddenMarkovModel(numstates, alphabet) dump_state(outfile, hmm, 'Initial State') hmm.train(words, iterations=100, verbose=args.verbose) dump_state(outfile, hmm, 'Iteration Summary') print "Completed Local Maxima question." # for #4 we train and then compute the viterbi path for each word # we store each word-path pair in a dict and then pass that dict # to a function that handles output if args.mode == 'viterbi': hmm = HiddenMarkovModel(numstates, alphabet) dump_state(args.outfile, hmm, 'Initial State') hmm.train(words, iterations=100, verbose=args.verbose)
def main(): spark = (SparkSession.builder.master("local[*]").config( "spark.executor.memory", "30g").config("spark.driver.memory", "30g").config( "spark.driver.maxResultSize", "30g").config("spark.memory.offHeap.enabled", True).config( "spark.memory.offHeap.size", "16g").appName("sampleCodeForReference").getOrCreate()) sc = spark.sparkContext test_data = pickle.load( open('/Users/locojay/PycharmProjects/dthmm/tutorials/testdata2.p', 'rb')) A = [[0.17299125, 0.08781199, 0.24904337, 0.49015339], [0.65466035, 0.0058856, 0.24847472, 0.09097933], [0.43406668, 0.09507003, 0.24143807, 0.22942522], [0.00310297, 0.41726041, 0.27046179, 0.30917482]] B = [[ 0.0248371, 0.00647766, 0.02919312, 0.02010902, 0.01741969, 0.03026002, 0.01107451, 0.03090185, 0.02000882, 0.02946754, 0.0329583, 0.02810143, 0.00973118, 0.01286111, 0.03036823, 0.03451904, 0.01301527, 0.03176073, 0.02069127, 0.0391591, 0.03724013, 0.01681755, 0.02387927, 0.01267418, 0.01405466, 0.00182615, 0.00099688, 0.02921965, 0.02068266, 0.00459763, 0.03083269, 0.02294538, 0.00748594, 0.0318249, 0.01643839, 0.03030681, 0.00853397, 0.02212386, 0.02451805, 0.01147829, 0.01860806, 0.01689099, 0.01947854, 0.00456117, 0.01985139, 0.02348703, 0.02722838, 0.02259387, 0.00460825, 0.00130027 ], [ 0.00118511, 0.0364538, 0.00539255, 0.02931715, 0.00712114, 0.02613686, 0.02025734, 0.00856556, 0.01788003, 0.02696186, 0.03206167, 0.02082036, 0.02027708, 0.0363248, 0.01253547, 0.02536659, 0.0303423, 0.00161272, 0.02162873, 0.0211614, 0.01741675, 0.01470692, 0.032151, 0.03228765, 0.03237699, 0.0370071, 0.01195834, 0.02739508, 0.01974688, 0.01438907, 0.00741205, 0.02553209, 0.00501492, 0.02914962, 0.01528311, 0.02546899, 0.01965691, 0.00166134, 0.0146325, 0.03175253, 0.00425995, 0.02717155, 0.02544106, 0.03355649, 0.02468158, 0.00874545, 0.01172551, 0.02154314, 0.00843848, 0.01803447 ], [ 0.0296181, 0.0348821, 0.02564371, 0.02800763, 0.01551197, 0.02558589, 0.03501015, 0.01300263, 0.01266429, 0.03546458, 0.00678947, 0.01032237, 0.03453364, 0.02323215, 0.01534716, 0.03644205, 0.02687086, 0.02292363, 0.00105033, 0.0289615, 0.02795536, 0.03250376, 0.02837804, 0.01249522, 0.02217764, 0.02628832, 0.00928285, 0.00739886, 0.03279007, 0.00722151, 0.00053051, 0.01206393, 0.01819556, 0.00779652, 0.02419107, 0.00798948, 0.00664281, 0.02770423, 0.0339964, 0.01410592, 0.01401967, 0.03120296, 0.02565983, 0.01024386, 0.01415742, 0.00839726, 0.01779137, 0.02100865, 0.02521129, 0.01073536 ], [ 0.01471172, 0.02670568, 0.01813862, 0.03895738, 0.0074108, 0.00734445, 0.02980466, 0.0244879, 0.00582519, 0.0089145, 0.00959946, 0.02949902, 0.01730438, 0.00265082, 0.00898055, 0.00310906, 0.02095744, 0.02549341, 0.00517031, 0.01065439, 0.03255066, 0.03373455, 0.00429001, 0.0298808, 0.03904555, 0.00203563, 0.0188991, 0.02278372, 0.02672836, 0.01151306, 0.01512417, 0.03303694, 0.03390606, 0.02449836, 0.01443768, 0.0127056, 0.03821532, 0.01233168, 0.00493174, 0.03505321, 0.03774991, 0.03070529, 0.02777502, 0.00753259, 0.02052302, 0.02192132, 0.00473921, 0.03786516, 0.03214382, 0.01762273 ]] pi = [0.1785823, 0.20446237, 0.26092583, 0.3560295] model = HiddenMarkovModel(A, B, pi) fit(sc, test_data, model)
if __name__ == '__main__': data = read_csv("Shakespeare_data.csv") data.dropna(axis = 'columns', how = 'any', inplace = True) text = [_ for _ in data['PlayerLine']] corpus = [textCleaner(i) for i in text] ''' Used to train model- hmm_model = HiddenMarkovModel(hiddenStates = 5) hmm_model.trainer(corpus,filName='model.pickle') print("Model complete") ''' trans,ems,initials = HiddenMarkovModel.load('model.pickle') currentModel = HiddenMarkovModel(hiddenStates = 5 ,transProbs = trans, emissionProbs = ems, initialProbs = initials) print('1- Generate text\n2- Predict text') inNum = 0 inNum = input() if inNum == '1': print("Number of words to be generated: ") currentModel.generator(int(input())) elif inNum == '2': print("Input sequences of words to predict on: ") textIn = str(input())
class HmmCoinsTest(unittest.TestCase): def setUp(self): self.states = ["Coin 1", "Coin 2", "Coin 3"] self.vocabulary = ["Heads", "Tails"] self.initial_probabilities = { "Coin 1": 0.4, "Coin 2": 0.3, "Coin 3": 0.3 } # The probability of moving from state a to state b # such that the sum of the all the probabilities = 1 self.transition_probabilities = { "Coin 1": {"Coin 1": 0.6, "Coin 2": 0.3, "Coin 3": 0.1}, "Coin 2": {"Coin 1": 0.2, "Coin 2": 0.5, "Coin 3": 0.3}, "Coin 3": {"Coin 1": 0.3, "Coin 2": 0.2, "Coin 3": 0.5} } # the probability of the observation O being generated from the state q self.emission_probabilities = { "Coin 1": {"Heads": 0.7, "Tails": 0.3}, "Coin 2": {"Heads": 0.3, "Tails": 0.7}, "Coin 3": {"Heads": 0.5, "Tails": 0.5} } self.hmm = HiddenMarkovModel( self.states, # all the possible hidden states self.vocabulary, # all possible observation types self.transition_probabilities, self.emission_probabilities, self.initial_probabilities ) def test_forward(self): observations = ["Heads", "Heads", "Heads"] P, forwards = self.hmm.forward(observations) self.assertEqual(P, 0.14533999999999997) self.assertEqual( forwards[0], [0.27999999999999997, 0.16169999999999998, 0.08824199999999997]) self.assertEqual(forwards[1], [0.09, 0.0477, 0.025607999999999995]) def test_backward(self): observations = ["Heads", "Heads", "Heads"] P, backwards = self.hmm.backward(observations) self.assertEqual(P, 0.14534) self.assertEqual(backwards[0], [0.30080000000000007, 0.56, 1.0]) self.assertEqual(backwards[1], [0.2224, 0.43999999999999995, 1.0]) def test_viterbi(self): observations = ["Heads", "Heads", "Heads"] P, backpoints = self.hmm.viterbi(observations) self.assertEqual(P, 0.049391999999999985) self.assertEqual(backpoints, ['Coin 1', 'Coin 1', 'Coin 1']) def test_forward_backward(self): observations = ["Heads", "Heads", "Heads"] old_prediction = self.hmm.viterbi(observations) new_hmm = self.hmm.forward_backward(observations) prediction = new_hmm.viterbi(observations[:3]) # we should see an increase in the probability output assert old_prediction[0] < prediction[0] self.assertEqual(prediction, (0.10346810696178374, ['Coin 1', 'Coin 1', 'Coin 1']))
# Script to get the word level accuracy of the HMM trained on pos_train.txt from hmm import HiddenMarkovModel h = HiddenMarkovModel(supervised=False) h.train() x = h.eval('./pos_test.txt') print(x)
class HmmWeatherTest(unittest.TestCase): def setUp(self): self.states = ["rainy", "sunny"] self.vocabulary = ["walk", "shop", "clean"] self.initial_probabilities = { "rainy": 0.6, "sunny": 0.4, } # The probability of moving from state a to state b # such that the sum of the all the probabilities = 1 self.transition_probabilities = { "rainy": { "rainy": 0.7, "sunny": 0.3, }, "sunny": { "rainy": 0.4, "sunny": 0.6, }, } # the probability of the observation O being generated from the state q self.emission_probabilities = { "rainy": { "walk": 0.1, "shop": 0.4, "clean": 0.5, }, "sunny": { "walk": 0.6, "shop": 0.3, "clean": 0.1, } } self.hmm = HiddenMarkovModel( self.states, # all the possible hidden states self.vocabulary, # all possible observation types self.transition_probabilities, self.emission_probabilities, self.initial_probabilities ) def test_forward(self): observations = ["walk", "shop", "clean"] P, forwards = self.hmm.forward(observations) self.assertEqual(P, 0.033612) self.assertEqual(forwards[0], [0.06, 0.055200000000000006, 0.02904]) self.assertEqual(forwards[1], [0.24, 0.04859999999999999, 0.004572]) def test_backward(self): observations = ["walk", "shop", "clean"] P, backwards = self.hmm.backward(observations) self.assertEqual(P, 0.033612) self.assertEqual(backwards[0], [0.1298, 0.38, 1.0]) self.assertEqual(backwards[1], [0.10760000000000002, 0.26, 1.0]) def test_viterbi(self): observations = ["walk", "shop", "clean"] P, backpoints = self.hmm.viterbi(observations) self.assertEqual(P, 0.01344) self.assertEqual(backpoints, ['sunny', 'rainy', 'rainy']) def test_forward_backward(self): observations = ["walk", "shop", "clean"] new_hmm = self.hmm.forward_backward(observations) prediction = new_hmm.viterbi(observations[:3]) self.assertEqual(prediction, (0.010994296643152459, ['sunny', 'rainy', 'rainy']))
# Script to get word level accuracy of the hidden markov model with the most probable # tag counts from hmm import HiddenMarkovModel h = HiddenMarkovModel(supervised=True) x = h.eval('./pos_test.txt') print(x)
def __init__(self, trainpath, validpaths, max_trans=0, vnodes=False, hypotest=None, saveplots=False): assert isinstance(trainpath, str) assert len(trainpath) assert isinstance(validpaths, list) for path in validpaths: assert isinstance(path, str) assert len(path) assert isinstance(max_trans, int) assert max_trans >= 0 assert isinstance(vnodes, bool) assert isinstance(saveplots, bool) self._modname = 'base' if vnodes: self._modname = 'vnode' if hypotest: self._modname += '+hypo' self._savepathbase = None if saveplots: self._savepathbase = "./%s-plots-%s-maxtrans%d/" % (trainpath, self._modname, max_trans) os.mkdir(self._savepathbase) # load datasets self._trainset = ObservationSequence(loadpath=trainpath, max_trans=max_trans, \ vnodes=vnodes) if hypotest: #statespace, transitions = self._trainset.admissible_statespace_transitions() #print "orig:\n", statespace, transitions statespace, transitions = self._trainset.hypotest_admissible_statespace_transitions(hypotest) #print "hypotest:\n", statespace, transitions self._trainset = ObservationSequence(loadpath=trainpath, max_trans=max_trans, vnodes=vnodes, \ admissible_statespace=statespace, \ admissible_transitions=transitions) statespace, transitions = self._trainset.admissible_statespace_transitions() #self._trainset.dump() #print "trainset:\n", statespace, transitions if not self._trainset.outseq_len(): print "warning, TransitionModel.__init__, empty trainset at path %s" % trainpath _FILTER_TRANS = 1 self._validsets = [] for validpath in validpaths: if _FILTER_TRANS: # filter trans dataset = ObservationSequence(loadpath=validpath, max_trans=max_trans, vnodes=vnodes, \ admissible_statespace=statespace, \ admissible_transitions=transitions) else: dataset = ObservationSequence(loadpath=validpath, max_trans=max_trans, vnodes=vnodes) if not dataset.outseq_len(): print "TransitionModel, skipping empty validation dataset at path %s" % validpath continue self._validsets.append(dataset) # define the initial set of states and the set of output symbols self._init_states = sorted(self._trainset.states()) self._N = len(self._init_states) self._symbols = sorted(self._trainset.symbols()) self._M = len(self._symbols) self._symbols_ext = copy.copy(self._symbols) if not _FILTER_TRANS: symbols = set() for dataset in self._validsets: symbols.update(dataset.symbols()) print "symbols", symbols for sym in symbols: if sym not in self._symbols_ext: print "sym", sym print "TransitionalModel: appending sym=\"%s\" in valset but not in trainset" % sym self._symbols_ext.append(sym) self._M_ext = len(self._symbols_ext) print "self._symbols_ext", self._symbols_ext camsymbols = {} for sym in self._symbols_ext: print "sym = ", sym for _id in sym.split('-'): print " _id = ", _id __id = int(_id.strip('abcdef')) try: camsymbols[__id].add(sym) except: print "new cam id", _id camsymbols[__id] = set([sym]) for camid in camsymbols.keys(): symbols = camsymbols[camid] camsymbols[camid] = sorted(symbols) symbol2cam = {} for sym in self._symbols_ext: for _id in sym.split('-'): __id = int(_id.strip('abcdef')) try: symbol2cam[sym].append(__id) except: symbol2cam[sym] = [__id] symbol2cam_enc = {} for sym,camids in symbol2cam.iteritems(): symenc = [] symbol2cam_enc[self._symbol_enc(sym)] = camids print "camsymbols=",camsymbols print "symbol2cam=",symbol2cam print "symbol2cam_enc=",symbol2cam_enc print "symbols_ext", self._symbols_ext # define the initial A, B matrices A, B = self._build_initial_model() # compute the node to be split self._compute_node_splits() # encode datasets (bijective mapping symobols <-> integers) self._trainset_enc = [] for observ in self._trainset.outseq(): self._trainset_enc.append(self._symbol_enc(observ)) self._validsets_enc = [] for dataset in self._validsets: encoded = [] for observ in dataset.outseq(): encoded.append(self._symbol_enc(observ)) self._validsets_enc.append(encoded) # print some info self.print_info() # create the HMM model with node splitting self._hmm = HiddenMarkovModel(A, B, copy.copy(self._init_states), \ self._trainset_enc, validsets=self._validsets_enc, symnames=copy.copy(self._symbols_ext), symbol2cam=symbol2cam_enc) self._transition_graph_threshold = 0.01 self._hmm.plot_transition_graph(0., savepathbase=self._savepathbase, savetag='coverage-model')
class TransitionalModel(): def __init__(self, trainpath, validpaths, max_trans=0, vnodes=False, hypotest=None, saveplots=False): assert isinstance(trainpath, str) assert len(trainpath) assert isinstance(validpaths, list) for path in validpaths: assert isinstance(path, str) assert len(path) assert isinstance(max_trans, int) assert max_trans >= 0 assert isinstance(vnodes, bool) assert isinstance(saveplots, bool) self._modname = 'base' if vnodes: self._modname = 'vnode' if hypotest: self._modname += '+hypo' self._savepathbase = None if saveplots: self._savepathbase = "./%s-plots-%s-maxtrans%d/" % (trainpath, self._modname, max_trans) os.mkdir(self._savepathbase) # load datasets self._trainset = ObservationSequence(loadpath=trainpath, max_trans=max_trans, \ vnodes=vnodes) if hypotest: #statespace, transitions = self._trainset.admissible_statespace_transitions() #print "orig:\n", statespace, transitions statespace, transitions = self._trainset.hypotest_admissible_statespace_transitions(hypotest) #print "hypotest:\n", statespace, transitions self._trainset = ObservationSequence(loadpath=trainpath, max_trans=max_trans, vnodes=vnodes, \ admissible_statespace=statespace, \ admissible_transitions=transitions) statespace, transitions = self._trainset.admissible_statespace_transitions() #self._trainset.dump() #print "trainset:\n", statespace, transitions if not self._trainset.outseq_len(): print "warning, TransitionModel.__init__, empty trainset at path %s" % trainpath _FILTER_TRANS = 1 self._validsets = [] for validpath in validpaths: if _FILTER_TRANS: # filter trans dataset = ObservationSequence(loadpath=validpath, max_trans=max_trans, vnodes=vnodes, \ admissible_statespace=statespace, \ admissible_transitions=transitions) else: dataset = ObservationSequence(loadpath=validpath, max_trans=max_trans, vnodes=vnodes) if not dataset.outseq_len(): print "TransitionModel, skipping empty validation dataset at path %s" % validpath continue self._validsets.append(dataset) # define the initial set of states and the set of output symbols self._init_states = sorted(self._trainset.states()) self._N = len(self._init_states) self._symbols = sorted(self._trainset.symbols()) self._M = len(self._symbols) self._symbols_ext = copy.copy(self._symbols) if not _FILTER_TRANS: symbols = set() for dataset in self._validsets: symbols.update(dataset.symbols()) print "symbols", symbols for sym in symbols: if sym not in self._symbols_ext: print "sym", sym print "TransitionalModel: appending sym=\"%s\" in valset but not in trainset" % sym self._symbols_ext.append(sym) self._M_ext = len(self._symbols_ext) print "self._symbols_ext", self._symbols_ext camsymbols = {} for sym in self._symbols_ext: print "sym = ", sym for _id in sym.split('-'): print " _id = ", _id __id = int(_id.strip('abcdef')) try: camsymbols[__id].add(sym) except: print "new cam id", _id camsymbols[__id] = set([sym]) for camid in camsymbols.keys(): symbols = camsymbols[camid] camsymbols[camid] = sorted(symbols) symbol2cam = {} for sym in self._symbols_ext: for _id in sym.split('-'): __id = int(_id.strip('abcdef')) try: symbol2cam[sym].append(__id) except: symbol2cam[sym] = [__id] symbol2cam_enc = {} for sym,camids in symbol2cam.iteritems(): symenc = [] symbol2cam_enc[self._symbol_enc(sym)] = camids print "camsymbols=",camsymbols print "symbol2cam=",symbol2cam print "symbol2cam_enc=",symbol2cam_enc print "symbols_ext", self._symbols_ext # define the initial A, B matrices A, B = self._build_initial_model() # compute the node to be split self._compute_node_splits() # encode datasets (bijective mapping symobols <-> integers) self._trainset_enc = [] for observ in self._trainset.outseq(): self._trainset_enc.append(self._symbol_enc(observ)) self._validsets_enc = [] for dataset in self._validsets: encoded = [] for observ in dataset.outseq(): encoded.append(self._symbol_enc(observ)) self._validsets_enc.append(encoded) # print some info self.print_info() # create the HMM model with node splitting self._hmm = HiddenMarkovModel(A, B, copy.copy(self._init_states), \ self._trainset_enc, validsets=self._validsets_enc, symnames=copy.copy(self._symbols_ext), symbol2cam=symbol2cam_enc) self._transition_graph_threshold = 0.01 self._hmm.plot_transition_graph(0., savepathbase=self._savepathbase, savetag='coverage-model') def optimize(self, nu=0.01): # plot coverage self._hmm.plot_transition_graph(self._transition_graph_threshold, \ savepathbase=self._savepathbase, savetag='coverage-model') # run BW before the first node-split while True: done = self._hmm.baumwelch(nu) self._hmm.plot_perf(savepathbase=self._savepathbase) self._hmm.plot_transition_graph(self._transition_graph_threshold, \ savepathbase=self._savepathbase) if done: break while True: keys = sorted(self._pending_splits.keys()) print "\n%d node-splits left" % len(keys) if not len(keys): break key = keys[0] splits = sorted(self._pending_splits[key], key=lambda split_data: split_data[0]) #print "splits for node ", key, " (", len(splits), "):", splits assert len(splits) split = splits.pop() if not len(splits): self._pending_splits.pop(key) else: self._pending_splits[key] = splits # per ogni nodo corrente # guarda alla backward star # guarda alla forward star # costruisci matrice rettangolare transizioni # identifica la riga piu' ortogonale a tutte le altre # # se lo trovi splitta il nodo ribalanciando le probabilita' #(inner, bsym, bstar_syms, fstar_syms, row) #print split _inner = split[0] _bsym = split[1] _bstar_sym = split[2] _fstar_sym = split[3] rates = split[4] assert numpy.isclose(rates.sum(), 1) #print " splitting node %s : inner=%.2f bsym=%s _bstar=%s _fstar=%s, rates=%s" % (key, _inner, _bsym, _bstar_sym, _fstar_sym, rates) state = self._state_enc(key) bstate = self._state_enc(_bsym) bstar = [] for i in _bstar_sym: bstar.append(self._state_enc(i)) fstar = [] for i in _fstar_sym: fstar.append(self._state_enc(i)) self._hmm.split_state(state, bstate, bstar, fstar, rates) self._hmm.plot_transition_graph(self._transition_graph_threshold, \ savepathbase=self._savepathbase, savetag='node-splitting') while True: done = self._hmm.baumwelch(nu) self._hmm.plot_perf(savepathbase=self._savepathbase) self._hmm.plot_transition_graph(self._transition_graph_threshold, \ savepathbase=self._savepathbase) if done: break def _symbol_enc(self, sym): #print sym #print self._symbols assert sym in self._symbols_ext return self._symbols_ext.index(sym) def _state_enc(self, state): assert state in self._init_states return self._init_states.index(state) def name(self): return self._modname def nr_training_transitions(self): return max(0, self._trainset.outseq_len() -1) def nr_validation_transitions(self, setid): assert isinstance(setid, int) assert setid >= 0 assert setid < self._validsets.outseq_len() return max(0, self._validsets[setid].outseq_len() -1) def _build_initial_model(self): dataset = self._trainset A = numpy.zeros((self._N, self._N)) for src in self._init_states: nr_from = dataset.count_transitions_from(src) #print "nr_from: %s = %d" % (src, nr_from) assert nr_from >= 0 if nr_from == 0: continue src_idx = self._state_enc(src) for dest in self._init_states: nr_to = dataset.count_transitions_from_to(src, dest) #print "nr_from_to: %s->%s = %d" % (src, dest, nr_from) assert nr_to >= 0 if nr_to == 0: continue dest_idx = self._state_enc(dest) A[src_idx,dest_idx] = nr_to/nr_from for i in xrange(self._N): # potremmo non avere transizioni uscenti da un certo # nodo ed in quel caso le righe di A non sommano tutte all'unita' # usa una distribuzione uniforme if not numpy.isclose(A[i,:].sum(), 1.): A[i,:] = numpy.ones(self._N)/self._N B = numpy.zeros((self._M,self._N)) for state in self._init_states: if state in self._symbols: B[self._symbol_enc(state), self._state_enc(state)] = 1. else: # this must be one of the vnull states assert '~0~' in state B[self._symbol_enc('0'), self._state_enc(state)] = 1. return A, B def _compute_node_splits(self): triplets = self._trainset.state_triplets_dict() fstar = self._trainset.state_fstar_dict() bstar = self._trainset.state_bstar_dict() #print "fstar: ", fstar #print "bstar: ", bstar splits = {} # computed splits keyed by the id of the splitted node # compute splits for each state states = tuple(set(self._trainset.states())) for state in states: if (state not in fstar.keys()) or (state not in bstar.keys()): # this can happen for symbols in the first and last triplet continue fstar_states = sorted(fstar[state]) bstar_states = sorted(bstar[state]) nr_fstar_states = len(fstar_states) nr_bstar_states = len(bstar_states) if nr_fstar_states < 2 or nr_bstar_states < 2: # the 'transition rate' table must have be at least 2x2 continue # build the table (using a matrix) mat = numpy.zeros((nr_bstar_states, nr_fstar_states)) for i in xrange(nr_bstar_states): for j in xrange(nr_fstar_states): bstate = bstar_states[i] fstate = fstar_states[j] try: mat[i,j] = triplets[(bstate,state,fstate)] except: mat[i,j] = 0. # we have just filled the i-th row, normalize it to unit norm mat[i,:] /= numpy.linalg.norm(mat[i,:]) # Evaluate the inner products matt = mat.transpose() for i in xrange(nr_bstar_states-1): bstate = bstar_states[i] row = mat[i,:] # We do all the inner products at once and then select just the smallest one # ! inners that have been evaluted already in past iterations are # ! skipped by setting them to 1. inners = row.dot(matt) inners[0:i] = 1. idx = numpy.argmin(inners) inner = inners[idx] if inner > 0.2: #print "splitting: discarding split with to high inner for node %s : from=%s inner=%.2f" % (state, bstate, inner) continue # add a new entry to the list of splits for the node with id 'sym' entry = (inner, bstate, bstar_states, fstar_states, row/row.sum()) try: #print "splitting: new split for node %s : %s" % (sym, entry) splits[state].append(entry) except: splits[state] = [entry] nr_splits = 0 print "\nNode splitting:" for state, _splits in splits.iteritems(): #print " splits for node ", key nr_splits += len(_splits) print " Nr. of splits for node %s: %d" % (state,len(_splits)) #for entry in item: #print key, entry print " Tot. nr. of splits %d" % nr_splits self._pending_splits = splits def print_info(self): print "transitional model dump: fill me" print "Symbols: " for i in xrange(len(self._symbols)): print " ", i, self._symbols[i] print "States: " for i in xrange(len(self._init_states)): print " ", i, self._init_states[i]
def TestHiddenMarkovModel(self): # fix random seed Rand.Restart(12347) # set hyperparameters (preenchido com os da geladeira) # a probab do utensilio assumir cada estado k e a mesma a priori ProbInitPriorObs = Dirichlet.Uniform(self.K) # a prob de transicao de estados tbm e uniforme a priori CPTTransPriorObs = System.Array.CreateInstance(Dirichlet,self.K) for i in range(0,self.K): CPTTransPriorObs[i] = Dirichlet.Uniform(self.K) EmitMeanPriorObs = System.Array.CreateInstance(Gaussian,self.K) #for i in range(0,self.K): EmitMeanPriorObs[0] = Gaussian.FromMeanAndPrecision(0, 0.01) #off EmitMeanPriorObs[1] = Gaussian.FromMeanAndPrecision(100, 0.00001) #on EmitPrecPriorObs = System.Array.CreateInstance(Gamma,self.K) #for i in range(0,self.K): EmitPrecPriorObs[0] = Gamma.FromShapeAndScale(0.2285, 0.0088) #off EmitPrecPriorObs[1] = Gamma.FromShapeAndScale(4, 0.01) #on # a escolha dos hiperparametros esta na tabela no fim do paper (analisar sensibilidade no futuro) # sample model parameters init = System.Array[float](ProbInitPriorObs.Sample().ToArray())# cria uma amostra na forma de um array de K posicoes obtida pela dirichlet #matrix de trans um array para cada linha trans0 = System.Array.CreateInstance(float, self.K) trans1 = System.Array.CreateInstance(float, self.K) trans0 = CPTTransPriorObs[0].Sample().ToArray() trans1 = CPTTransPriorObs[1].Sample().ToArray() emitMeans = System.Array.CreateInstance(float,self.K) for i in range(0,self.K): emitMeans[i] = EmitMeanPriorObs[i].Sample() emitPrecs = System.Array.CreateInstance(float,self.K) for i in range (0,self.K): emitPrecs[i] = EmitPrecPriorObs[i].Sample() # print parameters HiddenMarkovModel().HiddenMarkov() HiddenMarkovModel().DefineInferenceEngine() HiddenMarkovModel().SetParameters(init, trans0, trans1, emitMeans, emitPrecs) print "parameters:" HiddenMarkovModel().PrintParameters() # create distributions for sampling initDist = Discrete(init) transDist = System.Array.CreateInstance(Discrete,self.K) transDist[0] = Discrete(trans0) transDist[1] = Discrete(trans1) #print transDist[0] #print transDist[1] emitDist = System.Array.CreateInstance(Gaussian,self.K) for i in range (0,self.K): emitDist[i] = Gaussian.FromMeanAndPrecision(emitMeans[i], emitPrecs[i]) #sample data and emission data actualStates = System.Array.CreateInstance(int,self.T) emissions = System.Array.CreateInstance(float,self.T) actualStates[0] = initDist.Sample() emissions[0] = emitDist[actualStates[0]].Sample() for i in range(1,self.T): actualStates[i] = transDist[actualStates[i-1]].Sample() emissions[i] = emitDist[actualStates[i]].Sample() #print emissions[i] #print "sample data:" #print actualStates # infer model parameters, states and model evidence given priors and emission data HiddenMarkovModel().HiddenMarkov() HiddenMarkovModel().SetPriors(ProbInitPriorObs,CPTTransPriorObs, EmitMeanPriorObs,EmitPrecPriorObs) HiddenMarkovModel().ObserveData(emissions) HiddenMarkovModel().InferPosteriors() # print posterior distributions print "posteriors" HiddenMarkovModel().PrintPosteriors()
def pos_problem(arguments, fallback_model=None, fallback_training_limit=None): dataset_size = None if len(arguments) >= 2: dataset_size = int(arguments[1]) if len(arguments) >= 3: fallback_training_limit = int(arguments[2]) try: start = time() pickle_file = open("pos_hmm.pickle") request_size, training_stream, validation_stream, testing_sentences = pickle.load(pickle_file) pickle_file.close() if request_size != dataset_size: raise IOError() print "Unpickling: %f" % (time() - start) except (IOError, EOFError): # Load the dataset print "Loading dataset" start = time() if dataset_size: tagged_sentences = list(islice(PennTreebankReader.read_pos_tags_from_directory("data/wsj"), dataset_size)) else: tagged_sentences = list(PennTreebankReader.read_pos_tags_from_directory("data/wsj")) stop = time() print "Reading: %f" % (stop-start) print "Creating streams" start = time() training_sentences = tagged_sentences[0:len(tagged_sentences)*4/5] validation_sentences = tagged_sentences[len(tagged_sentences)*8/10+1:len(tagged_sentences)*9/10] testing_sentences = tagged_sentences[len(tagged_sentences)*9/10+1:] print "Training: %d" % len(training_sentences) print "Validation: %d" % len(validation_sentences) print "Testing: %d" % len(testing_sentences) training_stream, validation_stream = map(merge_stream, (training_sentences, validation_sentences)) stop = time() print "Streaming: %f" % (stop-start) serialized = (dataset_size, training_stream, validation_stream, testing_sentences) pickle_file = open("pos_hmm.pickle", "w") pickle.dump(serialized, pickle_file, protocol=pickle.HIGHEST_PROTOCOL) pickle_file.close() print "Training" start = time() pos_tagger = HiddenMarkovModel(label_history_size=2) pos_tagger.train(training_stream[1:-2], fallback_model=fallback_model, fallback_training_limit=fallback_training_limit) stop = time() print "Training: %f" % (stop-start) print "Testing on %d sentences" % len(testing_sentences) start = time() num_correct = 0 num_incorrect = 0 for correct_labels, emissions in testing_sentences: guessed_labels = pos_tagger.label(emissions, debug=False) # print "SENTENCE: %s" % emissions # print "CORRECT: %s" % correct_labels # print "GUESSED: %s" % guessed_labels for correct, guessed in izip(correct_labels, guessed_labels): if correct == START_LABEL or correct == STOP_LABEL: continue if correct == guessed: num_correct += 1 else: num_incorrect += 1 if correct_labels != guessed_labels: guessed_score = pos_tagger.score(zip(guessed_labels, emissions)) correct_score = pos_tagger.score(zip(correct_labels, emissions)) if guessed_score < correct_score: print "%d Guessed: %f, Correct: %f" % (len(emissions), guessed_score, correct_score) debug_label = lambda: pos_tagger.label(emissions, debug=True) debug_score = lambda labels: pos_tagger.score(zip(labels, emissions), debug=False) assert guessed_score >= correct_score or len(emissions) > 23, "Decoder sub-optimality (%f for guess, %f for correct)\n%s vs. %s" % (debug_score(guessed_labels), debug_score(correct_labels), debug_label(), correct_labels) stdout.write("%1.0f" % (sum(1 for guess, correct in zip(guessed_labels, correct_labels) if guess == correct) / len(correct_labels) * 10)) stdout.flush() else: stdout.write(".") stdout.flush() stdout.write("\n") stop = time() print "Testing: %f" % (stop-start) print "%d correct (%.3f%% of %d)" % (num_correct, 100.0 * float(num_correct) / float(num_correct + num_incorrect), num_correct + num_incorrect)