Пример #1
0
    def setUp(self):
        self.states = ["Coin 1", "Coin 2", "Coin 3"]
        self.vocabulary = ["Heads", "Tails"]

        self.initial_probabilities = {
            "Coin 1": 0.4,
            "Coin 2": 0.3,
            "Coin 3": 0.3
        }

        # The probability of moving from state a to state b
        # such that the sum of the all the probabilities = 1
        self.transition_probabilities = {
            "Coin 1": {"Coin 1": 0.6, "Coin 2": 0.3, "Coin 3": 0.1},
            "Coin 2": {"Coin 1": 0.2, "Coin 2": 0.5, "Coin 3": 0.3},
            "Coin 3": {"Coin 1": 0.3, "Coin 2": 0.2, "Coin 3": 0.5}
        }

        # the probability of the observation O being generated from the state q
        self.emission_probabilities = {
            "Coin 1": {"Heads": 0.7, "Tails": 0.3},
            "Coin 2": {"Heads": 0.3, "Tails": 0.7},
            "Coin 3": {"Heads": 0.5, "Tails": 0.5}
        }

        self.hmm = HiddenMarkovModel(
            self.states,  # all the possible hidden states
            self.vocabulary,  # all possible observation types
            self.transition_probabilities,
            self.emission_probabilities,
            self.initial_probabilities
        )
Пример #2
0
def main():
    log("Starting named entity recognition task")

    log("Splitting training set into training and validation sets")
    training, validation = Utils.create_training_validation_split(TRAINING_FILE_PATH)
    log("Writing training and validation sets to file")
    Utils.write_streams_to_file(training, "./input/training.txt")
    Utils.write_streams_to_file(validation, "./input/validation.txt")

    log("Reading training file")
    token_stream, pos_stream, tag_stream = Utils.read_training_file(TRAINING_FILE_PATH)
    log("Replacing low frequency tokens from training set")
    token_stream, closed_vocabulary = Unknown.replace_low_frequency_tokens(token_stream)

    log("Reading test file")
    test_stream = Utils.read_test_file(TEST_FILE_PATH)
    log("Replacing unknown tokens from test set")
    test_stream = Unknown.replace_unknown_tokens(test_stream, closed_vocabulary)

    log("Training most frequent class baseline")
    baseline = MostFrequentClassBaseline(token_stream, tag_stream)
    log("Predicting tags using baseline")
    baseline_predictions = baseline.classify_test_stream(test_stream)
    log("Writing predictions with baseline to file")
    Utils.write_results_to_file(baseline_predictions, "../output/baseline_output.txt")

    log("Training Hidden Markov Model")
    hmm = HiddenMarkovModel(token_stream, tag_stream)
    log("Predicting tags using HMM")
    hmm_predictions = hmm.classify_test_stream(test_stream)
    log("Writing predictions with HMM to file")
    Utils.write_results_to_file(hmm_predictions, "../output/hmm_output.txt")
    def test_hmm(self):
        '''A = np.matrix([[0,1,0,0],
                           [0.4,0,0.6,0],
                           [0,0.4,0,0.6],
                           [0,0,0.5,0.5]])
            B = np.matrix([[0.5,0.5],
                           [0.3,0.7],
                           [0.6,0.4],
                           [0.8,0.2]])
            PI = np.matrix([0.25,0.25,0.25,0.25]).transpose()

            #observation result
            O = [0,0,1,1,0]'''

        A = np.matrix([[0.5, 0.2, 0.3],
                       [0.3, 0.5, 0.2],
                       [0.2, 0.3, 0.5]])

        B = np.matrix([[0.5, 0.5],
                       [0.4, 0.6],
                       [0.7, 0.3]])
        PI = np.matrix([0.2, 0.4, 0.4]).transpose()
        O = [0, 1, 0]

        hmm = HiddenMarkovModel(A, B, PI, O)
        prob, path = hmm.viterbi()

        print("The probability for Observation States", str(O), " is ", hmm.forward_backword())
        assert hmm.forward_backword() == 0.130218

        print("The max probs for Observation States", str(O), " is ", prob, " and the hidden state path is ",
              '-'.join(['%s' % id for id in path]))
        assert prob == 0.014699999999999998
        assert path == [3, 3, 3]
Пример #4
0
	def test_larger_history(self):
		sequence = (('A', 'A'), ('B', 'B'),
					('A', 'A'), ('B', 'B'),
					('A', 'A'), ('B', 'B'))

		model = HiddenMarkovModel(label_history_size=3)
		model.train(sequence, fallback_model=None, use_linear_smoothing=False)

		self.assertEqual(len(model.transition), 7)
		self.assertEqual(len(model.transition['B::A::B']), 2)
		self.assertEqual(model.transition['<START>::<START>::<START>']['<START>::<START>::A'], log(0.5))
		self.assertEqual(model.transition['<START>::<START>::A']['<START>::A::B'], 0.0)
		self.assertEqual(model.transition['<START>::A::B']['A::B::A'], 0.0)
		self.assertEqual(model.transition['B::A::B']['A::B::<STOP>'], log(1.0 / 2.0))
		self.assertEqual(model.transition['A::B::A']['B::A::B'], 0.0)

		self.assertEqual(len(model.reverse_transition), 8)
		self.assertEqual(len(model.reverse_transition['A::B::A']), 2)
		self.assertEqual(model.reverse_transition['<START>::A::B']['<START>::<START>::A'], 0.0)
		self.assertEqual(model.reverse_transition['B::A::B']['A::B::A'], 0.0)
		self.assertEqual(model.reverse_transition['A::B::A']['B::A::B'], log(1.0 / 2.0))

		self.assertEqual(len(model.label_emissions), 4)
		self.assertEqual(len(model.label_emissions['A']), 2)
		self.assertEqual(model.label_emissions['A']['A::B::A'], log(2.0/3.0))

		self.assertEqual(len(model.emission), 8)
		assert all(len(values) == 1 for values in model.emission.itervalues())
		assert all(all(val == 0.0 for val in values.itervalues())
				   for values in model.emission.itervalues())
Пример #5
0
	def test_unk_emission(self):
#		print "Testing UNK emission with emission == label and self-biased transitions: ",

		model = HiddenMarkovModel(label_history_size=1)
		model.labels = ('A', 'B', START_LABEL, STOP_LABEL)

		self.set_defaults(model)
		self.identity_emissions(model)
		self.biased_transitions(model)

		emissions = ['A', 'C', 'A', 'B', 'B']
		labels = ['A', 'A', 'A', 'B', 'B']
		score = log(0.5) * 2 + log(0.25) + log(0.5) + log(0.25) + log(0.5) + log(0.25)

		self._test_label(model, emissions, score, labels=labels)

		emissions = ['A', 'C', 'C', 'B', 'B']
		labels = [['A', 'A', 'A', 'B', 'B'], ['A', 'A', 'B', 'B', 'B'], ['A', 'B', 'B', 'B', 'B']]

		score = None
		for label in labels:
			new_score = model.score(zip(label, emissions))
			if score: self.assertAlmostEqual(score, new_score, 5)#, "score(%s) (%f) bad" % (label, new_score)
			score = new_score

		emissions = ['A', 'C', 'C', 'B', 'B']
		labels = [['A', 'A', 'A', 'B', 'B'], ['A', 'A', 'B', 'B', 'B'], ['A', 'B', 'B', 'B', 'B']]

		score = None
		for label in labels:
			new_score = model.score(zip(label, emissions))
			if score: self.assertAlmostEqual(score, new_score, 5)#, "score(%s) (%f) bad" % (label, new_score)
			score = new_score
Пример #6
0
	def test_alternating_sequence(self):
		sequence = (('A', 'A'), ('B', 'B'),
					('A', 'A'), ('B', 'B'),
					('A', 'A'), ('B', 'B'))

		model = HiddenMarkovModel(label_history_size=2)
		model.train(sequence, fallback_model=None, use_linear_smoothing=False)

		self.assertEqual(len(model.transition), 5)
		self.assertEqual(len(model.transition['A::B']), 2)
		self.assertEqual(model.transition['A::B']['B::A'], log(2.0 / 3.0))
		self.assertEqual(model.transition['B::A']['A::B'], 0.0)
		self.assertEqual(model.transition['<START>::<START>']['<START>::A'], log(0.5))

		self.assertEqual(len(model.reverse_transition), 6)
		self.assertEqual(len(model.reverse_transition['A::B']), 2)
		self.assertEqual(model.reverse_transition['A::B']['B::A'], 0.0)
		self.assertEqual(model.reverse_transition['B::A']['A::B'], log(2.0 / 3.0))

		self.assertEqual(len(model.label_emissions), 4)
		self.assertEqual(len(model.label_emissions['A']), 2)
		self.assertEqual(model.label_emissions['A']['B::A'], log(2.0 / 3.0))

		self.assertEqual(len(model.emission), 6)
		self.assertEqual(len(model.emission['B::A']), 1)
		self.assertEqual(model.emission['B::A']['A'], 0.0)
Пример #7
0
    def test_alternating_sequence(self):
        sequence = (('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B'), ('A', 'A'),
                    ('B', 'B'))

        model = HiddenMarkovModel(label_history_size=2)
        model.train(sequence, fallback_model=None, use_linear_smoothing=False)

        self.assertEqual(len(model.transition), 5)
        self.assertEqual(len(model.transition['A::B']), 2)
        self.assertEqual(model.transition['A::B']['B::A'], log(2.0 / 3.0))
        self.assertEqual(model.transition['B::A']['A::B'], 0.0)
        self.assertEqual(model.transition['<START>::<START>']['<START>::A'],
                         log(0.5))

        self.assertEqual(len(model.reverse_transition), 6)
        self.assertEqual(len(model.reverse_transition['A::B']), 2)
        self.assertEqual(model.reverse_transition['A::B']['B::A'], 0.0)
        self.assertEqual(model.reverse_transition['B::A']['A::B'],
                         log(2.0 / 3.0))

        self.assertEqual(len(model.label_emissions), 4)
        self.assertEqual(len(model.label_emissions['A']), 2)
        self.assertEqual(model.label_emissions['A']['B::A'], log(2.0 / 3.0))

        self.assertEqual(len(model.emission), 6)
        self.assertEqual(len(model.emission['B::A']), 1)
        self.assertEqual(model.emission['B::A']['A'], 0.0)
Пример #8
0
	def test_three_history_single(self):
		sequence = zip(repeat('A', 6), repeat('A', 6))

		model = HiddenMarkovModel(label_history_size=3)
		model.train(sequence, fallback_model=None, use_linear_smoothing=False)

		self.assertEqual(model.label(list(repeat('A', 3))), list(repeat('A', 3)))
		self.assertEqual(model.label(list(repeat('A', 6))), list(repeat('A', 6)))
Пример #9
0
	def test_three_history_alternating(self):
		alternating = lambda n: [(l, e) for l, e, _ in izip(cycle(('A', 'B')), cycle(('A', 'B')),
															xrange(n))]
		sequence = alternating(6)

		model = HiddenMarkovModel(label_history_size=3)
		model.train(sequence, fallback_model=None, use_linear_smoothing=False)

		self.assertEqual(model.label(alternating(4)), [label for label, _ in alternating(4)])
		self.assertEqual(model.label(alternating(6)), [label for label, _ in alternating(6)])
Пример #10
0
	def test_identity_emission_uniform_transitions(self):
#		print "Testing emission == state w/ uniform transitions chain: ",

		model = HiddenMarkovModel(label_history_size=1)
		model.labels = ('A', 'B', START_LABEL, STOP_LABEL)

		self.set_defaults(model)
		self.uniform_transitions(model)
		self.identity_emissions(model)

		tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'], ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']]

		for test in tests:
			self._test_label(model, test, log(1.0 / 2.0) + log(1.0 / 3.0) * 4)
Пример #11
0
    def test_identity_emission_uniform_transitions(self):
        #		print "Testing emission == state w/ uniform transitions chain: ",

        model = HiddenMarkovModel(label_history_size=1)
        model.labels = ('A', 'B', START_LABEL, STOP_LABEL)

        self.set_defaults(model)
        self.uniform_transitions(model)
        self.identity_emissions(model)

        tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'],
                 ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']]

        for test in tests:
            self._test_label(model, test, log(1.0 / 2.0) + log(1.0 / 3.0) * 4)
Пример #12
0
    def test_extend_labels_simple(self):
        stream = (('1', 1), ('2', 2), ('3', 3))
        two_extended = [('1', ('<START>', ), 1), ('2', ('1', ), 2),
                        ('3', ('2', ), 3)]

        self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 2)),
                          two_extended)
Пример #13
0
	def test_extend_labels_three_history(self):
		stream = (('1', 1), ('2', 2), ('3', 3))
		three_extended = [('1', ('<START>', '<START>::<START>'), 1),
						  ('2', ('1', '<START>::1'), 2),
						  ('3', ('2', '1::2'), 3)]

		self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 3)), three_extended)
Пример #14
0
	def test_extend_labels_one_history(self):
		stream = (('1', 1), ('2', 2), ('3', 3))
		one_extended = [('1', tuple(), 1),
						('2', tuple(), 2),
						('3', tuple(), 3)]

		self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 1)), one_extended)
Пример #15
0
    def test_extend_labels_one_history(self):
        stream = (('1', 1), ('2', 2), ('3', 3))
        one_extended = [('1', tuple(), 1), ('2', tuple(), 2),
                        ('3', tuple(), 3)]

        self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 1)),
                          one_extended)
Пример #16
0
	def test_even_larger_history(self):
		sequence = (('A', 'A'), ('B', 'B'),
					('A', 'A'), ('B', 'B'),
					('A', 'A'), ('B', 'B'))

		model = HiddenMarkovModel(label_history_size=4)
		model.train(sequence, fallback_model=None, use_linear_smoothing=False)

		self.assertEqual(len(model.transition), 9)
		self.assertEqual(len(model.transition['<START>::A::B::A']), 1)
		self.assertEqual(len(model.transition['A::B::A::B']), 2)
		self.assertEqual(model.transition['<START>::<START>::<START>::<START>']['<START>::<START>::<START>::A'], log(0.5))
		self.assertEqual(model.transition['<START>::<START>::<START>::A']['<START>::<START>::A::B'], 0.0)
		self.assertEqual(model.transition['<START>::<START>::A::B']['<START>::A::B::A'], 0.0)
		self.assertEqual(model.transition['<START>::A::B::A']['A::B::A::B'], 0.0)
		self.assertEqual(model.transition['A::B::A::B']['B::A::B::A'], log(0.5))
Пример #17
0
	def test_identity_emissions_non_uniform_transitions(self):
#		print "Testing emissions == labels with non-uniform transitions chain: ",

		model = HiddenMarkovModel(label_history_size=1)
		model.labels = ('A', 'B', START_LABEL, STOP_LABEL)

		self.set_defaults(model)
		self.biased_transitions(model)
		self.identity_emissions(model)

		tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'], ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']]
		scores = [log(0.5) * 4 + log(0.25), log(0.5) * 4 + log(0.25), log(0.5)*3 + log(0.25)*2, log(0.5)*2 + log(0.25)*3]
		scored_tests = zip(tests, scores)

		for test, score in scored_tests:
			self._test_label(model, test, score)
Пример #18
0
	def test_extend_labels_simple(self):
		stream = (('1', 1), ('2', 2), ('3', 3))
		two_extended = [('1', ('<START>',), 1),
						('2', ('1',), 2),
						('3', ('2',), 3)]

		self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 2)), two_extended)
Пример #19
0
	def test_biased_emissions_uniform_transitions(self):
#		print "Testing uniform transitions with self-biased emissions: ",

		model = HiddenMarkovModel(label_history_size=1)
		model.labels = ('A', 'B', START_LABEL, STOP_LABEL)

		self.set_defaults(model)
		self.uniform_transitions(model)
		self.biased_emissions(model)

		tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'], ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']]
		scores = [log(0.5) + log(1.0 / 3.0) * 4.0 + 6.0 * log(2.0 / 3.0) for i in xrange(4)]
		scored_tests = zip(tests, scores)

		for test, score in scored_tests:
			self._test_label(model, test, score)
Пример #20
0
	def test_biased_emissions_biased_transitions(self):
#		print "Testing self-biased transitions with self-biased emissions: ",

		model = HiddenMarkovModel(label_history_size=1)
		model.labels = ('A', 'B', START_LABEL, STOP_LABEL)

		self.set_defaults(model)
		self.biased_transitions(model)
		self.biased_emissions(model)

		tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'], ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']]
		scores = [log(0.5) * 4 + log(0.25), log(0.5) * 4 + log(0.25), log(0.5)*3 + log(0.25)*2, log(0.5)*2 + log(0.25)*3]
		scores = [6.0 * log(2.0 / 3.0) + score for score in scores]
		scored_tests = zip(tests, scores)

		for test, score in scored_tests:
			self._test_label(model, test, score)
Пример #21
0
def simple_hmm():
    A = np.array([[0.9, 0.1], [0.4, 0.6]])
    B = np.array([[0.9, 0.1], [0.2, 0.8]])
    pi = np.array([0.8, 0.2])

    model = HiddenMarkovModel(A, B, pi)
    emissions = np.array([0, 1])
    return (model, emissions)
Пример #22
0
    def test_extend_labels_three_history(self):
        stream = (('1', 1), ('2', 2), ('3', 3))
        three_extended = [('1', ('<START>', '<START>::<START>'), 1),
                          ('2', ('1', '<START>::1'), 2),
                          ('3', ('2', '1::2'), 3)]

        self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 3)),
                          three_extended)
Пример #23
0
def make_model(states = 10, outputs = 5):
    model = HiddenMarkovModel()

    for state in range(states):
        model.add_initial_state(state, 1 / states)
        for output in range(outputs):
            model.add_emission(state, output, 1 / outputs)

    for state in range(states):
        for to_state in range(states):
            model.add_transition(state, to_state, 1 / states)

    model.normalize()
    return model
Пример #24
0
	def test_extend_labels_multiple_sentences(self):
		stream = (('1', 1), ('2', 2), ('<STOP>', '<STOP>'), ('<START>', '<START>'), ('1', 1))
		two_extended = [('1', ('<START>',), 1),
						('2', ('1',), 2),
						('<STOP>', ('2',), '<STOP>'),
						('<START>', ('<START>',), '<START>'),
						('1', ('<START>',), 1)]

		self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 2)), two_extended)
Пример #25
0
    def setUp(self):
        self.states = ["rainy", "sunny"]
        self.vocabulary = ["walk", "shop", "clean"]

        self.initial_probabilities = {
            "rainy": 0.6,
            "sunny": 0.4,
        }

        # The probability of moving from state a to state b
        # such that the sum of the all the probabilities = 1
        self.transition_probabilities = {
            "rainy": {
                "rainy": 0.7,
                "sunny": 0.3,
            },
            "sunny": {
                "rainy": 0.4,
                "sunny": 0.6,
            },
        }

        # the probability of the observation O being generated from the state q
        self.emission_probabilities = {
            "rainy": {
                "walk": 0.1,
                "shop": 0.4,
                "clean": 0.5,
            },
            "sunny": {
                "walk": 0.6,
                "shop": 0.3,
                "clean": 0.1,
            }
        }

        self.hmm = HiddenMarkovModel(
            self.states,  # all the possible hidden states
            self.vocabulary,  # all possible observation types
            self.transition_probabilities,
            self.emission_probabilities,
            self.initial_probabilities
        )
Пример #26
0
    def test_extend_labels_multiple_sentences(self):
        stream = (('1', 1), ('2', 2), ('<STOP>', '<STOP>'),
                  ('<START>', '<START>'), ('1', 1))
        two_extended = [('1', ('<START>', ), 1), ('2', ('1', ), 2),
                        ('<STOP>', ('2', ), '<STOP>'),
                        ('<START>', ('<START>', ), '<START>'),
                        ('1', ('<START>', ), 1)]

        self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 2)),
                          two_extended)
Пример #27
0
    def test_unk_emission(self):
        #		print "Testing UNK emission with emission == label and self-biased transitions: ",

        model = HiddenMarkovModel(label_history_size=1)
        model.labels = ('A', 'B', START_LABEL, STOP_LABEL)

        self.set_defaults(model)
        self.identity_emissions(model)
        self.biased_transitions(model)

        emissions = ['A', 'C', 'A', 'B', 'B']
        labels = ['A', 'A', 'A', 'B', 'B']
        score = log(0.5) * 2 + log(0.25) + log(0.5) + log(0.25) + log(
            0.5) + log(0.25)

        self._test_label(model, emissions, score, labels=labels)

        emissions = ['A', 'C', 'C', 'B', 'B']
        labels = [['A', 'A', 'A', 'B', 'B'], ['A', 'A', 'B', 'B', 'B'],
                  ['A', 'B', 'B', 'B', 'B']]

        score = None
        for label in labels:
            new_score = model.score(zip(label, emissions))
            if score:
                self.assertAlmostEqual(
                    score, new_score,
                    5)  #, "score(%s) (%f) bad" % (label, new_score)
            score = new_score

        emissions = ['A', 'C', 'C', 'B', 'B']
        labels = [['A', 'A', 'A', 'B', 'B'], ['A', 'A', 'B', 'B', 'B'],
                  ['A', 'B', 'B', 'B', 'B']]

        score = None
        for label in labels:
            new_score = model.score(zip(label, emissions))
            if score:
                self.assertAlmostEqual(
                    score, new_score,
                    5)  #, "score(%s) (%f) bad" % (label, new_score)
            score = new_score
Пример #28
0
	def test_extend_labels_longer_history_than_sentence(self):
		stream = (('1', 1), ('2', 2))
		five_extended = [('1', ('<START>',
								'<START>::<START>',
								'<START>::<START>::<START>',
								'<START>::<START>::<START>::<START>'), 1),
						 ('2', ('1', '<START>::1',
								'<START>::<START>::1',
								'<START>::<START>::<START>::1'), 2)]

		self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 5)), five_extended)
Пример #29
0
    def test_extend_labels_longer_history_than_sentence(self):
        stream = (('1', 1), ('2', 2))
        five_extended = [
            ('1', ('<START>', '<START>::<START>', '<START>::<START>::<START>',
                   '<START>::<START>::<START>::<START>'), 1),
            ('2', ('1', '<START>::1', '<START>::<START>::1',
                   '<START>::<START>::<START>::1'), 2)
        ]

        self.assertEquals(list(HiddenMarkovModel._extend_labels(stream, 5)),
                          five_extended)
Пример #30
0
    def test_biased_emissions_uniform_transitions(self):
        #		print "Testing uniform transitions with self-biased emissions: ",

        model = HiddenMarkovModel(label_history_size=1)
        model.labels = ('A', 'B', START_LABEL, STOP_LABEL)

        self.set_defaults(model)
        self.uniform_transitions(model)
        self.biased_emissions(model)

        tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'],
                 ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']]
        scores = [
            log(0.5) + log(1.0 / 3.0) * 4.0 + 6.0 * log(2.0 / 3.0)
            for i in xrange(4)
        ]
        scored_tests = zip(tests, scores)

        for test, score in scored_tests:
            self._test_label(model, test, score)
Пример #31
0
def fit(sc,
        data,
        model,
        stop_threshold=1E-9,
        min_iterations=0,
        max_iterations=1000):
    """

    fits HIddenMarkovModel (model) using baum welch to a set of data (data) in parrallel using SparkContext (sc)
    until either a threshold (stop_threshold) is met or at least min_iterations are run.
    max_iterations is used to stop the fitting process.


    :param sc: sparkContext
    :param data: data to be fitted
    :param model: HiddenMarkovModel instance
    :param stop_threshold:
    :param min_iterations: minimum number of iterations
    :param max_iterations: maximum number of iterations

    :return: HiddenMarkovModel instances fitted to the data. Use model to extract estimated model parameters
    """
    def fit_worker(batch):
        bwelch = BaumWelchBatch(model.loga, model.logb, model.logpi)
        for d in batch:
            bwelch.fit_sequence(d)
        return [bwelch]

    p_data = sc.parallelize(data)
    log_prob_sum = p_data.map(
        model.observation_log_probability).reduce(lambda x, y: x + y)
    iteration = 0
    improvement = float('inf')
    new_model = None

    while improvement > stop_threshold or iteration < min_iterations + 1:
        s = time.time()
        batches = p_data.mapPartitions(fit_worker).collect()
        logger.info(f'got baches of size {len(batches)}')
        new_model = HiddenMarkovModel.from_batches(batches)
        new_log_prob_sum = p_data.map(
            new_model.observation_log_probability).reduce(lambda x, y: x + y)
        improvement = new_log_prob_sum - log_prob_sum
        e = time.time()
        logger.info(f'took {e - s}')
        logger.info(f'improvement = {improvement:.5f}')
        log_prob_sum = new_log_prob_sum
        if iteration >= max_iterations:
            break
        iteration += 1

    return new_model
Пример #32
0
    def test_identity_emissions_non_uniform_transitions(self):
        #		print "Testing emissions == labels with non-uniform transitions chain: ",

        model = HiddenMarkovModel(label_history_size=1)
        model.labels = ('A', 'B', START_LABEL, STOP_LABEL)

        self.set_defaults(model)
        self.biased_transitions(model)
        self.identity_emissions(model)

        tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'],
                 ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']]
        scores = [
            log(0.5) * 4 + log(0.25),
            log(0.5) * 4 + log(0.25),
            log(0.5) * 3 + log(0.25) * 2,
            log(0.5) * 2 + log(0.25) * 3
        ]
        scored_tests = zip(tests, scores)

        for test, score in scored_tests:
            self._test_label(model, test, score)
Пример #33
0
    def test_even_larger_history(self):
        sequence = (('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B'), ('A', 'A'),
                    ('B', 'B'))

        model = HiddenMarkovModel(label_history_size=4)
        model.train(sequence, fallback_model=None, use_linear_smoothing=False)

        self.assertEqual(len(model.transition), 9)
        self.assertEqual(len(model.transition['<START>::A::B::A']), 1)
        self.assertEqual(len(model.transition['A::B::A::B']), 2)
        self.assertEqual(
            model.transition['<START>::<START>::<START>::<START>']
            ['<START>::<START>::<START>::A'], log(0.5))
        self.assertEqual(
            model.transition['<START>::<START>::<START>::A']
            ['<START>::<START>::A::B'], 0.0)
        self.assertEqual(
            model.transition['<START>::<START>::A::B']['<START>::A::B::A'],
            0.0)
        self.assertEqual(model.transition['<START>::A::B::A']['A::B::A::B'],
                         0.0)
        self.assertEqual(model.transition['A::B::A::B']['B::A::B::A'],
                         log(0.5))
Пример #34
0
	def test_simple_sequence(self):
		sequence = (('A', 'A'), ('A', 'A'),
					('A', 'A'), ('A', 'A'),
					('A', 'A'), ('A', 'A'))

		model = HiddenMarkovModel(label_history_size=2)
		model.train(sequence, fallback_model=None, use_linear_smoothing=False)

		self.assertEqual(len(model.transition), 4)
		self.assertEqual(len(model.transition['A::A']), 2)
		self.assertEqual(model.transition['A::A']['A::A'], log(4.0 / 5.0))

		self.assertEqual(len(model.reverse_transition), 5)
		self.assertEqual(len(model.reverse_transition['A::A']), 2)
		self.assertEqual(model.reverse_transition['A::A']['A::A'], log(4.0 / 5.0))

		self.assertEqual(len(model.emission), 5)
		self.assertEqual(len(model.emission['A::A']), 1)
		self.assertEqual(model.emission['A::A']['A'], 0.0)

		self.assertEqual(len(model.label_emissions), 3)
		self.assertEqual(len(model.label_emissions['A']), 2)
		self.assertEqual(model.label_emissions['A']['A::A'], log(5.0 / 6.0))
Пример #35
0
    def test_simple_sequence(self):
        sequence = (('A', 'A'), ('A', 'A'), ('A', 'A'), ('A', 'A'), ('A', 'A'),
                    ('A', 'A'))

        model = HiddenMarkovModel(label_history_size=2)
        model.train(sequence, fallback_model=None, use_linear_smoothing=False)

        self.assertEqual(len(model.transition), 4)
        self.assertEqual(len(model.transition['A::A']), 2)
        self.assertEqual(model.transition['A::A']['A::A'], log(4.0 / 5.0))

        self.assertEqual(len(model.reverse_transition), 5)
        self.assertEqual(len(model.reverse_transition['A::A']), 2)
        self.assertEqual(model.reverse_transition['A::A']['A::A'],
                         log(4.0 / 5.0))

        self.assertEqual(len(model.emission), 5)
        self.assertEqual(len(model.emission['A::A']), 1)
        self.assertEqual(model.emission['A::A']['A'], 0.0)

        self.assertEqual(len(model.label_emissions), 3)
        self.assertEqual(len(model.label_emissions['A']), 2)
        self.assertEqual(model.label_emissions['A']['A::A'], log(5.0 / 6.0))
Пример #36
0
    def test_biased_emissions_biased_transitions(self):
        #		print "Testing self-biased transitions with self-biased emissions: ",

        model = HiddenMarkovModel(label_history_size=1)
        model.labels = ('A', 'B', START_LABEL, STOP_LABEL)

        self.set_defaults(model)
        self.biased_transitions(model)
        self.biased_emissions(model)

        tests = [['A', 'A', 'A', 'A'], ['B', 'B', 'B', 'B'],
                 ['A', 'A', 'B', 'B'], ['B', 'A', 'B', 'B']]
        scores = [
            log(0.5) * 4 + log(0.25),
            log(0.5) * 4 + log(0.25),
            log(0.5) * 3 + log(0.25) * 2,
            log(0.5) * 2 + log(0.25) * 3
        ]
        scores = [6.0 * log(2.0 / 3.0) + score for score in scores]
        scored_tests = zip(tests, scores)

        for test, score in scored_tests:
            self._test_label(model, test, score)
Пример #37
0
    def test_larger_history(self):
        sequence = (('A', 'A'), ('B', 'B'), ('A', 'A'), ('B', 'B'), ('A', 'A'),
                    ('B', 'B'))

        model = HiddenMarkovModel(label_history_size=3)
        model.train(sequence, fallback_model=None, use_linear_smoothing=False)

        self.assertEqual(len(model.transition), 7)
        self.assertEqual(len(model.transition['B::A::B']), 2)
        self.assertEqual(
            model.transition['<START>::<START>::<START>']
            ['<START>::<START>::A'], log(0.5))
        self.assertEqual(
            model.transition['<START>::<START>::A']['<START>::A::B'], 0.0)
        self.assertEqual(model.transition['<START>::A::B']['A::B::A'], 0.0)
        self.assertEqual(model.transition['B::A::B']['A::B::<STOP>'],
                         log(1.0 / 2.0))
        self.assertEqual(model.transition['A::B::A']['B::A::B'], 0.0)

        self.assertEqual(len(model.reverse_transition), 8)
        self.assertEqual(len(model.reverse_transition['A::B::A']), 2)
        self.assertEqual(
            model.reverse_transition['<START>::A::B']['<START>::<START>::A'],
            0.0)
        self.assertEqual(model.reverse_transition['B::A::B']['A::B::A'], 0.0)
        self.assertEqual(model.reverse_transition['A::B::A']['B::A::B'],
                         log(1.0 / 2.0))

        self.assertEqual(len(model.label_emissions), 4)
        self.assertEqual(len(model.label_emissions['A']), 2)
        self.assertEqual(model.label_emissions['A']['A::B::A'], log(2.0 / 3.0))

        self.assertEqual(len(model.emission), 8)
        assert all(len(values) == 1 for values in model.emission.itervalues())
        assert all(
            all(val == 0.0 for val in values.itervalues())
            for values in model.emission.itervalues())
Пример #38
0
    def test_three_history_single(self):
        sequence = zip(repeat('A', 6), repeat('A', 6))

        model = HiddenMarkovModel(label_history_size=3)
        model.train(sequence, fallback_model=None, use_linear_smoothing=False)

        self.assertEqual(model.label(list(repeat('A', 3))),
                         list(repeat('A', 3)))
        self.assertEqual(model.label(list(repeat('A', 6))),
                         list(repeat('A', 6)))
Пример #39
0
    def test_three_history_alternating(self):
        alternating = lambda n: [(l, e)
                                 for l, e, _ in izip(cycle(
                                     ('A', 'B')), cycle(
                                         ('A', 'B')), xrange(n))]
        sequence = alternating(6)

        model = HiddenMarkovModel(label_history_size=3)
        model.train(sequence, fallback_model=None, use_linear_smoothing=False)

        self.assertEqual(model.label(alternating(4)),
                         [label for label, _ in alternating(4)])
        self.assertEqual(model.label(alternating(6)),
                         [label for label, _ in alternating(6)])
Пример #40
0
 def create_hidden_markov_model(self):
     if self.emission_mode == "projection":
         assert hasattr(self, "emission_probability")
         self.hidden_markov_model = HiddenMarkovModel(
             self.transition_probability,
             self.emission_probability,
             self.initial_probability,
             self.state_space.states,
             update_matrix=self.update_matrix,
             enable_warnings=True)
     elif self.emission_mode == "gaussian":
         assert hasattr(self, "mean") and hasattr(self, "covariance")
         self.hidden_markov_model = GaussianHiddenMarkovModel(
             self.transition_probability,
             self.initial_probability,
             self.state_space.states,
             self.mean,
             self.covariance,
             update_matrix=self.update_matrix,
             enable_warnings=True)
Пример #41
0
    # use testing parameters in dev mode
    if args.dev:
        alphabet = 'abdi#'
        words = ['babi#', 'dida#']

    # for #3, we run the training program 20 times to look for local maxima
    if args.mode == 'loop':
        print "Beginning iteration for Local Maxima question..."

        for i in xrange(1, 21):
            print "Local Maxima question: iteration %s" % i

            outfile = open('results/result%s.txt' % i, 'w')

            hmm = HiddenMarkovModel(numstates, alphabet)
            dump_state(outfile, hmm, 'Initial State')

            hmm.train(words, iterations=100, verbose=args.verbose)
            dump_state(outfile, hmm, 'Iteration Summary')

        print "Completed Local Maxima question."

    # for #4 we train and then compute the viterbi path for each word
    # we store each word-path pair in a dict and then pass that dict
    # to a function that handles output
    if args.mode == 'viterbi':
        hmm = HiddenMarkovModel(numstates, alphabet)
        dump_state(args.outfile, hmm, 'Initial State')

        hmm.train(words, iterations=100, verbose=args.verbose)
Пример #42
0
def main():
    spark = (SparkSession.builder.master("local[*]").config(
        "spark.executor.memory",
        "30g").config("spark.driver.memory", "30g").config(
            "spark.driver.maxResultSize",
            "30g").config("spark.memory.offHeap.enabled", True).config(
                "spark.memory.offHeap.size",
                "16g").appName("sampleCodeForReference").getOrCreate())

    sc = spark.sparkContext

    test_data = pickle.load(
        open('/Users/locojay/PycharmProjects/dthmm/tutorials/testdata2.p',
             'rb'))

    A = [[0.17299125, 0.08781199, 0.24904337, 0.49015339],
         [0.65466035, 0.0058856, 0.24847472, 0.09097933],
         [0.43406668, 0.09507003, 0.24143807, 0.22942522],
         [0.00310297, 0.41726041, 0.27046179, 0.30917482]]

    B = [[
        0.0248371, 0.00647766, 0.02919312, 0.02010902, 0.01741969, 0.03026002,
        0.01107451, 0.03090185, 0.02000882, 0.02946754, 0.0329583, 0.02810143,
        0.00973118, 0.01286111, 0.03036823, 0.03451904, 0.01301527, 0.03176073,
        0.02069127, 0.0391591, 0.03724013, 0.01681755, 0.02387927, 0.01267418,
        0.01405466, 0.00182615, 0.00099688, 0.02921965, 0.02068266, 0.00459763,
        0.03083269, 0.02294538, 0.00748594, 0.0318249, 0.01643839, 0.03030681,
        0.00853397, 0.02212386, 0.02451805, 0.01147829, 0.01860806, 0.01689099,
        0.01947854, 0.00456117, 0.01985139, 0.02348703, 0.02722838, 0.02259387,
        0.00460825, 0.00130027
    ],
         [
             0.00118511, 0.0364538, 0.00539255, 0.02931715, 0.00712114,
             0.02613686, 0.02025734, 0.00856556, 0.01788003, 0.02696186,
             0.03206167, 0.02082036, 0.02027708, 0.0363248, 0.01253547,
             0.02536659, 0.0303423, 0.00161272, 0.02162873, 0.0211614,
             0.01741675, 0.01470692, 0.032151, 0.03228765, 0.03237699,
             0.0370071, 0.01195834, 0.02739508, 0.01974688, 0.01438907,
             0.00741205, 0.02553209, 0.00501492, 0.02914962, 0.01528311,
             0.02546899, 0.01965691, 0.00166134, 0.0146325, 0.03175253,
             0.00425995, 0.02717155, 0.02544106, 0.03355649, 0.02468158,
             0.00874545, 0.01172551, 0.02154314, 0.00843848, 0.01803447
         ],
         [
             0.0296181, 0.0348821, 0.02564371, 0.02800763, 0.01551197,
             0.02558589, 0.03501015, 0.01300263, 0.01266429, 0.03546458,
             0.00678947, 0.01032237, 0.03453364, 0.02323215, 0.01534716,
             0.03644205, 0.02687086, 0.02292363, 0.00105033, 0.0289615,
             0.02795536, 0.03250376, 0.02837804, 0.01249522, 0.02217764,
             0.02628832, 0.00928285, 0.00739886, 0.03279007, 0.00722151,
             0.00053051, 0.01206393, 0.01819556, 0.00779652, 0.02419107,
             0.00798948, 0.00664281, 0.02770423, 0.0339964, 0.01410592,
             0.01401967, 0.03120296, 0.02565983, 0.01024386, 0.01415742,
             0.00839726, 0.01779137, 0.02100865, 0.02521129, 0.01073536
         ],
         [
             0.01471172, 0.02670568, 0.01813862, 0.03895738, 0.0074108,
             0.00734445, 0.02980466, 0.0244879, 0.00582519, 0.0089145,
             0.00959946, 0.02949902, 0.01730438, 0.00265082, 0.00898055,
             0.00310906, 0.02095744, 0.02549341, 0.00517031, 0.01065439,
             0.03255066, 0.03373455, 0.00429001, 0.0298808, 0.03904555,
             0.00203563, 0.0188991, 0.02278372, 0.02672836, 0.01151306,
             0.01512417, 0.03303694, 0.03390606, 0.02449836, 0.01443768,
             0.0127056, 0.03821532, 0.01233168, 0.00493174, 0.03505321,
             0.03774991, 0.03070529, 0.02777502, 0.00753259, 0.02052302,
             0.02192132, 0.00473921, 0.03786516, 0.03214382, 0.01762273
         ]]

    pi = [0.1785823, 0.20446237, 0.26092583, 0.3560295]
    model = HiddenMarkovModel(A, B, pi)
    fit(sc, test_data, model)
Пример #43
0
if __name__ == '__main__':
    data = read_csv("Shakespeare_data.csv")

    data.dropna(axis = 'columns', how = 'any', inplace = True)

    text = [_ for _ in data['PlayerLine']]

    corpus = [textCleaner(i) for i in text]
    '''
    Used to train model-
    hmm_model = HiddenMarkovModel(hiddenStates = 5)
    hmm_model.trainer(corpus,filName='model.pickle')
    print("Model complete")
    '''

    trans,ems,initials = HiddenMarkovModel.load('model.pickle')

    currentModel = HiddenMarkovModel(hiddenStates = 5 ,transProbs = trans, emissionProbs = ems, initialProbs = initials)

    print('1- Generate text\n2- Predict text')

    inNum = 0
    inNum = input()

    if inNum == '1':
        print("Number of words to be generated: ")
        currentModel.generator(int(input()))

    elif inNum == '2':
        print("Input sequences of words to predict on: ")
        textIn = str(input())
Пример #44
0
class HmmCoinsTest(unittest.TestCase):
    def setUp(self):
        self.states = ["Coin 1", "Coin 2", "Coin 3"]
        self.vocabulary = ["Heads", "Tails"]

        self.initial_probabilities = {
            "Coin 1": 0.4,
            "Coin 2": 0.3,
            "Coin 3": 0.3
        }

        # The probability of moving from state a to state b
        # such that the sum of the all the probabilities = 1
        self.transition_probabilities = {
            "Coin 1": {"Coin 1": 0.6, "Coin 2": 0.3, "Coin 3": 0.1},
            "Coin 2": {"Coin 1": 0.2, "Coin 2": 0.5, "Coin 3": 0.3},
            "Coin 3": {"Coin 1": 0.3, "Coin 2": 0.2, "Coin 3": 0.5}
        }

        # the probability of the observation O being generated from the state q
        self.emission_probabilities = {
            "Coin 1": {"Heads": 0.7, "Tails": 0.3},
            "Coin 2": {"Heads": 0.3, "Tails": 0.7},
            "Coin 3": {"Heads": 0.5, "Tails": 0.5}
        }

        self.hmm = HiddenMarkovModel(
            self.states,  # all the possible hidden states
            self.vocabulary,  # all possible observation types
            self.transition_probabilities,
            self.emission_probabilities,
            self.initial_probabilities
        )

    def test_forward(self):
        observations = ["Heads", "Heads", "Heads"]
        P, forwards = self.hmm.forward(observations)
        self.assertEqual(P, 0.14533999999999997)
        self.assertEqual(
            forwards[0], [0.27999999999999997, 0.16169999999999998, 0.08824199999999997])
        self.assertEqual(forwards[1], [0.09, 0.0477, 0.025607999999999995])

    def test_backward(self):
        observations = ["Heads", "Heads", "Heads"]
        P, backwards = self.hmm.backward(observations)
        self.assertEqual(P, 0.14534)
        self.assertEqual(backwards[0],  [0.30080000000000007, 0.56, 1.0])
        self.assertEqual(backwards[1], [0.2224, 0.43999999999999995, 1.0])

    def test_viterbi(self):
        observations = ["Heads", "Heads", "Heads"]
        P, backpoints = self.hmm.viterbi(observations)
        self.assertEqual(P, 0.049391999999999985)
        self.assertEqual(backpoints, ['Coin 1', 'Coin 1', 'Coin 1'])

    def test_forward_backward(self):
        observations = ["Heads", "Heads", "Heads"]
        old_prediction = self.hmm.viterbi(observations)
        new_hmm = self.hmm.forward_backward(observations)
        prediction = new_hmm.viterbi(observations[:3])

        # we should see an increase in the probability output
        assert old_prediction[0] < prediction[0]
        self.assertEqual(prediction, (0.10346810696178374,
                                      ['Coin 1', 'Coin 1', 'Coin 1']))
# Script to get the word level accuracy of the HMM trained on pos_train.txt
from hmm import HiddenMarkovModel
h = HiddenMarkovModel(supervised=False)
h.train()
x = h.eval('./pos_test.txt')
print(x)
Пример #46
0
class HmmWeatherTest(unittest.TestCase):
    def setUp(self):
        self.states = ["rainy", "sunny"]
        self.vocabulary = ["walk", "shop", "clean"]

        self.initial_probabilities = {
            "rainy": 0.6,
            "sunny": 0.4,
        }

        # The probability of moving from state a to state b
        # such that the sum of the all the probabilities = 1
        self.transition_probabilities = {
            "rainy": {
                "rainy": 0.7,
                "sunny": 0.3,
            },
            "sunny": {
                "rainy": 0.4,
                "sunny": 0.6,
            },
        }

        # the probability of the observation O being generated from the state q
        self.emission_probabilities = {
            "rainy": {
                "walk": 0.1,
                "shop": 0.4,
                "clean": 0.5,
            },
            "sunny": {
                "walk": 0.6,
                "shop": 0.3,
                "clean": 0.1,
            }
        }

        self.hmm = HiddenMarkovModel(
            self.states,  # all the possible hidden states
            self.vocabulary,  # all possible observation types
            self.transition_probabilities,
            self.emission_probabilities,
            self.initial_probabilities
        )

    def test_forward(self):
        observations = ["walk", "shop", "clean"]
        P, forwards = self.hmm.forward(observations)
        self.assertEqual(P, 0.033612)
        self.assertEqual(forwards[0], [0.06, 0.055200000000000006, 0.02904])
        self.assertEqual(forwards[1], [0.24, 0.04859999999999999, 0.004572])

    def test_backward(self):
        observations = ["walk", "shop", "clean"]
        P, backwards = self.hmm.backward(observations)
        self.assertEqual(P, 0.033612)
        self.assertEqual(backwards[0], [0.1298, 0.38, 1.0])
        self.assertEqual(backwards[1], [0.10760000000000002, 0.26, 1.0])

    def test_viterbi(self):
        observations = ["walk", "shop", "clean"]
        P, backpoints = self.hmm.viterbi(observations)
        self.assertEqual(P, 0.01344)
        self.assertEqual(backpoints, ['sunny', 'rainy', 'rainy'])

    def test_forward_backward(self):
        observations = ["walk", "shop", "clean"]
        new_hmm = self.hmm.forward_backward(observations)
        prediction = new_hmm.viterbi(observations[:3])
        self.assertEqual(prediction, (0.010994296643152459,
                                      ['sunny', 'rainy', 'rainy']))
Пример #47
0
# Script to get word level accuracy of the hidden markov model with the most probable
# tag counts
from hmm import HiddenMarkovModel
h = HiddenMarkovModel(supervised=True)
x = h.eval('./pos_test.txt')
print(x)
Пример #48
0
    def __init__(self, trainpath, validpaths, max_trans=0, vnodes=False, hypotest=None, saveplots=False):
        assert isinstance(trainpath, str)
        assert len(trainpath)
        assert isinstance(validpaths, list)
        for path in validpaths:
            assert isinstance(path, str)
            assert len(path)
        assert isinstance(max_trans, int)
        assert max_trans >= 0
        assert isinstance(vnodes, bool)
        assert isinstance(saveplots, bool)

        self._modname = 'base'
        if vnodes:
            self._modname = 'vnode'
        if hypotest:
            self._modname += '+hypo'
        
        self._savepathbase = None
        if saveplots:
            self._savepathbase = "./%s-plots-%s-maxtrans%d/" % (trainpath, self._modname, max_trans)
            os.mkdir(self._savepathbase)

        # load datasets
        self._trainset = ObservationSequence(loadpath=trainpath, max_trans=max_trans, \
                                             vnodes=vnodes)

        if hypotest:
            #statespace, transitions = self._trainset.admissible_statespace_transitions()
            #print "orig:\n", statespace, transitions
            statespace, transitions = self._trainset.hypotest_admissible_statespace_transitions(hypotest)
            #print "hypotest:\n", statespace, transitions
            self._trainset = ObservationSequence(loadpath=trainpath, max_trans=max_trans, vnodes=vnodes, \
                                             admissible_statespace=statespace, \
                                             admissible_transitions=transitions)

        statespace, transitions = self._trainset.admissible_statespace_transitions()
        #self._trainset.dump()
        #print "trainset:\n", statespace, transitions

        if not self._trainset.outseq_len():
            print "warning, TransitionModel.__init__, empty trainset at path %s" % trainpath


        _FILTER_TRANS = 1
        self._validsets = []
        for validpath in validpaths:
            if _FILTER_TRANS:
                # filter trans
                dataset = ObservationSequence(loadpath=validpath, max_trans=max_trans, vnodes=vnodes, \
                                              admissible_statespace=statespace, \
                                              admissible_transitions=transitions)
            else:
                dataset = ObservationSequence(loadpath=validpath, max_trans=max_trans, vnodes=vnodes)

            if not dataset.outseq_len():
                print "TransitionModel, skipping empty validation dataset at path %s" % validpath
                continue
            self._validsets.append(dataset)

        # define the initial set of states and the set of output symbols
        self._init_states = sorted(self._trainset.states())
        self._N = len(self._init_states)
        self._symbols = sorted(self._trainset.symbols())
        self._M = len(self._symbols)

        self._symbols_ext = copy.copy(self._symbols)        
        if not _FILTER_TRANS:

              symbols = set()
              for dataset in self._validsets:
                  symbols.update(dataset.symbols())
                  
              print "symbols", symbols
              for sym in symbols:
                  if sym not in self._symbols_ext:
                      print "sym", sym
                      print "TransitionalModel: appending sym=\"%s\" in valset but not in trainset" % sym
                      self._symbols_ext.append(sym)
        self._M_ext = len(self._symbols_ext)
        
        print "self._symbols_ext", self._symbols_ext
        camsymbols = {}
        for sym in self._symbols_ext:
            print "sym = ", sym
            for _id in sym.split('-'):
                print "  _id = ", _id

                __id = int(_id.strip('abcdef'))
                try:
                    camsymbols[__id].add(sym)
                except:
                    print "new cam id", _id
                    camsymbols[__id] = set([sym])
        for camid in camsymbols.keys():
            symbols = camsymbols[camid]
            camsymbols[camid] = sorted(symbols)

        symbol2cam = {}
        for sym in self._symbols_ext:
            for _id in sym.split('-'):
                __id = int(_id.strip('abcdef'))
                try:
                    symbol2cam[sym].append(__id)
                except:
                    symbol2cam[sym] = [__id]
      
        symbol2cam_enc = {}
        for sym,camids in symbol2cam.iteritems():
            symenc = []
            symbol2cam_enc[self._symbol_enc(sym)] = camids

        print "camsymbols=",camsymbols
        print "symbol2cam=",symbol2cam        
        print "symbol2cam_enc=",symbol2cam_enc
        print "symbols_ext", self._symbols_ext
                
        # define the initial A, B matrices
        A, B = self._build_initial_model()

        # compute the node to be split
        self._compute_node_splits()
       
        # encode datasets (bijective mapping symobols <-> integers)
        self._trainset_enc = []
        for observ in self._trainset.outseq():
            self._trainset_enc.append(self._symbol_enc(observ))

        self._validsets_enc = []
        for dataset in self._validsets:
            encoded = []
            for observ in dataset.outseq():
                encoded.append(self._symbol_enc(observ))
            self._validsets_enc.append(encoded)

        # print some info
        self.print_info()

        # create the HMM model with node splitting
        self._hmm = HiddenMarkovModel(A, B, copy.copy(self._init_states),	\
              			      self._trainset_enc, validsets=self._validsets_enc, symnames=copy.copy(self._symbols_ext), symbol2cam=symbol2cam_enc)

        self._transition_graph_threshold = 0.01
        self._hmm.plot_transition_graph(0., savepathbase=self._savepathbase, savetag='coverage-model')
Пример #49
0
class TransitionalModel():
    def __init__(self, trainpath, validpaths, max_trans=0, vnodes=False, hypotest=None, saveplots=False):
        assert isinstance(trainpath, str)
        assert len(trainpath)
        assert isinstance(validpaths, list)
        for path in validpaths:
            assert isinstance(path, str)
            assert len(path)
        assert isinstance(max_trans, int)
        assert max_trans >= 0
        assert isinstance(vnodes, bool)
        assert isinstance(saveplots, bool)

        self._modname = 'base'
        if vnodes:
            self._modname = 'vnode'
        if hypotest:
            self._modname += '+hypo'
        
        self._savepathbase = None
        if saveplots:
            self._savepathbase = "./%s-plots-%s-maxtrans%d/" % (trainpath, self._modname, max_trans)
            os.mkdir(self._savepathbase)

        # load datasets
        self._trainset = ObservationSequence(loadpath=trainpath, max_trans=max_trans, \
                                             vnodes=vnodes)

        if hypotest:
            #statespace, transitions = self._trainset.admissible_statespace_transitions()
            #print "orig:\n", statespace, transitions
            statespace, transitions = self._trainset.hypotest_admissible_statespace_transitions(hypotest)
            #print "hypotest:\n", statespace, transitions
            self._trainset = ObservationSequence(loadpath=trainpath, max_trans=max_trans, vnodes=vnodes, \
                                             admissible_statespace=statespace, \
                                             admissible_transitions=transitions)

        statespace, transitions = self._trainset.admissible_statespace_transitions()
        #self._trainset.dump()
        #print "trainset:\n", statespace, transitions

        if not self._trainset.outseq_len():
            print "warning, TransitionModel.__init__, empty trainset at path %s" % trainpath


        _FILTER_TRANS = 1
        self._validsets = []
        for validpath in validpaths:
            if _FILTER_TRANS:
                # filter trans
                dataset = ObservationSequence(loadpath=validpath, max_trans=max_trans, vnodes=vnodes, \
                                              admissible_statespace=statespace, \
                                              admissible_transitions=transitions)
            else:
                dataset = ObservationSequence(loadpath=validpath, max_trans=max_trans, vnodes=vnodes)

            if not dataset.outseq_len():
                print "TransitionModel, skipping empty validation dataset at path %s" % validpath
                continue
            self._validsets.append(dataset)

        # define the initial set of states and the set of output symbols
        self._init_states = sorted(self._trainset.states())
        self._N = len(self._init_states)
        self._symbols = sorted(self._trainset.symbols())
        self._M = len(self._symbols)

        self._symbols_ext = copy.copy(self._symbols)        
        if not _FILTER_TRANS:

              symbols = set()
              for dataset in self._validsets:
                  symbols.update(dataset.symbols())
                  
              print "symbols", symbols
              for sym in symbols:
                  if sym not in self._symbols_ext:
                      print "sym", sym
                      print "TransitionalModel: appending sym=\"%s\" in valset but not in trainset" % sym
                      self._symbols_ext.append(sym)
        self._M_ext = len(self._symbols_ext)
        
        print "self._symbols_ext", self._symbols_ext
        camsymbols = {}
        for sym in self._symbols_ext:
            print "sym = ", sym
            for _id in sym.split('-'):
                print "  _id = ", _id

                __id = int(_id.strip('abcdef'))
                try:
                    camsymbols[__id].add(sym)
                except:
                    print "new cam id", _id
                    camsymbols[__id] = set([sym])
        for camid in camsymbols.keys():
            symbols = camsymbols[camid]
            camsymbols[camid] = sorted(symbols)

        symbol2cam = {}
        for sym in self._symbols_ext:
            for _id in sym.split('-'):
                __id = int(_id.strip('abcdef'))
                try:
                    symbol2cam[sym].append(__id)
                except:
                    symbol2cam[sym] = [__id]
      
        symbol2cam_enc = {}
        for sym,camids in symbol2cam.iteritems():
            symenc = []
            symbol2cam_enc[self._symbol_enc(sym)] = camids

        print "camsymbols=",camsymbols
        print "symbol2cam=",symbol2cam        
        print "symbol2cam_enc=",symbol2cam_enc
        print "symbols_ext", self._symbols_ext
                
        # define the initial A, B matrices
        A, B = self._build_initial_model()

        # compute the node to be split
        self._compute_node_splits()
       
        # encode datasets (bijective mapping symobols <-> integers)
        self._trainset_enc = []
        for observ in self._trainset.outseq():
            self._trainset_enc.append(self._symbol_enc(observ))

        self._validsets_enc = []
        for dataset in self._validsets:
            encoded = []
            for observ in dataset.outseq():
                encoded.append(self._symbol_enc(observ))
            self._validsets_enc.append(encoded)

        # print some info
        self.print_info()

        # create the HMM model with node splitting
        self._hmm = HiddenMarkovModel(A, B, copy.copy(self._init_states),	\
              			      self._trainset_enc, validsets=self._validsets_enc, symnames=copy.copy(self._symbols_ext), symbol2cam=symbol2cam_enc)

        self._transition_graph_threshold = 0.01
        self._hmm.plot_transition_graph(0., savepathbase=self._savepathbase, savetag='coverage-model')

    def optimize(self, nu=0.01):
        # plot coverage
        self._hmm.plot_transition_graph(self._transition_graph_threshold,	\
                			savepathbase=self._savepathbase, savetag='coverage-model')

        # run BW before the first node-split
        while True:
            done = self._hmm.baumwelch(nu)
            self._hmm.plot_perf(savepathbase=self._savepathbase)
            self._hmm.plot_transition_graph(self._transition_graph_threshold,	\
                				savepathbase=self._savepathbase)
            if done:
                break

        while True:

            keys = sorted(self._pending_splits.keys())
            print "\n%d node-splits left" % len(keys)
            if not len(keys):
                break

            key = keys[0]
            splits = sorted(self._pending_splits[key], key=lambda split_data: split_data[0])
            #print "splits for node ", key, " (", len(splits), "):", splits
            assert len(splits)

            split = splits.pop()

            if not len(splits):
                self._pending_splits.pop(key)
            else:
                self._pending_splits[key] = splits

            # per ogni nodo corrente
            #   guarda alla backward star
            #   guarda alla forward star 
            #   costruisci matrice rettangolare transizioni
            #   identifica la riga piu' ortogonale a tutte le altre
            #
            #     se lo trovi splitta il nodo ribalanciando le probabilita'
           #(inner, bsym, bstar_syms, fstar_syms, row)
            #print split
            _inner = split[0]
            _bsym = split[1]
            _bstar_sym = split[2]
            _fstar_sym = split[3]
            rates = split[4]
            assert numpy.isclose(rates.sum(), 1)

            #print "   splitting node %s : inner=%.2f bsym=%s _bstar=%s _fstar=%s, rates=%s" % (key, _inner, _bsym, _bstar_sym, _fstar_sym, rates)

            state = self._state_enc(key)
            bstate = self._state_enc(_bsym)
            bstar = []
            for i in _bstar_sym:
                bstar.append(self._state_enc(i))
            fstar = []
            for i in _fstar_sym:
                fstar.append(self._state_enc(i))

            self._hmm.split_state(state, bstate, bstar, fstar, rates)
            self._hmm.plot_transition_graph(self._transition_graph_threshold,	\
            				    savepathbase=self._savepathbase, savetag='node-splitting')

            while True:
                done = self._hmm.baumwelch(nu)
                self._hmm.plot_perf(savepathbase=self._savepathbase)
                self._hmm.plot_transition_graph(self._transition_graph_threshold,	\
                				savepathbase=self._savepathbase)
                if done:
                    break

    def _symbol_enc(self, sym):
        #print sym
        #print self._symbols
        assert sym in self._symbols_ext
        return self._symbols_ext.index(sym)
        
    def _state_enc(self, state):
        assert state in self._init_states
        return self._init_states.index(state)

    def name(self):
        return self._modname

    def nr_training_transitions(self):
        return max(0, self._trainset.outseq_len() -1)

    def nr_validation_transitions(self, setid):
        assert isinstance(setid, int)
        assert setid >= 0
        assert setid < self._validsets.outseq_len()
        return max(0, self._validsets[setid].outseq_len() -1)

    def _build_initial_model(self):
        dataset = self._trainset
        A = numpy.zeros((self._N, self._N))

        for src in self._init_states:
            nr_from = dataset.count_transitions_from(src)
            #print "nr_from: %s = %d" % (src, nr_from)
            assert nr_from >= 0
            if nr_from == 0:
                continue

            src_idx = self._state_enc(src)

            for dest in self._init_states:
                nr_to = dataset.count_transitions_from_to(src, dest)
                #print "nr_from_to: %s->%s = %d" % (src, dest, nr_from)
                assert nr_to >= 0
                if nr_to == 0:
                    continue

                dest_idx = self._state_enc(dest)
                A[src_idx,dest_idx] = nr_to/nr_from

        for i in xrange(self._N):
            # potremmo non avere transizioni uscenti da un certo
            # nodo ed in quel caso le righe di A non sommano tutte all'unita'
            # usa una distribuzione uniforme
            if not numpy.isclose(A[i,:].sum(), 1.):
                A[i,:] = numpy.ones(self._N)/self._N
       
        B = numpy.zeros((self._M,self._N))
        for state in self._init_states:
            if state in self._symbols:
                B[self._symbol_enc(state), self._state_enc(state)] = 1.
            else:
                # this must be one of the vnull states
                assert '~0~' in state
                B[self._symbol_enc('0'), self._state_enc(state)] = 1.

        return A, B

    def _compute_node_splits(self):
        triplets = self._trainset.state_triplets_dict()
        fstar = self._trainset.state_fstar_dict()
        bstar = self._trainset.state_bstar_dict()

        #print "fstar: ", fstar
        #print "bstar: ", bstar

        splits = {}     # computed splits keyed by the id of the splitted node

        # compute splits for each state
        states = tuple(set(self._trainset.states()))
        for state in states:
            if (state not in fstar.keys()) or (state not in bstar.keys()):
                # this can happen for symbols in the first and last triplet
                continue

            fstar_states = sorted(fstar[state])
            bstar_states = sorted(bstar[state])
            nr_fstar_states = len(fstar_states)
            nr_bstar_states = len(bstar_states)
            if nr_fstar_states < 2 or nr_bstar_states < 2:
                # the 'transition rate' table must have be at least 2x2
                continue
            
            # build the table (using a matrix)
            mat = numpy.zeros((nr_bstar_states, nr_fstar_states))
            for i in xrange(nr_bstar_states):
                for j in xrange(nr_fstar_states):
                    bstate = bstar_states[i]
                    fstate = fstar_states[j]
                    try:
                        mat[i,j] = triplets[(bstate,state,fstate)]
                    except:
                        mat[i,j] = 0.
                
                # we have just filled the i-th row, normalize it to unit norm
                mat[i,:] /= numpy.linalg.norm(mat[i,:])
             
            # Evaluate the inner products
            matt = mat.transpose()
            for i in xrange(nr_bstar_states-1):
                bstate = bstar_states[i]
                row = mat[i,:]

                # We do all the inner products at once and then select just the smallest one
                # ! inners that have been evaluted already in past iterations are
                # ! skipped by setting them to 1.
                inners = row.dot(matt)
                inners[0:i] = 1.
                idx = numpy.argmin(inners)
                inner = inners[idx]
                if inner > 0.2:
                    #print "splitting: discarding split with to high inner for node %s : from=%s inner=%.2f" % (state, bstate, inner)
                    continue

                # add a new entry to the list of splits for the node with id 'sym'
                entry = (inner, bstate, bstar_states, fstar_states, row/row.sum())
                try:
                    #print "splitting: new split for node %s : %s" % (sym, entry)
                    splits[state].append(entry)
                except:
                    splits[state] = [entry]

        nr_splits = 0                        
        print "\nNode splitting:"
        for state, _splits in splits.iteritems():
            #print "  splits for node ", key
            nr_splits += len(_splits)
            print "  Nr. of splits for node %s: %d" % (state,len(_splits))
            #for entry in item:
                #print key, entry

        print "  Tot. nr. of splits %d" % nr_splits
        self._pending_splits = splits

    def print_info(self):
        print "transitional model dump: fill me"
        print "Symbols: "
        for i in xrange(len(self._symbols)):
            print "  ", i, self._symbols[i]
        print "States: "
        for i in xrange(len(self._init_states)):
            print "  ", i, self._init_states[i]
Пример #50
0
    def TestHiddenMarkovModel(self):
        # fix random seed
        Rand.Restart(12347)

        # set hyperparameters (preenchido com os da geladeira)
        # a probab do utensilio assumir cada estado k e a mesma a priori
        ProbInitPriorObs = Dirichlet.Uniform(self.K)
        # a prob de transicao  de estados tbm e uniforme a priori
        CPTTransPriorObs = System.Array.CreateInstance(Dirichlet,self.K)
        for i in range(0,self.K):
            CPTTransPriorObs[i] = Dirichlet.Uniform(self.K)

        EmitMeanPriorObs = System.Array.CreateInstance(Gaussian,self.K)
        #for i in range(0,self.K):
        EmitMeanPriorObs[0] = Gaussian.FromMeanAndPrecision(0, 0.01) #off
        EmitMeanPriorObs[1] = Gaussian.FromMeanAndPrecision(100, 0.00001) #on

        EmitPrecPriorObs = System.Array.CreateInstance(Gamma,self.K)
        #for i in range(0,self.K):
        EmitPrecPriorObs[0] = Gamma.FromShapeAndScale(0.2285, 0.0088) #off
        EmitPrecPriorObs[1] = Gamma.FromShapeAndScale(4, 0.01) #on
        # a escolha dos hiperparametros esta na tabela no fim do paper (analisar sensibilidade no futuro)

        # sample model parameters
        init = System.Array[float](ProbInitPriorObs.Sample().ToArray())# cria uma amostra na forma de um array de K posicoes obtida pela dirichlet
        #matrix de trans um array para cada linha
        trans0 = System.Array.CreateInstance(float, self.K)
        trans1 = System.Array.CreateInstance(float, self.K)
        trans0 = CPTTransPriorObs[0].Sample().ToArray()
        trans1 = CPTTransPriorObs[1].Sample().ToArray()

        emitMeans = System.Array.CreateInstance(float,self.K)
        for i in range(0,self.K):
            emitMeans[i] = EmitMeanPriorObs[i].Sample()

        emitPrecs = System.Array.CreateInstance(float,self.K)
        for i in range (0,self.K):
            emitPrecs[i] = EmitPrecPriorObs[i].Sample()

        # print parameters 
        HiddenMarkovModel().HiddenMarkov()
        HiddenMarkovModel().DefineInferenceEngine() 
        HiddenMarkovModel().SetParameters(init, trans0, trans1, emitMeans, emitPrecs)
        print "parameters:"
        HiddenMarkovModel().PrintParameters() 
        
        # create distributions for sampling
        initDist = Discrete(init) 
        transDist = System.Array.CreateInstance(Discrete,self.K)
        transDist[0] = Discrete(trans0)
        transDist[1] = Discrete(trans1)
        #print transDist[0]
        #print transDist[1]
        emitDist = System.Array.CreateInstance(Gaussian,self.K) 
        for i in range (0,self.K):
            emitDist[i] = Gaussian.FromMeanAndPrecision(emitMeans[i], emitPrecs[i])
    

        #sample data and emission data
        actualStates = System.Array.CreateInstance(int,self.T)
        emissions = System.Array.CreateInstance(float,self.T)
        actualStates[0] = initDist.Sample()
        emissions[0] = emitDist[actualStates[0]].Sample()
        for i in range(1,self.T):
            actualStates[i] = transDist[actualStates[i-1]].Sample()
            emissions[i] = emitDist[actualStates[i]].Sample()
            #print emissions[i]
        #print "sample data:"
        #print actualStates

        # infer model parameters, states and model evidence given priors and emission data
        HiddenMarkovModel().HiddenMarkov()
        HiddenMarkovModel().SetPriors(ProbInitPriorObs,CPTTransPriorObs, EmitMeanPriorObs,EmitPrecPriorObs)
        HiddenMarkovModel().ObserveData(emissions)
        HiddenMarkovModel().InferPosteriors()

        # print posterior distributions
        print "posteriors"
        HiddenMarkovModel().PrintPosteriors()
Пример #51
0
def pos_problem(arguments, fallback_model=None, fallback_training_limit=None):
	dataset_size = None
	if len(arguments) >= 2: dataset_size = int(arguments[1])
	if len(arguments) >= 3: fallback_training_limit = int(arguments[2])

	try:
		start = time()
		pickle_file = open("pos_hmm.pickle")
		request_size, training_stream, validation_stream, testing_sentences = pickle.load(pickle_file)
		pickle_file.close()

		if request_size != dataset_size: raise IOError()

		print "Unpickling: %f" % (time() - start)
	except (IOError, EOFError):
		# Load the dataset
		print "Loading dataset"
		start = time()
		if dataset_size: tagged_sentences = list(islice(PennTreebankReader.read_pos_tags_from_directory("data/wsj"), dataset_size))
		else: tagged_sentences = list(PennTreebankReader.read_pos_tags_from_directory("data/wsj"))
		stop = time()
		print "Reading: %f" % (stop-start)

		print "Creating streams"
		start = time()
		training_sentences = tagged_sentences[0:len(tagged_sentences)*4/5]
		validation_sentences = tagged_sentences[len(tagged_sentences)*8/10+1:len(tagged_sentences)*9/10]
		testing_sentences = tagged_sentences[len(tagged_sentences)*9/10+1:]

		print "Training: %d" % len(training_sentences)
		print "Validation: %d" % len(validation_sentences)
		print "Testing: %d" % len(testing_sentences)

		training_stream, validation_stream = map(merge_stream, (training_sentences, validation_sentences))
		stop = time()
		print "Streaming: %f" % (stop-start)

		serialized = (dataset_size, training_stream, validation_stream, testing_sentences)
		pickle_file = open("pos_hmm.pickle", "w")
		pickle.dump(serialized, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
		pickle_file.close()

	print "Training"
	start = time()
	pos_tagger = HiddenMarkovModel(label_history_size=2)
	pos_tagger.train(training_stream[1:-2], fallback_model=fallback_model, fallback_training_limit=fallback_training_limit)
	stop = time()
	print "Training: %f" % (stop-start)

	print "Testing on %d sentences" % len(testing_sentences)
	start = time()

	num_correct = 0
	num_incorrect = 0

	for correct_labels, emissions in testing_sentences:
		guessed_labels = pos_tagger.label(emissions, debug=False)
#		print "SENTENCE: %s" % emissions
#		print "CORRECT: %s" % correct_labels
#		print "GUESSED: %s" % guessed_labels
		for correct, guessed in izip(correct_labels, guessed_labels):
			if correct == START_LABEL or correct == STOP_LABEL: continue
			if correct == guessed:
				num_correct += 1
			else:
				num_incorrect += 1

		if correct_labels != guessed_labels:
			guessed_score = pos_tagger.score(zip(guessed_labels, emissions))
			correct_score = pos_tagger.score(zip(correct_labels, emissions))

			if guessed_score < correct_score: print "%d Guessed: %f, Correct: %f" % (len(emissions), guessed_score, correct_score)

			debug_label = lambda: pos_tagger.label(emissions, debug=True)
			debug_score = lambda labels: pos_tagger.score(zip(labels, emissions), debug=False)
			assert guessed_score >= correct_score or len(emissions) > 23, "Decoder sub-optimality (%f for guess, %f for correct)\n%s vs. %s" % (debug_score(guessed_labels), debug_score(correct_labels), debug_label(), correct_labels)

			stdout.write("%1.0f" % (sum(1 for guess, correct in zip(guessed_labels, correct_labels) if guess == correct) / len(correct_labels) * 10))
			stdout.flush()
		else:
			stdout.write(".")
			stdout.flush()
	stdout.write("\n")

	stop = time()
	print "Testing: %f" % (stop-start)

	print "%d correct (%.3f%% of %d)" % (num_correct, 100.0 * float(num_correct) / float(num_correct + num_incorrect), num_correct + num_incorrect)