def train_hmm_tagger(data): # HMM # Use the tag unigrams and bigrams calculated above to construct a hidden Markov tagger. # # - Add one state per tag # - The emission distribution at each state should be estimated with the formula: $P(w|t) = \frac{C(t, w)}{C(t)}$ # - Add an edge from the starting state `basic_model.start` to each tag # - The transition probability should be estimated with the formula: $P(t|start) = \frac{C(start, t)}{C(start)}$ # - Add an edge from each tag to the end state `basic_model.end` # - The transition probability should be estimated with the formula: $P(end|t) = \frac{C(t, end)}{C(t)}$ # - Add an edge between _every_ pair of tags # - The transition probability should be estimated with the formula: $P(t_2|t_1) = \frac{C(t_1, t_2)}{C(t_1)}$ basic_model = HiddenMarkovModel(name="base-hmm-tagger") state_dict = {} states = [] emission_counts = pair_counts(*list(zip( *data.training_set.stream()))[::-1]) for tag in emission_counts.keys(): tag_count = tag_unigrams[tag] probs = {} for w in emission_counts[tag]: probs[w] = emission_counts[tag][w] / tag_count emission_p = DiscreteDistribution(probs) state = State(emission_p, name="" + tag) basic_model.add_state(state) state_dict[tag] = state for tag in tag_starts: basic_model.add_transition(basic_model.start, state_dict[tag], tag_starts[tag] / len(data.training_set.Y)) basic_model.add_transition(state_dict[tag], basic_model.end, tag_ends[tag] / tag_unigrams[tag]) for (tag1, tag2) in tag_bigrams: basic_model.add_transition( state_dict[tag1], state_dict[tag2], tag_bigrams[(tag1, tag2)] / tag_unigrams[tag1]) # finalize the model basic_model.bake() assert all( tag in set(s.name for s in basic_model.states) for tag in data.training_set.tagset ), "Every state in your network should use the name of the associated tag, which must be one of the training set tags." assert basic_model.edge_count() == 168, ( "Your network should have an edge from the start node to each state, one edge between every " + "pair of tags (states), and an edge from each state to the end node.") HTML( '<div class="alert alert-block alert-success">Your HMM network topology looks good!</div>' ) return basic_model
model.add_transition(model.start, sunny_state, 0.5) model.add_transition(model.start, rainy_state, 0.5) # add sunny day transitions (we already know estimates of these probabilities # from the problem statement) model.add_transition(sunny_state, sunny_state, 0.8) # 80% sunny->sunny model.add_transition(sunny_state, rainy_state, 0.2) # 20% sunny->rainy # TODO: add rainy day transitions using the probabilities specified in the transition table model.add_transition(rainy_state, sunny_state, 0.4) # 40% rainy->sunny model.add_transition(rainy_state, rainy_state, 0.6) # 60% rainy->rainy # finally, call the .bake() method to finalize the model model.bake() assert model.edge_count() == 6, "There should be two edges from model.start, two from Rainy, and two from Sunny" assert model.node_count() == 4, "The states should include model.start, model.end, Rainy, and Sunny" print("Great! You've finished the model.") show_model(model, figsize=(5, 5), filename="example.png", overwrite=True, show_ends=False) column_order = ["Example Model-start", "Sunny", "Rainy", "Example Model-end"] # Override the Pomegranate default order column_names = [s.name for s in model.states] order_index = [column_names.index(c) for c in column_order] # re-order the rows/columns to match the specified column order
# End - Number of senteces ending with tag over count of tag appereances for tag in tag_starts: basic_model.add_transition(basic_model.start, s[tag], tag_starts[tag] / len(data.training_set.Y)) basic_model.add_transition(s[tag], basic_model.end, tag_ends[tag] / tag_unigrams[tag]) for (tag1, tag2) in tag_bigrams: basic_model.add_transition(s[tag1], s[tag2], tag_bigrams[(tag1, tag2)] / tag_unigrams[tag1]) basic_model.bake() assert all(tag in set(s.name for s in basic_model.states) for tag in data.training_set.tagset), \ "Every state in your network should use the name of the associated tag, which must be one of the training set tags." assert basic_model.edge_count() == 168, \ ("Your network should have an edge from the start node to each state, one edge between every " + "pair of tags (states), and an edge from each state to the end node.") hmm_training_acc = accuracy(data.training_set.X, data.training_set.Y, basic_model) print("training accuracy basic hmm model: {:.2f}%".format(100 * hmm_training_acc)) hmm_testing_acc = accuracy(data.testing_set.X, data.testing_set.Y, basic_model) print("testing accuracy basic hmm model: {:.2f}%".format(100 * hmm_testing_acc)) assert hmm_training_acc > 0.97, "Uh oh. Your HMM accuracy on the training set doesn't look right." assert hmm_training_acc > 0.955, "Uh oh. Your HMM accuracy on the training set doesn't look right."
basic_model.add_transition(tag_state,basic_model.end,end_prob[tag_state.name]) transition_prob_pair={} for key in tag_bigrams.keys(): transition_prob_pair[key]=tag_bigrams.get(key)/tags_count[key[0]] for tag_state in to_pass_states : for next_tag_state in to_pass_states : basic_model.add_transition(tag_state,next_tag_state,transition_prob_pair[(tag_state.name,next_tag_state.name)]) basic_model.bake() assert all(tag in set(s.name for s in basic_model.states) for tag in data.training_set.tagset), "Every state in your network should use the name of the associated tag, which must be one of the training set tags." assert basic_model.edge_count() == 168, ("Your network should have an edge from the start node to each state, one edge between every " + "pair of tags (states), and an edge from each state to the end node.") HTML('<div class="alert alert-block alert-success">Your HMM network topology looks good!</div>') hmm_training_acc = accuracy(data.training_set.X, data.training_set.Y, basic_model) print("training accuracy basic hmm model: {:.2f}%".format(100 * hmm_training_acc)) hmm_testing_acc = accuracy(data.testing_set.X, data.testing_set.Y, basic_model) print("testing accuracy basic hmm model: {:.2f}%".format(100 * hmm_testing_acc)) assert hmm_training_acc > 0.97, "Uh oh. Your HMM accuracy on the training set doesn't look right." assert hmm_testing_acc > 0.955, "Uh oh. Your HMM accuracy on the testing set doesn't look right." HTML('<div class="alert alert-block alert-success">Your HMM tagger accuracy looks correct! Congratulations, you\'ve finished the project.</div>') for key in data.testing_set.keys[:3]: print("Sentence Key: {}\n".format(key))
transition_probability = tag_bigrams[bigram] / tag_unigrams[tag1] sum_of_probabilities += transition_probability basic_model.add_transition(state1, state2, transition_probability) #============================================================== # finalize the model #============================================================== # NOTE: YOU SHOULD NOT NEED TO MODIFY ANYTHING BELOW THIS LINE basic_model.bake() print("Number of nodes or states: ", basic_model.node_count()) print("Number of edges: ", basic_model.edge_count()) assert all(tag in set(s.name for s in basic_model.states) for tag in data.training_set.tagset), \ "Every state in your network should use the name of the associated tag, which must be one of the training set tags." assert basic_model.edge_count() == 168, \ ("Your network should have an edge from the start node to each state, one edge between every " + "pair of tags (states), and an edge from each state to the end node.") HTML('<div class="alert alert-block alert-success">Your HMM network topology looks good!</div>') #============================================================== # evaluate train / test data set metrics #==============================================================
from pomegranate import HiddenMarkovModel, DiscreteDistribution, State import numpy as np model = HiddenMarkovModel(name='weather') sunny_emissions = DiscreteDistribution({'yes': 0.1, 'no': 0.9}) sunny_state = State(sunny_emissions, name='sunny') rainy_emissions = DiscreteDistribution({'yes': 0.8, 'no': 0.2}) rainy_state = State(rainy_emissions, name='rainy') model.add_states(sunny_state, rainy_state) model.add_transition(model.start, sunny_state, 0.5) model.add_transition(model.start, rainy_state, 0.5) model.add_transition(sunny_state, rainy_state, 0.2) model.add_transition(sunny_state, sunny_state, 0.8) model.add_transition(rainy_state, rainy_state, 0.6) model.add_transition(rainy_state, sunny_state, 0.4) model.bake() states = np.array([s.name for s in model.states]) print('{} states: {}'.format(model.node_count(), states)) transitions = model.dense_transition_matrix() print('{} transitions probabilities between states \n{}'.format( model.edge_count(), transitions))