def test_sample(self): train_ds = UDDataSet("data/en-ud-train.conllu") dev_ds = UDDataSet("data/en-ud-dev.conllu", train_ds) gibbs = Gibbs(HMM(train_ds)) sample = gibbs.sample(dev_ds.sentences()[50], 10) print([train_ds.idx2pos(i) for i in sample])
def test_parser2(self): hmm = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z']) }) grammar = Grammar(hmm, self.plural_english_rule_set) nfa = grammar.get_nfa()
def test_from_simulation1(self): hmm = HMM({'q0': ['q1'], 'q1': (['qf'], ['ba', 'baFi', 'babF', 'badF', 'bbFa', 'bbFidbF', 'bdFibF', 'bi', 'bibF', 'bidBF', 'dFabFF', 'dFaddF']) }) rule1 = Rule([{'voiceless': '-', 'labial': '+'}], [], [], [{'voiceless': '-', 'labial': '-'}], False) rule2 = Rule([{'voiceless': '-', 'high': '-'}], [{'labial': '-'}], [], [{'voiceless': '-'}], False) rule3 = Rule([{}], [], [{'cons': '+'}], [{'voiceless': '+', 'bound': '-'}], True) return self.get_energy(hmm, [rule1, rule2, rule3], "from_simulation1")
def test_np_chunk_pos(self): """predicting sequences using baseline feature""" train, test = self.split_np_chunk_corpus(POS) classifier = HMM() classifier.train(train) results = ConfusionMatrix(classifier, test) _, _, _, _ = results.print_out() self.assertGreater(accuracy(classifier, test), 0.55)
def get_random_hypothesis_randomized(cls, simulation, data, initial_hmm=None, initial_rules=None): if initial_rules: rule_set = RuleSet.load_from_flat_list(initial_rules) elif not configurations['EVOLVE_RULES']: rule_set = RuleSet.load_from_flat_list(deepcopy(simulation.target_tuple[1])) else: rule_set = RuleSet.get_random_rule_set() if initial_hmm: hmm = HMM(deepcopy(initial_hmm)) elif not configurations['EVOLVE_HMM']: hmm = HMM(deepcopy(simulation.target_tuple[0])) else: hmm = HMM.get_random_hmm(data) grammar = Grammar(hmm, rule_set) return Hypothesis(grammar)
def test_np_chunk_baseline(self): """predicting sequences using baseline feature""" train, test = self.split_np_chunk_corpus(Document) classifier = HMM() classifier.train(train) test_result = compute_cm(classifier, test) _, _, f1, accuracy = test_result.print_out() self.assertGreater(accuracy, 0.55)
def hmm_test(): st_time = time.time() model_file = "hmm_model.json" # load data with open(model_file, 'r') as f: data = json.load(f) A = np.array(data['A']) B = np.array(data['B']) pi = np.array(data['pi']) # observation symbols obs_dict = data['observations'] # state symbols states_symbols = dict() for idx, item in enumerate(data['states']): states_symbols[item] = idx Osequence = np.array(data['Osequence']) N = len(Osequence) model = HMM(pi, A, B, obs_dict, states_symbols) delta = model.forward(Osequence) m_delta = np.array([[3.5000e-01, 1.3600e-01, 0.0000e+00, 0.0000e+00, 1.1136e-05, 1.1136e-05, 0.0000e+00], [1.5000e-01, 3.2000e-02, 4.6400e-03, 2.7840e-04, 3.3408e-05, 1.1136e-05, 8.9088e-07]]) print("Your forward function output:", delta) print("My forward function output:", m_delta) gamma = model.backward(Osequence) m_gamma = np.array([[1.6896e-06, 3.8400e-06, 6.4000e-05, 2.0000e-03, 1.4000e-02, 2.0000e-02, 1.0000e+00], [1.9968e-06, 1.1520e-05, 1.9200e-04, 3.2000e-03, 2.2000e-02, 6.0000e-02, 1.0000e+00]]) print("Your backward function output:", gamma) print("My backward function output:", m_gamma) prob1 = model.sequence_prob(Osequence) m_prob1 = 8.908800000000002e-07 print("Your sequence_prob function output:", prob1) print("My sequence_prob function output:", m_prob1) prob2 = model.posterior_prob(Osequence) m_prob2 = np.array([[0.6637931, 0.5862069, 0., 0., 0.175, 0.25, 0.], [0.3362069, 0.4137931, 1., 1., 0.825, 0.75, 1.]]) print("Your posterior_prob function output:", prob2) print("My posterior_prob function output:", m_prob2) viterbi_path = model.viterbi(Osequence) m_viterbi_path = ['1', '1', '2', '2', '2', '2', '2'] print('Your viterbi function output: ', viterbi_path) print('My viterbi function output: ', m_viterbi_path) en_time = time.time() print() print("hmm total time: ", en_time - st_time)
def model_training(train_data, tags): """ Train HMM based on training data Inputs: - train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class - tags: (1*num_tags) a list of POS tags Returns: - model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data """ model = None ################################################### default = 1e-06 words = [] word_index_map = {} S = len(tags) L = len(train_data) state_dict = {tags[i]: i for i in range(S)} first_tag_counts = {tags[i]: 0 for i in range(S)} tag_counts = {tags[i]: 0 for i in range(S)} tag_tag_counts = { tags[i]: {tags[j]: 0 for j in range(S)} for i in range(S) } tag_word_counts = {tags[i]: {} for i in range(S)} for line in train_data: first_tag_counts[line.tags[0]] += 1 for index in range(line.length): tag, word = line.tags[index], line.words[index] if word not in word_index_map: word_index_map[word] = len(words) words.append(word) tag_counts[tag] += 1 tag_word_counts[tag].setdefault(word, 0) tag_word_counts[tag][word] += 1 if index < line.length - 1: nexttag = line.tags[index + 1] tag_tag_counts[tag][nexttag] += 1 pi = np.array([first_tag_counts[t] for t in tags]) / L tag_counts_array = np.array([[tag_counts[t]] for t in tags]) A = np.array([[tag_tag_counts[s].get(ss, default) for ss in tags] for s in tags]) / tag_counts_array B = np.array([[tag_word_counts[s].get(w, default) for w in words] for s in tags]) / tag_counts_array model = HMM(pi, A, B, word_index_map, state_dict) ################################################### return model
def test_morpheme_boundary(self): configurations["MORPHEME_BOUNDARY_FLAG"] = True self.initialise_segment_table("plural_english_segment_table.txt") hmm = HMM({INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z'])}) morpheme_boundary_hmm_transducer = hmm.get_transducer() self.write_to_dot_to_file(morpheme_boundary_hmm_transducer, "morpheme_boundary_hmm_transducer")
def test_get_log_lines(self): hmm = HMM({'q0': ['q1'], 'q1': (['q2', 'q3', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz']), 'q2': (['q3', 'qf'], ['zo', 'go', 'do']), 'q3': (['qf'], ['as', 'ak', 'at'])}) print(hmm) for line in hmm.get_log_lines(): print(line)
def test_np_chunk_baseline(self): """Test NP chunking with word and postag feature""" train, test = self.split_np_chunk_corpus(Document) classifier = HMM() classifier.train(train) test_result = compute_cm(classifier, test) _, _, f1, accuracy = test_result.print_out() self.assertGreater(accuracy, 0.55) self.assertTrue(all(i>=.90 for i in f1), 'not all greater than 90.0%')
def test_change_segment_in_emission(self): self.initialise_segment_table("plural_english_segment_table.txt") hmm = HMM({INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z'])}) self.write_to_dot_to_file(hmm, "hmm") segments = SegmentTable().get_segments_symbols() hmm.change_segment_in_emission(segments) print(hmm.get_all_emissions())
def main(args): model = HMM(args.full_emissions, \ args.add_one, \ args.end_token, \ args.config_path, \ args.data_path, \ args.save_model) model.test_model(args.data_path, args.save_test)
def test_crossover_connected_components(self): self.initialise_segment_table("plural_english_segment_table.txt") hmm_1 = HMM({INITIAL_STATE: ['q1'], 'q1': (['q2'], ['dogo', 'koko']), 'q2': (['q1', FINAL_STATE], ['z'])}) hmm_2 = HMM({INITIAL_STATE: ['q1'], 'q1': (['q2'], ['dag', 'kat']), 'q2': (['q3', FINAL_STATE], ['k']), 'q3': (['q1'], ['z'])}) offspring_1, offspring_2 = HMM.crossover(hmm_1, hmm_2) self.write_to_dot_to_file(hmm_1, 'component_parent_1') self.write_to_dot_to_file(hmm_2, 'component_parent_2') self.write_to_dot_to_file(offspring_1, 'component_offspring_1') self.write_to_dot_to_file(offspring_2, 'component_offspring_2') offspring_1.get_transducer() offspring_2.get_transducer()
def fit_sin(length=100,n_hidden_states=10,max_iter=10): x, y = sin(length) hmm = HMM(n_hidden_states,101) hmm.fit(np.reshape(y,(1,-1)), max_iter=max_iter) y_gen = hmm.generate(length) show_data(x,y) show_data(x,y_gen)
def test_morpheme_boundary(self): configurations["MORPHEME_BOUNDARY_FLAG"] = True self.initialise_segment_table("plural_english_segment_table.txt") hmm = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z']) }) grammar = Grammar(hmm, [])
def model_training(train_data, tags): """ Train HMM based on training data Inputs: - train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class - tags: (1*num_tags) a list of POS tags Returns: - model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data """ model = None ################################################### # Edit here N=len(train_data) S=len(tags) pi=np.zeros(S) A=np.zeros((S,S)) state_dict={} obs=[] obs_dict={} o=0 for t in range(S): state_dict[tags[t]]=t for line in train_data: pi[state_dict[line.tags[0]]]+=1 for w in range(line.length-1): A[state_dict[line.tags[w]],state_dict[line.tags[w+1]]]+=1 for line in train_data: for w in range(line.length): if line.words[w] not in obs_dict.keys(): obs_dict[line.words[w]]=o o+=1 pi=pi/N A=(A.T/np.sum(A, axis=1)).T O=len(obs_dict) B=np.zeros((S,O)) for line in train_data: for w in range(line.length): B[state_dict[line.tags[w]],obs_dict[line.words[w]]]+=1 B=(B.T/np.sum(B,axis=1)).T a1=np.isnan(A) A[a1]=0 b1=np.isnan(B) B[b1]=0 model=HMM(pi,A,B,obs_dict,state_dict) return model
def test_simple_hmm(self): "https://people.csail.mit.edu/rameshvs/content/hmms.pdf" hidden_values = ['Area 1', 'Area 2', 'Area 3'] prior = {'Area 1': 1 / 3, 'Area 2': 1 / 3, 'Area 3': 1 / 3} transition_table = { 'Area 1': { 'Area 1': 0.25, 'Area 2': 0.75, 'Area 3': 0. }, 'Area 2': { 'Area 1': 0., 'Area 2': 0.25, 'Area 3': 0.75 }, 'Area 3': { 'Area 1': 0., 'Area 2': 0., 'Area 3': 1. }, } obs_values = ['hot', 'cold'] obs_table = { 'Area 1': { 'hot': 1.0, 'cold': 0.0 }, 'Area 2': { 'hot': 0.0, 'cold': 1.0 }, 'Area 3': { 'hot': 1.0, 'cold': 0.0 }, } hmm = HMM(hidden_values, prior, transition_table, obs_values, obs_table, n_steps=3) hmm.set_observed_values(['hot', 'cold', 'hot']) hmm.compute_alpha() result = hmm.filtered_posterior(index=2) for r, v in zip(result.values(), [0.0, 0.0, 1.0]): self.assertEqual(r, v) hmm.compute_beta() for r, v in zip( hmm.smoothed_posterior(index=2).values(), [0.0, 0.0, 1.0]): self.assertEqual(r, v)
def prepare_seqs_en(self, decoding="viterbi"): params_fixed = (np.load("{}/ip.npy".format(self.path)), np.load("{}/tp.npy".format(self.path)), np.load("{}/fp.npy".format(self.path)), np.load("{}/ep.npy".format(self.path))) h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path) self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict) train_seq = self.ner_corpus.read_sequence_list_conll(eng_train) dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev) test_seq = self.ner_corpus.read_sequence_list_conll(eng_test) muc_seq = self.ner_corpus.read_sequence_list_conll( muc_test) if self.use_muc else None decoder = None type_decoder = None if decoding == "viterbi": decoder = h.viterbi_decode_corpus elif decoding == "max_emission": decoder = h.max_emission_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": type_decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined correctly, using Viterbi.") decoder = h.viterbi_decode_corpus print( "Decoding word representations on train. This may take a while...") type_decoder( train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq) print("Decoding word representations on dev.") type_decoder( dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq) print("Decoding word representations on test.") type_decoder( test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq) if self.use_muc: print("Decoding word representations on MUC.") type_decoder( muc_seq, self.dataset, self.logger) if type_decoder is not None else decoder(muc_seq) return train_seq, dev_seq, test_seq, muc_seq
def __get_best_pos_to_shoot(self): """Returns a position in which is more likely to shoot the enemy. """ #Gets the state of the markov model at time t. transition_probabilities = self.__get_net_probs() emission_probabilities = self.__get_net_probs() hmm = HMM(transition_probabilities, emission_probabilities) emissions = [2, 1, 0] initial = self.__get_net_probs() return (self.net[self.viterbi(hmm, initial, emissions)[0]].id)
def test_advance_emission(self): hmm = HMM({'q0': ['q1'], 'q1': (['q1', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz'] + ['zo', 'go', 'do'] + ['at']) }) self.write_to_dot_to_file(hmm, "pre_advance_emission_hmm") hmm.advance_emission() for line in hmm.get_log_lines(): print(line) self.write_to_dot_to_file(hmm, "advance_emission_hmm")
def test_init(self): hmm = HMM(UDDataSet('data/en-ud-train.conllu')) self.assertEqual(17, hmm.num_state) self.assertEqual(17, hmm.bos_idx) self.assertEqual(18, hmm.eos_idx) for i in range(hmm.num_state): self.assertAlmostEqual(-12.228919653600784, hmm.emission_counter[(i, -1)])
def test_crossover_subgraph(self): self.initialise_segment_table("plural_english_segment_table.txt") hmm_1 = HMM({INITIAL_STATE: ['q1'], 'q1': (['q1', 'q2'], ['da']), 'q2': ([FINAL_STATE], ['s'])}) hmm_2 = HMM({INITIAL_STATE: ['q1'], 'q1': (['q2'], ['ko']), 'q2': (['q3'], ['bo']), 'q3': (['q4'], ['go']), 'q4': ([FINAL_STATE], ['z'])}) offspring_1, offspring_2 = HMM.crossover_subgraphs(hmm_1, hmm_2) self.write_to_dot_to_file(hmm_1, 'subgraph_parent_1') self.write_to_dot_to_file(hmm_2, 'subgraph_parent_2') self.write_to_dot_to_file(offspring_1, 'subgraph_offspring_1') self.write_to_dot_to_file(offspring_2, 'subgraph_offspring_2') offspring_1.get_transducer() offspring_2.get_transducer()
def test_split_then_merge_state(self): hmm = HMM({'q0': ['q1'], 'q1': (['qf'], ['koko', 'gogo']) }) self.write_to_dot_to_file(hmm, "split_states_before") hmm.split_state() hmm.merge_states() for line in hmm.get_log_lines(): print(line) self.write_to_dot_to_file(hmm, "split_states_after")
def test_merge_emissions(self): hmm = HMM({'q0': ['q1', 'q5'], 'q1': (['qf'], ['koko']), 'q5': (['qf'], ['dag', 'kat']) }) self.write_to_dot_to_file(hmm, "merge_states_before") hmm.merge_emissions() for line in hmm.get_log_lines(): print(line) self.write_to_dot_to_file(hmm, "merge_states_after")
def test_morpheme_boundary(self): self.configurations["MORPHEME_BOUNDARY_FLAG"] = True self.initialise_segment_table("plural_english_segment_table.txt") hmm = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z']) }) grammar = Grammar(hmm) self.assertCountEqual(['dog', 'kat', 'dogz', 'katz'], grammar.get_all_outputs())
def test_remove_states(self): hmm = HMM({INITIAL_STATE: ['q1'], 'q1': (['q4', FINAL_STATE], ['a']), 'q4': (['q8', 'q8', FINAL_STATE], ['d']), 'q7': (['q8'], ['d']), 'q8': (['q2', FINAL_STATE], ['g']), 'q2': (['q1', FINAL_STATE], ['z'])}) hmm.remove_states(['q1']) print(hmm.inner_states) self.write_to_dot_to_file(hmm, 'after_remove') log_hmm(hmm)
def model_training(train_data, tags): #####lowercased###### for data in train_data: for it in range(data.length): data.words[it] = data.words[it].lower() ##################### S = len(tags) pi = np.zeros(S) A = np.zeros([S, S]) B = [] Bc = np.zeros([S, 1]) Ac = np.zeros([S, S]) obs_dict = {} states_symbols = {} for i in range(S): if not tags[i] in states_symbols.keys(): states_symbols[tags[i]] = i numS = np.zeros(S) num1S = np.zeros(S) #################################### for data in train_data: firsttag = data.tags[0] num1S[states_symbols[firsttag]] += 1 for i in range(data.length): word = data.words[i] tag = data.tags[i] if not word in obs_dict.keys(): obs_dict[word] = len(obs_dict) Bc = np.append(Bc, np.zeros([S, 1]), axis=1) Bc[states_symbols[tag], obs_dict[word]] += 1 numS[states_symbols[tag]] += 1 if i != data.length - 1: Ac[states_symbols[tag], states_symbols[data.tags[i + 1]]] += 1 B = np.zeros(np.shape(Bc)) pi = normalize(num1S) for s in range(S): for sp in range(S): if numS[s] == 0: A[s, sp] = 0 else: A[s, sp] = Ac[s, sp] / numS[s] for s in range(len(Bc)): for o in range(len(Bc[0])): if numS[s] == 0: B[s, o] = 0 else: B[s, o] = Bc[s, o] / numS[s] ################################### model = HMM(pi, A, B, obs_dict, states_symbols) return model
def test_morphology_only2(self): self.initialise_segment_table("plural_english_segment_table.txt") self.configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25 data = [u'tozata', u'tozaso', u'tozakt', u'tozzookata', u'tozzookaso', u'tozzookakt', u'tozzook', u'tozdodata', u'tozdodaso', u'tozdodakt', u'tozdod', u'tozgosata', u'tozgosaso', u'tozgosakt', u'tozgos', u'toz', u'dagata', u'dagaso', u'dagakt', u'dagzookata', u'dagzookaso', u'dagzookakt', u'dagzook', u'dagdodata', u'dagdodaso', u'dagdodakt', u'dagdod', u'daggosata', u'daggosaso', u'daggosakt', u'daggos', u'dag', u'gasata', u'gasaso', u'gasakt', u'gaszookata', u'gaszookaso', u'gaszookakt', u'gaszook', u'gasdodata', u'gasdodaso', u'gasdodakt', u'gasdod', u'gasgosata', u'gasgosaso', u'gasgosakt', u'gasgos', u'gas', u'kodata', u'kodaso', u'kodakt', u'kodzookata', u'kodzookaso', u'kodzookakt', u'kodzook', u'koddodata', u'koddodaso', u'koddodakt', u'koddod', u'kodgosata', u'kodgosaso', u'kodgosakt', u'kodgos', u'kod', u'katata', u'kataso', u'katakt', u'katzookata', u'katzookaso', u'katzookakt', u'katzook', u'katdodata', u'katdodaso', u'katdodakt', u'katdod', u'katgosata', u'katgosaso', u'katgosakt', u'katgos', u'kat', u'dotata', u'dotaso', u'dotakt', u'dotzookata', u'dotzookaso', u'dotzookakt', u'dotzook', u'dotdodata', u'dotdodaso', u'dotdodakt', u'dotdod', u'dotgosata', u'dotgosaso', u'dotgosakt', u'dotgos', u'dot'] hmm = HMM({'q0': [u'q1'], 'q1': ([u'q2', u'q3', u'qf'], ['toz', 'dag', 'kat', 'dot', 'kod', 'gas']), 'q2': ([u'q3',u'qf'], ['zook', 'gos', 'dod']), 'q3': ([u'qf'], ['aso', 'akt', 'ata'])}) self.configurations.simulation_data = data hypothesis = Hypothesis(Grammar(hmm, []))
def model_training(train_data, tags): """ Train HMM based on training data Inputs: - train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class - tags: (1*num_tags) a list of POS tags Returns: - model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data """ model = None ################################################### N = len(tags) A = np.ones((N, N)) / N pi = np.ones(N) / N state_dict, tag_dict, obs_dict = {}, {}, {} word_list = [] for idx, tag in enumerate(tags): state_dict[tag] = idx for cur_line in train_data: pi[state_dict[cur_line.tags[0]]] += 1 for idx in range(cur_line.length): tag = cur_line.tags[idx] word_list.append(cur_line.words[idx]) if tag not in tag_dict: tag_dict[tag] = 1 else: tag_dict[tag] += 1 if idx < cur_line.length - 1: A[tags.index(cur_line.tags[idx]), tags.index(cur_line.tags[idx + 1])] += 1 word_list = list(set(word_list)) for idx, word in enumerate(word_list): obs_dict[word] = idx total_tags = sum(tag_dict.values()) for key in tag_dict.keys(): tag_dict[key] /= total_tags B = np.zeros([N, len(word_list)]) for line in train_data: for word, tag in zip(line.words, line.tags): B[state_dict[tag], obs_dict[word]] = tag_dict[tag] A /= np.sum(A, axis=1)[:, None] pi /= len(train_data) model = HMM(pi, A, B, obs_dict, state_dict) ################################################### return model