def train(self): mmModel = np.zeros((noteRange+1, noteRange+1)) mm3Model = np.zeros((noteRange+1, noteRange+1, noteRange+1)) hmmModel = HMM(12, noteRange+1) obs = [] ground = [] actions = [] for i in range(minNote, maxNote): actions.append(i) qModel = QLearner(actions, epsilon=0.1, alpha=0.2, gamma=0.9) for ls in self.clusterData: for quadidx, quad in enumerate(ls): tempquad = map(lambda x: x - minNote, quad) #take this out for prevnote stuff obs.append(tempquad[1:]) #this is for hmm: you can also do same thing for qlearning to change state that way tempquad = map(lambda x: (x - minNote) % 12, quad) ground.append(tempquad[:3]) if (quad): for idx, note in enumerate(quad): if idx > 0: currNote = note prevNote = quad[idx - 1] #Q learning #q.learn(state1, action1, reward, state2) qModel.learn(prevNote, note, 1, note) #Markov model mmModel[currNote - minNote, prevNote - minNote] += 1 if idx > 2: #Markov model, more order currNote = note - minNote prevNote = quad[idx - 1] - minNote prevNote2 = quad[idx - 2] - minNote mm3Model[currNote, prevNote, prevNote2] += 1 hmmModel.learn(obs, ground) return (mmModel, mm3Model, hmmModel, qModel)
class HMMPOSTagger(object): """ 在中文分词结果基础上, 采用 HMM 模型实现词性标注 (Part-of-speech tagging). """ def __init__(self): self.hmm = HMM() self.re_chinese = re.compile(ur"([\u4E00-\u9FA5]+)") # 正则匹配汉字串 self.re_skip = re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)") # 正则匹配英文串和数字串 def load(self, model_dir): """ 加载模型文件. """ self.hmm.load(model_dir) def pos_tag(self, words): """ 基于 HMM 模型的词性标注. """ log_prob, pos_list = self._viterbi(words) for i, w in enumerate(words): yield (w, pos_list[i])
def main(job_id, params): num_runs = 20 obs_length = 100 num_states = 2 num_obs = 2 # readin hmm indx t = 0 try: with open(os.path.join('.', 'hmm_index.txt')) as hmm_index_file: t = int(hmm_index_file.read()) sys.stderr.write("!!!!!!!!!!!!!!!!!!HMM INDEX: " + str(t) + " !!!!!!!!!!!!!!!\n") except IOError: t = 0 # generate HMM observations np.random.seed(0x6b6c26b2) seeds = np.random.randint(0x0fffffff, size=num_runs) np.random.seed(seeds[t]) # random hmm z_mat, t_mat = random_hmm(num_states, num_obs) pi_vec = np.array([1.0 / num_states] * num_states) hmm_test = HMM(z_mat, t_mat, pi_vec) # random obs trajectory obs = hmm_test.generate(obs_length)[np.newaxis,:] # calculate log likelihood for input HMM parameters z_mat_p_input = np.array([[params['z_mat_p_0'][0], params['z_mat_p_1'][0]]]) t_mat_p_input = np.array([[params['t_mat_p_0'][0], params['t_mat_p_1'][0]]]) # pi_vec_input = np.array([params['pi_0'], 1 - params['pi_0']]) hmm_estimate = make_parameterized_HMM(z_mat_p_input, t_mat_p_input, pi_vec) hmm_loglikelihood = hmm_estimate.loglikelihood(obs[0]) return -hmm_loglikelihood
def prepare_seqs_nl_dbg(self, decoding="viterbi"): params_fixed = (np.load("{}ip.npy".format(self.path)), np.load("{}tp.npy".format(self.path)), np.load("{}fp.npy".format(self.path)), np.load("{}ep.npy".format(self.path))) h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False) h.dirname = self.path self.ner_corpus = Conll2002NerCorpus(self.dataset.x_dict) # train_seq = self.ner_corpus.read_sequence_list_conll(ned_train) dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev) test_seq = self.ner_corpus.read_sequence_list_conll(ned_test) if decoding == "viterbi": decoder = h.viterbi_decode_corpus elif decoding == "max_emission": decoder = h.max_emission_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined, using Viterbi.") decoder = h.viterbi_decode_corpus # print("Decoding word representations on train.") # decoder(train_seq) print("Decoding word representations on dev.") decoder(dev_seq)
def simple_weather_model(): hmm = HMM(["s1", "s2"], ["R", "NR"]) init = [0.7, 0.3] trans = [[0.8, 0.2], [0.1, 0.9]] observ = [[0.75, 0.25], [0.4, 0.6]] hmm.set_hidden_model(init, trans, observ) return hmm
def main(): parser = OptionParser() parser.add_option("-d", dest="training", help="training data directory") parser.add_option("-k", dest="K", type="int", help="number of latent states", default=6) parser.add_option("-a", dest="a", type="float", help="Dirichlet parameter", default=1.0) parser.add_option("-i", dest="I", type="int", help="iteration count", default=10) parser.add_option("-m", dest="model", help="model data filename to save") (options, args) = parser.parse_args() if not options.training: parser.error("need training data directory(-d)") features = load_data(options.training) hmm = HMM() hmm.set_corpus(features) hmm.init_inference(options.K, options.a) pre_L = -1e10 for i in range(options.I): log_likelihood = hmm.inference() print i, ":", log_likelihood if pre_L > log_likelihood: break pre_L = log_likelihood if options.model: hmm.save(options.model) else: hmm.dump()
def main(): hmm = HMM(*train(sys.argv[1])) with open(sys.argv[2]) as f: correct = 0 wrong = 0 correct_sents = 0 wrong_sents = 0 correct_known = 0 wrong_known = 0 for i, sent in enumerate(Reader(f)): prob, path = hmm.decode([word for (word, pos) in sent]) correct1 = 0 wrong1 = 0 for (gold, predicted) in zip(sent, path): if gold == predicted: correct1 += 1 else: wrong1 += 1 print('%e\t%.3f\t%s' % (prob, correct1 / (correct1 + wrong1), ' '.join('%s/%s' % pair for pair in path))) if prob > 0: correct_sents += 1 correct_known += correct1 wrong_known += wrong1 else: wrong_sents += 1 correct += correct1 wrong += wrong1 print("Correctly tagged words: %s" % (correct / (correct + wrong))) print("Sentences with non-zero probability: %s" % (correct_sents / (correct_sents + wrong_sents))) print("Correctly tagged words when only considering sentences with non-zero probability: %s" % (correct_known / (correct_known + wrong_known)))
def toy_model(self): hmm = HMM(["s1", "s2"], ["R", "NR"]) init = [0.5, 0.5] trans = [[0.2, 0.8], [0.8, 0.2]] observ = [[0.8, 0.2], [0.2, 0.8]] hmm.set_hidden_model(init, trans, observ) return hmm
def __init__(self, limb, name='Demo_26_Final'): rospy.init_node(name, anonymous=True) self._startGesture = '0' self._limb = limb self._knhObj = KinectNiteHelp() self._baeObj = BaxterArmEndpoint(self._limb) self._dmObj = Demo26Help() self._lhObj = LeapHelp() self._handCoordinates = [] self._baxterCoordinates = [] self._posCount = 0 rtMatFile = open("RTMatFile.dat", "r") self._rotMat = cPickle.load(rtMatFile) self._transMat = cPickle.load(rtMatFile) self._gCount = 0 self._gOn = 0 self._gPointCount = 0 self._gPoints = [] self._hmmObjG1 = HMM('G1.hmm') self._hmmObjG2 = HMM('G2.hmm') self._hmmObjG3 = HMM('G3.hmm') self._hmmObjG4 = HMM('G4.hmm') self._flag = '0' self._pub = rospy.Publisher('/robot/xdisplay', Image, latch=True) #self._flagPub = rospy.Publisher('flag_topic', String) self._sub = rospy.Subscriber('/key_tap_topic', String, self._callback) rtMatFile.close() img = cv.LoadImage('Welcome.png') msg = cv_bridge.CvBridge().cv_to_imgmsg(img, encoding="bgr8") self._pub.publish(msg) # Sleep to allow for image to be published. rospy.sleep(3)
def __init__(self, **kwarg): # lname, url, other prior knowledge super(HMMClassifier, self).__init__() self.HMMauthor = HMM('author', 2) self.HMMvenue = HMM('venue', 2) # Not important self.HMMentire = HMM('entire', 6) # Set empirically self.observations_raw = [] self.observation_sequences = [] self.labels = []
class TestHMM(): def __init__(self): self.Z = numpy.array([ [0.8, 0.09, 0.01], [0.09, 0.8, 0.01], [0.1, 0, 0.8] ]) self.b = numpy.array([ [0.1, 0.1, 0.8], [0.05, 0.9, 0.05], [0.8, 0.1, 0.1] ]) self.pi = numpy.array([0.9,0.05,0.05]) self.T = 2000 # we want the errors to be less than 20% self.error_threshold = 0.2 def setup(self): self.model = HMM(self.Z,self.b,self.pi) def gen_states_obs(self): states = [] obsvns = [] for (s,o) in self.model.gen(self.T): states.append(s) obsvns.append(o) return states, obsvns def test_init(self): self.model = HMM(self.Z,self.b,self.pi) def test_gen(self): self.setup() states = [] obsvns = [] for (s,o) in self.model.gen(10): states.append(s) obsvns.append(o) assert len(states) == 10 assert len(obsvns) == 10 def test_forward_backward(self): self.setup() states, obsvns = self.gen_states_obs() alpha,beta = self.model.forward_backward(obsvns) gamma = [a*b/sum(a*b) for a,b in zip(alpha,beta)] state_est = numpy.array([numpy.where(g==max(g))[0][0] for g in gamma]) err = sum(state_est != numpy.array(states))/float(len(states)) assert err < self.error_threshold def test_viterbi(self): self.setup() states, obsvns = self.gen_states_obs() state_est = self.model.viterbi(obsvns) err = sum(state_est != numpy.array(states))/float(len(states)) assert err < self.error_threshold
def test_simple_hmm_learning(self): state_seq = [[0, 1, 1, 0, 1, 0, 1, 1], [0, 0, 1, 0]] obs_seq = [[0, 0, 1, 1, 0, 0, 0, 1], [0, 1, 0, 0]] hmm = HMM(range(2), range(2)) hmm.learn_from_labeled_data(state_seq, obs_seq) print hmm eps = 0.00001 self.assertTrue(max_delta(hmm.initial, [0.750000, 0.250000]) < eps) self.assertTrue(max_delta(hmm.transition, [[0.285714, 0.714286], [0.571429, 0.428571]]) < eps) self.assertTrue(max_delta(hmm.observation, [[0.625000, 0.375000], [0.625000, 0.375000]]) < eps)
def test_hmm(): m = HMM(2, 2) observations = [[0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,0,1,1,1,1],[0,0,0,0,1,0,1,1,0,1,1,0,0,1,0,0,1,1,1,1,0,0,1,0,0]] ground = [[0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,1,1,1,1,1,1],[0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0]] m.learn(observations, ground, smooth=None) trueres = ([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0], -21.944) res = m.viterbi(observations[1]) assert trueres[0] == res[0] print trueres[1] print res[1] assert abs(trueres[1] - res[1]) < 0.1
def setUp(self): self._model_filename = "hmm_m4n4.pkl" self._train_filename = "m4n4.train.data" self._num_hidden = 4 self._num_observ = 4 transition_matrix = np.random.rand(4, 4) observation_matrix = np.random.rand(4, 4) hmm = HMM(self._num_hidden, self._num_observ, transition_matrix=transition_matrix, observation_matrix=observation_matrix) sequences = hmm.generate_data(10000, 4, 51) io.save_sequences(self._train_filename, sequences) HMM.to_file(self._model_filename, hmm)
def main(job_id, params): num_runs = 20 obs_length = 100 num_states = 2 num_obs = 2 # readin hmm indx t = 0 try: with open(os.path.join('.', 'hmm_index.txt')) as hmm_index_file: t = int(hmm_index_file.read()) sys.stderr.write("!!!!!!!!!!!!!!!!!!HMM INDEX: " + str(t) + " !!!!!!!!!!!!!!!\n") except IOError: t = 0 # generate HMM observations np.random.seed(0x6b6c26b2) seeds = np.random.randint(0x0fffffff, size=num_runs) np.random.seed(seeds[t]) # random hmm z_mat, t_mat = random_hmm(num_states, num_obs) pi_vec = np.array([1.0 / num_states] * num_states) hmm_test = HMM(z_mat, t_mat, pi_vec) # random obs trajectory obs = hmm_test.generate(obs_length)[np.newaxis, :] # calculate log likelihood for input HMM parameters z_mat_p_input = np.array([[params['z_mat_p_0'][0], params['z_mat_p_1'][0]]]) t_mat_p_input = np.array([[params['t_mat_p_0'][0], params['t_mat_p_1'][0]]]) # pi_vec_input = np.array([params['pi_0'], 1 - params['pi_0']]) hmm_estimate = make_parameterized_HMM(z_mat_p_input, t_mat_p_input, pi_vec) hmm_loglikelihood = hmm_estimate.loglikelihood(obs[0]) # use the current suggest point and run EM to get a new point hmm_em_est, _, _ = em.em(hmm_estimate, hmm_estimate.z_mat, hmm_estimate.t_mat, obs, 30, 0.1) em_est_z_mat, em_est_t_mat = retrieve_parameterized_HMM(hmm_em_est) em_est_ll = -hmm_em_est.loglikelihood(obs[0]) em_est_z_mat.reshape((em_est_z_mat.size,)) em_est_t_mat.reshape((em_est_t_mat.size,)) print em_est_t_mat print em_est_z_mat historical_points = [{'params': {}}] # write z_mat for i, v in enumerate(em_est_z_mat[0]): historical_points[0]['params']['z_mat_p_' + str(i)] = {'values': np.array([v]), 'type': 'float'} # write t_mat for i, v in enumerate(em_est_t_mat[0]): historical_points[0]['params']['t_mat_p_' + str(i)] = {'values': np.array([v]), 'type': 'float'} historical_points[0]['value'] = em_est_ll dump_new_history('.', historical_points) return -hmm_loglikelihood
def main(): parser = OptionParser() parser.add_option("-t", dest="test", help="test data directory") parser.add_option("-m", dest="model", help="model data filename to save") (options, args) = parser.parse_args() if not options.model: parser.error("need model data filename(-m)") hmm = HMM() hmm.load(options.model) if options.test: tests = load_data(options.test) for x in tests: print zip(x, hmm.Viterbi(hmm.words2id(x)))
def train_hmm_from_data(data_filename, debug=False): if debug: print "\n\nReading dataset %s ..." % data_filename data_filename = normalize_filename(data_filename) d = DataSet(data_filename) #if options.verbose: # print d if debug: print "Building an HMM from the full training data..." hmm = HMM(d.states, d.outputs) hmm.learn_from_labeled_data(d.train_state, d.train_output) if debug: print "The model:" print hmm return (hmm, d)
def hmmBaumWelchRestarts(nRuns, stopping, obs, parallel=True): '''Performs the baum welch nRuns times stopping when the likelihood changes by less than stopping''' models = [HMM.randomStudent() for _ in xrange(nRuns)] return baumWelchRestarts(models, obs, stopping, parallel)
def main(): hmm = HMM(3, ('up', 'down', 'unchanged'), initial_probability=[0.5, 0.2, 0.3], transition_probability=[[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], observation_probability=[[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]]) observation = ("up", "up", "unchanged", "down", "unchanged", "down", "up") ob_length = len(observation) p, _ = hmm.forward(observation, ob_length) path = hmm.decode(observation, ob_length) print("P{} = {:.13f}".format(tuple(observation), p)) print("Observation sequence =", tuple(i+1 for i in path))
def test_accumulative(self): hmm = HMM.from_file(self._model_filename) for i in xrange(self._num_hidden): self.assertAlmostEqual(1.0, hmm._accumulative_transition_matrix[-1, i], delta=1e-6) self.assertAlmostEqual(1.0, hmm._accumulative_observation_matrix[-1, i], delta=1e-6)
def hmm_train(results): train_file = results.train freq_file = results.freq logger.debug( 'Started training HMM with options:' + "\n" + 'training file: ' + str(train_file) + "\n" + 'frequency file:' + str(freq_file) + "\n") if not os.path.exists('model/hmm-model'): classifier = HMM() classifier.train(train_file,freq_file) logger.info("Done Training, model is written in model file") model = classifier.get_theta() write_obj(model, 'hmm-model') else: logger.info('model already exists, nothing to do!')
def test_slfit(self): sequences = io.load_sequences(self._train_filename) hmm = HMM.from_file(self._model_filename) learner = SLHMM(self._num_hidden, self._num_observ) learner.fit(sequences, verbose=True) for sequence in sequences: pprint("True probability: %f" % hmm.predict(sequence)) pprint("Infered probability: %f" % learner.predict(sequence))
def __init__(self, hmm, rule_set=None): if isinstance(hmm, HMM): self.hmm = hmm else: self.hmm = HMM(hmm) segment_table = SegmentTable() self.segment_symbol_length = uniform_encoding.log2(len(segment_table) + 1) # + 1 for the delimiter if rule_set: self.rule_set = rule_set else: self.rule_set = RuleSet(noise=False) noises = configurations.get("NOISE_RULE_SET", []) self.noise_rule_set = RuleSet.load_noise_rules_from_flat_list(noises) self._cached_hmm_transducer = None self._cached_rule_set_transducer = None self._cached_noise_rule_set_transducer = None
def hmm_test(results): test_file = results.test freq_file = results.freq logger.debug( 'Started testing HMM with options:' + "\n" + 'test file: ' + str(results.test) + "\n" + 'frequency file:' + str(freq_file) + "\n") logger.info("Loading model") model = read_obj('hmm-model') classifier = HMM() classifier.load_theta(model) classifier.test(test_file,freq_file)
def testViterbi(self): """Test viterbi algorithm Come up with a good non-trivial way to test your training function You can use the given icecream dataset or make up your own Add your implementation """ seq1 = Instance(label = "he saw a dog".split(), data = ['NN', 'VB', 'D', 'NN']) seq2 = Instance(label = "the dog sees a saw".split(), data = ['D', 'NN', 'VB', 'D', 'NN']) seq3 = Instance(label = "the saw saw a dog".split(), data = ['D', 'NN', 'VB', 'D', 'NN']) seq4 = Instance(label = "he saw the saw".split(), data = ['NN', 'VB', 'D', 'NN']) instances = [seq1, seq2, seq3, seq4] hmm = HMM() hmm.train(instances) sawind = hmm.label_alphabet.get_index('saw') self.assertNotEqual(hmm.emission_matrix[sawind, 0], 0, 'WRONG') self.assertNotEqual(hmm.emission_matrix[sawind, 1], 0, 'WRONG')
def __init__(self, demos, n_state, gamma, n_offspring): ''' Base RL class. Creates an hmm inside. Explores and updates hmm parameters. Generates motion for an episode. Stores rollout (or episode) information :param demos: List of 2D numpy arrays. Each 2D numpy array is (n_keyframe, n_dim) for action model, n_dim = 7 for goal model, n_dim = 8 :param n_state: Integer, number of possible hidden states. :param gamma: Initial covariance matrix multiplier :param n_offspring: Number of offsprings (rollouts) in an episode ''' self.hmm = HMM(demos, n_state, gamma) self.n_offspring = n_offspring self.reset_rollout()
def testSupervisedTraining(self): """Test parameter fitting Come up with a good non-trivial way to test your training function You can use the given icecream dataset or make up your own Add your implementation """ seq1 = Instance(label = ['odd', 'even', 'odd', 'even', 'odd'], data = [3, 2, 1, 4, 1]) seq2 = Instance(label = ['even', 'even', 'odd', 'odd', 'even'], data = [2, 4, 1, 3, 2]) seq3 = Instance(label = ['even', 'even', 'odd', 'odd', 'odd'], data = [1, 2, 3, 4, 3]) seq4 = Instance(label = ['odd', 'odd', 'even', 'even', 'even'], data = [4, 3, 4, 1, 2]) instances = [seq1, seq2, seq3, seq4] hmm = HMM() hmm.train(instances) mystery = Instance(data = [2, 1, 3, 4, 2, 2, 1, 3]) labels = hmm.classify_instance(mystery) self.assertEqual(labels, ['even', 'odd', 'odd', 'even', 'even', 'even', 'odd', 'odd'], 'NOOO')
def test_morpheme_boundary(self): configurations["MORPHEME_BOUNDARY_FLAG"] = True self.initialise_segment_table("plural_english_segment_table.txt") hmm = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z']) }) grammar = Grammar(hmm, [])
def main(): parser = argparse.ArgumentParser() parser.add_argument('path', type=str, help='the path to testing data') args = parser.parse_args() test__data_path = args.path test_words = load_raw_data(test__data_path) hmm = HMM() hmm.load_model('./hmmmodel.txt') tag_result = [] for sentence in test_words: re = hmm.decode(sentence) tag_result.append(re) save_result(test_words, tag_result)
def prepare_seqs_en(self, decoding="viterbi"): params_fixed = (np.load("{}/ip.npy".format(self.path)), np.load("{}/tp.npy".format(self.path)), np.load("{}/fp.npy".format(self.path)), np.load("{}/ep.npy".format(self.path))) h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path) self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict) train_seq = self.ner_corpus.read_sequence_list_conll(eng_train) dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev) test_seq = self.ner_corpus.read_sequence_list_conll(eng_test) muc_seq = self.ner_corpus.read_sequence_list_conll( muc_test) if self.use_muc else None decoder = None type_decoder = None if decoding == "viterbi": decoder = h.viterbi_decode_corpus elif decoding == "max_emission": decoder = h.max_emission_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": type_decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined correctly, using Viterbi.") decoder = h.viterbi_decode_corpus print( "Decoding word representations on train. This may take a while...") type_decoder( train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq) print("Decoding word representations on dev.") type_decoder( dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq) print("Decoding word representations on test.") type_decoder( test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq) if self.use_muc: print("Decoding word representations on MUC.") type_decoder( muc_seq, self.dataset, self.logger) if type_decoder is not None else decoder(muc_seq) return train_seq, dev_seq, test_seq, muc_seq
def model_training(train_data, tags): """ Train HMM based on training data Inputs: - train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class - tags: (1*num_tags) a list of POS tags Returns: - model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data """ model = None ################################################### # Edit here N=len(train_data) S=len(tags) pi=np.zeros(S) A=np.zeros((S,S)) state_dict={} obs=[] obs_dict={} o=0 for t in range(S): state_dict[tags[t]]=t for line in train_data: pi[state_dict[line.tags[0]]]+=1 for w in range(line.length-1): A[state_dict[line.tags[w]],state_dict[line.tags[w+1]]]+=1 for line in train_data: for w in range(line.length): if line.words[w] not in obs_dict.keys(): obs_dict[line.words[w]]=o o+=1 pi=pi/N A=(A.T/np.sum(A, axis=1)).T O=len(obs_dict) B=np.zeros((S,O)) for line in train_data: for w in range(line.length): B[state_dict[line.tags[w]],obs_dict[line.words[w]]]+=1 B=(B.T/np.sum(B,axis=1)).T a1=np.isnan(A) A[a1]=0 b1=np.isnan(B) B[b1]=0 model=HMM(pi,A,B,obs_dict,state_dict) return model
def setUp(self): """Initialize Eisner ice cream HMM (J & M, Figure 6.3)""" self.hmm = HMM() # These variables have many aliases. J & M call them π, A, B, Q, and V. # You don't need to use these names, but you do need to provide a way # of initializing them. self.hmm.train( [], initial_probabilities=[.8, .2], # P(Hot, Cold) transition_probabilities=[ [.7, .3], # P(Hot|Hot, Cold) [.4, .6] ], # P(Cold|Hot, Cold) emission_probabilities=[ [.2, .4, .4], # P(1, 2, 3|Hot) [.5, .4, .1] ], # P(1, 2, 3|Cold) states=("Hot", "Cold"), vocabulary=(1, 2, 3))
def test_predict(self): sequences = io.load_sequences(self._train_filename) hmm = HMM.from_file(self._model_filename) for sequence in sequences: self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), "HMM.prediction Error") sequences = [[0, 1], [1, 2, 3, 0], [0, 0, 0, 1]] for sequence in sequences: self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), "HMM.prediction Error")
def speech_tagging(test_data, model, tags): """ Inputs: - test_data: (1*num_sentence) a list of sentences, each sentence is an object of line class - model: an object of HMM class Returns: - tagging: (num_sentence*num_tagging) a 2D list of output tagging for each sentences on test_data """ tagging = [] ################################################### # Edit here ################################################### N, M = model.B.shape new_model = model new_column = 1e-6 * np.ones([N, 1]) new_feature_number = 0 new_b = model.B new_obs_dict = model.obs_dict for sentence in test_data: for word in sentence.words: if word not in model.obs_dict: # add new features and set number of new features # add new column to b # sample : np.append(???,new_column,axis=1) new_b = np.append(new_b, new_column, axis=1) # add new features to obs_dict new_obs_dict[word] = len(new_b[0, :]) - 1 # augment new features number new_feature_number += 1 if new_feature_number != 0: new_model = HMM(model.pi, model.A, new_b, new_obs_dict, model.state_dict) for sentence in test_data: tag_row = new_model.viterbi(sentence.words) tagging.append(tag_row) return tagging
def __get_best_pos_to_shoot(self): """Returns a position in which is more likely to shoot the enemy. """ #Gets the state of the markov model at time t. transition_probabilities = self.__get_net_probs() emission_probabilities = self.__get_net_probs() hmm = HMM(transition_probabilities, emission_probabilities) emissions = [2, 1, 0] initial = self.__get_net_probs() return (self.net[self.viterbi(hmm, initial, emissions)[0]].id)
def hmm_test(): st_time = time.time() model_file = "hmm_model.json" # load data with open(model_file, 'r') as f: data = json.load(f) A = np.array(data['A']) B = np.array(data['B']) pi = np.array(data['pi']) # observation symbols obs_dict = data['observations'] # state symbols states_symbols = dict() for idx, item in enumerate(data['states']): states_symbols[item] = idx Osequence = np.array(data['Osequence']) N = len(Osequence) model = HMM(pi, A, B, obs_dict, states_symbols) delta = model.forward(Osequence) m_delta = np.array([[3.5000e-01, 1.3600e-01, 0.0000e+00, 0.0000e+00, 1.1136e-05, 1.1136e-05, 0.0000e+00], [1.5000e-01, 3.2000e-02, 4.6400e-03, 2.7840e-04, 3.3408e-05, 1.1136e-05, 8.9088e-07]]) print("Your forward function output:", delta) print("My forward function output:", m_delta) gamma = model.backward(Osequence) m_gamma = np.array([[1.6896e-06, 3.8400e-06, 6.4000e-05, 2.0000e-03, 1.4000e-02, 2.0000e-02, 1.0000e+00], [1.9968e-06, 1.1520e-05, 1.9200e-04, 3.2000e-03, 2.2000e-02, 6.0000e-02, 1.0000e+00]]) print("Your backward function output:", gamma) print("My backward function output:", m_gamma) prob1 = model.sequence_prob(Osequence) m_prob1 = 8.908800000000002e-07 print("Your sequence_prob function output:", prob1) print("My sequence_prob function output:", m_prob1) prob2 = model.posterior_prob(Osequence) m_prob2 = np.array([[0.6637931, 0.5862069, 0., 0., 0.175, 0.25, 0.], [0.3362069, 0.4137931, 1., 1., 0.825, 0.75, 1.]]) print("Your posterior_prob function output:", prob2) print("My posterior_prob function output:", m_prob2) viterbi_path = model.viterbi(Osequence) m_viterbi_path = ['1', '1', '2', '2', '2', '2', '2'] print('Your viterbi function output: ', viterbi_path) print('My viterbi function output: ', m_viterbi_path) en_time = time.time() print() print("hmm total time: ", en_time - st_time)
def test_hmm_connected_components(self): hmm = HMM({INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dag', 'kot']), 'q2': (['q1', FINAL_STATE], ['z'])}) log_hmm(hmm) component_states = hmm.get_connected_components(ignore_initial_and_final_states=False) self.write_to_dot_to_file(hmm, 'connected_hmm') print(component_states) assert component_states[0][0] == FINAL_STATE assert component_states[2][0] == INITIAL_STATE assert 'q1' in component_states[1] assert 'q2' in component_states[1] component_states = hmm.get_connected_components(ignore_initial_and_final_states=True) assert 'q1' in component_states[0] assert 'q2' in component_states[0] assert [INITIAL_STATE] not in component_states assert [FINAL_STATE] not in component_states
def test_predict(self): sequences = io.load_sequences(self._train_filename) hmm = HMM.from_file(self._model_filename) for sequence in sequences: self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), "HMM.prediction Error") sequences = [[0,1], [1,2,3,0], [0,0,0,1]] for sequence in sequences: self.assertEqual(hmm.predict(sequence), hmm.predict(sequence), "HMM.prediction Error")
def train_hmm(X, k, prev_hmm=None, window_size=5, num_clusters=5, num_states=4, max_iter=10): """Trains an HMM Args: X: the data k: the number of sequences to divide the data into prev_hmm: an HMM object to set the parameters window_size: the window size of exponential weighting num_clusters: the number of mixtures in each GMM num_states: the number of states in the HMM max_iter: the max number of iterations allowed in the HMM Returns: the trained HMM """ temp_X = X.reshape((k, X.shape[0] // k, X.shape[1])) num_features = X.shape[1] if (prev_hmm is None): pi = random_splits(num_states, 1) A = np.array([random_splits(num_states, 1) for _ in range(num_states)]) weights = np.random.rand(num_states, num_clusters) / num_clusters means = np.random.rand(num_states, num_clusters, num_features) * .6 - .3 cov = np.tile(np.eye(num_features), (num_states, num_clusters, 1, 1)) for i in range(num_states): weights[i] = weights[i] / weights[i].sum() else: pi = prev_hmm.pi A = prev_hmm.A weights = prev_hmm.weights means = prev_hmm.means cov = prev_hmm.cov hmm = HMM(pi, A, weights, means, cov, num_states) hmm.em(X, window_size=3, max_iter=max_iter) return hmm
def test_init(self): hmm = HMM(UDDataSet('data/en-ud-train.conllu')) self.assertEqual(17, hmm.num_state) self.assertEqual(17, hmm.bos_idx) self.assertEqual(18, hmm.eos_idx) for i in range(hmm.num_state): self.assertAlmostEqual(-12.228919653600784, hmm.emission_counter[(i, -1)])
def train_N_state_hmms_from_data(filename, num_states, debug=False): """ reads all the data, then split it up into each category, and then builds a separate hmm for each category in data """ dataset = DataSet(filename) category_seqs = split_into_categories(dataset) # Build a hmm for each category in data hmms = {} for cat, seqs in category_seqs.items(): if debug: print "\n\nLearning %s-state HMM for category %s" % ( num_states, cat) model = HMM(range(num_states), dataset.outputs) model.learn_from_observations(seqs, debug) hmms[cat] = model if debug: print "The learned model for %s:" % cat print model return (hmms, dataset)
def model_training(train_data, tags): """ Train HMM based on training data Inputs: - train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class - tags: (1*num_tags) a list of POS tags Returns: - model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data """ model = None ################################################### N = len(tags) A = np.ones((N, N)) / N pi = np.ones(N) / N state_dict, tag_dict, obs_dict = {}, {}, {} word_list = [] for idx, tag in enumerate(tags): state_dict[tag] = idx for cur_line in train_data: pi[state_dict[cur_line.tags[0]]] += 1 for idx in range(cur_line.length): tag = cur_line.tags[idx] word_list.append(cur_line.words[idx]) if tag not in tag_dict: tag_dict[tag] = 1 else: tag_dict[tag] += 1 if idx < cur_line.length - 1: A[tags.index(cur_line.tags[idx]), tags.index(cur_line.tags[idx + 1])] += 1 word_list = list(set(word_list)) for idx, word in enumerate(word_list): obs_dict[word] = idx total_tags = sum(tag_dict.values()) for key in tag_dict.keys(): tag_dict[key] /= total_tags B = np.zeros([N, len(word_list)]) for line in train_data: for word, tag in zip(line.words, line.tags): B[state_dict[tag], obs_dict[word]] = tag_dict[tag] A /= np.sum(A, axis=1)[:, None] pi /= len(train_data) model = HMM(pi, A, B, obs_dict, state_dict) ################################################### return model
def test_morpheme_boundary(self): self.configurations["MORPHEME_BOUNDARY_FLAG"] = True self.initialise_segment_table("plural_english_segment_table.txt") hmm = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z']) }) grammar = Grammar(hmm) self.assertCountEqual(['dog', 'kat', 'dogz', 'katz'], grammar.get_all_outputs())
def test_morphology_only2(self): self.initialise_segment_table("plural_english_segment_table.txt") self.configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25 data = [u'tozata', u'tozaso', u'tozakt', u'tozzookata', u'tozzookaso', u'tozzookakt', u'tozzook', u'tozdodata', u'tozdodaso', u'tozdodakt', u'tozdod', u'tozgosata', u'tozgosaso', u'tozgosakt', u'tozgos', u'toz', u'dagata', u'dagaso', u'dagakt', u'dagzookata', u'dagzookaso', u'dagzookakt', u'dagzook', u'dagdodata', u'dagdodaso', u'dagdodakt', u'dagdod', u'daggosata', u'daggosaso', u'daggosakt', u'daggos', u'dag', u'gasata', u'gasaso', u'gasakt', u'gaszookata', u'gaszookaso', u'gaszookakt', u'gaszook', u'gasdodata', u'gasdodaso', u'gasdodakt', u'gasdod', u'gasgosata', u'gasgosaso', u'gasgosakt', u'gasgos', u'gas', u'kodata', u'kodaso', u'kodakt', u'kodzookata', u'kodzookaso', u'kodzookakt', u'kodzook', u'koddodata', u'koddodaso', u'koddodakt', u'koddod', u'kodgosata', u'kodgosaso', u'kodgosakt', u'kodgos', u'kod', u'katata', u'kataso', u'katakt', u'katzookata', u'katzookaso', u'katzookakt', u'katzook', u'katdodata', u'katdodaso', u'katdodakt', u'katdod', u'katgosata', u'katgosaso', u'katgosakt', u'katgos', u'kat', u'dotata', u'dotaso', u'dotakt', u'dotzookata', u'dotzookaso', u'dotzookakt', u'dotzook', u'dotdodata', u'dotdodaso', u'dotdodakt', u'dotdod', u'dotgosata', u'dotgosaso', u'dotgosakt', u'dotgos', u'dot'] hmm = HMM({'q0': [u'q1'], 'q1': ([u'q2', u'q3', u'qf'], ['toz', 'dag', 'kat', 'dot', 'kod', 'gas']), 'q2': ([u'q3',u'qf'], ['zook', 'gos', 'dod']), 'q3': ([u'qf'], ['aso', 'akt', 'ata'])}) self.configurations.simulation_data = data hypothesis = Hypothesis(Grammar(hmm, []))
def model_training(train_data, tags): #####lowercased###### for data in train_data: for it in range(data.length): data.words[it] = data.words[it].lower() ##################### S = len(tags) pi = np.zeros(S) A = np.zeros([S, S]) B = [] Bc = np.zeros([S, 1]) Ac = np.zeros([S, S]) obs_dict = {} states_symbols = {} for i in range(S): if not tags[i] in states_symbols.keys(): states_symbols[tags[i]] = i numS = np.zeros(S) num1S = np.zeros(S) #################################### for data in train_data: firsttag = data.tags[0] num1S[states_symbols[firsttag]] += 1 for i in range(data.length): word = data.words[i] tag = data.tags[i] if not word in obs_dict.keys(): obs_dict[word] = len(obs_dict) Bc = np.append(Bc, np.zeros([S, 1]), axis=1) Bc[states_symbols[tag], obs_dict[word]] += 1 numS[states_symbols[tag]] += 1 if i != data.length - 1: Ac[states_symbols[tag], states_symbols[data.tags[i + 1]]] += 1 B = np.zeros(np.shape(Bc)) pi = normalize(num1S) for s in range(S): for sp in range(S): if numS[s] == 0: A[s, sp] = 0 else: A[s, sp] = Ac[s, sp] / numS[s] for s in range(len(Bc)): for o in range(len(Bc[0])): if numS[s] == 0: B[s, o] = 0 else: B[s, o] = Bc[s, o] / numS[s] ################################### model = HMM(pi, A, B, obs_dict, states_symbols) return model
def test_crossover_subgraph(self): self.initialise_segment_table("plural_english_segment_table.txt") hmm_1 = HMM({INITIAL_STATE: ['q1'], 'q1': (['q1', 'q2'], ['da']), 'q2': ([FINAL_STATE], ['s'])}) hmm_2 = HMM({INITIAL_STATE: ['q1'], 'q1': (['q2'], ['ko']), 'q2': (['q3'], ['bo']), 'q3': (['q4'], ['go']), 'q4': ([FINAL_STATE], ['z'])}) offspring_1, offspring_2 = HMM.crossover_subgraphs(hmm_1, hmm_2) self.write_to_dot_to_file(hmm_1, 'subgraph_parent_1') self.write_to_dot_to_file(hmm_2, 'subgraph_parent_2') self.write_to_dot_to_file(offspring_1, 'subgraph_offspring_1') self.write_to_dot_to_file(offspring_2, 'subgraph_offspring_2') offspring_1.get_transducer() offspring_2.get_transducer()
def predict_crimes(cls, crimes, limit=16): # Sort crimes in chronological order crimes.sort(key=lambda crime: crime.date) # Find the average duration between crimes deltas = [later.date - now.date for now, later in zip(crimes, crimes[1:])] delta = sum(deltas, timedelta()) / len(deltas) # Only allow a granularity of 1 hour delta = max(delta, timedelta(hours=1)) # Create a timeline that marks each of the given events, and also # includes empty non-events at regular intervals (determined by `delta`) # between the first and last timestamp time, stop = crimes[0].date, crimes[-1].date timeline = {crime.date : crime for crime in crimes} while time <= stop: timeline.setdefault(time, None) time += delta # Convert the padded timeline back to an ordered time sequence of events events = [timeline[k] for k in sorted(timeline.keys())] # Create an HMM to predict the regions in which crimes will take place regions = [e.region if e is not None else None for e in events] regions = iter(HMM.from_events(regions)) # Create a separate HMM for the crimes' descriptions descs = [e.description for e in events if e is not None] descs = iter(HMM.from_events(descs)) future = [] while len(future) < limit: time += delta region = next(regions) if region is not None: future.append({ 'date': time.strftime(cls.DATE_FMT), 'region': region.to_dict(), 'description': next(descs) }) return future
def prepare_seqs_en_dbg(self, decoding="viterbi"): params_fixed = (np.load("{}ip.npy".format(self.path)), np.load("{}tp.npy".format(self.path)), np.load("{}fp.npy".format(self.path)), np.load("{}ep.npy".format(self.path))) h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False) h.dirname = self.path self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict) # train_seq = self.ner_corpus.read_sequence_list_conll(eng_train) dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev) # test_seq = self.ner_corpus.read_sequence_list_conll(eng_test) # muc_seq = self.ner_corpus.read_sequence_list_conll(muc_test) if decoding == "viterbi": decoder = h.viterbi_decode_corpus elif decoding == "max_emission": decoder = h.max_emission_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined correctly, using Viterbi.") decoder = h.viterbi_decode_corpus #print("Decoding word representations on train.") #decoder(train_seq) print("Decoding word representations on dev.") decoder(dev_seq, self.dataset) #print("Decoding word representations on test.") #decoder(test_seq) #print("Decoding word representations on MUC.") #decoder(muc_seq) #return train_seq, dev_seq, test_seq, muc_seq return dev_seq
def init_target_hypothesis(self): target_tuple = self.simulation.target_tuple target_rule_set = RuleSet.load_form_flat_list(target_tuple[1]) target_hypothesis = Hypothesis.create_hypothesis( HMM(deepcopy(target_tuple[0])), target_rule_set) target_energy = target_hypothesis.get_energy() self.logger.info('Target hypothesis:') log_hypothesis(target_hypothesis, self.logger.info) self.logger.info('Target energy: {}'.format(target_energy)) self.logger.info('Target hypothesis energy signature: {}'.format( target_hypothesis.get_recent_energy_signature())) return target_hypothesis, target_energy
def generate_samples(): model = HMM.from_fixed_params() T = 10 Z, X = model.sample(T) print(model) print(f"X: {X}, True Z: {Z}") smcmc = SMCMC(model, T) samples = smcmc.sample(N=100, x_sequence=X) print(f"Last L samples: \n{samples[:, -1]}") # print(np.unique(samples[:, 0], return_counts=True)[1] / len(samples)) print(f"Mean sample: {samples.mean((0, 1)).round(1)}")
def __init__(self, string_input_words, max_word_length_in_data, initial_hmm=None, alphabet_or_words="words"): if not isinstance(string_input_words, list): raise ValueError("should be list") self.feature_table = get_feature_table() self.max_word_length_in_data = max_word_length_in_data if not initial_hmm: feature_table = get_feature_table() if alphabet_or_words == "alphabet": alphabet = feature_table.get_alphabet() self.hmm = HMM.create_hmm_alphabet(alphabet) elif alphabet_or_words == "words": self.hmm = HMM.create_hmm_from_list(string_input_words) else: self.hmm = initial_hmm self.words = [] self._update_words()
def __init__(self): # TODO 测试集上检查平滑处理的抉择问题 self.minfreq = -3.14e+100 # 构建字典树、用于扫描全切分有向图 self.trie = Trie() self.construct_trie() # 构建 二元词典 # self.construct_bigram_dic() # 读取二元词典 with open('files/bigram_dic.json', 'r') as f: self.bigram_dic = json.load(f) # 进行特殊处理 self.SP = SpecialProcess() # 创建HMM分词模型 self.hmm = HMM() # 获取常用姓名中名字 self.get_second_names() self.get_first_name()
def test_plural_english_grammar(self): self.initialise_segment_table("plural_english_segment_table.txt") rule_set = self.get_rule_set("plural_english_rule_set.json") hmm = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z']) }) grammar = Grammar(hmm, rule_set) grammar_transducer = grammar.get_transducer()
def __load_dictionary(self, dir_name): print('Loading dictionary from ' + dir_name + "...") for dir in os.walk(dir_name).next()[1]: self.dictionary[dir] = [] for file in os.walk(dir_name + '/' + dir).next()[2]: if file.endswith('.wav'): rate, data = wavfile.read(dir_name + '/' + dir + '/' + file) if self.rate is not None and self.rate != rate: print('Error: Dictionary sampling rate not constant.') self.rate = rate coefficients = self.__get_features(data) self.dictionary[dir].append((coefficients, len(data))) print('Done.') print('Computing HMMs...') for key, value in self.dictionary.items(): hmm = HMM(key) model_size = self.__get_model_size([len(x[0]) for x in value]) hmm.train(model_size, [x[0] for x in value]) self.hmms.append(hmm) print('Trained {0} with {1} states'.format(key, model_size)) print('Done.')
def main(): tagged_words = brown.tagged_words() words_corpus = brown.words() word2vec = Word2Vec() word2vec.train(words_corpus) word_vecs = [word2vec.word2vec(word) for word in words_corpus] n_clusters = 10 # random number for now kmeans = KMeans(n_clusters) kmeans.compute(word_vecs) # word-cluster HMM p_word = {} p_cluster = {} p_cluster_given_word = None # softmax p_word_given_cluster = None # joint probability formula p_transition_cluster = None # count p_initial_cluster = None # count # cluster-tag HMM p_cluster_given_tag = None # softmax p_transition_tag = None # count from tagged data p_initial_tag = None # count from tagged data hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster) hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag) words = [] clusters = hmm_word_cluster.viterbi(words) tags = hmm_cluster_tag.viterbi(clusters)
def main(): # load data train_data = myutils.read_conll_file("data/da_ddt-ud-train.conllu") dev_data = myutils.read_conll_file("data/da_ddt-ud-dev.conllu") hmm = HMM() # fit model to training data hmm.fit(train_data) # get most likely tag predictions most_likely_predictions = hmm.predict(dev_data, method='most_likely') viterbi_predictions = hmm.predict(dev_data, method='viterbi') # evaluate gold = [x[1] for x in dev_data] sent_level, word_level = myutils.evaluate(gold, most_likely_predictions) print('most likely scores:') print('sent level: {:.4f}'.format(sent_level)) print('word level: {:.4f} \n'.format(word_level)) sent_level, word_level = myutils.evaluate(gold, viterbi_predictions) print('viterbi scores:') print('sent level: {:.4f}'.format(sent_level)) print('word level: {:.4f} \n'.format(word_level))
def test_crossover(self): from copy import deepcopy rule_1 = Rule.load([[{ 'cont': '+' }], [{ 'coronal': '-' }], [{ 'coronal': '-' }], [], True]) rule_2 = Rule.load([[{ 'cons': '+', 'low': '-' }], [{ 'voice': '-' }], [{ 'voice': '-' }], [], True]) crossover_rule_1 = deepcopy(rule_1) crossover_rule_2 = deepcopy(rule_2) crossover_rule_1.left_context_feature_bundle_list = rule_2.left_context_feature_bundle_list crossover_rule_1.right_context_feature_bundle_list = rule_2.right_context_feature_bundle_list crossover_rule_1.change_feature_bundle_list = rule_2.change_feature_bundle_list crossover_rule_2.left_context_feature_bundle_list = rule_1.left_context_feature_bundle_list crossover_rule_2.right_context_feature_bundle_list = rule_1.right_context_feature_bundle_list crossover_rule_2.change_feature_bundle_list = rule_1.change_feature_bundle_list rule_set_1 = RuleSet([crossover_rule_1]) rule_set_2 = RuleSet([crossover_rule_2]) print(rule_set_1) print(rule_set_2) hmm = HMM({ 'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']), 'q2': (['qf'], ['zo', 'go', 'do']) }) grammar_1 = Grammar(hmm, rule_set_1) grammar_2 = Grammar(hmm, rule_set_2) data = ['kat', 'dot', 'dag', 'kod'] + \ ['katso', 'dotso', 'dagzo', 'kodzo'] + \ ['katko', 'dotko', 'daggo', 'kodgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo'] hypothesis_1 = Hypothesis(grammar_1, data) hypothesis_2 = Hypothesis(grammar_2, data) print(hypothesis_1.get_energy()) print(hypothesis_2.get_energy())
def model_training(train_data, tags): """ Train HMM based on training data Inputs: - train_data: (1*num_sentence) a list of sentences, each sentence is an object of line class - tags: (1*num_tags) a list of POS tags Returns: - model: an object of HMM class initialized with parameters(pi, A, B, obs_dict, state_dict) you calculated based on train_data """ model = None obs_dict = {} state_dict = {} curr_index = 0 for tag in tags: state_dict[tag] = curr_index curr_index += 1 curr_index = 0 for line in train_data: for word in line.words: if word not in obs_dict: obs_dict[word] = curr_index curr_index += 1 S = len(state_dict.keys()) L = len(obs_dict.keys()) pi = np.zeros([S]) A = np.zeros([S, S]) B = np.zeros([S, L]) for line in train_data: pi[state_dict[line.tags[0]]] += 1 pi /= np.sum(pi) for line in train_data: for i in range(len(line.tags)-1): A[state_dict[line.tags[i]], state_dict[line.tags[i+1]]] += 1 for i in range(S): A[i, :] /= np.sum(A[i, :]) for line in train_data: for i in range(len(line.words)): B[state_dict[line.tags[i]], obs_dict[line.words[i]]] += 1 for i in range(S): B[i, :] /= np.sum(B[i, :]) model = HMM(pi, A, B, obs_dict, state_dict) return model