def setUp(self): # 建立两个HMM,隐藏状态个数为4,X可能分布为10类 n_state =4 n_feature = 10 X_length = 1000 n_batch = 100 # 批量数目 self.n_batch = n_batch self.X_length = X_length self.test_hmm = hmm.DiscreteHMM(n_state, n_feature) self.comp_hmm = ContrastHMM(n_state, n_feature) self.X, self.Z = self.comp_hmm.module.sample(self.X_length*10) self.test_hmm.train(self.X, self.Z)
def test_train_batch(self): X = [] Z = [] for b in range(self.n_batch): b_X, b_Z = self.comp_hmm.module.sample(self.X_length) X.append(b_X) Z.append(b_Z) batch_hmm = hmm.DiscreteHMM(self.test_hmm.n_state, self.test_hmm.x_num) batch_hmm.train_batch(X, Z) # 判断概率参数是否接近 # 初始概率判定没有通过!!! self.assertAlmostEqual(s_error(batch_hmm.start_prob, self.comp_hmm.module.startprob_), 0, 1) self.assertAlmostEqual(s_error(batch_hmm.transmat_prob, self.comp_hmm.module.transmat_), 0, 1) self.assertAlmostEqual(s_error(batch_hmm.emission_prob, self.comp_hmm.module.emissionprob_), 0, 1)
for i in range(len(X)): start_prob[Z[i][0]] += 1.0 for j in range(1, len(Z[i])): transmat_prob[Z[i][j - 1]][Z[i][j]] += 1.0 for j in range(len(Z[i])): emission_prob[Z[i][j]][X[i][j]] += 1.0 # 对概率矩阵归一化 start_prob = start_prob / np.sum(start_prob) transmat_prob = transmat_prob / np.repeat(np.sum(transmat_prob, axis=1), 4).reshape((4, 4)) emission_prob = emission_prob / np.repeat(np.sum(emission_prob, axis=1), len(word_dic)).reshape( (4, len(word_dic))) wordseg_hmm = hmm.DiscreteHMM(start_prob, transmat_prob, emission_prob, 4, len(word_dic)) print("startprob_prior: ", wordseg_hmm.start_prob) print("transmit: ", wordseg_hmm.transmat_prob) sentence_1 = "我要回家吃饭" sentence_2 = "中国人民从此站起来了" sentence_3 = "经党中央研究决定" sentence_4 = "江主席发表重要讲话" Z_1 = wordseg_hmm.decode(word_trans(sentence_1, word_dic)) Z_2 = wordseg_hmm.decode(word_trans(sentence_2, word_dic)) Z_3 = wordseg_hmm.decode(word_trans(sentence_3, word_dic)) Z_4 = wordseg_hmm.decode(word_trans(sentence_4, word_dic)) print(u"我要回家吃饭: ", Z_1)
# -*- coding:utf-8 -*- # By tostq <*****@*****.**> # 博客: blog.csdn.net/tostq from hmmlearn.hmm import MultinomialHMM import numpy as np import hmm dice_num = 3 x_num = 8 dice_hmm = hmm.DiscreteHMM(3, 8) dice_hmm.start_prob = np.ones(3) / 3.0 dice_hmm.transmat_prob = np.ones((3, 3)) / 3.0 dice_hmm.emission_prob = np.array([[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]) # 归一化 dice_hmm.emission_prob = dice_hmm.emission_prob / np.repeat( np.sum(dice_hmm.emission_prob, 1), 8).reshape((3, 8)) dice_hmm.trained = True X = np.array([[1], [6], [3], [5], [2], [7], [3], [5], [2], [4], [3], [6], [1], [5], [4]]) Z = dice_hmm.decode(X) # 问题A logprob = dice_hmm.X_prob(X) # 问题B # 问题C x_next = np.zeros((x_num, dice_num)) for i in range(x_num): c = np.array([i]) x_next[i] = dice_hmm.predict(X, i)
word_inc = [] line = wordline.strip() line = line.decode("utf-8", "ignore") for n in range(len(line)): word_inc.append([word_dic[line[n]]]) return np.array(word_inc) X, Z, word_dic = precess_data() print type(X) print type(Z) print X[10] print Z[10] print len(word_dic) #是代表观测值的种类数,即有多少种文字 wordseg_hmm = hmm.DiscreteHMM(4, len(word_dic), 2) wordseg_hmm.train_batch(X, Z) print "startprob_prior: ", wordseg_hmm.start_prob print "transmit: ", wordseg_hmm.transmat_prob sentence_1 = "我要回家吃饭" sentence_2 = "中国人民从此站起来了" sentence_3 = "经党中央研究决定" sentence_4 = "江主席发表重要讲话" Z_1 = wordseg_hmm.decode(word_trans(sentence_1, word_dic)) Z_2 = wordseg_hmm.decode(word_trans(sentence_2, word_dic)) Z_3 = wordseg_hmm.decode(word_trans(sentence_3, word_dic)) Z_4 = wordseg_hmm.decode(word_trans(sentence_4, word_dic))