def HardEM(file, outcome): words = loadWords(file) T = loadT("T") iniT = initTransfer(T) #随机初始化转移概率 iniE = initEmission(T, words) #随机初始化发射概率 sentences = loadData(file) Y = [] for i in sentences: Y.append(viterbi(i, iniT, iniE)) print(sentences[50]) print(Y[50]) for i in range(maxLoop): print(i) new_T = transfer(Y) new_E = emission(sentences, Y) new_Y = [] for j in sentences: new_Y.append(viterbi(j, new_T, new_E)) print(sentences[50]) print(Y[50]) if diff(Y, new_Y) < threshold: Y = new_Y break Y = new_Y with open(outcome, mode='w', encoding="utf-8") as output: for i in range(len(sentences)): for j in range(len(sentences[i])): output.write(sentences[i][j] + ' ' + Y[i][j] + ' \n')
def Perceptron(s): global alpha_emision global alpha_trans tags = ["GENE","NOGENE","*","STOP"] n = len(tags) z = viterbi.viterbi(s,alpha_emision,alpha_trans, tags, n) c1 = {} c2 = {} c3={} c4 = {} #sequence using viterbi on train and raw train tags Viterbiseq ={} for i in range(1,len(z)-1): Viterbiseq[trigram(tags[z[i]],tags[z[i+1]],tags[z[i+1]])] = 0 tagSeq = {} t1=[] for i in s: t =i.split(" ") t1.append(t[1]) for i in range(len(t1)-2): tagSeq[trigram(t1[i],t1[i+1],t1[i+2])] = 0 # print(tagSeq) # print(Viterbiseq) for i in range(1,len(s)): for u in range(n): for v in range(n): for w in range(n): c1[trigram(tags[w],tags[u],tags[v])] = 0 c2[trigram(tags[w],tags[u],tags[v])] = 0 for i in range(1,len(s)): for u in range(n): for v in range(n): for w in range(n): if trigram(tags[w],tags[u],tags[v]) in tagSeq : c2[trigram(tags[w],tags[u],tags[v])] += 1 if trigram(tags[w],tags[u],tags[v]) in Viterbiseq : c1[trigram(tags[w],tags[u],tags[v])] += 1 if c1.get(trigram(tags[w],tags[u],tags[v])) != c2.get(trigram(tags[w],tags[u],tags[v])): alpha_trans[tags[w],tags[u],tags[v]] += c1.get(trigram(tags[w],tags[u],tags[v]))-c2.get(trigram(tags[w],tags[u],tags[v])) wordTagSeq={} wordTagVit={} for i in range(1,len(z)): t =s[i-1].split(" ") wordTagSeq[words(t[1],t[0])]=0 wordTagVit[words(tags[z[i]],t[0])]=0 # print(wordTagSeq) # print("\n") # print(wordTagVit) for i in s: for j in tags: c3[words(j,i)] =0 c4[words(j,i)] =0 for i in s: for j in tags: if words(j,i) in wordTagSeq: c3[words(j,i)] += 1 if words(j,i) in wordTagVit: c4[words(j,i)] += 1 if c3.get(words(j,i)) != c4.get(words(j,i)): alpha_emision+= c3.get(words(j,i)) - c4.get(words(j,i))
def cwsSent(sent, model, cwsInfo): (initProb, tranProb), (vocab, indexVocab) = cwsInfo vec = pp.sent2vec2(sent, vocab, ctxWindows=5) vec = np.array(vec) # 根据输入得到 [B,E,M,S] 标注推断 # 按batch产生输入数据属于各个类别的概率 emit_P = model.predict_proba(vec) # 按batch产生输入数据的类别预测结果 classes = model.predict_classes(vec) # print(classes) # [3 0 2 3 0 1 2 3 0 1 2 3] prob, path = viterbi.viterbi(vec, tags, initProb, tranProb, emit_P.transpose()) assert len(path) == len(sent) result = '' for i, t in enumerate(path): if tags[t] == 'B': result += sent[i] elif tags[t] == 'M': result += sent[i] elif tags[t] == 'E': result += sent[i] + ' ' else: result += sent[i] + ' ' return result.strip()
def start_tagging(filename, outname, outpath, sepnum=50, test=False): if not os.path.exists(outpath): os.makedirs(outpath) file_type = "測試" if test else "驗證" print("========== " + file_type + "集標注 ==========") print("載入模型參數...") pi, transition, emission = load_model('model') print("載入模型參數...") print("讀取" + file_type + "集進行HMM+Viterbi標註...") with open(filename, 'r', encoding='utf-8-sig') as fr: for line in fr: sent_list = [] sent_temp = "" line = line.rstrip() for car in line: if car != '。': sent_temp = sent_temp + car elif car == '。': sent_temp = sent_temp + car if len(sent_temp) > sepnum: f = math.floor(len(sent_temp) / sepnum) m = len(sent_temp) % sepnum i = 0 while i < f: sent_list.append(sent_temp[sepnum * i:sepnum * (i + 1)]) i += 1 if m != 0: sent_list.append(sent_temp[sepnum * (i):(sepnum * (i)) + m]) sent_temp = "" else: sent_list.append(sent_temp) sent_temp = '' if sent_temp != '': sent_list.append(sent_temp) seg = [] # HMM, 維特比分詞 for sent in sent_list: pos_list = viterbi(sent, pi, transition, emission) # 將標註好的結果套用回原本的句子裡 seglist = cut(sent, pos_list) seg.append(seglist) # 追加寫回文本中 with open(outpath + '/' + outname + '.txt', 'a', encoding='utf-8-sig') as outp: for line in seg: for word in line: outp.write(word + ' ') outp.write('\n') print("========== " + file_type + "集標注完成 ==========") print()
def tpl_model(pinyin): for p in pinyin: if p not in py2ch: return 'wrong spelling!!!!!' if len(pinyin) < 3: return dbl_model(pinyin) weights = [] ch_pairs = get_pairs(pinyin[0], pinyin[1]) xs = np.array([freq_dbl(c) for c in ch_pairs]) win = pinyin[:2] for i, p in enumerate(pinyin[2:]): win.append(p) # window of 3 ch w = np.ndarray((len(py2ch[win[0]]) * len(py2ch[win[1]]), len(py2ch[win[1]]) * len(py2ch[win[2]]))) for ip, prev in enumerate(get_pairs(win[0], win[1])): for ic, cur in enumerate(get_pairs(win[1], win[2])): if not cur[0] == prev[1]: w[ip][ic] = 0 else: w[ip][ic] = freq_cond_tpl(prev + cur[1]) weights.append(w) win = win[1:] path = viterbi.viterbi(xs, weights) ans = get_pairs(pinyin[0], pinyin[1])[path[0]][0] for i, p in enumerate(path): ans += get_pairs(pinyin[i], pinyin[i + 1])[p][1] return ans
def decode(word, modelfile, verbose=False, letters=False): if os.path.getsize(modelfile) > 0: with open(modelfile, "rb") as data: unpickler = pickle.Unpickler(data); d = unpickler.load() else: print("Error reading model file") return None #d = pickle.load(data) states = d[0] start_p= d[1] trans_p = d[2] emit_p = d[3] word = list(word) if verbose: print("Segment word:", word) estimate = viterbi(word, states, start_p, trans_p, emit_p) final = "" if not letters: for i in range(len(estimate)): if estimate[i] == 'w': final = final + word[i] elif estimate[i] == '|': final = final + word[i] final = final + ' ' else: break else: final = "".join(estimate) if verbose: print(estimate) print(final)
def __init__(self, s): self.c1 = {} self.c2 = {} self.c3 = {} self.c4 = {} global trigram global words global tags global n self.z = viterbi.viterbi(s, alpha_emision, alpha_trans, tags, n) self.wordTagSeq = {} self.wordTagVit = {} self.Viterbiseq = {} # print(self.z) for i in range(len(self.z) - 2): self.Viterbiseq[trigram(tags[self.z[i]], tags[self.z[i + 1]], tags[self.z[i + 2]])] = 0 self.tagSeq = {} t1 = [] for i in s: t = i.split(" ") t1.append(t[1]) for i in range(len(s) - 2): self.tagSeq[trigram(t1[i], t1[i + 1], t1[i + 2])] = 0 # print(self.Viterbiseq, "--> viterbi seq") # print(self.tagSeq,"--> ti seq") for i in range(1, len(self.z)): t = s[i - 1].split(" ") self.wordTagSeq[words(t[1], t[0])] = 0 self.wordTagVit[words(tags[self.z[i]], t[0])] = 0
def viterbi_decode(self, seq): node_potentials, edge_potentials = self.build_potentials(seq) viterbi_path, _ = viterbi(node_potentials, edge_potentials) res = viterbi_path new_seq = seq.copy_sequence() new_seq.y = res return new_seq
def cwsSent(sent, model, cwsInfo): (initProb, tranProb), (vocab, indexVocab) = cwsInfo vec = cws.sent2vec(sent, vocab, ctxWindows=7) vec = np.array(vec) probs = model.predict_proba(vec) #classes = model.predict_classes(vec) prob, path = viterbi.viterbi(vec, cws.corpus_tags, initProb, tranProb, probs.transpose()) ss = '' for i, t in enumerate(path): ss += '%s/%s ' % (sent[i], cws.corpus_tags[t]) ss = '' word = '' for i, t in enumerate(path): if cws.corpus_tags[t] == 'S': ss += sent[i] + ' ' word = '' elif cws.corpus_tags[t] == 'B': word += sent[i] elif cws.corpus_tags[t] == 'E': word += sent[i] ss += word + ' ' word = '' elif cws.corpus_tags[t] == 'M': word += sent[i] return ss
def main(): print("hello world") fasta_list = h.process_fasta(c.genome_file, c.fna_exten) seq = h.get_seq_list(fasta_list)[0] ginfo_list = h.process_gff(c.genome_file, c.gff_exten) ginfo_list = sorted(ginfo_list, key=lambda ginfo: ginfo.start) viterbi_output = open(c.results_folder + "viterbi" + c.text_exten, "wt") viterbi_intervals = v.viterbi(seq, viterbi_output) viterbi_output.close() viterbi_eval_output = open( c.results_folder + "viterbi_eval" + c.text_exten, "wt") evaluate(viterbi_intervals, ginfo_list, viterbi_eval_output) viterbi_eval_output.close() baum_welch_output = open(c.results_folder + "baum_welch" + c.text_exten, "wt") baum_welch_intervals = bw.baum_welch(seq, baum_welch_output) baum_welch_output.close() baum_welch_eval_output = open( c.results_folder + "baum_welch_eval" + c.text_exten, "wt") evaluate(baum_welch_intervals, ginfo_list, baum_welch_eval_output) baum_welch_eval_output.close() print("done")
def evaluate(model, examples, gold, label=None): output = list(model.predict({'input': examples }, batch_size=config.batch_size)['output']) pred = np.argmax(np.asarray(output), axis=2).flatten() vpred = viterbi.viterbi(np.concatenate(output), *viterbi_probabilities) return (common.classification_summary(gold, pred) + '\n' + 'w/viterbi ' + common.classification_summary(gold, vpred))
def seg(inp): "segmenter main function" tail = "" if len(inp) % 2 != 0: tail = inp[-1] inp = inp[:-1] #load wubimap for line in wubi: wubi_map[line[0]] = int(line[1]) #all segments here seg = [] all_seg(inp,seg,0,tail) #find viterbi path every segment vit = [] for segline in seg: ans = vt.viterbi(segline,uni_map, big_map, wd_map, wubi) if not ans == None: #ans[1] = ans[1] - 20*len(ans[0]) vit.append(ans) vit.sort(key=lambda path: path[1], reverse=False) for v in vit: #print(v) pass #return max viterbi path if len(vit) > 0: return vit[-1][0] else: return False
def pinyin(input_path, output_path): fin = codecs.open(input_path) fout = codecs.open(output_path, 'w') # load initial probability & transition probability print('loading data...') path = '../data/init_prob.json' with codecs.open(path) as f: init_prob = json.load(f) word_list = list(init_prob.keys()) path = '../data/trans_prob.json' with codecs.open(path) as f: trans_prob = json.load(f) # apply viterbi for line in fin: if line[-1] == '\n': line = line[:-1] #print(line) fout.write(viterbi.viterbi(init_prob, trans_prob, line)) fin.close() fout.close()
def viterbi_decode(self,seq): node_potentials,edge_potentials = self.build_potentials(seq) viterbi_path,_ = viterbi(node_potentials,edge_potentials) res = viterbi_path new_seq = seq.copy_sequence() new_seq.y = res return new_seq
def evaluate(): global possible_tags global strings global cca_length get_words() get_strings() get_alpha() get_phi() get_regExp() # get_codeWords() get_cca() # cca_length = len(cca1['amended']) cca_length = 20 data = open('inputs/eng.test{0}'.format(sys.argv[2]), 'r') s = 'outputs_cca_pos_egw30_rounding_currentOnly/result_{0}_{1}.txt'.format( sys.argv[2], sys.argv[1]) output = open(s, 'w') line = data.readline() output.write('{0}\n\n'.format(line.strip())) line = data.readline() vals = get_sentence(data) sentence = vals[0] correct_tags = vals[1] POS = vals[2] count = 0 time1 = 0.0 time2 = 0.0 avg_time = 0.0 time_val = 0.0 first = True while sentence: #------------------------ #-------TIME-STATS------- #------------------------ count += 1 time2 = time() if not first: avg_time = (avg_time * (count - 1) + (time2 - time1)) / count time_val = int((avg_time) * (number_of_sentences - count)) first = False progress = open('progress_test.txt', 'w') progress.write( 'Percent complete:\n{0}/{1} = {2}%\n\nTime remaining: \n{3} h {4} min {5} sec' .format(int(count), int(number_of_sentences), float(count * 100) / float(number_of_sentences), time_val / 3600, (time_val % 3600) / 60, time_val % 60)) time1 = time2 progress.close() #-------------------------- #-------------------------- tags = viterbi.viterbi(sentence, POS, phi, possible_tags, alpha, strings, Words, regExp, codes, cca1, cca_length) for i in range(len(sentence)): output.write('{0} {1} {2} {3}\n'.format(sentence[i], POS[i][0], correct_tags[i], tags[i])) output.write('\n') vals = get_sentence(data) sentence = vals[0] correct_tags = vals[1] POS = vals[2]
def learning(T): weight = defaultdict(lambda : uniform(-1.0, 1.0)) data = [] possible_tags, transition = set(["<s>", "</s>"]), set() for line in iter(sys.stdin.readline, ""): X, Y = [], [] pre_y = "<s>" for x_y in line.rstrip().split(): (x, y) = x_y.split('_') X.append(x) Y.append(y) possible_tags.add(y) transition.add(" ".join([pre_y, y])) pre_y = y transition.add(" ".join([pre_y, "</s>"]) ) data.append((X, Y)) data_size = len(data) for t in range(T): for line_num, (X, Y_prime) in enumerate(data): sys.stdout.write("\rIteration %d, linenum %d / %d" % (t+1, line_num+1, data_size)) sys.stdout.flush() Y_hat = viterbi(weight, X, possible_tags, transition) phi_prime = create_feature(X, Y_prime) phi_hat = create_feature(X, Y_hat) update_weight(weight, phi_prime, phi_hat) return (weight, possible_tags, transition)
def learning(T): weight = defaultdict(lambda: uniform(-1.0, 1.0)) data = [] possible_tags, transition = set(["<s>", "</s>"]), set() for line in iter(sys.stdin.readline, ""): X, Y = [], [] pre_y = "<s>" for x_y in line.rstrip().split(): (x, y) = x_y.split('_') X.append(x) Y.append(y) possible_tags.add(y) transition.add(" ".join([pre_y, y])) pre_y = y transition.add(" ".join([pre_y, "</s>"])) data.append((X, Y)) data_size = len(data) for t in range(T): for line_num, (X, Y_prime) in enumerate(data): sys.stdout.write("\rIteration %d, linenum %d / %d" % (t + 1, line_num + 1, data_size)) sys.stdout.flush() Y_hat = viterbi(weight, X, possible_tags, transition) phi_prime = create_feature(X, Y_prime) phi_hat = create_feature(X, Y_hat) update_weight(weight, phi_prime, phi_hat) return (weight, possible_tags, transition)
def prediction_structured(features, edge_feat): features = features[np.newaxis, :, np.newaxis, :] edge_feat = edge_feat[np.newaxis, :, np.newaxis, :] unary_lgts = session.run(unary_logits, feed_dict={unary_features: features}) edge_lgts = session.run(edge_logits, feed_dict={edge_features: edge_feat}) return viterbi(unary_lgts.reshape([-1,2]), edge_lgts.reshape([-1,4]), lam=lamb)
def tagging(set_,tags,word_tag): global full_tags global init_table global full_cpd_tags global full_cpd_word_tag tagset=[] for i in range (0,len(set_)): tagset.append(viterbi(set_[i], set(full_tags), init_table, tags, word_tag)) return combine_tag_word(set_,tagset)
def decode(self, initials): timer = Timer() states = set() for obs in initials: states.update(self.words_by_letter[obs]) logger.info("Searching %s possible states", len(states)) result = viterbi.viterbi(initials, states, self.start_p, self.transition_p, self.emission_p) logger.info("Decoding %r took %s s", initials, timer.elapsed()) return result
def IBM1_VB(e, f, Lambda, nr_it=10, alpha=0.01): aer_values = [] elbo_values = [] # load test set count_e = count_words(e) count_f = count_words(f) theta = init_lexicon(e, f, init="uniform") [e_val, f_val] = load_train('data', 'test') e_val, f_val = replace_singletons(e_val, count_e), replace_singletons( f_val, count_f) print('--Performing EM--') for it in range(nr_it): print('Expectation...') count_f_e = defaultdict(lambda: defaultdict(float)) for sentnr, (e_sent, f_sent) in enumerate(zip(e, f)): if sentnr % 10000 == 0: print('#sent:', sentnr) for f_w in f_sent: sum_pi_t = sum([theta[e_word][f_w] for e_word in e_sent]) for e_w in e_sent: pi_t = theta[e_w][f_w] # Update counts count_f_e[e_w][f_w] += pi_t / sum_pi_t print('Maximization') for e_w, f_words in Lambda.items(): X = digamma(sum(Lambda[e_w].values())) for f_w, p in f_words.items(): Lambda[e_w][f_w] = alpha + count_f_e[e_w][f_w] theta[e_w][f_w] = math.exp(digamma(Lambda[e_w][f_w]) - X) #TODO: calculate ELBO elbo = calculate_elbo(e, f, count_e, count_f, theta, alpha) print('ELBO:', elbow) elbo_values.append(elbo) # Create NAACL file for current run output_naacl(viterbi(e_val, f_val, theta), 'AER/naacl_IBM1VB_it{}.txt'.format(it + 1)) aer_values.append( cmdline( 'perl data/testing/eval/wa_eval_align.pl data/testing/answers/test.wa.nonullalign AER/naacl_IBM1VB_it{}.txt' .format(it + 1))) os.system( 'perl data/testing/eval/wa_eval_align.pl data/testing/answers/test.wa.nonullalign AER/naacl_IBM1VB_it{}.txt' .format(it + 1)) # pickle.dump(elbo_values, open( "ELBO_IBM_VI.p", "wb" ) ) pickle.dump(aer_values, open("AER_IBM_VI.p", "wb")) return theta, elbo_values
def tune_params(sentences): shuffle(sentences) tags = get_tags(sentences) for delta in [7, 5, 4, 3, 2, 1, 0.8]: for sigma in [0.07, 0.05, 0.04, 0.03, 0.02, 0.01, 0.008]: print('Delta, Sigma:', delta, sigma) accuracy_results = [] precision_results = [] recall_results = [] for i in range(5): # print('Cross-validation:', i) training, testing = split_data(sentences, i) # print('Training size:', len(training)) # print('Testing size:', len(testing)) trs_model, ems_model = build_models(training, delta, sigma) map_unk = lambda s: [ w if w in ems_model.tokens else 'UNK' for w in s ] # testing = testing[:5] preds = [ viterbi(trs_model, ems_model, map_unk(s.words))[0] for s in testing ] labels = [s.tags for s in testing] accuracy, precisions, recalls = eval_metric( tags, preds, labels) accuracy_results.append(accuracy) precision_results.append(precisions) recall_results.append(recalls) print('accuracy:', round(mean(accuracy_results), 4)) precisions, recalls = {}, {} for t in precision_results[0].keys(): precisions[t] = mean([p[t] for p in precision_results]) recalls[t] = mean([r[t] for r in recall_results]) print('precisions:', round(mean(precisions.values()), 4)) print('recalls:', round(mean(recalls.values()), 4)) print('NN:', round(precisions['NN'], 4), round(recalls['NN'], 4)) print('VB:', round(precisions['VB'], 4), round(recalls['VB'], 4)) print('JJ:', round(precisions['JJ'], 4), round(recalls['JJ'], 4)) print('NNP:', round(precisions['NNP'], 4), round(recalls['NNP'], 4))
def predictCategory(hmm, stc): # preprocess: split and lower sentence words = nltk.word_tokenize(stc) words = [word.lower() for word in words] # use DP to max observed prob pos = viterbi(*paras, stc=words) print('HMM tagged:\n', pos) # use nltk taggedWords = nltk.pos_tag(words, tagset='universal') print('NLTK tagged:\n', taggedWords)
def t_BMES(): PI, A, B = build() S = B.keys() for k in S: if k not in PI: PI[k] = 0.0 for sen in samples: Y = tuple(sen) prob, X = viterbi(Y, S, PI, A, B) print u''.join(sen[i] + (X[i] in 'ES' and '|' or '') for i in xrange(len(sen)))
def main(args): train_set = load_dataset(args.training_file, args.case_sensitive) test_set = load_dataset(args.test_file, args.case_sensitive) if args.baseline: print("You are running the baseline algorithm!") accuracy = compute_accuracies(test_set, baseline(train_set, strip_tags(test_set))) else: print("You are running the Viterbi algorithm!") accuracy = compute_accuracies(test_set, viterbi(train_set, strip_tags(test_set))) print("Accuracy:",accuracy)
def find_CpG_islands_example2(gene): # Stati nascosti e osservabili S = np.array(["AI", "CI", "GI", "TI", "AN", "CN", "GN", "TN"]) SY = np.array(["A", "C", "G", "T"]) # Matrice transizione m = [[ 1.85152516e-01, 2.75974026e-01, 4.00289017e-01, 1.37026750e-01, 3.19045117e-04, 3.19045117e-04, 6.38090233e-04, 2.81510397e-04 ], [ 1.89303979e-01, 3.58523577e-01, 2.52868527e-01, 1.97836007e-01, 4.28792308e-04, 5.72766368e-04, 3.75584503e-05, 4.28792308e-04 ], [ 1.72369088e-01, 3.29501650e-01, 3.55446538e-01, 1.40829292e-01, 3.39848138e-04, 4.94038497e-04, 7.64658311e-04, 2.54886104e-04 ], [ 9.38783432e-02, 3.40823149e-01, 3.75970400e-01, 1.86949063e-01, 2.56686367e-04, 5.57197235e-04, 1.05804868e-03, 5.07112091e-04 ], [ 0.00000000e+00, 3.78291020e-05, 0.00000000e+00, 0.00000000e+00, 2.94813496e-01, 1.94641138e-01, 2.86962055e-01, 2.23545482e-01 ], [ 0.00000000e+00, 7.57154865e-05, 0.00000000e+00, 0.00000000e+00, 3.26811872e-01, 2.94079570e-01, 6.17258712e-02, 3.17306971e-01 ], [ 0.00000000e+00, 5.73810399e-05, 0.00000000e+00, 0.00000000e+00, 2.57133507e-01, 2.33483327e-01, 2.94234944e-01, 2.15090841e-01 ], [ 0.00000000e+00, 3.11417347e-05, 0.00000000e+00, 0.00000000e+00, 1.79565378e-01, 2.32469115e-01, 2.94623408e-01, 2.93310958e-01 ]] M = pd.DataFrame(m, columns=S, index=S) # Matrice probabilita' di emissione d = np.eye(4) E = pd.DataFrame(np.concatenate([d, d]), columns=SY, index=S) # Probabilita' iniziali pinizio = pd.DataFrame([[1 / 8] * 8], columns=S) path = viterbi(M, E, S, pinizio, gene) for i in range(len(gene)): print(gene[i] + " ", end=" ") print("\n") for i in range(len(path)): print(path[i][-1] + " ", end=" ") print("\n")
def menu(): print "1 - EBI Web Service Access" print "2 - Hidden Markov Model" print "3 - Phylogenetic Tree" option = input("Select an option: ") if option == 1: print "1a - Global Alignment Tool" print "1b - Local Alignment Tool" option2 = raw_input("Select an option: ") if option2 == "1a": globalAlign() elif option2 == "1b": localAlign() if option == 2: viterbi() if option == 3: print "3a - UPGMA" option2 = raw_input("Select an option: ") if option2 == "3a": upgma()
def main(): mode = sys.argv[1] test = float(sys.argv[2]) data = pickle.load(open(sys.argv[3], 'rb')) if mode == '-c': train_method = HMM.count elif mode == '-l': train_method = HMM.learn else: print("INVALID MODE") print("Data preprocessing...") test_i = random.sample(range(len(data)), int(test*len(data))) states = list(set(list(itertools.chain(*data))[1::2])) random.shuffle(data) train = data[int(len(data)*test):] test = data[:int(len(data)*test)] vocab = list(set(list(itertools.chain(*train))[0::2])) state_map = {state:i for i,state in enumerate(states)} vocab_map = {word:i for i,word in enumerate(vocab)} for i,sentence in enumerate(train): words_i = [vocab_map[x] for x in sentence[0::2]] states_i =[state_map[x] for x in sentence[1::2]] train[i] = [words_i, states_i] print("Training HMM...") A,B,pi = train_method(train, len(vocab), len(states)) correct = 0 total = 0 for sentence in test: words = sentence[0::2] states = sentence[1::2] for i,word in enumerate(words): if word in vocab_map: words[i] = vocab_map[word] else: words[i] = '<UNK>' p_states = viterbi(A,B,pi,words) total += len(states) for i in range(len(states)): if states[i] == p_states[i]: correct += 1 print('Accuracy: '+str(correct/total))
def evaluate(): global possible_tags global strings global cca_length get_words() get_strings() get_alpha() get_phi() get_regExp() # get_codeWords() get_cca() # cca_length = len(cca1['amended']) cca_length = 20 data = open('inputs/eng.test{0}'.format(sys.argv[2]), 'r') s = 'outputs_cca_pos_egw30_rounding_currentOnly/result_{0}_{1}.txt'.format(sys.argv[2], sys.argv[1]) output = open(s, 'w') line = data.readline() output.write('{0}\n\n'.format(line.strip())) line = data.readline() vals = get_sentence(data) sentence = vals[0] correct_tags = vals[1] POS = vals[2] count = 0 time1 = 0.0 time2 = 0.0 avg_time = 0.0 time_val = 0.0 first = True while sentence: #------------------------ #-------TIME-STATS------- #------------------------ count += 1 time2 = time() if not first: avg_time = (avg_time*(count-1)+(time2-time1))/count time_val = int((avg_time)*(number_of_sentences-count)) first = False progress = open('progress_test.txt', 'w') progress.write('Percent complete:\n{0}/{1} = {2}%\n\nTime remaining: \n{3} h {4} min {5} sec'.format(int(count), int(number_of_sentences), float(count*100)/float(number_of_sentences), time_val/3600, (time_val%3600)/60, time_val%60)) time1 = time2 progress.close() #-------------------------- #-------------------------- tags = viterbi.viterbi(sentence, POS, phi, possible_tags, alpha, strings, Words, regExp, codes, cca1, cca_length) for i in range(len(sentence)): output.write('{0} {1} {2} {3}\n'.format(sentence[i], POS[i][0], correct_tags[i], tags[i])) output.write('\n') vals = get_sentence(data) sentence = vals[0] correct_tags = vals[1] POS = vals[2]
def test_trellis(self): _, actual, _ = viterbi(self.obs, self.A, self.B, self.pi) expected = LMatrix(("H", "L"), xrange(len(self.obs)), data = np.array([ [ -2.737, -5.474, -8.211, -11.533, -14.007, -17.329, -19.54, -22.862, -25.657], [ -3.322, -6.059, -8.796, -10.948, -14.007, -16.481, -19.54, -22.014, -24.487] ]) ) for s in actual.rlabels: for t in actual.clabels: self.assertAlmostEqual(actual[s,t], expected[s,t], 3)
def _do_viterbi(self, sentence): word_list = [] viterbi_results = np.array([]) for word, tag in sentence: if word in self.word_to_index: word_list.append(self.word_to_index[word]) else: # If we encountered an OOV word call Viterbi on the previous words if len(word_list) != 0: viterbi_results = np.concatenate( (viterbi_results, viterbi(word_list, self.A, self.B, self.Pi))) viterbi_results = np.append(viterbi_results, None) word_list = [] # Call Viterbi on the last chunk of the sentence (it may be the full sentence, if there # wasn't an OOV word in this sentence) if len(word_list) != 0: viterbi_results = np.concatenate( (viterbi_results, viterbi(word_list, self.A, self.B, self.Pi))) return viterbi_results
def tagger(sentence): Pi, A, B = load_parameters("data/HMMTagger.parameters.npz") with open("data/Taggerindex.pkl", "rb") as f: word_index = pickle.load(f) label_index = pickle.load(f) f.close() obs = map_obs(word_index, sentence) prob, route = viterbi(obs, Pi, A, B) sequence = [label_index[i] for i in route] result = '' for word, tag in zip(sentence, sequence): result += ''.join(' ' + word + '/' + tag + ' ') return result.strip()
def evaluate(self, data): word_level_acc = 0.0 sentence_level_acc = 0.0 count_words = 0 for sentence in data: word_list = [] begin_index = 0 correct_sentence = True for i in range(len(sentence)): count_words += 1 word = sentence[i][0] label = sentence[i][1] if word not in self.unique_words: # OOV word , breaks the sentence. if len(word_list) > 0: seq = vt.viterbi(word_list, self.A, self.B, self.Pi) word_acc, sentence_acc = self.comparisons( begin_index, i, sentence, seq) # Computes segment accuracy correct_sentence = correct_sentence and sentence_acc word_level_acc += word_acc word_list.clear() rand_label = np.random.randint(0, len( self.A)) # Assign random label begin_index = i + 1 if rand_label == label: word_level_acc += 1 else: word_list.append(self.unique_words[word]) # If we reach the end of the sentence if len(word_list) > 0: seq = vt.viterbi(word_list, self.A, self.B, self.Pi) word_acc, sentence_acc = self.comparisons( begin_index, len(sentence), sentence, seq) correct_sentence = correct_sentence and sentence_acc word_level_acc += word_acc if correct_sentence: sentence_level_acc += 1 return word_level_acc / count_words, sentence_level_acc / len(data)
def perceptron(print_alpha = 0, mult = 0, import_alpha = 0): global alpha global alpha_average global possible_tags global strings global strings_abr global add_factor global mult_factor init_phi_alpha(mult) get_strings() if import_alpha: read_alpha() alpha_average = copy.deepcopy(alpha) for t in range(T_DEFAULT): print '---{0}---'.format(t) sys.stdout.flush() dont_repeat = True data = open(sys.argv[2], 'r') vals = get_sentence_and_tags(data) j = 0 while vals: sentence = vals[0] correct_tags = vals[1] result = viterbi.viterbi(sentence, phi, possible_tags, alpha, strings, strings_abr, mult) z = result[0] indices = result[1] if not z == correct_tags: dont_repeat = False correct_indices = get_indices(sentence, correct_tags) if mult: for i in indices: alpha[i] = float(alpha[i])/mult_factor for i in correct_indices: alpha[i] = float(alpha[i])*mult_factor else: for i in indices: alpha[i] += -1*add_factor for i in correct_indices: alpha[i] += add_factor else: j += 1 for i in range(len(alpha)): alpha_average[i] += alpha[i] vals = get_sentence_and_tags(data) data.close() if dont_repeat: print 'SUCCESS!!!' break # print 'number correct: {0}'.format(j) if print_alpha: write_alpha(t)
def perceptron(print_alpha = 0): global possible_tags global strings global strings_abr global add_factor get_regExp() get_strings() get_tags() get_phi() for t in range(T_DEFAULT): print '---{0}---'.format(t) sys.stdout.flush() dont_repeat = True data = open(sys.argv[1], 'r') vals = get_sentence_and_tags(data) j = 0 examp_num = 0 while vals: examp_num += 1 sentence = vals[0] correct_tags = vals[1] tags = viterbi(sentence, phi, possible_tags, alpha, strings, strings_abr, Words, regExp) indices = get_indices(sentence, tags, examp_num) correct_indices = get_indices(sentence, correct_tags, examp_num) if not tags == correct_tags: dont_repeat = False for i in indices: alpha[i] += -1*add_factor for i in correct_indices: alpha[i] += add_factor else: j += 1 for i in set(indices) | set(correct_indices): val1 = alpha_average[i][0]+(examp_num - alpha_average[i][1])*alpha_average[i][2] val2 = examp_num val3 = alpha[i] alpha_average[i] = (val1,val2,val3) vals = get_sentence_and_tags(data) data.close() if dont_repeat: print 'SUCCESS!!!' break print 'number correct: {0}'.format(j) for i in alpha: val1 = alpha_average[i][0]+(examp_num+1 - alpha_average[i][1])*alpha_average[i][2] val2 = 1 val3 = alpha[i] alpha_average[i] = (val1,val2,val3) if print_alpha: write_alpha(t)
def testTag(): prob_start = load_model("prob_mat/prob_start.pkl") prob_trans = load_model("prob_mat/prob_trans.pkl") prob_emit = load_model("prob_mat/prob_emit.pkl") test_str_list = [] test_str_list.append(u"长春市长春节讲话。") test_str_list.append(u"他说的确实在理.") test_str_list.append(u"毛主席万岁。") test_str_list.append(u"我有一台电脑。") for test_str in test_str_list: pos_list = viterbi(test_str, ('B', 'M', 'E', 'S'), prob_start, prob_trans, prob_emit) print(test_str, '\n', pos_list)
def run(self): if self.isTest: print "Running HMM" h = HiddenMarkovModel(self.train_file,smoothed=self.smoothing) print "Running Viterbi" toc = time.clock() predicted = viterbi(h,self.test_file, test = False) tic = time.clock() print "Viterbi ran in %f seconds"%(tic-toc) actual, tokens = zip(*self.parse_file(self.test_answers)) return (predicted,actual,tokens) else: print "Splitting Data" (train,test) = self.splitCV(self.parse_file(self.train_file),self.cv_validation_percentage) print "Converting Lists" train_text = "".join(["%s %s\n" % (p,t) for [p,t] in train]) test_text = "".join(["%s\n" % t for [p,t] in test]) print "Running HMM" h = HiddenMarkovModel(text=train_text, smoothed=self.smoothing) print "Running Viterbi" predicted = viterbi(h,text=test_text, test=False) actual = self.getActual(test) return (predicted,actual)
def tagging(set_, tags, word_tag): global full_tags global init_table global full_cpd_tags global full_cpd_word_tag tagset = [] init_table = {} for tag in full_tag_set: init_table[tag] = 0.00000000000000000001 init_table['<s>'] = 1.0 for i in range(0, len(set_)): tagset.append( viterbi(dict_tags, dict_words, set_[i], full_tag_set, init_table, tags, word_tag)) return combine_tag_word(set_, tagset)
def t_wordseg(): PI, A, B = build(True) for k in B.keys(): if '|' == k[-1]: B[k[:-1]] = {k[:-1]: 1.0} else: B[k + '|'] = B[k] S = B.keys() for k in S: if k not in PI: PI[k] = 0.0 for sen in samples: Y = tuple(sen) prob, X = viterbi(Y, S, PI, A, B) print u''.join(X)
def __cut(sentence): prob, pos_list = viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P) begin, next = 0,0 for i,char in enumerate(sentence): pos = pos_list[i][0] if pos=='B': begin = i elif pos=='E': yield pair(sentence[begin:i+1], pos_list[i][1]) next = i+1 elif pos=='S': yield pair(char,pos_list[i][1]) next = i+1 if next<len(sentence): yield pair(sentence[next:], pos_list[next][1] )
def viterbi_run(training, test_file): #returns a list of sentence list containing tuples (word,part of speech) corpus_list = viterbi.corpus_list(training) #creates a dictionary of corpus part of speech tag : occurences corpus_dictionary = viterbi.corpus_dictionary(training) #pos_keys keys = viterbi.key_list(corpus_dictionary) #creates the prior_probabilities transitions table for the entire corpus prior_probabilities_table = viterbi.transition_table(corpus_dictionary,corpus_list) #creates a word dictionary #word: list of part of speeches and increment occurences of word as part of speech word_dic = viterbi.word_dic(corpus_list,keys) #word_keys words = viterbi.key_list(word_dic) #likelihood_table likelihood_table = viterbi.word_freq(corpus_dictionary,word_dic) #Emissions and Transitions sentences = viterbi.corpus_list_2(test_file) error_list = [] error_list_i = [] new_sentences = [] count = 0 for sentence in sentences: trans = viterbi.sentence_tag(sentence,keys,words,likelihood_table) s_pos = viterbi.sentence_pos(trans) transition_table = viterbi.transition_probabilities(trans,s_pos,prior_probabilities_table,keys) observed_like = viterbi.observed_likelihoods(sentence,s_pos,trans,likelihood_table,words,keys) vit_sent = viterbi.viterbi(observed_like,sentence,s_pos,transition_table)
def evaluate(): global possible_tags global strings global strings_abr get_words() get_strings() get_alpha() get_phi() get_regExp() data = open(sys.argv[4], 'r') output = open(sys.argv[5], 'w') sentence = get_sentence(data) while sentence: tags = viterbi.viterbi(sentence, phi, possible_tags, alpha, strings, strings_abr, Words, regExp) for i in range(len(sentence)): output.write('{} {}\n'.format(sentence[i], tags[i])) output.write('\n') sentence = get_sentence(data)
def gen_couplet(transition_prob_tree, output_prob_tree, unigram_freq, first_half): assert type(first_half) == unicode couplet_length = len(first_half) visible_words = np.array([first_half[i] for i in range (couplet_length)]) hidden_candidate_words = np.array([u' ' for _ in range(top_k_word*couplet_length)]).reshape(top_k_word, couplet_length) output_prob = np.random.rand(top_k_word, couplet_length) for i in range(couplet_length): key = first_half[i] if not output_prob_tree.has_key(key): print '%s, Cannot generate couplet' % key return '' hash_leaf = output_prob_tree[key] hidden_candidate_words[:,i], output_prob[:,i] = gen_candidates(first_half, hash_leaf, top_k_word) for i in range(couplet_length): candidate = u'' for j in range(top_k_word): candidate += hidden_candidate_words[j, i] try: transition_prob, init_prob = init_model(transition_prob_tree, unigram_freq, hidden_candidate_words, top_k_word) except: return '' optimal_path, prob = viterbi(transition_prob, output_prob, init_prob, [], visible_words, top_k_word, top_k_candidate) optimal_path = deal_repeat(first_half, optimal_path) results = [] for i in range(optimal_path.shape[0]): second_half = '' for j in range(optimal_path.shape[1]): second_half += hidden_candidate_words[optimal_path[i, j], j] score = ranking_function(output_prob_tree, first_half, second_half) results.append((score, second_half)) results = sorted(results, reverse=True)[:top_k_output] return results
def k_fold_cross_valid_known(k, parsed, known, discounts): res = defaultdict(list) for train, test in _fold(parsed, k): for discount in discounts: print 'train: ', len(train), 'test: ', len(test) tag2id, word2id = build_dict(parsed) id2tag = {v: k for k, v in tag2id.iteritems()} id2word = {v: k for k, v in word2id.iteritems()} emission, transition = _counter_known(parsed, train, known, 0.85, tag2id, word2id, discount) count_ok, count_total = 0., 0. for i, seq in enumerate(test): out = viterbi(seq, transition, emission, word2id, tag2id) ok, total = _compare(seq[1:-1], id_to_token(out, id2word, id2tag)) count_ok += ok; count_total += total if DEBUG: print 'evaluating', i, 'th sentence.', count_ok/count_total, 'so far.' res[discount].append(count_ok/count_total) print 'Fold accuracy: ', res[discount][-1], 'discount: ', discount for d in res: print 'discount:', d, '->', 'avg:', np.mean(res[d])
def perceptron(print_alpha = 0): global possible_tags global strings global strings_abr global add_factor get_regExp() get_strings() get_tags() for t in range(T_DEFAULT): print '---{0}---'.format(t) sys.stdout.flush() dont_repeat = True data = open(sys.argv[1], 'r') vals = get_sentence_and_tags(data) j = 0 while vals: sentence = vals[0] correct_tags = vals[1] tags = viterbi(sentence, phi, possible_tags, alpha, strings, strings_abr, Words, regExp) indices = get_indices(sentence, tags) if not tags == correct_tags: dont_repeat = False correct_indices = get_indices(sentence, correct_tags) for i in indices: alpha[i] += -1*add_factor for i in correct_indices: alpha[i] += add_factor else: j += 1 for i in alpha: alpha_average[i] += alpha[i] vals = get_sentence_and_tags(data) data.close() if dont_repeat: print 'SUCCESS!!!' break print 'number correct: {0}'.format(j) if print_alpha: write_alpha(t)
def test_model(corpus): cp = corpus.corpus_sentence word_list = list() pos_list = list() for paragraph in cp: text = [] pos = [] for tp in paragraph: text.append(tp[0]) pos.append(tp[1]) word_list.append(text) pos_list.append(pos) initp, trans_bi, emiss = corpus.get_statistics_model(tri_gram=False) _, trans_tri, emiss = corpus.get_statistics_model(tri_gram=True) bigram_result = [] trigram_result = [] count = 0 for paragraph in word_list: pos_bi = vtb.viterbi(paragraph, corpus.pos_list_sentence, initp, trans_bi, emiss) # pos_tri = vtb.viterbi_trigram(paragraph, corpus.pos_list_sentence, initp, trans_tri, emiss) bigram_result.append(pos_bi) # trigram_result.append(pos_tri) print(count) count += 1 if count == 1000: break tp, tn, fp, fn, other = evaluate_sentence(pos_list[0:1000], bigram_result) write_results_to_file("test/test_model_orchid_bigram", word_list[0:1000], pos_list, bigram_result, tp, tn, fp, fn, other, test_text="bigram model test")
H = [Health(0,'Healthy'), Health(1,'Fever')] observed = [Symptom('normal'), Symptom('cold'), Symptom('dizzy')] start_p = [0.6, 0.4] # index 0 'Healthy' # index 1 'Fever' '''transition_probability = { 'Healthy' : {'Healthy': 0.7, 'Fever': 0.3}, 'Fever' : {'Healthy': 0.4, 'Fever': 0.6}, }''' '''emission_probability = { 'Healthy' : {'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1}, 'Fever' : {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6}, }''' T = numpy.matrix( [[0.7, 0.3], [0.4, 0.6]]) def health_p(state, emission): e_p = [{'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1}, {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6}] return e_p[state.i][emission.feeling] xpath = viterbi.viterbi(observed,H,T,start_p,health_p) for x in xpath: print x
def test_backtrace(self): _, _, actual = viterbi(self.obs, self.A, self.B, self.pi) expected = [{'H': 'H', 'L': 'H'}, {'H': 'H', 'L': 'H'}, {'H': 'H', 'L': 'H'}, {'H': 'L', 'L': 'L'}, {'H': 'H', 'L': 'L'}, {'H': 'L', 'L': 'L'}, {'H': 'H', 'L': 'L'}, {'H': 'L', 'L': 'L'}] self.assertEqual(actual, expected)
def test_state_sequence(self): actual, _, _ = viterbi(self.obs, self.A, self.B, self.pi) expected = ("H", "H", "H", "L", "L", "L", "L", "L", "L") self.assertEqual(actual, expected)
generate_d_p(d_p, B) dprint("\nProbabilities:") if DEBUG: for b in B: dprint("{0} \nwith e={1} and d={2}".format(b,e_p[b.i],d_p[b.i],sum(e_p[b.i].values()),sum(d_p[b.i].values()))) # print "\nSafety check:" # print accent_p(B[7],S[1]) # print duration_p(B[7],S[0]) # Two emission functions # xpath = viterbi.viterbi(S,B,T,start_p,accent_p,duration_p) # One emission function xpath = viterbi.viterbi(S,B,T,start_p, accent_p) print "\nAnd they said, in great unison, that The Path shalt be:" sounder = Sounder(5) sendlist = [(-1,1,b) for b in range(0,5)] for x in xpath: print x # print "Hidden state, transition values ",T[x.i] sendlist[x.origin] = (ra.randint(60,80),x.duration,x.origin) print sendlist sounder.set_notes(sendlist) sounder.send_notes() sounder.close()
def main(): global mainConfig,vit loadConfig() if (platform.system() == "Windows"): # in case we're using... winblows mainConfig['path_separator'] = "\\" saveConfig() while True: drawScreen() ans = raw_input("* Select an option: ") if ans == "": break elif (ans == '1'): print "Refreshing all tables from file..." build.refreshAll(mainConfig['learnFile']) vit = viterbi.viterbi(build.startProbs, build.transProbs, build.obsProbs) print "Refreshing complete" wait() elif (ans == '2'): print "Unpickling tables..." build.unpickleTables() vit = viterbi.viterbi(build.startProbs, build.transProbs, build.obsProbs) print "Unpickling complete." wait() elif (ans == '3'): print "Training text file must be in "+mainConfig['path_separator']+"train directory!" f = raw_input("Select a new training file: ") newPath = "train"+mainConfig['path_separator']+f if (not os.path.isfile(newPath)): print "ERROR: Not a valid file" else: mainConfig['learnFile'] = f saveConfig() print "Updated successfully" wait() elif (ans == '4'): build.printStats() wait() elif (ans == '5'): testLoop() elif (ans == '6'): test.runTest(mainConfig["testFile"],mainConfig["rslts"]) wait() elif (ans == '7'): print "Testing text file must be in "+mainConfig['path_separator']+"test directory!" f = raw_input("Select a new training file: ") newPath = "test"+mainConfig['path_separator']+f if (not os.path.isfile(newPath)): print "ERROR: Not a valid file" else: mainConfig['testFile'] = f saveConfig() print "Updated successfully" wait() elif (ans == '8'): r = raw_input("Select desired number of results (1-14): ") if (r.isdigit() and (int(r) in range(1,15))): mainConfig['rslts'] = int(r) saveConfig() print "Updated successfully" else: print "ERROR: Not a valid option. Enter 1-14 only." wait()
def predict_one(weight, words, possible_tags, transition): return " ".join(viterbi(weight, words, possible_tags, transition))
import numpy as np from viterbi import viterbi if __name__ == '__main__': n_hid = 2 n_obs = 3 trans_hid = np.array( [ [0.5,0.5], [0.5,0.5] ] ) trans_obs = np.array( [ [0.5,0.4,0.1], [0.4,0.1, 0.5] ]) solver = viterbi(n_hid, n_obs, trans_hid, trans_obs) obs = np.array( [0,1,1,0,2,0,2,2,2,0,2,2,2,2,2,0,0,1,1,2] ) mlp = solver.get_MLP(obs) print mlp
def main(): if dimension==1 : # gmm = np.zeros(number_of_components*size) # mu = np.zeros(number_of_components) # sigma = np.zeros(number_of_components) # for i in range(number_of_components) : # gmm[i*size:(i+1)*size], mu[i], sigma[i] = create_data(dimension,size,i) gmm = np.zeros((1,number_of_components*size),dtype=float) mu = np.zeros((number_of_components,1),dtype=float) sigma = np.zeros((number_of_components,1,1),dtype=float) matrix = np.zeros((number_of_components,number_of_components),dtype=float) # for i in range(number_of_components): # x, mu[i,0], sigma[i,0,0] = create_data(dimension,size,i) else: gmm = np.zeros((dimension,number_of_components*size),dtype=float) mu = np.zeros((number_of_components,dimension),dtype=float) sigma = np.zeros((number_of_components,dimension,dimension),dtype=float) matrix = np.zeros((number_of_components,number_of_components),dtype=float) # for i in range(number_of_components): # x, mu[i,:], sigma[i,:,:] = create_data(dimension,size,i) weights = np.array([0.6, 0.4]) matrix = np.array([[0.7, 0.3], [0.1, 0.9]]) model = hmm.GaussianHMM(2, "full", weights, matrix) model.means_ = mu model.covars_ = sigma gmm, Z = model.sample(number_of_components*size) # else : # gmm = np.zeros((dimension,number_of_components*size)) # mu = np.zeros((number_of_components,dimension)) # sigma = np.zeros((number_of_components,dimension,dimension)) # for i in range(number_of_components) : # gmm[:,i*size:(i+1)*size], mu[i,:], sigma[i,:,:] = create_data(dimension,size,i) means, variances, pi, a = emHMM_algorithm(gmm,dimension,number_of_components,number_of_components*size) # num_bins = 50 # n, bins, patches = plt.hist(gmm, num_bins, normed=1, facecolor='green', alpha=0.5) # # add a 'best fit' line # for i in range(number_of_components) : # y = mlab.normpdf(bins, means[i], variances[i]) # plt.plot(bins, y, 'r--') # plt.xlabel('Values') # plt.ylabel('Probability') # plt.title('Data Histogram vs predicted distribution') # # # Tweak spacing to prevent clipping of ylabel # plt.subplots_adjust(left=0.15) # plt.show() b = np.zeros((number_of_components,number_of_components*size)) #Evaluate posterior if dimension==1: for i in range(number_of_components) : # Calculate the probability of seeing the observation given each state pdf = pi[i]*mlab.normpdf(gmm, means[i], variances[i,0]) b[i,:] = pdf[:,0] else: centered_data = np.zeros((number_of_components,number_of_components*size,dimension)) den = np.zeros((number_of_components,number_of_components*size)) num = np.zeros((number_of_components,number_of_components*size)) for i in range(number_of_components) : # Calculate the probability of seeing the observation given each state for n in range(number_of_components*size): centered_data[i, n, :] = gmm[n, :]-means[i, :] den[i,n] = np.sqrt((2*math.pi)**(dimension)*np.linalg.det(variances[i,:,:])) num[i,n] = np.exp((-1/2)*np.dot(np.dot(centered_data[i,n,:][np.newaxis],np.linalg.inv(variances[i,:,:])),centered_data[i,n,:][:,np.newaxis])) b[i,n] = num[i,n] / den[i,n] # Predict path, x, y = viterbi(size*number_of_components,a,b,pi) plt.figure(); plt.plot(path[0,:],'ro') plt.plot(path[0,:],'r') plt.plot(Z,'g') plt.show() if dimension==1: print "initial means: ", mu[:,0], "\n", "initial variances: ", sigma[:,0,0], "\n", "initial weights: ", weights, "\n" print "means:", means, "\n" "sigmas:", variances, "\n", "weights:", pi, "\n" print "initial mixing mgmmatrix:", matrix, "\n" print "mixing matrix:", a, "\n" else: print "initial means: ", mu, "\n", "initial variances: ", sigma, "\n", "initial weights: ", weights, "\n" print "means:", means, "\n" "sigmas:", variances, "\n", "weights:", pi, "\n" print "initial mixing matrix:", matrix, "\n" print "mixing matrix:", a, "\n"
turned against many dictators , but none quite so resourceful . """ use_filename = False #cProfile.run("viterbi(hmm, filename='../test-big-sample.pos')") tic = time.clock() if use_filename: pos = viterbi(hmm, filename=filename) toc = time.clock() with open(output_filename, 'w') as fp: fp.write('\n'.join(pos)) else: text = string #"\n".join(string.split(' ')) pos = viterbi(hmm, text=text, test=False) toc = time.clock() print str(pos) print str(toc-tic)
def tag_sen_feats(self, sen_feats): logTagProbsByPos = self.getLogTagProbsByPos(sen_feats) _, bestTagging = viterbi(self.transProbs, logTagProbsByPos, self.lmw) return bestTagging
def viterbi_decode(self,seq): node_potentials,edge_potentials = self.build_potentials(seq) viterbi_path,_ = viterbi(node_potentials,edge_potentials) res = viterbi_path new_seq = seq.update_from_sequence(res) return new_seq