def get_observations(self): self.print_conditions() contents = self.contents[self.index + 1:] correct_answers = [] correct_answer = () observations = [] observation = () for (i, line) in enumerate(contents): read_line = line.rstrip() letters = read_line.split(" ") if read_line == "_ _": observations.append(observation) observation = () continue if i + 1 == len(contents): observation = observation + (letters[1], ) correct_answers.append(letters[0]) observations.append(observation) observation = () break observation = observation + (letters[1], ) correct_answers.append(letters[0]) correct_letters = [] corrected_letters = [] viterbi = Viterbi(observations[0], self.states, self.start_probability, self.transition_probability, self.emission_probability) hit = 0 total = 0 for (i, observation) in enumerate(observations): viterbi = Viterbi(observation, self.states, self.start_probability, self.transition_probability, self.emission_probability) corrected_letters = corrected_letters + viterbi.run_viterbi()[1] print "Some of the reconstructed state sequence: " for (i, letter) in enumerate(corrected_letters): if letter == correct_answers[i]: hit += 1 if self.iteration < 100: print letter, self.iteration += 1 total += 1 print "\nPercent correctness:", hit / float(total) * 100
def predict(train_path, threshold, reg_lambda, test_path, conf, beam_width, file_name): v = MaximumEntropyMarkovModel.load_v_from_pickle(dump_weights_path='weights', threshold=args.threshold, reg_lambda=reg_lambda) ft_statistics = FeatureStatistics(input_file_path=train_path, threshold=threshold, config=conf) ft_statistics.pre_process(fill_possible_tag_dict=False) is_comp = 'comp' in file_name if is_comp: test_sentence_hist_list = FeatureStatistics.fill_comp_ordered_history_list(file_path=test_path) else: test_sentence_hist_list = FeatureStatistics.fill_tagged_ordered_history_list(file_path=test_path, is_test=True) tag_set = ft_statistics.tags_set all_possible_tags_dict = ft_statistics.hist_to_feature_vec_dict get_ft_from_hist_func = ft_statistics.get_non_zero_feature_vec_indices_from_history word_possible_tag_set = ft_statistics.word_possible_tag_set word_possible_tag_with_threshold_dict = ft_statistics.word_possible_tag_with_threshold_dict rare_words_tags = ft_statistics.rare_words_tags viterbi = Viterbi( v=v, sentence_hist_list=test_sentence_hist_list, tags_list=tag_set, all_possible_tags_dict=all_possible_tags_dict, get_feature_from_hist=get_ft_from_hist_func, word_possible_tag_set=word_possible_tag_set, word_possible_tag_with_threshold_dict=word_possible_tag_with_threshold_dict, rare_words_tags=rare_words_tags, threshold=args.threshold, reg_lambda=args.reg_lambda, file_name=file_name, beam_width=beam_width ) viterbi.predict_all_test(num_workers=4, is_comp=is_comp)
def test(self): v = Viterbi("model.txt") predicted_slot_count = 0 actual_slot_count = 0 hit_count = 0 test_set_size = len(self.test_set[0]) print("poccessing...") for i in range(test_set_size): if i != 0 and i % 100 == 0: print(str(i) + " done") sentence = list() for wordidx in self.test_set[0][i]: sentence.append(self.__idx2words[wordidx]) predicted_seq = v.poccess(sentence) predicted_slot = extract_slot(predicted_seq) label_seq = list() for labelidx in self.test_set[2][i]: label_seq.append(self.__idx2labels[labelidx]) actual_slot = extract_slot(label_seq) for item in predicted_slot: if item in actual_slot: hit_count += 1 predicted_slot_count += len(predicted_slot) actual_slot_count += len(actual_slot) print("test set size:" + str(test_set_size)) print("predicted slot:" + str(predicted_slot_count) + " actual slot:" + str(actual_slot_count) + " hit:" + str(hit_count)) print("Precision:" + str(hit_count / predicted_slot_count)) print("Recall:" + str(hit_count / actual_slot_count)) print("F1score:" + str(2 * hit_count / (actual_slot_count + predicted_slot_count)))
def test_wikipedia(self): hmm = { 'Rainy': [('Rainy', 0.7), ('Sunny', 0.3)], 'Sunny': [('Rainy', 0.4), ('Sunny', 0.6)] } start_probabilities = {'Rainy': 0.6, 'Sunny': 0.4} emission_probabilities = { 'Rainy': { 'walk': 0.1, 'shop': 0.4, 'clean': 0.5 }, 'Sunny': { 'walk': 0.6, 'shop': 0.3, 'clean': 0.1 } } vit = Viterbi(hmm, lambda state, obs: emission_probabilities[state][obs]) (v, p) = vit.step('walk', start_probabilities) (v, p) = vit.step('shop', v, p) (v, p) = vit.step('clean', v, p) max_state = max(v, key=lambda x: v[x]) assert (p[max_state] == ['Sunny', 'Rainy', 'Rainy'])
def main(): bijen = Bijenkhan(BIJEN_CORPUS) sents_tags = [] for sents, tags in bijen.sent_tag_gen(100): s = zip(sents, tags) sents_tags.extend(s) random.shuffle(sents_tags) test_sents_tags = sents_tags[:NUM_TEST_SAMPES] train_sents_tags = sents_tags[NUM_TEST_SAMPES:] viterbi = Viterbi(len(bijen.get_tags()), len(bijen.get_vocab()), bijen.get_tags(), bijen.get_bigram_tags(), train_sents_tags) for i in range(len(test_sents_tags)): true_labels = test_sents_tags[i][1] print(GREEN + 'True labels: ', true_labels) tmp = test_sents_tags[i][0] pred_labels = viterbi.viterbi(tmp[1:-1]) print(RED + 'Pred labels: ', pred_labels) print(CYAN + f'Accuracy: {accuracy_score(true_labels, pred_labels)}') print(CYAN + f'Precision: {precision_score(true_labels, pred_labels, average="macro")}') print(CYAN + f'Recall: {recall_score(true_labels, pred_labels, average="macro")}') print('\n'*2)
def run_viterbi(self): #print "Fin" #print self.observations viterbi = Viterbi(self.observations, self.states, self.start_probability, self.transition_probability, self.emission_probability) (junk, deduced_path) = viterbi.run_viterbi() self.checkSolutions(deduced_path)
def get_example(self, idx): valid_indices = np.arange(self.dataset_length)[self._label_mask(idx)] # calculation of distance taking too long on all the states_ valid_indices_choice = np.random.choice(valid_indices, size=1000) valid_states = self.states[valid_indices_choice][:, 0, ...] observation = self.states[idx] hidden_states = Viterbi(valid_states)(observation, self.blanket_size) example = self.base_dataset[idx] example["neighbours"] = self.base_dataset[hidden_states[:, 0].astype(int)] return example
def example2(): likelihood = np.loadtxt('likelihood.txt') print('probs shape: %s ' % str(likelihood.shape)) transcript = [2, 1, 3, 1, 3] viterbi = Viterbi(transcript, likelihood) alignement = viterbi.inference() assert len(alignement) == likelihood.shape[0] counter = count(alignement, transcript) print(alignement) print(counter)
def example3(): likelihood = np.loadtxt('likelihood.txt') print('probs shape: %s ' % str(likelihood.shape)) transcript = ['a', 'b', 'c', 'b', 'c'] state2idx = {'a': 2, 'b': 1, 'c': 3} viterbi = Viterbi(transcript, likelihood, state2idx=state2idx) alignement = viterbi.inference() assert len(alignement) == likelihood.shape[0] counter = count(alignement, transcript) print(alignement) print(counter)
def test(self, word_seq_path, output_path): original_seq, processed_seq = self.__prepare_word_seq(word_seq_path) decoder = Viterbi(self.vocab_list, self.tags, self.trans_prob, self.emit_prob) tags_pred, prob = decoder.decode(processed_seq) with open(output_path, "w") as out: for word, tag in zip(original_seq, tags_pred): if not word: out.write("\n") else: out.write("{0}\t{1}\n".format(word, tag))
def train(self, sequences, iterations=3): vit = Viterbi() for x in range(iterations): self.log_space() for name, seq in sequences.items(): seq['Z'] = vit.decode(self, seq['X']) print seq['Z'] #we return from log space self.delog() self.train_by_counting(sequences) print Model(self.keys, self.model, self.labels) return Model(self.keys, self.model, self.labels)
def __init__(self, hmm, emission_probability, constraint_length=10, MAX_DIST=500, priors=None, smallV=0.00000000001): # initialize spatial index self.previous_obs = None if priors == None: priors = dict([(state, 1.0 / len(hmm)) for state in hmm]) state_spatial_index = Rtree() unlocated_states = [] id_to_state = {} id = 0 for state in hmm: geom = self.geometry_of_state(state) if not geom: unlocated_states.append(state) else: ((lat1, lon1), (lat2, lon2)) = geom state_spatial_index.insert(id, (min(lon1, lon2), min( lat1, lat2), max(lon1, lon2), max(lat1, lat2))) id_to_state[id] = state id = id + 1 def candidate_states(obs): #was (lat,lon) in place of obs geom = self.geometry_of_observation(obs) if geom == None: return hmm.keys() else: (lat, lon) = geom nearby_states = state_spatial_index.intersection( (lon - MAX_DIST / METERS_PER_DEGREE_LONGITUDE, lat - MAX_DIST / METERS_PER_DEGREE_LATITUDE, lon + MAX_DIST / METERS_PER_DEGREE_LONGITUDE, lat + MAX_DIST / METERS_PER_DEGREE_LATITUDE)) candidates = [id_to_state[id] for id in nearby_states] + unlocated_states return candidates self.viterbi = Viterbi(hmm, emission_probability, constraint_length=constraint_length, priors=priors, candidate_states=candidate_states, smallV=smallV)
def _writer_viterbi(self): sentence = [] original_sentence = [] tag_set = [] lines_to_write = [] with open(self.input_file, "r") as f: data = f.readlines() for line in data: words = line.split() if words and words[1] != '.': current_word = words[1] local_tag_set = [] for k, v in emission_probability.iteritems(): keys = k.split('|') if keys[0] == words[1]: local_tag_set.append(keys[1]) if not local_tag_set: words[1] = UNKNOWN_WORD local_tag_set = get_unknown_word_tags() sentence.append(words[1]) original_sentence.append(current_word) tag_set.extend(local_tag_set) elif words and words[1] == '.': # send sentence to viterbi to compute tags. viterbi = Viterbi( tag_set=list(set(tag_set)), word_set=sentence, transition_probability=transition_probability, emission_probability=emission_probability) viterbi_states = viterbi.get_viterbi_states() for word in range(0, len(sentence)): lines_to_write.append( str(word + 1) + '\t' + original_sentence[word] + '\t' + viterbi_states[word] + '\n') lines_to_write.append( str(len(sentence) + 1) + '\t' + '.' + '\t' + '.' + '\n') lines_to_write.append('\n') sentence = [] original_sentence = [] tag_set = [] with open(self.output_path, 'w') as of: of.writelines(lines_to_write)
def cross_validation(self): cv_data = self.group_data(self.data) # The transition probabilities are done on the entire train and not on each fold. trans_probs = self.comp_transition_prob(self.data) # Do 10-fold cross validation below. k = 10 for i in range(0, k): train_set = [] valid_set = cv_data[i] print("Validation Fold ", i + 1) for j in range(0, k): if j != i: train_set += cv_data[j] # Do the Naive Bayes Classification here self.estimate_nb(train_set) nb_pred_labels = self.predict(valid_set) nb_act_labels = [item[0] for item in valid_set] nb_acc = len( np.where( np.array(nb_pred_labels) == np.array(nb_act_labels))[0]) print("Validation Accuracy of Naive Bayes ", nb_acc / len(nb_act_labels)) # The emission probabilities are done for each cv dataset. emission_probs = self.comp_emission_prob(nb_pred_labels, nb_act_labels) valid_words = self.dt.build_test_words(valid_set) # Do the Viterbi step here vt_pred_labels = [] vt_act_labels = [] nb_pred_labels = [] itr = 0 for w in valid_words: nb_pred_word = self.predict(valid_set[itr:(itr + len(w))]) nb_pred_labels += nb_pred_word vit = Viterbi(emission_probs, trans_probs, nb_pred_word) vt_pred_labels += vit.hmmWord() itr += len(w) nb_acc = len( np.where( np.array(nb_pred_labels) == np.array(nb_act_labels))[0]) vt_acc = len( np.where( np.array(vt_pred_labels) == np.array(nb_act_labels))[0]) print("Validation Accuracy of Viterbi ", vt_acc / len(nb_act_labels))
def __init__(self, ref_frames_data_filename, ref_pickle_filename, test_pickle_filename): print "init..." self.previous_obs = None self.image_processor = ImageProcessor() self.descriptors_ref = self.image_processor.load_sift(ref_pickle_filename) self.descriptors_test = self.image_processor.load_sift(test_pickle_filename) hmm = self.ref_frames_data_to_hmm(ref_frames_data_filename) #emission_probabilities = map(lambda x: complementary_normal_distribution_cdf(x,0,EMISSION_SIGMA),range(0,int(3.0*EMISSION_SIGMA))) priors=dict([(state,1.0/len(hmm)) for state in hmm]) self.viterbi = Viterbi(hmm,self.emission_probability, constraint_length=2500, # BE CAREFUL with it. walking may take long time and higher value may be needed here priors=priors)
def test(self): print('Test started...') start_test = time.time() self.pred_tags = [] test_orig, test_prep = dataloader(self.corpus + TEST_WORDS, 'test') tagger = Viterbi(self.vocab, self.tags, test_prep, self.A, self.B) preds = tagger.decode() for word, tag in zip(test_orig, preds): self.pred_tags.append((word, tag)) with open(PRED_T_POS, 'w') as out: for word, tag in self.pred_tags: if not word: out.write("\n") else: out.write("{0}\t{1}\n".format(word, tag)) out.close() print('Test finished, file has been written in '+ str(time.time()-\ start_test))
def viterbi(self, train_path, test_path, output_path): self._nerdic = NERDic(train_path) io = self._io train_sentences = [] test_sentences = [] for words, poss, labels in io.read_sentences(train_path): train_sentences.append(Sentence(labels, words, poss, self._nerdic)) for words, poss, labels in io.read_sentences(test_path): test_sentences.append(Sentence(labels, words, poss, self._nerdic)) viterbi = Viterbi(9) viterbi.train(train_sentences) for sent in test_sentences: predict_ids = viterbi.search(sent) sent.add_predict(predict_ids) io.write_sentences(output_path, test_sentences)
def validate(self): print('Validation started...') start_val = time.time() self.pred_tags = [] valid_orig, valid_prep = dataloader(self.corpus + \ VALIDATE_WORDS, 'validate') tagger = Viterbi(self.vocab, self.tags, valid_prep, self.A, self.B) preds = tagger.decode() for word, tag in zip(valid_orig, preds): self.pred_tags.append((word, tag)) with open(PRED_V_POS, 'w') as out: for word, tag in self.pred_tags: if not word: out.write("\n") else: out.write("{0}\t{1}\n".format(word, tag)) out.close() print('Validation ended, file has been written in '+ str(time.time()-\ start_val))
def predict(self): nvi = 12 for i in range(nvi): predicted_labels_training_set = [] print("starting viterbi run {}...".format(i)) for j, sent in enumerate(self.training_set): predicted_labels_training_set.append( Viterbi(sent, self.event_names, self.fweights).run()) tmp_sent = copy.deepcopy(sent) tmp_sent["events"] = predicted_labels_training_set[j] for i, w in enumerate(sent["words"]): # extract features from each word from the correcly labelled sentence.. ff = self.create_features(sent, i, "train") # and the labelling by Viterbi ff_pr = self.create_features(tmp_sent, i, "train") if sent["events"][i] != tmp_sent["events"][i]: for k in ff_pr: if k in self.fweights: self.fweights[k] -= 1 for g in ff: if g in self.fweights: self.fweights[g] += 1 # now get scores for this Viterbi iteration training_labels = [st["events"] for st in self.training_set] # print("have {} training sentences and {} predicted ones".format(len(training_labels), len(predicted_labels_training_set))) Scores(training_labels, predicted_labels_training_set).show() with open(self.feature_trained_file_path, "w+") as f: json.dump(self.fweights, f)
def test_viterbi_decode(): ''' Test case based on HW3 question. ''' log = logging.getLogger('test viterbi') ZERO = 0.000000000000000001 obs_space = ["moo", "hello", "quack", START_END_OBS] states = ["Cow", "Duck", START_END_TAG] trans_prob = [[0.5, 0.3, 0.2], [0.3, 0.5, 0.2], [1.0, ZERO, ZERO]] emit_prob = [[0.9, 0.1, ZERO, ZERO], [ZERO, 0.4, 0.6, ZERO], [ZERO, ZERO, ZERO, 1.0]] decoder = Viterbi(obs_space, states, trans_prob, emit_prob) obs = ["moo", "hello", "quack", START_END_OBS] seq, prob = decoder.decode(obs) log.debug("seq: " + str(seq)) log.debug("log_prob: " + str(prob)) assert prob - (-5.03903) < ZERO and \ seq == ["Cow", "Duck", "Duck", START_END_TAG]
def run_viterbi(self): contents = self.contents[self.index + 1:] observations = () correct_path = [] for line in contents: read_line = line.rstrip() if read_line == ".": viterbi = Viterbi(observations, self.states, \ self.start_probability, \ self.transition_probability, \ self.emission_probability) deduced_path = viterbi.run_viterbi() junk, guessed_path = deduced_path self.iteration += 1 self.check_correctness(guessed_path, correct_path) observations = () correct_path = [] continue coordinate, color = read_line.split(" ") observations = observations + (color,) correct_path.append(coordinate)
import numpy as np from viterbi import Viterbi, add_one_smoothing TAGS = ['N', 'C', 'V', 'J'] LEXICON = ['that', 'is', 'not', 'it', 'good', 'bad'] Pi = [1 / 8, 3 / 8, 3 / 8, 1 / 8] count_A = np.array([[2., 0., 3., 1.], [2., 0., 0., 0.], [4., 0., 1., 0.], [0., 0., 0., 0.]]) count_B = np.array([[4., 0., 2., 2., 0., 0.], [2., 0., 0., 0., 0., 0.], [0., 6., 0., 0., 0., 0.], [0., 0., 0., 0., 1., 0.]]) if __name__ == '__main__': A = add_one_smoothing(count_A) B = add_one_smoothing(count_B) viterbi = Viterbi(Pi, A, B, TAGS, LEXICON) sentence1 = 'bad is not good' sentence2 = 'is it bad' pred1 = viterbi.predict_tags(sentence1) pred2 = viterbi.predict_tags(sentence2) print(pred1) print(pred2)
from opt_results1 import simple_vec from viterbi import Viterbi if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--lamb', type=float, default=0) parser.add_argument('-f', '--families', nargs='+', type=int, default=[0, 1, 2, 3, 4, 5, 7, 8, 15, 16, 17, 19, 20]) args = parser.parse_args() print(args.lamb, " ", args.families) vec = train.calc_weight_vector("train.wtag", families=args.families, lamb=args.lamb) path = train.create_and_get_path(args.families, args.lamb) file = open(path, "w") file.write("simple_vec = %s\n" % vec.x.tolist()) file.close() #vec = simple_vec vit = Viterbi(vec.x.tolist(), args.families) vit.evaluate("test.wtag", 3, 0, args.lamb) print(args.lamb)
import utils from viterbi import Viterbi from utils import HmmParam hmm = HmmParam() # print(hmm.emission_table['data']['北京']) # print(hmm.transition_table['data']['爱']['北京']) # print(hmm.transition_table['data']['爱']) # print(hmm.py2hz_dict['beijing']) print(Viterbi(hmm, ['baobao', 'ye', 'tai', 'bang', 'le', 'ba'], 5))
print('Beam search:', beam_diameter) tagger = BeamSearch(counts, beam_diameter) test_sentences = conll.read_sentences(test_file) formatted_test_corpus = [ conll.split_rows(sentence, column_names) for sentence in test_sentences ] for sentence in tqdm(formatted_test_corpus): tagger.tag(sentence) cm = ConfusionMatrix(formatted_test_corpus, POS_key) cm.compute_matrix() print("Accuracy: ", cm.compute_accuracy()) print('Viterbi') tagger = Viterbi(counts) test_sentences = conll.read_sentences(test_file) formatted_test_corpus = [ conll.split_rows(sentence, column_names) for sentence in test_sentences ] for sentence in tqdm(formatted_test_corpus): tagger.tag(sentence) cm = ConfusionMatrix(formatted_test_corpus, POS_key) cm.compute_matrix() print("Accuracy: ", cm.compute_accuracy()) cm.print()
#outputs.to_project_1_sequences_file_from_posterior_decoding(sequences.get(), probs, 'posterior-decoding-sequences.txt') outputs.to_project_1_sequences_file(sequences.get(), probs, 'viterbi-sequences.txt') outputs.to_project_1_probs_file(sequences.get(), probs, 'viterbi-probs.txt') """ if __name__ == '__main__': model = hmm.Model(KEYS) model.load(HMMFILE) sequences = sequences.Sequences(SEQUENCEFILE) # load methods vit = Viterbi() post = Posterior() # viterbi probs = {} for key, sequence in sequences.get().items(): probs[key] = vit.decode(model, sequence) outputs.to_project_2_viterbi(sequences.get(), probs, 'pred-test-sequences-project2-viterbi.txt') probs = {} for key, value in sequences.get().items(): sequence = {'Z': post.decode(model, value), 'X': value} log_joint = compute_hmm(model, sequence)
except FileNotFoundError: print("Weights were not found") exit(0) predictions = list() incorrect_count = 0 correct_count = 0 start_time = time() incorrect_tags = dict() confusion_matrix = pd.DataFrame(index=tags, columns=tags).fillna(0) for i in range(test_data_size): # print(i+1,'/',test_data_size) sentence = [x[0] for x in test_data.data[i]] test_tags = [x[1] for x in test_data.data[i]] viterbi = Viterbi(tags, gen.transform, sentence, w_0, 5) predicted_tags = viterbi.run() predictions.append((sentence, predicted_tags)) for t, p in zip(test_tags, predicted_tags): if t == p: correct_count += 1 else: incorrect_count += 1 if t in incorrect_tags: incorrect_tags[t] += 1 else: incorrect_tags[t] = 1 confusion_matrix.loc[t, p] += 1 end_time = time() - start_time
import utils from ChineseTone import PinyinHelper from tqdm import tqdm with open('./test/test_set.json') as f: test_set = json.load(f) hmm = HmmParam() # hmm.py2hz_dict['不'] # hmm.emmission['不'] count_single = 0 correct_single = 0 count_sentence = 0 correct_sentence = 0 for i in tqdm(range(len(test_set))): count_sentence += 1 test = test_set[i] flag = True answer = Viterbi(hmm,test['py'],5)[0].path # print(answer) for idx,an in enumerate(answer): count_single += 1 if an == test['hz'][idx]: correct_single += 1 else: flag = False if flag: correct_sentence += 1 print('single:',correct_single/count_single) print('sentence:',correct_sentence/count_sentence)
#Set up the VITERBI states = ["Buy", "Sell"] obs = [] obs_prices = [] obs_delta = [] prev_price = hist_prices["Adj Close"][0] for price in hist_prices["Adj Close"]: if price >= prev_price: obs.append("Up") else: obs.append("Down") obs_prices.append(price) obs_delta.append(price - prev_price) prev_price = price possible_obs = ["Up", "Down"] v = Viterbi(initial, states, obs, possible_obs, trans, emiss) v.run() #v.print_table() #v.print_backtrack_table() #v.print_backtrack() #make a graph backtrack = v.get_backtrack() backtrack.pop(0) to_print = pd.DataFrame(hist_prices['Adj Close']) to_print["Delta"] = obs_delta to_print["Output"] = backtrack print(to_print) fig = hist_prices['Adj Close'].plot(grid="True") i = start tmp_backtrack = backtrack
# Checking arguments parser = argparse.ArgumentParser(description='Output Viterbi Sequence.') parser.add_argument('sequence', metavar='N', type=str, nargs='+', help='an integer for the sequence') parser.add_argument( '-s', '--sequence', action='store_const', dest='sequence', const=sequence, default=None, help= 'A sequence of numbers for the output sequence :: Required for any output.' ) args = parser.parse_args() sequence = args.sequence[0] # If sequence not provided, return help and exit if not sequence: parser.print_help() exit() print 'Observation(Input Sequence):', sequence element = Viterbi(sequence) element.viterbi()