def predict(train_path, threshold, reg_lambda, test_path, conf, beam_width, file_name): v = MaximumEntropyMarkovModel.load_v_from_pickle(dump_weights_path='weights', threshold=args.threshold, reg_lambda=reg_lambda) ft_statistics = FeatureStatistics(input_file_path=train_path, threshold=threshold, config=conf) ft_statistics.pre_process(fill_possible_tag_dict=False) is_comp = 'comp' in file_name if is_comp: test_sentence_hist_list = FeatureStatistics.fill_comp_ordered_history_list(file_path=test_path) else: test_sentence_hist_list = FeatureStatistics.fill_tagged_ordered_history_list(file_path=test_path, is_test=True) tag_set = ft_statistics.tags_set all_possible_tags_dict = ft_statistics.hist_to_feature_vec_dict get_ft_from_hist_func = ft_statistics.get_non_zero_feature_vec_indices_from_history word_possible_tag_set = ft_statistics.word_possible_tag_set word_possible_tag_with_threshold_dict = ft_statistics.word_possible_tag_with_threshold_dict rare_words_tags = ft_statistics.rare_words_tags viterbi = Viterbi( v=v, sentence_hist_list=test_sentence_hist_list, tags_list=tag_set, all_possible_tags_dict=all_possible_tags_dict, get_feature_from_hist=get_ft_from_hist_func, word_possible_tag_set=word_possible_tag_set, word_possible_tag_with_threshold_dict=word_possible_tag_with_threshold_dict, rare_words_tags=rare_words_tags, threshold=args.threshold, reg_lambda=args.reg_lambda, file_name=file_name, beam_width=beam_width ) viterbi.predict_all_test(num_workers=4, is_comp=is_comp)
def main(logger): print "Initializing Data Parser..." data_parser = OrwellDataParser(logger) print "Initializing HMM..." hmm_obj = HMM(logger, data_parser, skew_unseen=True) print "Initializing Viterbi..." viterbi_obj = Viterbi(logger, hmm_obj) print "Initializing Accuracy Estimator..." accuracy_estimator = AccuracyEstimator(logger, data_parser) for language, language_file in DATA_FILES: print "******************************************************************************************" print language print "******************************************************************************************" print "Training HMM with %s data..." % language viterbi_obj.train(language_file, START_LINE - 1) print "Estimating accuracy of the model..." total_accuracy, unseen_accuracy = accuracy_estimator.compute_parameters(viterbi_obj, language_file, START_LINE) print "TOTAL ACCURACY : %.10f" % total_accuracy print "UNSEEN_ACCURACY : %.10f" % unseen_accuracy print "Resetting model and estimator parameters..." viterbi_obj.reset() accuracy_estimator.reset()
def test(self): v = Viterbi("model.txt") predicted_slot_count = 0 actual_slot_count = 0 hit_count = 0 test_set_size = len(self.test_set[0]) print("poccessing...") for i in range(test_set_size): if i != 0 and i % 100 == 0: print(str(i) + " done") sentence = list() for wordidx in self.test_set[0][i]: sentence.append(self.__idx2words[wordidx]) predicted_seq = v.poccess(sentence) predicted_slot = extract_slot(predicted_seq) label_seq = list() for labelidx in self.test_set[2][i]: label_seq.append(self.__idx2labels[labelidx]) actual_slot = extract_slot(label_seq) for item in predicted_slot: if item in actual_slot: hit_count += 1 predicted_slot_count += len(predicted_slot) actual_slot_count += len(actual_slot) print("test set size:" + str(test_set_size)) print("predicted slot:" + str(predicted_slot_count) + " actual slot:" + str(actual_slot_count) + " hit:" + str(hit_count)) print("Precision:" + str(hit_count / predicted_slot_count)) print("Recall:" + str(hit_count / actual_slot_count)) print("F1score:" + str(2 * hit_count / (actual_slot_count + predicted_slot_count)))
def test_wikipedia(self): hmm = { 'Rainy': [('Rainy', 0.7), ('Sunny', 0.3)], 'Sunny': [('Rainy', 0.4), ('Sunny', 0.6)] } start_probabilities = {'Rainy': 0.6, 'Sunny': 0.4} emission_probabilities = { 'Rainy': { 'walk': 0.1, 'shop': 0.4, 'clean': 0.5 }, 'Sunny': { 'walk': 0.6, 'shop': 0.3, 'clean': 0.1 } } vit = Viterbi(hmm, lambda state, obs: emission_probabilities[state][obs]) (v, p) = vit.step('walk', start_probabilities) (v, p) = vit.step('shop', v, p) (v, p) = vit.step('clean', v, p) max_state = max(v, key=lambda x: v[x]) assert (p[max_state] == ['Sunny', 'Rainy', 'Rainy'])
def main(): bijen = Bijenkhan(BIJEN_CORPUS) sents_tags = [] for sents, tags in bijen.sent_tag_gen(100): s = zip(sents, tags) sents_tags.extend(s) random.shuffle(sents_tags) test_sents_tags = sents_tags[:NUM_TEST_SAMPES] train_sents_tags = sents_tags[NUM_TEST_SAMPES:] viterbi = Viterbi(len(bijen.get_tags()), len(bijen.get_vocab()), bijen.get_tags(), bijen.get_bigram_tags(), train_sents_tags) for i in range(len(test_sents_tags)): true_labels = test_sents_tags[i][1] print(GREEN + 'True labels: ', true_labels) tmp = test_sents_tags[i][0] pred_labels = viterbi.viterbi(tmp[1:-1]) print(RED + 'Pred labels: ', pred_labels) print(CYAN + f'Accuracy: {accuracy_score(true_labels, pred_labels)}') print(CYAN + f'Precision: {precision_score(true_labels, pred_labels, average="macro")}') print(CYAN + f'Recall: {recall_score(true_labels, pred_labels, average="macro")}') print('\n'*2)
def run_viterbi(self): #print "Fin" #print self.observations viterbi = Viterbi(self.observations, self.states, self.start_probability, self.transition_probability, self.emission_probability) (junk, deduced_path) = viterbi.run_viterbi() self.checkSolutions(deduced_path)
def example2(): likelihood = np.loadtxt('likelihood.txt') print('probs shape: %s ' % str(likelihood.shape)) transcript = [2, 1, 3, 1, 3] viterbi = Viterbi(transcript, likelihood) alignement = viterbi.inference() assert len(alignement) == likelihood.shape[0] counter = count(alignement, transcript) print(alignement) print(counter)
def test(self, word_seq_path, output_path): original_seq, processed_seq = self.__prepare_word_seq(word_seq_path) decoder = Viterbi(self.vocab_list, self.tags, self.trans_prob, self.emit_prob) tags_pred, prob = decoder.decode(processed_seq) with open(output_path, "w") as out: for word, tag in zip(original_seq, tags_pred): if not word: out.write("\n") else: out.write("{0}\t{1}\n".format(word, tag))
def example3(): likelihood = np.loadtxt('likelihood.txt') print('probs shape: %s ' % str(likelihood.shape)) transcript = ['a', 'b', 'c', 'b', 'c'] state2idx = {'a': 2, 'b': 1, 'c': 3} viterbi = Viterbi(transcript, likelihood, state2idx=state2idx) alignement = viterbi.inference() assert len(alignement) == likelihood.shape[0] counter = count(alignement, transcript) print(alignement) print(counter)
def get_observations(self): self.print_conditions() contents = self.contents[self.index + 1:] correct_answers = [] correct_answer = () observations = [] observation = () for (i, line) in enumerate(contents): read_line = line.rstrip() letters = read_line.split(" ") if read_line == "_ _": observations.append(observation) observation = () continue if i + 1 == len(contents): observation = observation + (letters[1], ) correct_answers.append(letters[0]) observations.append(observation) observation = () break observation = observation + (letters[1], ) correct_answers.append(letters[0]) correct_letters = [] corrected_letters = [] viterbi = Viterbi(observations[0], self.states, self.start_probability, self.transition_probability, self.emission_probability) hit = 0 total = 0 for (i, observation) in enumerate(observations): viterbi = Viterbi(observation, self.states, self.start_probability, self.transition_probability, self.emission_probability) corrected_letters = corrected_letters + viterbi.run_viterbi()[1] print "Some of the reconstructed state sequence: " for (i, letter) in enumerate(corrected_letters): if letter == correct_answers[i]: hit += 1 if self.iteration < 100: print letter, self.iteration += 1 total += 1 print "\nPercent correctness:", hit / float(total) * 100
def train(self, sequences, iterations=3): vit = Viterbi() for x in range(iterations): self.log_space() for name, seq in sequences.items(): seq['Z'] = vit.decode(self, seq['X']) print seq['Z'] #we return from log space self.delog() self.train_by_counting( sequences ) print Model(self.keys, self.model, self.labels) return Model(self.keys, self.model, self.labels)
def train(self, sequences, iterations=3): vit = Viterbi() for x in range(iterations): self.log_space() for name, seq in sequences.items(): seq['Z'] = vit.decode(self, seq['X']) print seq['Z'] #we return from log space self.delog() self.train_by_counting(sequences) print Model(self.keys, self.model, self.labels) return Model(self.keys, self.model, self.labels)
def get_observations(self): self.print_conditions() contents = self.contents[self.index + 1:] correct_answers = [] correct_answer = () observations = [] observation = () for (i, line) in enumerate(contents): read_line = line.rstrip() letters = read_line.split(" ") if read_line == "_ _": observations.append(observation) observation = () continue if i + 1 == len(contents): observation = observation + (letters[1], ) correct_answers.append(letters[0]) observations.append(observation) observation = () break observation = observation + (letters[1], ) correct_answers.append(letters[0]) correct_letters = [] corrected_letters = [] viterbi = Viterbi(observations[0], self.states, self.start_probability, self.transition_probability, self.emission_probability) hit = 0 total = 0 for (i, observation) in enumerate(observations): viterbi = Viterbi(observation, self.states, self.start_probability, self.transition_probability, self.emission_probability) corrected_letters = corrected_letters + viterbi.run_viterbi()[1] print "Some of the reconstructed state sequence: " for (i, letter) in enumerate(corrected_letters): if letter == correct_answers[i]: hit += 1 if self.iteration < 100: print letter, self.iteration += 1 total += 1 print "\nPercent correctness:", hit/float(total) * 100
def __init__(self, hmm, emission_probability, constraint_length=10, MAX_DIST=500, priors=None, smallV=0.00000000001): # initialize spatial index self.previous_obs = None if priors == None: priors = dict([(state, 1.0 / len(hmm)) for state in hmm]) state_spatial_index = Rtree() unlocated_states = [] id_to_state = {} id = 0 for state in hmm: geom = self.geometry_of_state(state) if not geom: unlocated_states.append(state) else: ((lat1, lon1), (lat2, lon2)) = geom state_spatial_index.insert(id, (min(lon1, lon2), min( lat1, lat2), max(lon1, lon2), max(lat1, lat2))) id_to_state[id] = state id = id + 1 def candidate_states(obs): #was (lat,lon) in place of obs geom = self.geometry_of_observation(obs) if geom == None: return hmm.keys() else: (lat, lon) = geom nearby_states = state_spatial_index.intersection( (lon - MAX_DIST / METERS_PER_DEGREE_LONGITUDE, lat - MAX_DIST / METERS_PER_DEGREE_LATITUDE, lon + MAX_DIST / METERS_PER_DEGREE_LONGITUDE, lat + MAX_DIST / METERS_PER_DEGREE_LATITUDE)) candidates = [id_to_state[id] for id in nearby_states] + unlocated_states return candidates self.viterbi = Viterbi(hmm, emission_probability, constraint_length=constraint_length, priors=priors, candidate_states=candidate_states, smallV=smallV)
def main(): init_prob = [0, 0.5, 0.5] num_state = 2 a = [[0, 0.5, 0.5], [0, 0.6, 0.4], [0, 0.7, 0.3]] b = [[0, 0, 0], [0, 0.7, 0.3], [0, 0.2, 0.8]] T = 2 offset = 1 Viterbi.get_optimal_state_sequence(init_prob, a, b, num_state, T, offset)
def _writer_viterbi(self): sentence = [] original_sentence = [] tag_set = [] lines_to_write = [] with open(self.input_file, "r") as f: data = f.readlines() for line in data: words = line.split() if words and words[1] != '.': current_word = words[1] local_tag_set = [] for k, v in emission_probability.iteritems(): keys = k.split('|') if keys[0] == words[1]: local_tag_set.append(keys[1]) if not local_tag_set: words[1] = UNKNOWN_WORD local_tag_set = get_unknown_word_tags() sentence.append(words[1]) original_sentence.append(current_word) tag_set.extend(local_tag_set) elif words and words[1] == '.': # send sentence to viterbi to compute tags. viterbi = Viterbi( tag_set=list(set(tag_set)), word_set=sentence, transition_probability=transition_probability, emission_probability=emission_probability) viterbi_states = viterbi.get_viterbi_states() for word in range(0, len(sentence)): lines_to_write.append( str(word + 1) + '\t' + original_sentence[word] + '\t' + viterbi_states[word] + '\n') lines_to_write.append( str(len(sentence) + 1) + '\t' + '.' + '\t' + '.' + '\n') lines_to_write.append('\n') sentence = [] original_sentence = [] tag_set = [] with open(self.output_path, 'w') as of: of.writelines(lines_to_write)
def __init__(self, ref_frames_data_filename, ref_pickle_filename, test_pickle_filename): print "init..." self.previous_obs = None self.image_processor = ImageProcessor() self.descriptors_ref = self.image_processor.load_sift(ref_pickle_filename) self.descriptors_test = self.image_processor.load_sift(test_pickle_filename) hmm = self.ref_frames_data_to_hmm(ref_frames_data_filename) #emission_probabilities = map(lambda x: complementary_normal_distribution_cdf(x,0,EMISSION_SIGMA),range(0,int(3.0*EMISSION_SIGMA))) priors=dict([(state,1.0/len(hmm)) for state in hmm]) self.viterbi = Viterbi(hmm,self.emission_probability, constraint_length=2500, # BE CAREFUL with it. walking may take long time and higher value may be needed here priors=priors)
def cross_validation(self): cv_data = self.group_data(self.data) # The transition probabilities are done on the entire train and not on each fold. trans_probs = self.comp_transition_prob(self.data) # Do 10-fold cross validation below. k = 10 for i in range(0, k): train_set = [] valid_set = cv_data[i] print("Validation Fold ", i + 1) for j in range(0, k): if j != i: train_set += cv_data[j] # Do the Naive Bayes Classification here self.estimate_nb(train_set) nb_pred_labels = self.predict(valid_set) nb_act_labels = [item[0] for item in valid_set] nb_acc = len( np.where( np.array(nb_pred_labels) == np.array(nb_act_labels))[0]) print("Validation Accuracy of Naive Bayes ", nb_acc / len(nb_act_labels)) # The emission probabilities are done for each cv dataset. emission_probs = self.comp_emission_prob(nb_pred_labels, nb_act_labels) valid_words = self.dt.build_test_words(valid_set) # Do the Viterbi step here vt_pred_labels = [] vt_act_labels = [] nb_pred_labels = [] itr = 0 for w in valid_words: nb_pred_word = self.predict(valid_set[itr:(itr + len(w))]) nb_pred_labels += nb_pred_word vit = Viterbi(emission_probs, trans_probs, nb_pred_word) vt_pred_labels += vit.hmmWord() itr += len(w) nb_acc = len( np.where( np.array(nb_pred_labels) == np.array(nb_act_labels))[0]) vt_acc = len( np.where( np.array(vt_pred_labels) == np.array(nb_act_labels))[0]) print("Validation Accuracy of Viterbi ", vt_acc / len(nb_act_labels))
def main(logger): print "Initializing Data Parser..." data_parser = OrwellDataParser(logger) print "Initializing Linear Sequence Model..." ls_obj = LinearSequence(logger, data_parser, use_avg=True, use_suffix=True) print "Initializing Viterbi..." viterbi_obj = Viterbi(logger, ls_obj) print "Initializing Accuracy Estimator..." accuracy_estimator = AccuracyEstimator(logger, data_parser) for language, language_file in DATA_FILES: print "******************************************************************************************" print language print "******************************************************************************************" print "Training Linear Sequence Linear Sequence Model with %s data..." % language viterbi_obj.train(language_file, START_LINE - 1) #import pdb;pdb.set_trace() print viterbi_obj.predict_sequence(["his", "breast", "rose", "and", "fell", "a", "little", "faster", "."]) print "Estimating accuracy of the model..." total_accuracy, unseen_accuracy = accuracy_estimator.compute_parameters(viterbi_obj, language_file, START_LINE) print "TOTAL ACCURACY : %.10f" % total_accuracy print "UNSEEN_ACCURACY : %.10f" % unseen_accuracy print "Resetting model and estimator parameters..." viterbi_obj.reset() accuracy_estimator.reset()
def viterbi(self, train_path, test_path, output_path): self._nerdic = NERDic(train_path) io = self._io train_sentences = [] test_sentences = [] for words, poss, labels in io.read_sentences(train_path): train_sentences.append(Sentence(labels, words, poss, self._nerdic)) for words, poss, labels in io.read_sentences(test_path): test_sentences.append(Sentence(labels, words, poss, self._nerdic)) viterbi = Viterbi(9) viterbi.train(train_sentences) for sent in test_sentences: predict_ids = viterbi.search(sent) sent.add_predict(predict_ids) io.write_sentences(output_path, test_sentences)
def test(self): print('Test started...') start_test = time.time() self.pred_tags = [] test_orig, test_prep = dataloader(self.corpus + TEST_WORDS, 'test') tagger = Viterbi(self.vocab, self.tags, test_prep, self.A, self.B) preds = tagger.decode() for word, tag in zip(test_orig, preds): self.pred_tags.append((word, tag)) with open(PRED_T_POS, 'w') as out: for word, tag in self.pred_tags: if not word: out.write("\n") else: out.write("{0}\t{1}\n".format(word, tag)) out.close() print('Test finished, file has been written in '+ str(time.time()-\ start_test))
def validate(self): print('Validation started...') start_val = time.time() self.pred_tags = [] valid_orig, valid_prep = dataloader(self.corpus + \ VALIDATE_WORDS, 'validate') tagger = Viterbi(self.vocab, self.tags, valid_prep, self.A, self.B) preds = tagger.decode() for word, tag in zip(valid_orig, preds): self.pred_tags.append((word, tag)) with open(PRED_V_POS, 'w') as out: for word, tag in self.pred_tags: if not word: out.write("\n") else: out.write("{0}\t{1}\n".format(word, tag)) out.close() print('Validation ended, file has been written in '+ str(time.time()-\ start_val))
def train_iteration(self, filepath): viterbi = Viterbi("EMPTY") viterbi.v = self.v with open(filepath) as train_file: corpus = gen_sentence_train(train_file) count = 0 for doc in corpus: count += 1 if count % 1000 == 0: _logger.debug("%d sentence processed" % count) sent = [s[0] for s in doc] tags = [s[1] for s in doc] tags_pred = viterbi.decode_one(list(sent)) assert len(sent) == len(tags) == len(tags_pred) feat_gold = feat_vect(sent, tags) feat_pred = feat_vect(sent, tags_pred) for feat in feat_pred: self.v[feat] -= feat_pred[feat] for feat in feat_gold: self.v[feat] += feat_gold[feat]
def get_example(self, idx): valid_indices = np.arange(self.dataset_length)[self._label_mask(idx)] # calculation of distance taking too long on all the states_ valid_indices_choice = np.random.choice(valid_indices, size=1000) valid_states = self.states[valid_indices_choice][:, 0, ...] observation = self.states[idx] hidden_states = Viterbi(valid_states)(observation, self.blanket_size) example = self.base_dataset[idx] example["neighbours"] = self.base_dataset[hidden_states[:, 0].astype(int)] return example
def main(logger): out_ptr = open(OUTPUT, "w") print "Initializing Data Parser..." data_parser = OrwellDataParser(logger) print "Initializing Linear Sequence Model..." ls_obj = LinearSequence(logger, data_parser) print "Initializing Viterbi..." viterbi_obj = Viterbi(logger, ls_obj) print "Initializing Accuracy Estimator..." accuracy_estimator = AccuracyEstimator(logger, data_parser) for language, language_file in TRAINING_FILES: language_file = "Data/%s" % language_file print "******************************************************************************************" print language print "******************************************************************************************" print "Training Linear Sequence Linear Sequence Model with %s data..." % language viterbi_obj.train(language_file, START_LINE - 1) #import pdb;pdb.set_trace() #print viterbi_obj.predict_sequence(["his", "breast", "rose", "and", "fell", "a", "little", "faster", "."]) print "Estimating accuracy of the model..." total_accuracy, unseen_accuracy = accuracy_estimator.compute_parameters(viterbi_obj, language_file, "inaccurate_words_%s.txt"%language, language, START_LINE) print "TOTAL ACCURACY : %.10f" % total_accuracy print "UNSEEN_ACCURACY : %.10f" % unseen_accuracy o_line = "Language : %s\nTotal Accuracy : %.10f\nUnseen Accuracy : %.10f\n\n\n" % (language, total_accuracy, unseen_accuracy) out_ptr.write(o_line) print "Resetting model and estimator parameters..." viterbi_obj.reset() accuracy_estimator.reset() out_ptr.close()
def test_viterbi_decode(): ''' Test case based on HW3 question. ''' log = logging.getLogger('test viterbi') ZERO = 0.000000000000000001 obs_space = ["moo", "hello", "quack", START_END_OBS] states = ["Cow", "Duck", START_END_TAG] trans_prob = [[0.5, 0.3, 0.2], [0.3, 0.5, 0.2], [1.0, ZERO, ZERO]] emit_prob = [[0.9, 0.1, ZERO, ZERO], [ZERO, 0.4, 0.6, ZERO], [ZERO, ZERO, ZERO, 1.0]] decoder = Viterbi(obs_space, states, trans_prob, emit_prob) obs = ["moo", "hello", "quack", START_END_OBS] seq, prob = decoder.decode(obs) log.debug("seq: " + str(seq)) log.debug("log_prob: " + str(prob)) assert prob - (-5.03903) < ZERO and \ seq == ["Cow", "Duck", "Duck", START_END_TAG]
def main(): p = optparse.OptionParser() p.add_option('-r', action = 'store_true', dest = "redo", default = False) opts, args = p.parse_args() output_file = '' if len(args) == 1: fileName = args[0] elif len(args) == 2: fileName = args[0] output_file = args[1] elif not args: sys.stderr.write("Error: please specify a file name\n") raise SystemExit(1) elif len(args) > 2: sys.stderr.write("Error: too much argument\n") raise SystemExit(1) # split the sentences processor = Preprocessor(fileName) sentences = processor.getSentences() # create the likelihood table, prior probability table and so on if opts.redo or not (os.path.isfile("likelihood.pkl") and os.path.isfile("prior_prob.pkl") and os.path.isfile("tags.pkl") and os.path.isfile("vocabulary.pkl")): viterbi_util.compute_table("training.pos") # run viterbi algorithm viterbi = Viterbi() output = [] for sentence in sentences: tag_seq = viterbi.go(sentence) output.append((sentence, tag_seq)) # write the result into a file viterbi_util.write_out(output, output_file)
def label(self): vit_obs = [] hidden_states = [] vit = Viterbi() with open(self.testing_file, 'r') as in_file: text = in_file.read() for word_pos in text.split(): word_pos_split = (word_pos.split('_')) word = word_pos_split[0] pos = word_pos_split[1] self.real.append(pos) # record the true value self.words.append(word) if pos not in hidden_states: hidden_states.append(pos) vit_obs.append(word) # print "observation: ", vit_obs # print "hidden states: ", hidden_states # print "transition: ", self.transition # print "emission: ", self.emission # print "start: ", self.start print "Beginning viterbi algorithm" probability, self.predicted = vit.viterbi(vit_obs, hidden_states, self.start, self.transition, self.emission)
def run_viterbi(self): contents = self.contents[self.index + 1:] observations = () correct_path = [] for line in contents: read_line = line.rstrip() if read_line == ".": viterbi = Viterbi(observations, self.states, \ self.start_probability, \ self.transition_probability, \ self.emission_probability) deduced_path = viterbi.run_viterbi() junk, guessed_path = deduced_path self.iteration += 1 self.check_correctness(guessed_path, correct_path) observations = () correct_path = [] continue coordinate, color = read_line.split(" ") observations = observations + (color,) correct_path.append(coordinate)
def __init__(self, hmm, emission_probability, constraint_length=10, MAX_DIST=500, priors=None, smallV=0.00000000001): # initialize spatial index self.previous_obs = None if priors == None: priors=dict([(state,1.0/len(hmm)) for state in hmm]) state_spatial_index = Rtree() unlocated_states = [] id_to_state = {} id = 0 for state in hmm: geom=self.geometry_of_state(state) if not geom: unlocated_states.append(state) else: ((lat1,lon1),(lat2,lon2))=geom state_spatial_index.insert(id, (min(lon1, lon2), min(lat1, lat2), max(lon1, lon2), max(lat1, lat2))) id_to_state[id]=state id=id+1 def candidate_states(obs): #was (lat,lon) in place of obs geom = self.geometry_of_observation(obs) if geom == None: return hmm.keys() else: (lat,lon)=geom nearby_states = state_spatial_index.intersection((lon-MAX_DIST/METERS_PER_DEGREE_LONGITUDE, lat-MAX_DIST/METERS_PER_DEGREE_LATITUDE, lon+MAX_DIST/METERS_PER_DEGREE_LONGITUDE, lat+MAX_DIST/METERS_PER_DEGREE_LATITUDE)) candidates = [id_to_state[id] for id in nearby_states]+unlocated_states return candidates self.viterbi = Viterbi(hmm,emission_probability, constraint_length=constraint_length, priors=priors, candidate_states=candidate_states, smallV=smallV)
def __init__(self, logger, data_parser, use_avg=False, use_suffix=False, training_level=5, start_tag="START", stop_tag="STOP"): self.logger = logger self.data_parser = data_parser self.start_tag = start_tag self.stop_tag = stop_tag self.training_level = training_level self.tag_features = set() self.word_features = set() self.weights = {} self.seen_words = set() self.viterbi_obj = Viterbi(logger, self) self.KEY_TAG = "TAG_FEATURE" self.KEY_WORD = "WORD_FEATURE" self.KEY_SUFFIX = "SUFFIX_FEATURE" self.special_tags = [start_tag, stop_tag] self.hidden_states = [] self.avg_weights = {} self.use_avg = use_avg self.use_suffix = use_suffix self.trained = False self.suffix_features= set()
def predict(self): nvi = 12 for i in range(nvi): predicted_labels_training_set = [] print("starting viterbi run {}...".format(i)) for j, sent in enumerate(self.training_set): predicted_labels_training_set.append( Viterbi(sent, self.event_names, self.fweights).run()) tmp_sent = copy.deepcopy(sent) tmp_sent["events"] = predicted_labels_training_set[j] for i, w in enumerate(sent["words"]): # extract features from each word from the correcly labelled sentence.. ff = self.create_features(sent, i, "train") # and the labelling by Viterbi ff_pr = self.create_features(tmp_sent, i, "train") if sent["events"][i] != tmp_sent["events"][i]: for k in ff_pr: if k in self.fweights: self.fweights[k] -= 1 for g in ff: if g in self.fweights: self.fweights[g] += 1 # now get scores for this Viterbi iteration training_labels = [st["events"] for st in self.training_set] # print("have {} training sentences and {} predicted ones".format(len(training_labels), len(predicted_labels_training_set))) Scores(training_labels, predicted_labels_training_set).show() with open(self.feature_trained_file_path, "w+") as f: json.dump(self.fweights, f)
from opt_results1 import simple_vec from viterbi import Viterbi if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--lamb', type=float, default=0) parser.add_argument('-f', '--families', nargs='+', type=int, default=[0, 1, 2, 3, 4, 5, 7, 8, 15, 16, 17, 19, 20]) args = parser.parse_args() print(args.lamb, " ", args.families) vec = train.calc_weight_vector("train.wtag", families=args.families, lamb=args.lamb) path = train.create_and_get_path(args.families, args.lamb) file = open(path, "w") file.write("simple_vec = %s\n" % vec.x.tolist()) file.close() #vec = simple_vec vit = Viterbi(vec.x.tolist(), args.families) vit.evaluate("test.wtag", 3, 0, args.lamb) print(args.lamb)
import utils from viterbi import Viterbi from utils import HmmParam hmm = HmmParam() # print(hmm.emission_table['data']['北京']) # print(hmm.transition_table['data']['爱']['北京']) # print(hmm.transition_table['data']['爱']) # print(hmm.py2hz_dict['beijing']) print(Viterbi(hmm, ['baobao', 'ye', 'tai', 'bang', 'le', 'ba'], 5))
class GPSMatcher: def __init__(self, hmm, emission_probability, constraint_length=10, MAX_DIST=500, priors=None, smallV=0.00000000001): # initialize spatial index self.previous_obs = None if priors == None: priors=dict([(state,1.0/len(hmm)) for state in hmm]) state_spatial_index = Rtree() unlocated_states = [] id_to_state = {} id = 0 for state in hmm: geom=self.geometry_of_state(state) if not geom: unlocated_states.append(state) else: ((lat1,lon1),(lat2,lon2))=geom state_spatial_index.insert(id, (min(lon1, lon2), min(lat1, lat2), max(lon1, lon2), max(lat1, lat2))) id_to_state[id]=state id=id+1 def candidate_states(obs): #was (lat,lon) in place of obs geom = self.geometry_of_observation(obs) if geom == None: return hmm.keys() else: (lat,lon)=geom nearby_states = state_spatial_index.intersection((lon-MAX_DIST/METERS_PER_DEGREE_LONGITUDE, lat-MAX_DIST/METERS_PER_DEGREE_LATITUDE, lon+MAX_DIST/METERS_PER_DEGREE_LONGITUDE, lat+MAX_DIST/METERS_PER_DEGREE_LATITUDE)) candidates = [id_to_state[id] for id in nearby_states]+unlocated_states return candidates self.viterbi = Viterbi(hmm,emission_probability, constraint_length=constraint_length, priors=priors, candidate_states=candidate_states, smallV=smallV) def step(self,obs,V,p): if self.previous_obs != None: for int_obs in self.interpolated_obs(self.previous_obs, obs): V,p = self.viterbi.step(int_obs,V,p) V,p = self.viterbi.step(obs,V,p) self.previous_obs = obs return V,p def interpolated_obs(self,prev,obs): return [] def geometry_of_observation(self, obs): return obs def geometry_of_state(self, state): """ Subclasses should override this method to return the geometry of a given state, typically an edge.""" if state == 'unknown': return None else: return state
def run(self, verbose): data = self.data hmm = self.hmm viterbi = Viterbi(hmm) # Start probabilitites print("Start probabilities:") for state in data.statekeys: print(state, ':\t', "{0:.3f}".format(hmm.start_prob(state))) # Transition probabilities print("\nTransition probabilities:") for state in data.states: sys.stdout.write('\t' + state) sys.stdout.write('\n') sys.stdout.flush() for from_state in data.states: sys.stdout.write(from_state + ' :') for to_state in data.states: trans_prob = hmm.trans_prob(from_state, to_state) sys.stdout.write('\t' + "{0:.3f}".format(trans_prob)) sys.stdout.write('\n') sys.stdout.flush() # Output probabilities print("\nOutput probabilities:") print_outputs = (len(data.outputs) < 30) or verbose if not print_outputs: print("*" * 32) print("Too many outputs to display... calculating the outputs...") print("Run with '-v' to see all outputs") print("*" * 32) if print_outputs: for output in sorted(data.outputs): sys.stdout.write('\t' + output) if print_outputs: sys.stdout.write('\n') sys.stdout.flush() for state in data.states: sys.stdout.write(state + ' :') for output in sorted(data.outputs): out_prob = hmm.output_prob(state, output) if print_outputs: sys.stdout.write('\t' + "{0:.3f}".format(float(out_prob))) if print_outputs: sys.stdout.write('\n') sys.stdout.flush() # Most likely sequence overall_error = 0 for i, sequence in enumerate(data.testing.sequences): print_mls = (i < 4) or verbose if (i == 4) and not verbose: print("") print("*" * 32) print("There are too many sequences to display... Calculating") print("Run with '-v' to see all outputs") print("*" * 32) outputs = sequence.outputs() inputs = sequence.inputs() _, mls = viterbi.most_likely_sequence(outputs) if print_mls: print("\nMost likely sequence #"+str(i)+":") print('input\tcalc\toutput') errors = 0 inputs_len = len(inputs) for i in range(inputs_len): if print_mls: print(inputs[i], '\t', mls[i], '\t', outputs[i]) if inputs[i] != mls[i]: errors += 1 else: pass err_percentage = errors / float(inputs_len) if print_mls: print('Errors:', errors, '/', len(inputs), '=', err_percentage) seq_len = float(len(data.testing.sequences)) overall_error += err_percentage / seq_len correct_percent = 1 - overall_error print("\nThe overall percent correct is " + "{0:.3f}".format(correct_percent) + "%")
class LinearSequence: def __init__(self, logger, data_parser, use_avg=False, use_suffix=False, training_level=5, start_tag="START", stop_tag="STOP"): self.logger = logger self.data_parser = data_parser self.start_tag = start_tag self.stop_tag = stop_tag self.training_level = training_level self.tag_features = set() self.word_features = set() self.weights = {} self.seen_words = set() self.viterbi_obj = Viterbi(logger, self) self.KEY_TAG = "TAG_FEATURE" self.KEY_WORD = "WORD_FEATURE" self.KEY_SUFFIX = "SUFFIX_FEATURE" self.special_tags = [start_tag, stop_tag] self.hidden_states = [] self.avg_weights = {} self.use_avg = use_avg self.use_suffix = use_suffix self.trained = False self.suffix_features= set() def reset(self): self.tag_features = set() self.word_features = set() self.weights = {} self.seen_words = set() self.trained = False self.avg_weights = {} self.hidden_states = [] self.suffix_features=set() def is_unseen(self, word): if word in self.seen_words: return False return True def train(self, training_file, end_line=5500): self.logger.info("Started training data from %s upto line %d" %(training_file, end_line)) tags_info = {} for line_no, word_list in self.data_parser.next(training_file): if line_no > end_line: break prev_tag = None for index, (word, tag) in enumerate(word_list): #create feature space if prev_tag is not None: self.tag_features.add((prev_tag, tag)) self.word_features.add((tag, word)) #suffix features if len(word) > 1: self.suffix_features.add((word[-1:], tag)) if len(word) > 2: self.suffix_features.add((word[-2:], tag)) if len(word) > 3: self.suffix_features.add((word[-3:], tag)) prev_tag = tag #if tag not in self.hidden_states: # self.hidden_states.append(tag) tags_info[tag] = tags_info.setdefault(tag, 0) + 1 self.seen_words.add(word) #self.viterbi_obj.tag_list = [tag for tag, count in sorted(tags_info.iteritems(), key=lambda x:x[1])] #self.hidden_states = [tag for tag, count in sorted(tags_info.iteritems(), key=lambda x:x[1])] self.viterbi_obj.tag_list = tags_info.keys() self.hidden_states = tags_info.keys() print self.hidden_states self.logger.info("Completed parsing the training data to form feature space") self.logger.info("Tag features : %d" % len(self.tag_features)) self.logger.info("Word features : %d" % len(self.word_features)) self.logger.info("Suffix features : %d" % len(self.suffix_features)) self.logger.info("Hidden States : %d" % len(self.hidden_states)) self.estimate_weights(training_file, end_line) self.trained = True def get_suffix_feature(self, tag, word): if self.trained and self.use_avg: weights = self.avg_weights else: weights = self.weights wt = 0 if len(word) > 1: wt += weights.get(self.KEY_SUFFIX, {}).get((word[-1:], tag), 0) if len(word) > 2: wt += weights.get(self.KEY_SUFFIX, {}).get((word[-2:], tag), 0) if len(word) > 3: wt += weights.get(self.KEY_SUFFIX, {}).get((word[-3:], tag), 0) return wt def get_cand_suffix_feature(self, tag, word): if self.trained and self.use_avg: weights = self.avg_weights else: weights = self.weights wt = 0 if len(word) > 1: wt += weights.get("CAND_SUF", {}).get((word[-1:], tag), 0) if len(word) > 2: wt += weights.get("CAND_SUF", {}).get((word[-2:], tag), 0) if len(word) > 3: wt += weights.get("CAND_SUF", {}).get((word[-3:], tag), 0) return wt def get_transition_feature(self, prev_tag, next_tag=None): if self.trained and self.use_avg: weights = self.avg_weights else: weights = self.weights if next_tag is None: next_tag = self.stop_tag return weights.get(self.KEY_TAG, {}).get((prev_tag, next_tag), 0) def get_emission_feature(self, tag, word): if self.trained and self.use_avg: weights = self.avg_weights else: weights = self.weights return weights.get(self.KEY_WORD, {}).get((tag, word), 0) def get_cand_emission_feature(self, tag, word): if self.trained and self.use_avg: weights = self.avg_weights else: weights = self.weights return weights.get("CAND_EMI", {}).get((tag, word), 0) def main_compute_prev(self, result_list, t, i, j, word_list, tag_list): addn_wt = 0 if self.use_suffix: addn_wt = self.get_suffix_feature(tag_list[j], word_list[t + 1]) if t < 0: if tag_list[i] == self.start_tag: return self.get_transition_feature(tag_list[i], tag_list[j]) + self.get_emission_feature(tag_list[j], word_list[t + 1]) + addn_wt else: return -10000000 + self.get_transition_feature(tag_list[i], tag_list[j]) + self.get_emission_feature(tag_list[j], word_list[t + 1]) + addn_wt return result_list[t][i][0] + self.get_transition_feature(tag_list[i], tag_list[j]) + self.get_emission_feature(tag_list[j], word_list[t + 1]) + addn_wt def main_compute_final(self, result_list, i, num_words, tag_list): return result_list[num_words - 1][i][0] + self.get_transition_feature(tag_list[i]) def compute_prev(self, result_list, t, j, word_list, tag_list): addn_wt = self.get_cand_suffix_feature(tag_list[j], word_list[t + 1]) return self.get_cand_emission_feature(tag_list[j], word_list[t + 1]) + addn_wt def estimate_weights(self, training_file, end_line=5500): self.logger.info("Started estimating weights...") parse_level = 0 multiplier = self.training_level * end_line for r_level in xrange(self.training_level): parse_level += 1 for line_no, word_list in self.data_parser.next(training_file): if line_no > end_line: break if line_no % 500 == 0: print "LEVEL: %d : Processed %d lines..." % (r_level, line_no) new_word_list = [(word, tag) for word, tag in word_list if tag not in self.special_tags] predicted_tags = self.viterbi_obj.predict_sequence([word for word, tag in new_word_list]) self.reestimate_weights(new_word_list, predicted_tags, multiplier) multiplier -= 1 self.logger.info("Completed parsing %d time(s) for estimating weights" % parse_level) self.logger.info("Completed estimating weights") self.logger.info("TAG WEIGHTS : %d" % len(self.weights.get(self.KEY_TAG, {}))) self.logger.info("WORD WEIGHTS : %d" % len(self.weights.get(self.KEY_WORD, {}))) def reestimate_weights(self, word_list, predicted_tags, multiplier): prev_tag = self.start_tag pred_prev_tag = self.start_tag prev_main_tag = self.start_tag pred_prev_main_tag = self.start_tag local_diff = {} for index, (word, tag) in enumerate(word_list): if tag in self.special_tags: pred_tag = tag else: if predicted_tags: pred_tag = predicted_tags[index] else: pred_tag = None if tag in ["PUN", "START", "STOP"]: main_tag = tag else: main_tag = tag[0] if pred_tag in ["PUN", "START", "STOP"]: pred_main_tag = pred_tag else: pred_main_tag = pred_tag[0] #weights of tags if prev_main_tag is not None: #count = self.weights.setdefault(self.KEY_TAG, {}).setdefault((prev_tag, tag), 0) #self.weights[self.KEY_TAG][(prev_tag, tag)] = count + 1 count = local_diff.setdefault(self.KEY_TAG, {}).setdefault((prev_main_tag, main_tag), 0) local_diff[self.KEY_TAG][(prev_main_tag, main_tag)] = count + 1 if predicted_tags and pred_prev_main_tag is not None: #count = self.weights.setdefault(self.KEY_TAG, {}).setdefault((pred_prev_tag, pred_tag), 0) #self.weights[self.KEY_TAG][(pred_prev_tag, pred_tag)] = count - 1 count = local_diff.setdefault(self.KEY_TAG, {}).setdefault((pred_prev_main_tag, pred_main_tag), 0) local_diff[self.KEY_TAG][(pred_prev_main_tag, pred_main_tag)] = count - 1 #weights of words if tag not in self.special_tags: #count = self.weights.setdefault(self.KEY_WORD, {}).setdefault((tag, word), 0) #self.weights[self.KEY_WORD][(tag, word)] = count + 1 count = local_diff.setdefault(self.KEY_WORD, {}).setdefault((main_tag, word), 0) local_diff[self.KEY_WORD][(main_tag, word)] = count + 1 count = local_diff.setdefault("CAND_EMI", {}).setdefault((tag, word), 0) local_diff["CAND_EMI"][(tag, word)] = count + 1 if predicted_tags: #count = self.weights.setdefault(self.KEY_WORD, {}).setdefault((pred_tag, word), 0) #self.weights[self.KEY_WORD][(pred_tag, word)] = count - 1 count = local_diff.setdefault(self.KEY_WORD, {}).setdefault((pred_main_tag, word), 0) local_diff[self.KEY_WORD][(pred_main_tag, word)] = count - 1 count = local_diff.setdefault("CAND_EMI", {}).setdefault((pred_tag, word), 0) local_diff["CAND_EMI"][(pred_tag, word)] = count - 1 #weight of suffix if len(word) > 1: count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-1:], main_tag), 0) local_diff[self.KEY_SUFFIX][(word[-1:], main_tag)] = count + 1 count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-1:], pred_main_tag), 0) local_diff[self.KEY_SUFFIX][(word[-1:], pred_main_tag)] = count - 1 count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-1:], tag), 0) local_diff["CAND_SUF"][(word[-1:], tag)] = count + 1 count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-1:], pred_tag), 0) local_diff["CAND_SUF"][(word[-1:], pred_tag)] = count - 1 if len(word) > 2: count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-2:], main_tag), 0) local_diff[self.KEY_SUFFIX][(word[-2:], main_tag)] = count + 1 count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-2:], pred_main_tag), 0) local_diff[self.KEY_SUFFIX][(word[-2:], pred_main_tag)] = count - 1 count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-2:], tag), 0) local_diff["CAND_SUF"][(word[-2:], tag)] = count + 1 count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-2:], pred_tag), 0) local_diff["CAND_SUF"][(word[-2:], pred_tag)] = count - 1 if len(word) > 3: count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-3:], main_tag), 0) local_diff[self.KEY_SUFFIX][(word[-3:], main_tag)] = count + 1 count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-3:], pred_main_tag), 0) local_diff[self.KEY_SUFFIX][(word[-3:], pred_main_tag)] = count - 1 count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-3:], tag), 0) local_diff["CAND_SUF"][(word[-3:], tag)] = count + 1 count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-3:], pred_tag), 0) local_diff["CAND_SUF"][(word[-3:], pred_tag)] = count - 1 prev_tag = tag prev_main_tag = main_tag pred_prev_tag = pred_tag pred_prev_main_tag = pred_main_tag count = local_diff.setdefault(self.KEY_TAG, {}).setdefault((main_tag, self.stop_tag), 0) local_diff[self.KEY_TAG][(main_tag, self.stop_tag)] = count + 1 count = local_diff.setdefault(self.KEY_TAG, {}).setdefault((pred_main_tag, self.stop_tag), 0) local_diff[self.KEY_TAG][(pred_main_tag, self.stop_tag)] = count - 1 for tag_type, info_hash in local_diff.iteritems(): for key, value in info_hash.iteritems(): count = self.weights.setdefault(tag_type, {}).setdefault(key, 0) self.weights[tag_type][key] = count + value wt = self.avg_weights.setdefault(tag_type, {}).setdefault(key, 0) self.avg_weights[tag_type][key] = wt + multiplier * value
except FileNotFoundError: print("Weights were not found") exit(0) predictions = list() incorrect_count = 0 correct_count = 0 start_time = time() incorrect_tags = dict() confusion_matrix = pd.DataFrame(index=tags, columns=tags).fillna(0) for i in range(test_data_size): # print(i+1,'/',test_data_size) sentence = [x[0] for x in test_data.data[i]] test_tags = [x[1] for x in test_data.data[i]] viterbi = Viterbi(tags, gen.transform, sentence, w_0, 5) predicted_tags = viterbi.run() predictions.append((sentence, predicted_tags)) for t, p in zip(test_tags, predicted_tags): if t == p: correct_count += 1 else: incorrect_count += 1 if t in incorrect_tags: incorrect_tags[t] += 1 else: incorrect_tags[t] = 1 confusion_matrix.loc[t, p] += 1 end_time = time() - start_time
# Checking arguments parser = argparse.ArgumentParser(description='Output Viterbi Sequence.') parser.add_argument('sequence', metavar='N', type=str, nargs='+', help='an integer for the sequence') parser.add_argument( '-s', '--sequence', action='store_const', dest='sequence', const=sequence, default=None, help= 'A sequence of numbers for the output sequence :: Required for any output.' ) args = parser.parse_args() sequence = args.sequence[0] # If sequence not provided, return help and exit if not sequence: parser.print_help() exit() print 'Observation(Input Sequence):', sequence element = Viterbi(sequence) element.viterbi()
import numpy as np from viterbi import Viterbi, add_one_smoothing TAGS = ['N', 'C', 'V', 'J'] LEXICON = ['that', 'is', 'not', 'it', 'good', 'bad'] Pi = [1 / 8, 3 / 8, 3 / 8, 1 / 8] count_A = np.array([[2., 0., 3., 1.], [2., 0., 0., 0.], [4., 0., 1., 0.], [0., 0., 0., 0.]]) count_B = np.array([[4., 0., 2., 2., 0., 0.], [2., 0., 0., 0., 0., 0.], [0., 6., 0., 0., 0., 0.], [0., 0., 0., 0., 1., 0.]]) if __name__ == '__main__': A = add_one_smoothing(count_A) B = add_one_smoothing(count_B) viterbi = Viterbi(Pi, A, B, TAGS, LEXICON) sentence1 = 'bad is not good' sentence2 = 'is it bad' pred1 = viterbi.predict_tags(sentence1) pred2 = viterbi.predict_tags(sentence2) print(pred1) print(pred2)
def callViterbi(self, observations): vit = Viterbi() tvit = vit.execute(observations, self.state, self.startProb, self.transSt, self.emission) probs = tvit[0] state_res = tvit[1] return (probs, state_res)
import utils from ChineseTone import PinyinHelper from tqdm import tqdm with open('./test/test_set.json') as f: test_set = json.load(f) hmm = HmmParam() # hmm.py2hz_dict['不'] # hmm.emmission['不'] count_single = 0 correct_single = 0 count_sentence = 0 correct_sentence = 0 for i in tqdm(range(len(test_set))): count_sentence += 1 test = test_set[i] flag = True answer = Viterbi(hmm,test['py'],5)[0].path # print(answer) for idx,an in enumerate(answer): count_single += 1 if an == test['hz'][idx]: correct_single += 1 else: flag = False if flag: correct_sentence += 1 print('single:',correct_single/count_single) print('sentence:',correct_sentence/count_sentence)
class GPSMatcher: def __init__(self, hmm, emission_probability, constraint_length=10, MAX_DIST=500, priors=None, smallV=0.00000000001): # initialize spatial index self.previous_obs = None if priors == None: priors = dict([(state, 1.0 / len(hmm)) for state in hmm]) state_spatial_index = Rtree() unlocated_states = [] id_to_state = {} id = 0 for state in hmm: geom = self.geometry_of_state(state) if not geom: unlocated_states.append(state) else: ((lat1, lon1), (lat2, lon2)) = geom state_spatial_index.insert(id, (min(lon1, lon2), min( lat1, lat2), max(lon1, lon2), max(lat1, lat2))) id_to_state[id] = state id = id + 1 def candidate_states(obs): #was (lat,lon) in place of obs geom = self.geometry_of_observation(obs) if geom == None: return hmm.keys() else: (lat, lon) = geom nearby_states = state_spatial_index.intersection( (lon - MAX_DIST / METERS_PER_DEGREE_LONGITUDE, lat - MAX_DIST / METERS_PER_DEGREE_LATITUDE, lon + MAX_DIST / METERS_PER_DEGREE_LONGITUDE, lat + MAX_DIST / METERS_PER_DEGREE_LATITUDE)) candidates = [id_to_state[id] for id in nearby_states] + unlocated_states return candidates self.viterbi = Viterbi(hmm, emission_probability, constraint_length=constraint_length, priors=priors, candidate_states=candidate_states, smallV=smallV) def step(self, obs, V, p): if self.previous_obs != None: for int_obs in self.interpolated_obs(self.previous_obs, obs): V, p = self.viterbi.step(int_obs, V, p) V, p = self.viterbi.step(obs, V, p) self.previous_obs = obs return V, p def interpolated_obs(self, prev, obs): return [] def geometry_of_observation(self, obs): return obs def geometry_of_state(self, state): """ Subclasses should override this method to return the geometry of a given state, typically an edge.""" if state == 'unknown': return None else: return state
#Set up the VITERBI states = ["Buy", "Sell"] obs = [] obs_prices = [] obs_delta = [] prev_price = hist_prices["Adj Close"][0] for price in hist_prices["Adj Close"]: if price >= prev_price: obs.append("Up") else: obs.append("Down") obs_prices.append(price) obs_delta.append(price - prev_price) prev_price = price possible_obs = ["Up", "Down"] v = Viterbi(initial, states, obs, possible_obs, trans, emiss) v.run() #v.print_table() #v.print_backtrack_table() #v.print_backtrack() #make a graph backtrack = v.get_backtrack() backtrack.pop(0) to_print = pd.DataFrame(hist_prices['Adj Close']) to_print["Delta"] = obs_delta to_print["Output"] = backtrack print(to_print) fig = hist_prices['Adj Close'].plot(grid="True") i = start tmp_backtrack = backtrack
def cross_validation(sequences, training_method, decoder): """ Performs the 10-fold cross-validation Requieres an array of dict sequences Requires the training function Requires a decoder objetct (Viterbi or Posterior) """ # here we store the total_ac for each cross-validation vit_total_ac = np.array([.0] * len(sequences)) post_total_ac = np.array([.0] * len(sequences)) vit = Viterbi() post = Posterior() for i in range(len(sequences)): vit_total_scores = np.zeros([4]) post_total_scores = np.zeros([4]) # arrays with the sequences for training and for validation training_data_array = sequences[:] validation_data_array = [ training_data_array.pop(i) ] # merging the arrays into dictionaries training_data = merge(training_data_array) validation_data = merge(validation_data_array) # the training function returns a model model = training_method(training_data) #do viterbi prediction on set i for key, sequence in validation_data.items(): # the sequence from the file true_seq = sequence['Z'] # the sequence decoded using viterbi, or posterior and the model generated vit_pred_seq = vit.decode(model, sequence['X']) post_pred_seq = post.decode(model, sequence['X']) """ print key print "PREDICTED" print pred_seq print "TRUE" print true_seq """ tp, fp, tn, fn = compare_tm_pred.count(true_seq, vit_pred_seq) vit_total_scores += np.array([tp, fp, tn, fn]) tp, fp, tn, fn = compare_tm_pred.count(true_seq, post_pred_seq) post_total_scores += np.array([tp, fp, tn, fn]) if VERBOSE: print ">" + key compare_tm_pred.print_stats(tp, fp, tn, fn) print vit_total_ac[i] = compare_tm_pred.compute_stats(*vit_total_scores)[3] post_total_ac[i] = compare_tm_pred.compute_stats(*post_total_scores)[3] #print total_ac if VERBOSE: print "Summary 10-fold cross validation over index %i :"%(i) # compare_tm_pred.print_stats( *total_scores ) print print print print "-------------------------------------------------------" if DEBUG: raw_input("press any key to continue\n") print "Overall viterbi result mean: %s, variance: %s"%(np.mean(vit_total_ac), np.var(vit_total_ac)) print "Posterior mean: %s, variance %s"%(np.mean(post_total_ac), np.var(post_total_ac))