def test(self, word_seq_path, output_path): original_seq, processed_seq = self.__prepare_word_seq(word_seq_path) decoder = Viterbi(self.vocab_list, self.tags, self.trans_prob, self.emit_prob) tags_pred, prob = decoder.decode(processed_seq) with open(output_path, "w") as out: for word, tag in zip(original_seq, tags_pred): if not word: out.write("\n") else: out.write("{0}\t{1}\n".format(word, tag))
def train(self, sequences, iterations=3): vit = Viterbi() for x in range(iterations): self.log_space() for name, seq in sequences.items(): seq['Z'] = vit.decode(self, seq['X']) print seq['Z'] #we return from log space self.delog() self.train_by_counting(sequences) print Model(self.keys, self.model, self.labels) return Model(self.keys, self.model, self.labels)
def train(self, sequences, iterations=3): vit = Viterbi() for x in range(iterations): self.log_space() for name, seq in sequences.items(): seq['Z'] = vit.decode(self, seq['X']) print seq['Z'] #we return from log space self.delog() self.train_by_counting( sequences ) print Model(self.keys, self.model, self.labels) return Model(self.keys, self.model, self.labels)
def test(self): print('Test started...') start_test = time.time() self.pred_tags = [] test_orig, test_prep = dataloader(self.corpus + TEST_WORDS, 'test') tagger = Viterbi(self.vocab, self.tags, test_prep, self.A, self.B) preds = tagger.decode() for word, tag in zip(test_orig, preds): self.pred_tags.append((word, tag)) with open(PRED_T_POS, 'w') as out: for word, tag in self.pred_tags: if not word: out.write("\n") else: out.write("{0}\t{1}\n".format(word, tag)) out.close() print('Test finished, file has been written in '+ str(time.time()-\ start_test))
def validate(self): print('Validation started...') start_val = time.time() self.pred_tags = [] valid_orig, valid_prep = dataloader(self.corpus + \ VALIDATE_WORDS, 'validate') tagger = Viterbi(self.vocab, self.tags, valid_prep, self.A, self.B) preds = tagger.decode() for word, tag in zip(valid_orig, preds): self.pred_tags.append((word, tag)) with open(PRED_V_POS, 'w') as out: for word, tag in self.pred_tags: if not word: out.write("\n") else: out.write("{0}\t{1}\n".format(word, tag)) out.close() print('Validation ended, file has been written in '+ str(time.time()-\ start_val))
def test_viterbi_decode(): ''' Test case based on HW3 question. ''' log = logging.getLogger('test viterbi') ZERO = 0.000000000000000001 obs_space = ["moo", "hello", "quack", START_END_OBS] states = ["Cow", "Duck", START_END_TAG] trans_prob = [[0.5, 0.3, 0.2], [0.3, 0.5, 0.2], [1.0, ZERO, ZERO]] emit_prob = [[0.9, 0.1, ZERO, ZERO], [ZERO, 0.4, 0.6, ZERO], [ZERO, ZERO, ZERO, 1.0]] decoder = Viterbi(obs_space, states, trans_prob, emit_prob) obs = ["moo", "hello", "quack", START_END_OBS] seq, prob = decoder.decode(obs) log.debug("seq: " + str(seq)) log.debug("log_prob: " + str(prob)) assert prob - (-5.03903) < ZERO and \ seq == ["Cow", "Duck", "Duck", START_END_TAG]
outputs.to_project_1_probs_file(sequences.get(), probs, 'viterbi-probs.txt') """ if __name__ == '__main__': model = hmm.Model(KEYS) model.load(HMMFILE) sequences = sequences.Sequences(SEQUENCEFILE) # load methods vit = Viterbi() post = Posterior() # viterbi probs = {} for key, sequence in sequences.get().items(): probs[key] = vit.decode(model, sequence) outputs.to_project_2_viterbi(sequences.get(), probs, 'pred-test-sequences-project2-viterbi.txt') probs = {} for key, value in sequences.get().items(): sequence = {'Z': post.decode(model, value), 'X': value} log_joint = compute_hmm(model, sequence) probs[key] = (log_joint, sequence['Z']) #outputs.to_project_2_posterior(sequences.get(), probs, 'posterior-output.txt') outputs.to_project_2_posterior( sequences.get(), probs, 'pred-test-sequences-project2-posterior.txt') # testing
def cross_validation(sequences, training_method, decoder): """ Performs the 10-fold cross-validation Requieres an array of dict sequences Requires the training function Requires a decoder objetct (Viterbi or Posterior) """ # here we store the total_ac for each cross-validation vit_total_ac = np.array([.0] * len(sequences)) post_total_ac = np.array([.0] * len(sequences)) vit = Viterbi() post = Posterior() for i in range(len(sequences)): vit_total_scores = np.zeros([4]) post_total_scores = np.zeros([4]) # arrays with the sequences for training and for validation training_data_array = sequences[:] validation_data_array = [training_data_array.pop(i)] # merging the arrays into dictionaries training_data = merge(training_data_array) validation_data = merge(validation_data_array) # the training function returns a model model = training_method(training_data) #do viterbi prediction on set i for key, sequence in validation_data.items(): # the sequence from the file true_seq = sequence['Z'] # the sequence decoded using viterbi, or posterior and the model generated vit_pred_seq = vit.decode(model, sequence['X']) post_pred_seq = post.decode(model, sequence['X']) """ print key print "PREDICTED" print pred_seq print "TRUE" print true_seq """ tp, fp, tn, fn = compare_tm_pred.count(true_seq, vit_pred_seq) vit_total_scores += np.array([tp, fp, tn, fn]) tp, fp, tn, fn = compare_tm_pred.count(true_seq, post_pred_seq) post_total_scores += np.array([tp, fp, tn, fn]) if VERBOSE: print ">" + key compare_tm_pred.print_stats(tp, fp, tn, fn) print vit_total_ac[i] = compare_tm_pred.compute_stats(*vit_total_scores)[3] post_total_ac[i] = compare_tm_pred.compute_stats(*post_total_scores)[3] #print total_ac if VERBOSE: print "Summary 10-fold cross validation over index %i :" % (i) # compare_tm_pred.print_stats( *total_scores ) print print print print "-------------------------------------------------------" if DEBUG: raw_input("press any key to continue\n") print "Overall viterbi result mean: %s, variance: %s" % ( np.mean(vit_total_ac), np.var(vit_total_ac)) print "Posterior mean: %s, variance %s" % (np.mean(post_total_ac), np.var(post_total_ac))
def cross_validation(sequences, training_method, decoder): """ Performs the 10-fold cross-validation Requieres an array of dict sequences Requires the training function Requires a decoder objetct (Viterbi or Posterior) """ # here we store the total_ac for each cross-validation vit_total_ac = np.array([.0] * len(sequences)) post_total_ac = np.array([.0] * len(sequences)) vit = Viterbi() post = Posterior() for i in range(len(sequences)): vit_total_scores = np.zeros([4]) post_total_scores = np.zeros([4]) # arrays with the sequences for training and for validation training_data_array = sequences[:] validation_data_array = [ training_data_array.pop(i) ] # merging the arrays into dictionaries training_data = merge(training_data_array) validation_data = merge(validation_data_array) # the training function returns a model model = training_method(training_data) #do viterbi prediction on set i for key, sequence in validation_data.items(): # the sequence from the file true_seq = sequence['Z'] # the sequence decoded using viterbi, or posterior and the model generated vit_pred_seq = vit.decode(model, sequence['X']) post_pred_seq = post.decode(model, sequence['X']) """ print key print "PREDICTED" print pred_seq print "TRUE" print true_seq """ tp, fp, tn, fn = compare_tm_pred.count(true_seq, vit_pred_seq) vit_total_scores += np.array([tp, fp, tn, fn]) tp, fp, tn, fn = compare_tm_pred.count(true_seq, post_pred_seq) post_total_scores += np.array([tp, fp, tn, fn]) if VERBOSE: print ">" + key compare_tm_pred.print_stats(tp, fp, tn, fn) print vit_total_ac[i] = compare_tm_pred.compute_stats(*vit_total_scores)[3] post_total_ac[i] = compare_tm_pred.compute_stats(*post_total_scores)[3] #print total_ac if VERBOSE: print "Summary 10-fold cross validation over index %i :"%(i) # compare_tm_pred.print_stats( *total_scores ) print print print print "-------------------------------------------------------" if DEBUG: raw_input("press any key to continue\n") print "Overall viterbi result mean: %s, variance: %s"%(np.mean(vit_total_ac), np.var(vit_total_ac)) print "Posterior mean: %s, variance %s"%(np.mean(post_total_ac), np.var(post_total_ac))