class Tagger: def __init__(self, test_data_file, feature_template_list, model_file, test_output_file, word_sta = {}): self.test_data_file = test_data_file self.model = Model(feature_template_list, list(ALL_LABEL), model_file) self.test_output_file = test_output_file self.prior = word_sta def tag(self): ''' decode a sequence ''' out_file = open(self.test_output_file, 'w') li = 0 correct = [0, 0] for (chunk, line) in read_test_data(self.test_data_file): observe_data = [w[0] for w in line] infer_label = self.model.viterbi(observe_data, self.prior) if len(line) > 0 and len(line[0]) > 1: ideal_lable = [w[1] for w in line] for i in xrange(len(ideal_lable)): correct[0] += 1 if ideal_lable[i] == infer_label[i] else 0 correct[1] += 1 else: ideal_lable = ['' for l in infer_label] for (word, label, labelr) in zip(observe_data, infer_label, ideal_lable): print >> out_file, word + '\t' + label + '\t' + labelr li += 1 sys.stdout.write("tag %d sentence p(f, r) %f \r" %(li, float(correct[0]) / correct[1])) sys.stdout.flush() print >> out_file print >> sys.stdout print >> sys.stdout, "correctness: %f" % (float(correct[0]) / correct[1])
class Perceptron: ''' word_sta is in the form: word -> tag -> count ''' def __init__(self, train_data_file, feature_template_list, model_file, old_model_file = None, word_sta = {}): self.train_data_file = train_data_file self.model = Model(feature_template_list, list(ALL_LABEL), old_model_file) self.model_file = model_file self.prior = word_sta def train(self, iteration, keep): ''' perceptron train algorithm ''' for it in xrange(iteration): viterbi_time = 0 update_time = 0 on = 0 ln = 0 label_len = len(ALL_LABEL) ** 2 same = [0, 0] print >> sys.stdout, 'perceptron iteration', it + 1 for (chunk, line) in read_train_data(self.train_data_file): ln += 1 if ln % 1000 == 0: print >> sys.stdout, 'complete %d sentences in %d secs' % (ln, viterbi_time) if random.random() > keep: continue observe_data = [w[0] for w in line] ideal_lable = [w[1] for w in line] start = time.clock() infer_label = self.model.viterbi(observe_data, self.prior) end = time.clock() ss = 0 for li in xrange(len(ideal_lable)): ss += 1 if ideal_lable[li] == infer_label[li] else 0 same[0] += float(ss) / len(ideal_lable) same[1] += 1 on += len(observe_data) * label_len viterbi_time += end - start start = time.clock() self.model.update(observe_data, ideal_lable, infer_label) end = time.clock() update_time += end - start print >> sys.stdout, 'complete %d iteration in %d secs with precision %f' %(it + 1, viterbi_time, same[0] / same[1]) print >> open(self.model_file + '.delta' + str(it + 1), 'w'), self.model print >> open(self.model_file, 'w'), self.model