def __init__(self, epochs=10, eta=.0001): self.decoder = ViterbiDecompounder() self.parameters_for_epoch = [] self.n_epochs = epochs self.eta = eta self.n_features = ViterbiDecompounder.n_features
class StructuredPerceptron: def __init__(self, epochs=10, eta=.0001): self.decoder = ViterbiDecompounder() self.parameters_for_epoch = [] self.n_epochs = epochs self.eta = eta self.n_features = ViterbiDecompounder.n_features def train(self, data, heldout, verbose=0, run_label=None): self.decoder.w = np.ones(self.n_features, dtype=float) / self.n_features print("Start weights: %s" % self.decoder.w, file=sys.stderr) training_accuracy = [0.0] heldout_accuracy = [0.0] for i_epoch in range(self.n_epochs): tp, fp, fn = 0, 0, 0 for compound in data: tp, fp, fn = self.train_one(compound, tp, fp, fn) self.parameters_for_epoch.append(self.decoder.w.copy()) precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = 2 * ((precision * recall) / (precision + recall)) training_accuracy.append(f1) if verbose == 1: acc = self.test(heldout) heldout_accuracy.append(acc) print("Training", training_accuracy) # Stop if the error on the training data does not decrease if training_accuracy[-1] <= training_accuracy[-2]: break print("Weights: %s" % self.decoder.w, file=sys.stderr) print("Epoch %i, F1: %f" % (i_epoch, f1), file=sys.stderr) # Average! averaged_parameters = 0 for epoch_parameters in self.parameters_for_epoch: averaged_parameters += epoch_parameters averaged_parameters /= len(self.parameters_for_epoch) self.decoder.w = averaged_parameters # Finished training self.trained = True if verbose == 1: print("Heldout accs:", str(heldout_accuracy)) print(self.decoder.w) # Export training info in verbose mode: if verbose == 2: x = np.arange(0, len(training_accuracy), 1.0) plt.plot(x, training_accuracy, marker='o', linestyle='--', color='r', label='Training') plt.plot(x, heldout_accuracy, marker='o', linestyle='--', color='b', label='Heldout') plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.title('Training and Heldout Accuracy') plt.ylim([0.9, 1.0]) plt.legend(bbox_to_anchor=(1., 0.2)) plt.savefig('eval/%s_training.png' % run_label) plt.close() def train_one(self, compound, tp, fp, fn): # Returns a list of tuples with (start, stop) position predicted_splits = self.decoder.viterbi_decode(compound) gold_splits = compound.get_gold_splits() gold_splits_set = set(gold_splits) predicted_splits_set = set(predicted_splits) for split in gold_splits_set.union(predicted_splits_set): if split in predicted_splits_set and split in gold_splits_set: # Do nothing tp += 1 if split[1] == len( compound.string ) and split[0] != 0: # Ignore the final artificial path continue if split in predicted_splits_set and split not in gold_splits_set: # This is a bad split! prev_split = get_prev_split(predicted_splits, split) predicted_split_features = self.decoder.fs( compound, prev_split, split, compound.predicted_lattice) print("Pred fs:", predicted_split_features, file=sys.stderr) self.decoder.w -= self.eta * (self.decoder.w * predicted_split_features) fp += 1 if split not in predicted_splits_set and split in gold_splits_set: # This split should have been there! prev_split = get_prev_split(gold_splits, split) gold_split_features = self.decoder.fs( compound, prev_split, split, compound.predicted_lattice) print("Gold fs:", gold_split_features, file=sys.stderr) print("w:", self.decoder.w, file=sys.stderr) self.decoder.w += self.eta * (self.decoder.w * gold_split_features) fn += 1 return tp, fp, fn def test(self, compounds): tp, fp, fn = 0, 0, 0 for compound in compounds: z = self.decoder.viterbi_decode(compound) gold_splits = set(compound.gold_splits) for split in z: if split in gold_splits: tp += 1 else: fp += 1 for gold_split in gold_splits: if gold_split not in z: fn += 1 precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = 2 * ((precision * recall) / (precision + recall)) print("Test Precision: %f" % recall) print("Test Recall: %f" % precision) print("Test F1: %f" % f1) return f1
print >> sys.stderr, "Loading gensim model..." model = gensim.models.Word2Vec.load_word2vec_format(args.model_folder + '/w2v.bin', binary=True) print >> sys.stderr, "Done." if args.mode == "lattices": for line in sys.stdin: print( get_decompound_lattice( line.decode('utf8').rstrip('\n').title(), args.nAccuracy, args.similarityThreshold ) ) elif args.mode in ["1-best", "dict_w2v"]: vit = ViterbiDecompounder() vit.load_weights(args.weightsFile) if args.mode == "1-best": words = map(lambda line: line.decode('utf8').strip(), fileinput.input()) else: words = list(model.vocab.keys()) for word in words: lattice = Lattice(get_decompound_lattice(word, args.nAccuracy, args.similarityThreshold)) viterbi_path = vit.viterbi_decode(Compound(word, None, lattice)) print " ".join(map(lambda p: "%d,%d" % p, viterbi_path)
modelSetup, nAccuracy=args.nAccuracy, globalNN=args.globalNN, similarityThreshold=args.similarityThreshold, prototype_file=args.prototypeFile) if args.mode == "lattices": for line in sys.stdin: print( base_decompounder.get_decompound_lattice( line.decode('utf8').rstrip('\n').title(), )) elif args.mode == "w2v_dict": for word in base_decompounder.model.vocab.keys(): print word.encode('utf-8') elif args.mode in ["1-best", "dict_w2v"]: vit = ViterbiDecompounder() vit.load_weights(modelSetup["WEIGHTS"]) words = [] if args.mode == "1-best": words = map(lambda line: line.decode('utf8').strip(), sys.stdin) else: words = base_decompounder.model.vocab.keys() print >> sys.stderr, "# words: %d" % len(words) def process_word(word): lattice = Lattice(base_decompounder.get_decompound_lattice(word)) viterbi_path = vit.viterbi_decode(Compound(word, None, lattice)) return [ word.encode('utf-8'),
nAccuracy=args.nAccuracy, globalNN=args.globalNN, similarityThreshold=args.similarityThreshold, prototype_file=args.prototypeFile) if args.mode == "lattices": for line in sys.stdin: print( base_decompounder.get_decompound_lattice( line.decode('utf8').rstrip('\n').title(), ) ) elif args.mode == "w2v_dict": for word in base_decompounder.model.vocab.keys(): print word.encode('utf-8') elif args.mode in ["1-best", "dict_w2v"]: vit = ViterbiDecompounder() vit.load_weights(modelSetup["WEIGHTS"]) words = [] if args.mode == "1-best": words = map(lambda line: line.decode('utf8').strip(), sys.stdin) else: words = base_decompounder.model.vocab.keys() print >>sys.stderr, "# words: %d" % len(words) def process_word(word): lattice = Lattice(base_decompounder.get_decompound_lattice(word)) viterbi_path = vit.viterbi_decode(Compound(word, None, lattice)) return [word.encode('utf-8'), print_path(viterbi_path).encode('utf-8')]
class StructuredPerceptron: def __init__(self, epochs=10, eta=.0001): self.decoder = ViterbiDecompounder() self.parameters_for_epoch = [] self.n_epochs = epochs self.eta = eta self.n_features = ViterbiDecompounder.n_features def train(self, data, heldout, verbose=0, run_label=None): self.decoder.w = np.ones(self.n_features, dtype=float) / self.n_features print >> sys.stderr, "Start weights: %s" % self.decoder.w training_accuracy = [0.0] heldout_accuracy = [0.0] for i_epoch in xrange(self.n_epochs): tp, fp, fn = 0, 0, 0 for compound in data: tp, fp, fn = self.train_one(compound, tp, fp, fn) self.parameters_for_epoch.append(self.decoder.w.copy()) precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = 2 * ((precision * recall) / (precision + recall)) training_accuracy.append(f1) if verbose == 1: acc = self.test(heldout) heldout_accuracy.append(acc) print "Training", training_accuracy # Stop if the error on the training data does not decrease if training_accuracy[-1] <= training_accuracy[-2]: break print >> sys.stderr, "Weights: %s" % self.decoder.w print >> sys.stderr, "Epoch %i, F1: %f" % (i_epoch, f1) # Average! averaged_parameters = 0 for epoch_parameters in self.parameters_for_epoch: averaged_parameters += epoch_parameters averaged_parameters /= len(self.parameters_for_epoch) self.decoder.w = averaged_parameters # Finished training self.trained = True if verbose == 1: print "Heldout accs:", str(heldout_accuracy) print self.decoder.w # Export training info in verbose mode: if verbose == 2: x = np.arange(0, len(training_accuracy), 1.0) plt.plot(x, training_accuracy, marker='o', linestyle='--', color='r', label='Training') plt.plot(x, heldout_accuracy, marker='o', linestyle='--', color='b', label='Heldout') plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.title('Training and Heldout Accuracy') plt.ylim([0.9, 1.0]) plt.legend(bbox_to_anchor=(1., 0.2)) plt.savefig('eval/%s_training.png' % run_label) plt.close() def train_one(self, compound, tp, fp, fn): # Returns a list of tuples with (start, stop) position predicted_splits = self.decoder.viterbi_decode(compound) gold_splits = compound.get_gold_splits() gold_splits_set = set(gold_splits) predicted_splits_set = set(predicted_splits) for split in gold_splits_set.union(predicted_splits_set): if split in predicted_splits_set and split in gold_splits_set: # Do nothing tp += 1 if split[1] == len(compound.string) and split[0] != 0: # Ignore the final artificial path continue if split in predicted_splits_set and split not in gold_splits_set: # This is a bad split! prev_split = get_prev_split(predicted_splits, split) predicted_split_features = self.decoder.fs(compound, prev_split, split, compound.predicted_lattice) print >> sys.stderr, "Pred fs:", predicted_split_features self.decoder.w -= self.eta * (self.decoder.w * predicted_split_features) fp += 1 if split not in predicted_splits_set and split in gold_splits_set: # This split should have been there! prev_split = get_prev_split(gold_splits, split) gold_split_features = self.decoder.fs(compound, prev_split, split, compound.predicted_lattice) print >> sys.stderr, "Gold fs:", gold_split_features print >> sys.stderr, "w:", self.decoder.w self.decoder.w += self.eta * (self.decoder.w * gold_split_features) fn += 1 return tp, fp, fn def test(self, compounds): tp, fp, fn = 0, 0, 0 for compound in compounds: z = self.decoder.viterbi_decode(compound) gold_splits = set(compound.gold_splits) for split in z: if split in gold_splits: tp += 1 else: fp += 1 for gold_split in gold_splits: if gold_split not in z: fn += 1 precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = 2 * ((precision * recall) / (precision + recall)) print "Test Precision: %f" % recall print "Test Recall: %f" % precision print "Test F1: %f" % f1 return f1