示例#1
0
    def __init__(self, epochs=10, eta=.0001):

        self.decoder = ViterbiDecompounder()
        self.parameters_for_epoch = []

        self.n_epochs = epochs
        self.eta = eta

        self.n_features = ViterbiDecompounder.n_features
    def __init__(self, epochs=10, eta=.0001):

        self.decoder = ViterbiDecompounder()
        self.parameters_for_epoch = []

        self.n_epochs = epochs
        self.eta = eta

        self.n_features = ViterbiDecompounder.n_features
示例#3
0
class StructuredPerceptron:
    def __init__(self, epochs=10, eta=.0001):

        self.decoder = ViterbiDecompounder()
        self.parameters_for_epoch = []

        self.n_epochs = epochs
        self.eta = eta

        self.n_features = ViterbiDecompounder.n_features

    def train(self, data, heldout, verbose=0, run_label=None):

        self.decoder.w = np.ones(self.n_features,
                                 dtype=float) / self.n_features
        print("Start weights: %s" % self.decoder.w, file=sys.stderr)

        training_accuracy = [0.0]
        heldout_accuracy = [0.0]

        for i_epoch in range(self.n_epochs):

            tp, fp, fn = 0, 0, 0

            for compound in data:
                tp, fp, fn = self.train_one(compound, tp, fp, fn)

            self.parameters_for_epoch.append(self.decoder.w.copy())

            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = 2 * ((precision * recall) / (precision + recall))
            training_accuracy.append(f1)

            if verbose == 1:
                acc = self.test(heldout)
                heldout_accuracy.append(acc)

            print("Training", training_accuracy)
            # Stop if the error on the training data does not decrease
            if training_accuracy[-1] <= training_accuracy[-2]:
                break

            print("Weights: %s" % self.decoder.w, file=sys.stderr)
            print("Epoch %i, F1: %f" % (i_epoch, f1), file=sys.stderr)

        # Average!
        averaged_parameters = 0
        for epoch_parameters in self.parameters_for_epoch:
            averaged_parameters += epoch_parameters
        averaged_parameters /= len(self.parameters_for_epoch)

        self.decoder.w = averaged_parameters

        # Finished training
        self.trained = True

        if verbose == 1:
            print("Heldout accs:", str(heldout_accuracy))
            print(self.decoder.w)

        # Export training info in verbose mode:
        if verbose == 2:
            x = np.arange(0, len(training_accuracy), 1.0)
            plt.plot(x,
                     training_accuracy,
                     marker='o',
                     linestyle='--',
                     color='r',
                     label='Training')
            plt.plot(x,
                     heldout_accuracy,
                     marker='o',
                     linestyle='--',
                     color='b',
                     label='Heldout')

            plt.xlabel('Epoch')
            plt.ylabel('Accuracy')
            plt.title('Training and Heldout Accuracy')

            plt.ylim([0.9, 1.0])

            plt.legend(bbox_to_anchor=(1., 0.2))

            plt.savefig('eval/%s_training.png' % run_label)

            plt.close()

    def train_one(self, compound, tp, fp, fn):

        # Returns a list of tuples with (start, stop) position
        predicted_splits = self.decoder.viterbi_decode(compound)

        gold_splits = compound.get_gold_splits()
        gold_splits_set = set(gold_splits)
        predicted_splits_set = set(predicted_splits)

        for split in gold_splits_set.union(predicted_splits_set):
            if split in predicted_splits_set and split in gold_splits_set:  # Do nothing
                tp += 1

            if split[1] == len(
                    compound.string
            ) and split[0] != 0:  # Ignore the final artificial path
                continue

            if split in predicted_splits_set and split not in gold_splits_set:  # This is a bad split!
                prev_split = get_prev_split(predicted_splits, split)

                predicted_split_features = self.decoder.fs(
                    compound, prev_split, split, compound.predicted_lattice)
                print("Pred fs:", predicted_split_features, file=sys.stderr)
                self.decoder.w -= self.eta * (self.decoder.w *
                                              predicted_split_features)

                fp += 1

            if split not in predicted_splits_set and split in gold_splits_set:  # This split should have been there!
                prev_split = get_prev_split(gold_splits, split)

                gold_split_features = self.decoder.fs(
                    compound, prev_split, split, compound.predicted_lattice)
                print("Gold fs:", gold_split_features, file=sys.stderr)
                print("w:", self.decoder.w, file=sys.stderr)
                self.decoder.w += self.eta * (self.decoder.w *
                                              gold_split_features)

                fn += 1

        return tp, fp, fn

    def test(self, compounds):
        tp, fp, fn = 0, 0, 0

        for compound in compounds:
            z = self.decoder.viterbi_decode(compound)

            gold_splits = set(compound.gold_splits)
            for split in z:
                if split in gold_splits:
                    tp += 1
                else:
                    fp += 1

            for gold_split in gold_splits:
                if gold_split not in z:
                    fn += 1

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * ((precision * recall) / (precision + recall))

        print("Test Precision: %f" % recall)
        print("Test Recall: %f" % precision)
        print("Test F1: %f" % f1)

        return f1
    print >> sys.stderr, "Loading gensim model..."
    model = gensim.models.Word2Vec.load_word2vec_format(args.model_folder + '/w2v.bin', binary=True)

    print >> sys.stderr, "Done."

    if args.mode == "lattices":
        for line in sys.stdin:
            print(
                get_decompound_lattice(
                    line.decode('utf8').rstrip('\n').title(),
                    args.nAccuracy,
                    args.similarityThreshold
                )
            )

    elif args.mode in ["1-best", "dict_w2v"]:
        vit = ViterbiDecompounder()
        vit.load_weights(args.weightsFile)

        if args.mode == "1-best":
            words = map(lambda line: line.decode('utf8').strip(), fileinput.input())
        else:
            words = list(model.vocab.keys())

        for word in words:
            lattice = Lattice(get_decompound_lattice(word, args.nAccuracy, args.similarityThreshold))
            viterbi_path = vit.viterbi_decode(Compound(word, None, lattice))
            print " ".join(map(lambda p: "%d,%d" % p, viterbi_path)

示例#5
0
        modelSetup,
        nAccuracy=args.nAccuracy,
        globalNN=args.globalNN,
        similarityThreshold=args.similarityThreshold,
        prototype_file=args.prototypeFile)

    if args.mode == "lattices":
        for line in sys.stdin:
            print(
                base_decompounder.get_decompound_lattice(
                    line.decode('utf8').rstrip('\n').title(), ))
    elif args.mode == "w2v_dict":
        for word in base_decompounder.model.vocab.keys():
            print word.encode('utf-8')
    elif args.mode in ["1-best", "dict_w2v"]:
        vit = ViterbiDecompounder()
        vit.load_weights(modelSetup["WEIGHTS"])

        words = []
        if args.mode == "1-best":
            words = map(lambda line: line.decode('utf8').strip(), sys.stdin)
        else:
            words = base_decompounder.model.vocab.keys()

        print >> sys.stderr, "# words: %d" % len(words)

        def process_word(word):
            lattice = Lattice(base_decompounder.get_decompound_lattice(word))
            viterbi_path = vit.viterbi_decode(Compound(word, None, lattice))
            return [
                word.encode('utf-8'),
            nAccuracy=args.nAccuracy, globalNN=args.globalNN,
            similarityThreshold=args.similarityThreshold,
            prototype_file=args.prototypeFile)

    if args.mode == "lattices":
        for line in sys.stdin:
            print(
                base_decompounder.get_decompound_lattice(
                    line.decode('utf8').rstrip('\n').title(),
                )
            )
    elif args.mode == "w2v_dict":
        for word in base_decompounder.model.vocab.keys():
            print word.encode('utf-8')
    elif args.mode in ["1-best", "dict_w2v"]:
        vit = ViterbiDecompounder()
        vit.load_weights(modelSetup["WEIGHTS"])

        words = []
        if args.mode == "1-best":
            words = map(lambda line: line.decode('utf8').strip(),
                    sys.stdin)
        else:
            words = base_decompounder.model.vocab.keys()

        print >>sys.stderr, "# words: %d" % len(words)

        def process_word(word):
            lattice = Lattice(base_decompounder.get_decompound_lattice(word))
            viterbi_path = vit.viterbi_decode(Compound(word, None, lattice))
            return [word.encode('utf-8'), print_path(viterbi_path).encode('utf-8')]
class StructuredPerceptron:
    def __init__(self, epochs=10, eta=.0001):

        self.decoder = ViterbiDecompounder()
        self.parameters_for_epoch = []

        self.n_epochs = epochs
        self.eta = eta

        self.n_features = ViterbiDecompounder.n_features

    def train(self, data, heldout, verbose=0, run_label=None):

        self.decoder.w = np.ones(self.n_features, dtype=float) / self.n_features
        print >> sys.stderr, "Start weights: %s" % self.decoder.w

        training_accuracy = [0.0]
        heldout_accuracy = [0.0]

        for i_epoch in xrange(self.n_epochs):

            tp, fp, fn = 0, 0, 0

            for compound in data:
                tp, fp, fn = self.train_one(compound, tp, fp, fn)

            self.parameters_for_epoch.append(self.decoder.w.copy())

            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = 2 * ((precision * recall) / (precision + recall))
            training_accuracy.append(f1)

            if verbose == 1:
                acc = self.test(heldout)
                heldout_accuracy.append(acc)

            print "Training", training_accuracy
            # Stop if the error on the training data does not decrease
            if training_accuracy[-1] <= training_accuracy[-2]:
                break

            print >> sys.stderr, "Weights: %s" % self.decoder.w
            print >> sys.stderr, "Epoch %i, F1: %f" % (i_epoch, f1)

        # Average!
        averaged_parameters = 0
        for epoch_parameters in self.parameters_for_epoch:
            averaged_parameters += epoch_parameters
        averaged_parameters /= len(self.parameters_for_epoch)

        self.decoder.w = averaged_parameters

        # Finished training
        self.trained = True

        if verbose == 1:
            print "Heldout accs:", str(heldout_accuracy)
            print self.decoder.w

        # Export training info in verbose mode:
        if verbose == 2:
            x = np.arange(0, len(training_accuracy), 1.0)
            plt.plot(x, training_accuracy, marker='o', linestyle='--', color='r', label='Training')
            plt.plot(x, heldout_accuracy, marker='o', linestyle='--', color='b', label='Heldout')

            plt.xlabel('Epoch')
            plt.ylabel('Accuracy')
            plt.title('Training and Heldout Accuracy')

            plt.ylim([0.9, 1.0])

            plt.legend(bbox_to_anchor=(1., 0.2))

            plt.savefig('eval/%s_training.png' % run_label)

            plt.close()

    def train_one(self, compound, tp, fp, fn):

        # Returns a list of tuples with (start, stop) position
        predicted_splits = self.decoder.viterbi_decode(compound)

        gold_splits = compound.get_gold_splits()
        gold_splits_set = set(gold_splits)
        predicted_splits_set = set(predicted_splits)

        for split in gold_splits_set.union(predicted_splits_set):
            if split in predicted_splits_set and split in gold_splits_set:  # Do nothing
                tp += 1

            if split[1] == len(compound.string) and split[0] != 0:  # Ignore the final artificial path
                continue

            if split in predicted_splits_set and split not in gold_splits_set:  # This is a bad split!
                prev_split = get_prev_split(predicted_splits, split)

                predicted_split_features = self.decoder.fs(compound, prev_split, split, compound.predicted_lattice)
                print >> sys.stderr, "Pred fs:", predicted_split_features
                self.decoder.w -= self.eta * (self.decoder.w * predicted_split_features)

                fp += 1

            if split not in predicted_splits_set and split in gold_splits_set:  # This split should have been there!
                prev_split = get_prev_split(gold_splits, split)

                gold_split_features = self.decoder.fs(compound, prev_split, split, compound.predicted_lattice)
                print >> sys.stderr, "Gold fs:", gold_split_features
                print >> sys.stderr, "w:", self.decoder.w
                self.decoder.w += self.eta * (self.decoder.w * gold_split_features)

                fn += 1

        return tp, fp, fn

    def test(self, compounds):
        tp, fp, fn = 0, 0, 0

        for compound in compounds:
            z = self.decoder.viterbi_decode(compound)

            gold_splits = set(compound.gold_splits)
            for split in z:
                if split in gold_splits:
                    tp += 1
                else:
                    fp += 1

            for gold_split in gold_splits:
                if gold_split not in z:
                    fn += 1

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * ((precision * recall) / (precision + recall))

        print "Test Precision: %f" % recall
        print "Test Recall: %f" % precision
        print "Test F1: %f" % f1

        return f1