Пример #1
0
def unavg_perc_train(trainfile, devfile, dictionary, model):
    print("-------------------------------------------------------")
    print("UNAVERAGED PERCEPTRON - BIGRAM")
    updates = 0
    epoch = 5
    lr = 1

    start = time.time()

    for i in range(epoch):
        for x_i, y_i in readfile(trainfile):
            z_i = decode(x_i, dictionary, model)
            if y_i != z_i:
                updates += 1
                phi_xy = phi(x_i, y_i)
                phi_xz = phi(x_i, z_i)
                for key in phi_xy:
                    if key not in phi_xz:
                        model[key] += lr * 1

                for key in phi_xz:
                    if key not in phi_xy:
                        model[key] -= lr * 1

        train_err = test(trainfile, dictionary, model)
        dev_err = test(devfile, dictionary, model)
        print("epoch:", i + 1, "updates:", updates,
              "train_err: {0:.2%}".format(train_err),
              "dev_err: {0:.2%}".format(dev_err))

    end = time.time()
    elap_time = end - start
    print("elapsed time:", elap_time)
    print("-------------------------------------------------------")
Пример #2
0
def train(trainfile, devfile, dictionary, epochs=1, isAVG = 1):
    w_model = defaultdict(float)
    wa_model = defaultdict(float)
    features = set()

    train_set = list(tagger.readfile(trainfile))
    c = 1.

    for epoch in xrange(epochs):
        update = 0.
        error = 0.
        total_words = 0.
        for _, (wordseq, tagseq) in enumerate(train_set):
            total_words += len(wordseq)
            zseq = tagger.decode(wordseq, dictionary, w_model)
            if isAVG:
                c += 1.
            if zseq != tagseq:
                delta = defaultdict(float)
                wordseq = [startsym]*2 + wordseq + [stopsym]*2
                tagseq = [startsym]*2 + tagseq + [stopsym]*2
                zseq = [startsym]*2 + zseq + [stopsym]*2
                update += 1

                for i, (w, t1, t2) in enumerate(zip(wordseq,tagseq,zseq)[2:],2):
                    if t1 != t2:
                        delta[t1, wordseq[i-1],wordseq[i+1]] += 1
                        delta[t2, wordseq[i-1],wordseq[i+1]] -= 1
                        features.add(t1, wordseq[i-1],wordseq[i+1])
                        features.add(t2, wordseq[i-1],wordseq[i+1])
                        error += 1
                    if t1 != t2 or tagseq[i-2] != zseq[i-2]:
                        delta[(tagseq[i-2],tagseq[i-1]), t1] += 1
                        delta[(zseq[i-2],zseq[i-1]), t2] -= 1
                        features.add(((tagseq[i-2],tagseq[i-1]), t1))
                        features.add(((zseq[i-2],zseq[i-1]), t2))
                #w_model += delta
                w_model = dictadd(w_model, delta)
                if isAVG:
                    wa_model = dictadd(wa_model, delta, c)
        #train_err = tagger.test(trainfile,dictionary, w_model)
        train_err = error/total_words
        dev_err = tagger.test(devfile, dictionary,w_model)
        feature_size = len(features)
        #print total_words
        if not isAVG:
            print "epoch %d, update %d,featrues %d, train_err %.2f%%, dev_err %.2f%%" % (epoch+1, update,feature_size, 100*train_err, 100*dev_err )
        else:
            #train_avg_err = tagger.test(trainfile,dictionary, dictadd(w_model, wa_model, -1/c))
            dev_AVG_err = tagger.test(devfile,dictionary, dictadd(w_model, wa_model, -1/c))
            print "epoch %d, update %d,featrues %d, train_err %.2f%%, dev_err %.2f%%, dev_avg %.2f%%" % (epoch+1, update, feature_size,100*train_err, 100*dev_err, 100*dev_AVG_err)
    if not isAVG:
        return w_model
    else:
        return dictadd(w_model, wa_model, -1/c)
Пример #3
0
def output_dev(inputfile, dictionary, model):
    dev_set = list(tagger.readfile(inputfile))
    #print dev_set
    f = open("dev.lower.unk.best", "w")
    for wordseq, tagseq in dev_set:
        best_tag = tagger.decode3(wordseq,dictionary, model)
        output = ""
        for word, tag in zip(wordseq, best_tag):
            output = output + word + "/"+ tag + " "
        print >> f, output
    f.close
Пример #4
0
def avg_perc_train(trainfile, devfile, dictionary, model):
    print("-------------------------------------------------------")
    print("AVERAGED PERCEPTRON - BIGRAM")
    updates = 0
    c = 1
    epoch = 5
    lr = 1
    # weights = defaultdict(float)
    model_0 = copy.deepcopy(model)
    model_a = copy.deepcopy(model)

    start = time.time()

    for i in range(epoch):
        for x_i, y_i in readfile(trainfile):
            z_i = decode(x_i, dictionary, model_0)
            if y_i != z_i:
                updates += 1
                phi_xy = phi(x_i, y_i)
                phi_xz = phi(x_i, z_i)
                for key in phi_xy:
                    if key not in phi_xz:
                        model_0[key] += lr * 1
                        model_a[key] += c * lr * 1

                for key in phi_xz:
                    if key not in phi_xy:
                        model_0[key] -= lr * 1
                        model_a[key] -= c * lr * 1
            c += 1
        weights = {key: model_0[key] - model_a[key] / c for key in model}

        train_err = test(trainfile, dictionary, weights)
        dev_err = test(devfile, dictionary, weights)
        print("epoch:", i + 1, "updates:", c,
              "train_err: {0:.2%}".format(train_err),
              "dev_err: {0:.2%}".format(dev_err))

    end = time.time()
    elap_time = end - start
    print("elapsed time:", elap_time)
    print("-------------------------------------------------------")
Пример #5
0
def train(trainfile,
          devfile,
          dictionary,
          Average=False,
          MultiGrams=False,
          epochs=10):
    weight = defaultdict(float)
    avg_weight = defaultdict(float)
    trainset = tagger.readfile(trainfile)
    c = 0
    best_dev_err = 1
    error_rates = []
    error_rates_train = []
    location_epoch = []

    if not Average:
        print('Unaverage Structure Perceptron, MultiGrams = %s' % (MultiGrams))
    else:
        print('Average Structure Perceptron, MultiGrams = %s' % (MultiGrams))
    for epoch in range(1, epochs + 1):
        errors = 0

        for wordseq, gold_tagseq in trainset:
            c += 1
            cur_tagseq = tagger.decode(wordseq, dictionary, weight, MultiGrams)

            if cur_tagseq != gold_tagseq:
                errors += 1
                phi_total = defaultdict(float)
                wordseq = [startsym] + wordseq + [stopsym]
                gold_tagseq = [startsym] + gold_tagseq + [stopsym]
                cur_tagseq = [startsym] + cur_tagseq + [stopsym]

                for i, (word, tag_gold, tag_cur) in enumerate(
                        zip(wordseq[1:], gold_tagseq[1:], cur_tagseq[1:]), 1):
                    if tag_gold != tag_cur:
                        phi_total[('tw', tag_gold,
                                   word)] += 1  # tag(y) -> word
                        phi_total[('tw', tag_cur, word)] -= 1  # tag(z) -> word

                        if MultiGrams:
                            phi_total[('tt_1w', tag_gold, gold_tagseq[i - 1],
                                       word)] += 1
                            phi_total[('tt_1w', tag_cur, cur_tagseq[i - 1],
                                       word)] -= 1

                    if tag_gold != tag_cur or gold_tagseq[i -
                                                          1] != cur_tagseq[i -
                                                                           1]:
                        phi_total[(tag_gold,
                                   gold_tagseq[i - 1])] += 1  # phi(x, y)
                        phi_total[(tag_cur,
                                   cur_tagseq[i - 1])] -= 1  # phi(x, z)

                        if MultiGrams:
                            phi_total[(tag_gold, word, wordseq[i - 1])] += 1
                            phi_total[(tag_cur, word, wordseq[i - 1])] -= 1

                            phi_total[('tt_1w_1', tag_gold, gold_tagseq[i - 1],
                                       wordseq[i - 1])] += 1
                            phi_total[('tt_1w_1', tag_cur, cur_tagseq[i - 1],
                                       wordseq[i - 1])] -= 1

                            phi_total[(tag_gold, gold_tagseq[i - 1], word,
                                       wordseq[i - 1])] += 1
                            phi_total[(tag_cur, cur_tagseq[i - 1], word,
                                       wordseq[i - 1])] -= 1

                            if i >= 2:
                                phi_total[(tag_gold, gold_tagseq[i - 2],
                                           gold_tagseq[i - 1])] += 1
                                phi_total[(tag_cur, cur_tagseq[i - 2],
                                           cur_tagseq[i - 1])] -= 1
                            if i >= 3:
                                phi_total[(tag_gold, gold_tagseq[i - 3],
                                           gold_tagseq[i - 2],
                                           gold_tagseq[i - 1])] += 1
                                phi_total[(tag_cur, cur_tagseq[i - 3],
                                           cur_tagseq[i - 2],
                                           cur_tagseq[i - 1])] -= 1

                    if not Average:
                        for e in phi_total.keys():
                            weight[e] += phi_total[e]
                    else:
                        for e in phi_total.keys():
                            # avg_weight
                            weight[e] += phi_total[e]
                            avg_weight[e] += c * phi_total[e]
                        # update(avg_weight, phi_total, c)

        if Average:
            for e in weight:
                avg_weight[e] = weight[e] - avg_weight[e] / c

        if not Average:
            train_err = tagger.test(trainfile, dictionary, weight, MultiGrams)
            dev_err = tagger.test(devfile, dictionary, weight, MultiGrams)

            if best_dev_err > dev_err:
                best_dev_err = dev_err
                best_epoch = epoch
                best_weight = weight

            error_rates.append(dev_err)
            error_rates_train.append(train_err)
            location_epoch.append(epoch)
            print(
                "epoch %d, updates %d, feature = %d, train_err = %.2f%%, dev_err = %.2f%%"
                % (epoch, errors, num_feature(weight), train_err * 100,
                   dev_err * 100))

        else:
            train_avg_err = tagger.test(trainfile, dictionary, avg_weight,
                                        MultiGrams)
            dev_avg_err = tagger.test(devfile, dictionary, avg_weight,
                                      MultiGrams)

            if best_dev_err > dev_avg_err:
                best_dev_err = dev_avg_err
                best_epoch = epoch
                best_weight = weight

            error_rates.append(dev_avg_err)
            error_rates_train.append(train_avg_err)
            location_epoch.append(epoch)
            print(
                "epoch %d, updates %d, feature = %d, train_err = %.2f%%, dev_avg_err = %.2f%%"
                % (epoch, errors, num_feature(weight), train_avg_err * 100,
                   dev_avg_err * 100))
    if not Average:
        print("The best dev_err = %.2f%% at %d epoch" %
              (best_dev_err * 100, best_epoch))
    else:
        print("The best dev_avg_err = %.2f%% at %d epoch" %
              (best_dev_err * 100, best_epoch))

    return error_rates_train, error_rates, location_epoch, best_dev_err, best_weight