예제 #1
0
def main():
    references = []
    sys.stderr.write("Reading English Sentences\n")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i%100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nTry reading nbests datastructure from disk ... \n")
    nbests = read_ds_from_file(opts.nbestDS)
    if nbests is None:
        nbests = []
        sys.stderr.write("No nbests on disk, so calculating ndests ... \n")
        for j,line in enumerate(open(opts.nbest)):
            (i, sentence, features) = line.strip().split("|||")
            i = int(i)
            stats = list(bleu.bleu_stats(sentence, references[i]))
            # bleu_score = bleu.bleu(stats)
            smoothed_bleu_score = bleu.smoothed_bleu(stats)
            # making the feature string to float list
            feature_list = [float(x) for x in features.split()]
            if len(nbests)<=i:
                nbests.append([])
            # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
            nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list))

            if j%5000 == 0:
                sys.stderr.write(".")
        write_ds_to_file(nbests, opts.nbestDS)

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization

    avg_theta = [ 0.0 for _ in xrange(arg_num)]
    avg_cnt = 0
    sys.stderr.write("\nTraining...\n")
    for j in xrange(opts.epo):
        mistake = 0;
        for nbest in nbests:
            sample = get_sample(nbest)
            sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True)
            for i in xrange(min(len(sample), opts.xi)):
                v1 = sample[i][0].feature_list
                v2 = sample[i][1].feature_list
                if dot_product(theta, v1) <= dot_product(theta, v2):
                    mistake += 1
                    theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta)
                    
                avg_theta = vector_plus(avg_theta, theta)
                avg_cnt += 1

        sys.stderr.write("Mistake:  %s\n" % (mistake,))
    

    weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ]
    sys.stderr.write("Computing best BLEU score and outputing...\n")
    # instead of print the averaged-out weights, print the weights that maximize the BLEU score    
    print "\n".join([str(weight) for weight in weights])
예제 #2
0
def main():
    references = []
    sys.stderr.write("Reading English Sentences\n")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i % 100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nTry reading nbests datastructure from disk ... \n")
    nbests = read_ds_from_file(opts.nbestDS)
    if nbests is None:
        nbests = []
        sys.stderr.write("No nbests on disk, so calculating ndests ... \n")
        for j, line in enumerate(open(opts.nbest)):
            (i, sentence, features) = line.strip().split("|||")
            i = int(i)
            stats = list(bleu.bleu_stats(sentence, references[i]))
            # bleu_score = bleu.bleu(stats)
            smoothed_bleu_score = bleu.smoothed_bleu(stats)
            # making the feature string to float list
            feature_list = [float(x) for x in features.split()]
            if len(nbests) <= i:
                nbests.append([])
            # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
            nbests[i].append(entry(sentence, smoothed_bleu_score,
                                   feature_list))

            if j % 5000 == 0:
                sys.stderr.write(".")
        write_ds_to_file(nbests, opts.nbestDS)

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0 / arg_num for _ in xrange(arg_num)]  #initialization

    avg_theta = [0.0 for _ in xrange(arg_num)]
    avg_cnt = 0
    sys.stderr.write("\nTraining...\n")
    for j in xrange(opts.epo):
        mistake = 0
        for nbest in nbests:
            sample = get_sample(nbest)
            sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu,
                        reverse=True)
            for i in xrange(min(len(sample), opts.xi)):
                v1 = sample[i][0].feature_list
                v2 = sample[i][1].feature_list
                if dot_product(theta, v1) <= dot_product(theta, v2):
                    mistake += 1
                    theta = vector_plus(theta, vector_plus(v1, v2, -1),
                                        opts.eta)

                avg_theta = vector_plus(avg_theta, theta)
                avg_cnt += 1

        sys.stderr.write("Mistake:  %s\n" % (mistake, ))

    weights = [
        avg / avg_cnt if avg_cnt != 0 else 1 / float(arg_num)
        for avg in avg_theta
    ]
    sys.stderr.write("Computing best BLEU score and outputing...\n")
    # instead of print the averaged-out weights, print the weights that maximize the BLEU score
    print "\n".join([str(weight) for weight in weights])
예제 #3
0
def main():
    references = []
    sys.stderr.write("Reading English Sentences\n")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i%100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nTry reading %s from disk ... \n" % opts.nbestDS)
    nbests = read_ds_from_file(opts.nbestDS)
    if nbests is None:
        nbests = []
        sys.stderr.write("%s is not on disk, so calculating it ... \n" % opts.nbestDS)
        for j,line in enumerate(open(opts.nbest)):
            (i, sentence, features) = line.strip().split("|||")
            i = int(i)
            stats = list(bleu.bleu_stats(sentence, references[i]))
            # bleu_score = bleu.bleu(stats)
            smoothed_bleu_score = bleu.smoothed_bleu(stats)
            # making the feature string to float list
            feature_list = [float(x) for x in features.split()]
            if len(nbests)<=i:
                nbests.append([])
            # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
            nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list))

            if j%5000 == 0:
                sys.stderr.write(".")
        sys.stderr.write("\nWriting %s to disk ... \n" % opts.nbestDS)
        write_ds_to_file(nbests, opts.nbestDS)
        sys.stderr.write("Finish writing %s\n" % opts.nbestDS)

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization

    # avg_theta = [ 0.0 for _ in xrange(arg_num)]
    # avg_cnt = 0

    tau = opts.tau # positive learning margin
    sys.stderr.write("\nTraining...\n")
    for iter_num in xrange(opts.epo):
        sys.stderr.write("\nIteration#{} ".format(iter_num + 1))
        cnt = 0;
        # sentence wise updating

        for i, nbest in enumerate(nbests):
            y = sorted(nbest, key = lambda h: h.smoothed_bleu, reverse = True)
            mu = [0.0]*len(nbest)
            w_times_x = [0.0]*len(nbest)
            for j, best in enumerate(nbest):
                # calculate linear function result
                w_times_x[j] = dot_product(theta, best.feature_list)

            # processing pairs 
            top_r = int(len(y)*opts.r)
            bottom_k = int(len(y)*opts.k)
            for j in xrange(len(nbest) - 1):
                for l in xrange(j+1, len(nbest)):
                    if nbest[j].smoothed_bleu <= y[top_r].smoothed_bleu \
                    and nbest[l].smoothed_bleu >= y[- bottom_k].smoothed_bleu \
                    and w_times_x[j] > w_times_x[l] + tau:
                        mu[j] = mu[j] + 1
                        mu[l] = mu[l] - 1
                    elif nbest[j].smoothed_bleu >= y[- bottom_k].smoothed_bleu \
                    and nbest[l].smoothed_bleu <= y[top_r].smoothed_bleu \
                    and w_times_x[j] > w_times_x[l] - tau:
                        mu[j] = mu[j] - 1
                        mu[l] = mu[l] + 1
                    else:
                        cnt += 1
                if (j + 1) % 100 == 0:
                    sys.stderr.write(".")

            vector_sum = [0 for _ in xrange(len(nbest[0].feature_list))]
            for m, best in enumerate(nbest):
                vector_sum = vector_plus(vector_sum, scale_product(mu[m], best.feature_list))

            theta = vector_plus(theta, vector_sum, opts.eta)

            # avg_theta = vector_plus(avg_theta, theta)
            # avg_cnt += 1

        sys.stderr.write("\n Non-supported vectors:  %s\n" % (cnt,))
    

    # weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ]
    sys.stderr.write("Computing best BLEU score and outputing...\n")
    # instead of print the averaged-out weights, print the weights that maximize the BLEU score    
    print "\n".join([str(weight) for weight in theta])
예제 #4
0
def main():
    references = []
    sys.stderr.write("Reading English Sentences\n")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i % 100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nTry reading %s from disk ... \n" % opts.nbestDS)
    nbests = read_ds_from_file(opts.nbestDS)
    if nbests is None:
        nbests = []
        sys.stderr.write("%s is not on disk, so calculating it ... \n" %
                         opts.nbestDS)
        for j, line in enumerate(open(opts.nbest)):
            (i, sentence, features) = line.strip().split("|||")
            i = int(i)
            stats = list(bleu.bleu_stats(sentence, references[i]))
            # bleu_score = bleu.bleu(stats)
            smoothed_bleu_score = bleu.smoothed_bleu(stats)
            # making the feature string to float list
            feature_list = [float(x) for x in features.split()]
            if len(nbests) <= i:
                nbests.append([])
            # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
            nbests[i].append(entry(sentence, smoothed_bleu_score,
                                   feature_list))

            if j % 5000 == 0:
                sys.stderr.write(".")
        sys.stderr.write("\nWriting %s to disk ... \n" % opts.nbestDS)
        write_ds_to_file(nbests, opts.nbestDS)
        sys.stderr.write("Finish writing %s\n" % opts.nbestDS)

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0 / arg_num for _ in xrange(arg_num)]  #initialization

    # avg_theta = [ 0.0 for _ in xrange(arg_num)]
    # avg_cnt = 0

    tau = opts.tau  # positive learning margin
    sys.stderr.write("\nTraining...\n")
    for iter_num in xrange(opts.epo):
        sys.stderr.write("\nIteration#{} ".format(iter_num + 1))
        cnt = 0
        # sentence wise updating

        for i, nbest in enumerate(nbests):
            y = sorted(nbest, key=lambda h: h.smoothed_bleu, reverse=True)
            mu = [0.0] * len(nbest)
            w_times_x = [0.0] * len(nbest)
            for j, best in enumerate(nbest):
                # calculate linear function result
                w_times_x[j] = dot_product(theta, best.feature_list)

            # processing pairs
            top_r = int(len(y) * opts.r)
            bottom_k = int(len(y) * opts.k)
            for j in xrange(len(nbest) - 1):
                for l in xrange(j + 1, len(nbest)):
                    yj = nbest[j].smoothed_bleu
                    yl = nbest[l].smoothed_bleu
                    if yj < yl \
                    and dist(yj, yl) > opts.epsilon \
                    and w_times_x[j] - w_times_x[l] < g_learn(yj, yl)*tau:
                        mu[j] = mu[j] + g_learn(yj, yl)
                        mu[l] = mu[l] - g_learn(yj, yl)
                    elif yj > yl \
                    and dist(yj, yl) > opts.epsilon \
                    and w_times_x[l] - w_times_x[y] < g_learn(yl, yj)*tau:
                        mu[j] = mu[j] - g_learn(yl, yj)
                        mu[l] = mu[l] + g_learn(yl, yj)
                    else:
                        cnt += 1
                if (j + 1) % 10000 == 0:
                    sys.stderr.write(".")

            vector_sum = [0 for _ in xrange(len(nbest[0].feature_list))]
            for m, best in enumerate(nbest):
                vector_sum = vector_plus(
                    vector_sum, scale_product(mu[m], best.feature_list))

            theta = vector_plus(theta, vector_sum, opts.eta)

            # avg_theta = vector_plus(avg_theta, theta)
            # avg_cnt += 1

        sys.stderr.write("\n Non-supported vectors:  %s\n" % (cnt, ))

    # weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ]
    sys.stderr.write("Computing best BLEU score and outputing...\n")
    # instead of print the averaged-out weights, print the weights that maximize the BLEU score
    print "\n".join([str(weight) for weight in theta])