def main(): references = [] sys.stderr.write("Reading English Sentences\n") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i%100 == 0: sys.stderr.write(".") sys.stderr.write("\nTry reading nbests datastructure from disk ... \n") nbests = read_ds_from_file(opts.nbestDS) if nbests is None: nbests = [] sys.stderr.write("No nbests on disk, so calculating ndests ... \n") for j,line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu.bleu_stats(sentence, references[i])) # bleu_score = bleu.bleu(stats) smoothed_bleu_score = bleu.smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if len(nbests)<=i: nbests.append([]) # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j%5000 == 0: sys.stderr.write(".") write_ds_to_file(nbests, opts.nbestDS) arg_num = len(nbests[0][0].feature_list) theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization avg_theta = [ 0.0 for _ in xrange(arg_num)] avg_cnt = 0 sys.stderr.write("\nTraining...\n") for j in xrange(opts.epo): mistake = 0; for nbest in nbests: sample = get_sample(nbest) sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True) for i in xrange(min(len(sample), opts.xi)): v1 = sample[i][0].feature_list v2 = sample[i][1].feature_list if dot_product(theta, v1) <= dot_product(theta, v2): mistake += 1 theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta) avg_theta = vector_plus(avg_theta, theta) avg_cnt += 1 sys.stderr.write("Mistake: %s\n" % (mistake,)) weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ] sys.stderr.write("Computing best BLEU score and outputing...\n") # instead of print the averaged-out weights, print the weights that maximize the BLEU score print "\n".join([str(weight) for weight in weights])
def main(): references = [] sys.stderr.write("Reading English Sentences\n") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i % 100 == 0: sys.stderr.write(".") sys.stderr.write("\nTry reading nbests datastructure from disk ... \n") nbests = read_ds_from_file(opts.nbestDS) if nbests is None: nbests = [] sys.stderr.write("No nbests on disk, so calculating ndests ... \n") for j, line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu.bleu_stats(sentence, references[i])) # bleu_score = bleu.bleu(stats) smoothed_bleu_score = bleu.smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if len(nbests) <= i: nbests.append([]) # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j % 5000 == 0: sys.stderr.write(".") write_ds_to_file(nbests, opts.nbestDS) arg_num = len(nbests[0][0].feature_list) theta = [1.0 / arg_num for _ in xrange(arg_num)] #initialization avg_theta = [0.0 for _ in xrange(arg_num)] avg_cnt = 0 sys.stderr.write("\nTraining...\n") for j in xrange(opts.epo): mistake = 0 for nbest in nbests: sample = get_sample(nbest) sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True) for i in xrange(min(len(sample), opts.xi)): v1 = sample[i][0].feature_list v2 = sample[i][1].feature_list if dot_product(theta, v1) <= dot_product(theta, v2): mistake += 1 theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta) avg_theta = vector_plus(avg_theta, theta) avg_cnt += 1 sys.stderr.write("Mistake: %s\n" % (mistake, )) weights = [ avg / avg_cnt if avg_cnt != 0 else 1 / float(arg_num) for avg in avg_theta ] sys.stderr.write("Computing best BLEU score and outputing...\n") # instead of print the averaged-out weights, print the weights that maximize the BLEU score print "\n".join([str(weight) for weight in weights])
def main(): references = [] sys.stderr.write("Reading English Sentences\n") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i%100 == 0: sys.stderr.write(".") sys.stderr.write("\nTry reading %s from disk ... \n" % opts.nbestDS) nbests = read_ds_from_file(opts.nbestDS) if nbests is None: nbests = [] sys.stderr.write("%s is not on disk, so calculating it ... \n" % opts.nbestDS) for j,line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu.bleu_stats(sentence, references[i])) # bleu_score = bleu.bleu(stats) smoothed_bleu_score = bleu.smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if len(nbests)<=i: nbests.append([]) # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j%5000 == 0: sys.stderr.write(".") sys.stderr.write("\nWriting %s to disk ... \n" % opts.nbestDS) write_ds_to_file(nbests, opts.nbestDS) sys.stderr.write("Finish writing %s\n" % opts.nbestDS) arg_num = len(nbests[0][0].feature_list) theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization # avg_theta = [ 0.0 for _ in xrange(arg_num)] # avg_cnt = 0 tau = opts.tau # positive learning margin sys.stderr.write("\nTraining...\n") for iter_num in xrange(opts.epo): sys.stderr.write("\nIteration#{} ".format(iter_num + 1)) cnt = 0; # sentence wise updating for i, nbest in enumerate(nbests): y = sorted(nbest, key = lambda h: h.smoothed_bleu, reverse = True) mu = [0.0]*len(nbest) w_times_x = [0.0]*len(nbest) for j, best in enumerate(nbest): # calculate linear function result w_times_x[j] = dot_product(theta, best.feature_list) # processing pairs top_r = int(len(y)*opts.r) bottom_k = int(len(y)*opts.k) for j in xrange(len(nbest) - 1): for l in xrange(j+1, len(nbest)): if nbest[j].smoothed_bleu <= y[top_r].smoothed_bleu \ and nbest[l].smoothed_bleu >= y[- bottom_k].smoothed_bleu \ and w_times_x[j] > w_times_x[l] + tau: mu[j] = mu[j] + 1 mu[l] = mu[l] - 1 elif nbest[j].smoothed_bleu >= y[- bottom_k].smoothed_bleu \ and nbest[l].smoothed_bleu <= y[top_r].smoothed_bleu \ and w_times_x[j] > w_times_x[l] - tau: mu[j] = mu[j] - 1 mu[l] = mu[l] + 1 else: cnt += 1 if (j + 1) % 100 == 0: sys.stderr.write(".") vector_sum = [0 for _ in xrange(len(nbest[0].feature_list))] for m, best in enumerate(nbest): vector_sum = vector_plus(vector_sum, scale_product(mu[m], best.feature_list)) theta = vector_plus(theta, vector_sum, opts.eta) # avg_theta = vector_plus(avg_theta, theta) # avg_cnt += 1 sys.stderr.write("\n Non-supported vectors: %s\n" % (cnt,)) # weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ] sys.stderr.write("Computing best BLEU score and outputing...\n") # instead of print the averaged-out weights, print the weights that maximize the BLEU score print "\n".join([str(weight) for weight in theta])
def main(): references = [] sys.stderr.write("Reading English Sentences\n") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i % 100 == 0: sys.stderr.write(".") sys.stderr.write("\nTry reading %s from disk ... \n" % opts.nbestDS) nbests = read_ds_from_file(opts.nbestDS) if nbests is None: nbests = [] sys.stderr.write("%s is not on disk, so calculating it ... \n" % opts.nbestDS) for j, line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu.bleu_stats(sentence, references[i])) # bleu_score = bleu.bleu(stats) smoothed_bleu_score = bleu.smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if len(nbests) <= i: nbests.append([]) # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j % 5000 == 0: sys.stderr.write(".") sys.stderr.write("\nWriting %s to disk ... \n" % opts.nbestDS) write_ds_to_file(nbests, opts.nbestDS) sys.stderr.write("Finish writing %s\n" % opts.nbestDS) arg_num = len(nbests[0][0].feature_list) theta = [1.0 / arg_num for _ in xrange(arg_num)] #initialization # avg_theta = [ 0.0 for _ in xrange(arg_num)] # avg_cnt = 0 tau = opts.tau # positive learning margin sys.stderr.write("\nTraining...\n") for iter_num in xrange(opts.epo): sys.stderr.write("\nIteration#{} ".format(iter_num + 1)) cnt = 0 # sentence wise updating for i, nbest in enumerate(nbests): y = sorted(nbest, key=lambda h: h.smoothed_bleu, reverse=True) mu = [0.0] * len(nbest) w_times_x = [0.0] * len(nbest) for j, best in enumerate(nbest): # calculate linear function result w_times_x[j] = dot_product(theta, best.feature_list) # processing pairs top_r = int(len(y) * opts.r) bottom_k = int(len(y) * opts.k) for j in xrange(len(nbest) - 1): for l in xrange(j + 1, len(nbest)): yj = nbest[j].smoothed_bleu yl = nbest[l].smoothed_bleu if yj < yl \ and dist(yj, yl) > opts.epsilon \ and w_times_x[j] - w_times_x[l] < g_learn(yj, yl)*tau: mu[j] = mu[j] + g_learn(yj, yl) mu[l] = mu[l] - g_learn(yj, yl) elif yj > yl \ and dist(yj, yl) > opts.epsilon \ and w_times_x[l] - w_times_x[y] < g_learn(yl, yj)*tau: mu[j] = mu[j] - g_learn(yl, yj) mu[l] = mu[l] + g_learn(yl, yj) else: cnt += 1 if (j + 1) % 10000 == 0: sys.stderr.write(".") vector_sum = [0 for _ in xrange(len(nbest[0].feature_list))] for m, best in enumerate(nbest): vector_sum = vector_plus( vector_sum, scale_product(mu[m], best.feature_list)) theta = vector_plus(theta, vector_sum, opts.eta) # avg_theta = vector_plus(avg_theta, theta) # avg_cnt += 1 sys.stderr.write("\n Non-supported vectors: %s\n" % (cnt, )) # weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ] sys.stderr.write("Computing best BLEU score and outputing...\n") # instead of print the averaged-out weights, print the weights that maximize the BLEU score print "\n".join([str(weight) for weight in theta])