def validate_model(model, valset, refset, search_option, name, reader, option, **variables): trans = translate(model, valset, **search_option) bleu_score = bleu(trans, refset) step = variables["step"] epoch = variables["epoch"] global_step = variables["global_step"] msg = "global_step: %d, epoch: %d, step: %d, bleu: %2.4f" print(msg % (global_step, epoch, step, bleu_score)) best_score = option["bleu"] if bleu_score > best_score: option["bleu"] = bleu_score save_model(model, name, reader, option, **variables)
# coding=utf-8 from metric import bleu, otem, utem ref = "You should learn to use the computer ." can1 = "You should learn to use the car ." can2 = "You should learn to use the the computer ." can3 = "You should learn to use the ." print('Reference: {}'.format(ref)) print('Candidate1: {}'.format(can1)) print('Candidate2: {}'.format(can2)) print('Candidate3: {}'.format(can3)) refs = [[ref.split()]] can1 = [can1.split()] can2 = [can2.split()] can3 = [can3.split()] print('BLEU {} vs {} vs {}'.format(bleu(can1, refs), bleu(can2, refs), bleu(can3, refs))) print('OTEM {} vs {} vs {}'.format(otem(can1, refs, n=1), otem(can2, refs, n=1), otem(can3, refs, n=1))) print('UTEM {} vs {} vs {}'.format(utem(can1, refs), utem(can2, refs), utem(can3, refs)))
def train(args): option = getoption() if os.path.exists(args.model): option, params = loadmodel(args.model) init = False else: init = True override(option, args) print_option(option) # set seed numpy.random.seed(option["seed"]) if option["ref"]: references = loadreferences(option["ref"]) else: references = None svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs pathname, basename = os.path.split(args.model) modelname = getfilename(basename) autoname = os.path.join(pathname, modelname + ".autosave.pkl") bestname = os.path.join(pathname, modelname + ".best.pkl") batch = option["batch"] sortk = option["sort"] or 1 shuffle = option["seed"] if option["shuffle"] else None reader = textreader(option["corpus"], shuffle) processor = [getlen, getlen] stream = textiterator(reader, [batch, batch * sortk], processor, option["limit"], option["sort"]) if shuffle and option["indices"] is not None: reader.set_indices(option["indices"]) if args.reset: option["count"] = [0, 0] option["epoch"] = 0 option["cost"] = 0.0 skipstream(reader, option["count"][1]) epoch = option["epoch"] maxepoch = option["maxepoch"] model = rnnsearch(**option) if init: uniform(model.parameter, -0.08, 0.08) else: set_variables(model.parameter, params) print "parameters:", parameters(model.parameter) # tuning option toption = {} toption["algorithm"] = option["optimizer"] toption["variant"] = option["variant"] toption["constraint"] = ("norm", option["norm"]) toption["norm"] = True trainer = optimizer(model, **toption) alpha = option["alpha"] # beamsearch option doption = {} doption["beamsize"] = option["beamsize"] doption["normalize"] = option["normalize"] doption["maxlen"] = option["maxlen"] doption["minlen"] = option["minlen"] best_score = option["bleu"] unk_sym = option["unk"] eos_sym = option["eos"] count = option["count"][0] totcost = option["cost"] for i in range(epoch, maxepoch): for data in stream: xdata, xmask = processdata(data[0], svocab, unk_sym, eos_sym) ydata, ymask = processdata(data[1], tvocab, unk_sym, eos_sym) t1 = time.time() cost, norm = trainer.optimize(xdata, xmask, ydata, ymask) trainer.update(alpha=alpha) t2 = time.time() count += 1 cost = cost * ymask.shape[1] / ymask.sum() totcost += cost / math.log(2) print i + 1, count, cost, norm, t2 - t1 # autosave if count % option["freq"] == 0: model.option["indices"] = reader.get_indices() model.option["bleu"] = best_score model.option["cost"] = totcost model.option["count"] = [count, reader.count] serialize(autoname, model) if count % option["vfreq"] == 0: if option["validation"] and references: trans = translate(model, option["validation"], **doption) bleu_score = bleu(trans, references) print "bleu: %2.4f" % bleu_score if bleu_score > best_score: best_score = bleu_score model.option["indices"] = reader.get_indices() model.option["bleu"] = best_score model.option["cost"] = totcost model.option["count"] = [count, reader.count] serialize(bestname, model) if count % option["sfreq"] == 0: n = len(data[0]) ind = numpy.random.randint(0, n) sdata = data[0][ind] tdata = data[1][ind] xdata = xdata[:, ind:ind + 1] hls = beamsearch(model, xdata) if len(hls) > 0: best, score = hls[0] print sdata print tdata print " ".join(best[:-1]) else: print sdata print tdata print "warning: no translation" print "--------------------------------------------------" if option["validation"] and references: trans = translate(model, option["validation"], **doption) bleu_score = bleu(trans, references) print "iter: %d, bleu: %2.4f" % (i + 1, bleu_score) if bleu_score > best_score: best_score = bleu_score model.option["indices"] = reader.get_indices() model.option["bleu"] = best_score model.option["cost"] = totcost model.option["count"] = [count, reader.count] serialize(bestname, model) print "averaged cost: ", totcost / count print "--------------------------------------------------" # early stopping if i + 1 >= option["stop"]: alpha = alpha * option["decay"] count = 0 totcost = 0.0 stream.reset() # update autosave model.option["epoch"] = i + 1 model.option["alpha"] = alpha model.option["indices"] = reader.get_indices() model.option["bleu"] = best_score model.option["cost"] = totcost model.option["count"] = [0, 0] serialize(autoname, model) print "best(bleu): %2.4f" % best_score stream.close()
def decode(args): num_models = len(args.model) models = [None for i in range(num_models)] for i in range(num_models): option, params = load_model(args.model[i]) scope = "rnnsearch_%d" % i model = rnnsearch(scope=scope, **option) var_list = get_variables_with_prefix(scope) set_variables(var_list, params) models[i] = model # use the first model svocabs, tvocabs = models[0].option["vocabulary"] unk_sym = models[0].option["unk"] eos_sym = models[0].option["eos"] count = 0 svocab, isvocab = svocabs tvocab, itvocab = tvocabs option = {} option["maxlen"] = args.maxlen option["minlen"] = args.minlen option["beamsize"] = args.beamsize option["normalize"] = args.normalize option["arithmetic"] = args.arithmetic if args.oracle: references = load_references(args.oracle) else: references = None while True: line = sys.stdin.readline() if line == "": break data = [line] seq, mask = convert_data(data, svocab, unk_sym, eos_sym) t1 = time.time() tlist = beamsearch(models, seq, **option) t2 = time.time() if len(tlist) == 0: translation = "" score = -10000.0 else: if references is None: best, score = tlist[0] translation = " ".join(best[:-1]) sys.stdout.write(translation) sys.stdout.write("\n") else: best_ind = 0 best_score = 0 # find the best translation according to oracle for i, (trans, score) in enumerate(tlist): trans = trans[:-1] bleu_score = bleu([trans], [references[count]], smoothing=True) if bleu_score > best_score: best_score = bleu_score best_ind = i output = " ".join(tlist[0][0][:-1]) + " ||| " output += str(tlist[0][1]) + " ||| " + str(best_ind) + " ||| " output += " ".join(tlist[best_ind][0][:-1]) + " ||| " output += str(tlist[best_ind][1]) sys.stdout.write(output) sys.stdout.write("\n") count = count + 1 sys.stderr.write(str(count) + " ") sys.stderr.write(str(score) + " " + str(t2 - t1) + "\n")
def train(args): option = default_option() # predefined model names pathname, basename = os.path.split(args.model) modelname = get_filename(basename) autoname = os.path.join(pathname, modelname + ".autosave.pkl") bestname = os.path.join(pathname, modelname + ".best.pkl") # load models if os.path.exists(args.model): opt, params = load_model(args.model) option = opt init = False else: init = True if args.initialize: init_params = load_model(args.initialize) init_params = init_params[1] restore = True else: restore = False override(option, args_to_dict(args)) print_option(option) # load references if option["references"]: references = load_references(option["references"]) else: references = None if args.skip_val: references = None criterion = option["criterion"] if criterion == "mrt": sys.stderr.write("warning: In MRT mode, batch is set to 1\n") # input corpus batch = option["batch"] if criterion == "mle" else 1 sortk = option["sort"] or 1 if criterion == "mle" else 1 shuffle = option["seed"] if option["shuffle"] else None reader = textreader(option["corpus"], shuffle) processor = [data_length, data_length] stream = textiterator(reader, [batch, batch * sortk], processor, option["limit"], option["sort"]) if shuffle and option["indices"] is not None: reader.set_indices(option["indices"]) if args.reset: option["count"] = [0, 0] option["epoch"] = 0 option["cost"] = 0.0 skip_stream(reader, option["count"][1]) epoch = option["epoch"] maxepoch = option["maxepoch"] # create model regularizer = [] if option["l1_scale"]: regularizer.append(ops.l1_regularizer(option["l1_scale"])) if option["l2_scale"]: regularizer.append(ops.l2_regularizer(option["l2_scale"])) scale = option["scale"] initializer = ops.random_uniform_initializer(-scale, scale) regularizer = ops.sum_regularizer(regularizer) # set seed numpy.random.seed(option["seed"]) model = rnnsearch(initializer=initializer, regularizer=regularizer, **option) variables = None if restore: matched, not_matched = match_variables(ops.trainable_variables(), init_params) if args.finetune: variables = not_matched if not variables: raise RuntimeError("no variables to finetune") if not init: set_variables(ops.trainable_variables(), params) if restore: restore_variables(matched, not_matched) print "parameters:", count_parameters(ops.trainable_variables()) # tuning option tune_opt = {} tune_opt["algorithm"] = option["optimizer"] tune_opt["constraint"] = ("norm", option["norm"]) tune_opt["norm"] = True tune_opt["variables"] = variables # create optimizer trainer = optimizer(model, **tune_opt) # beamsearch option search_opt = {} search_opt["beamsize"] = option["beamsize"] search_opt["normalize"] = option["normalize"] search_opt["maxlen"] = option["maxlen"] search_opt["minlen"] = option["minlen"] # vocabulary and special symbol svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs unk_sym = option["unk"] eos_sym = option["eos"] # summary count = option["count"][0] totcost = option["cost"] best_score = option["bleu"] alpha = option["alpha"] sharp = option["sharp"] for i in range(epoch, maxepoch): for data in stream: xdata, xmask = convert_data(data[0], svocab, unk_sym, eos_sym) ydata, ymask = convert_data(data[1], tvocab, unk_sym, eos_sym) if criterion == "mrt": refs = [] for item in data[1]: item = item.split() item = [unk_sym if word not in tvocab else word for word in item] refs.append(" ".join(item)) t1 = time.time() # sample from model nsample = option["sample"] - len(refs) xdata = numpy.repeat(xdata, nsample, 1) xmask = numpy.repeat(xmask, nsample, 1) maxlen = int(1.5 * len(ydata)) examples = batchsample(model, xdata, xmask, maxlen) space = build_sample_space(refs, examples) score = numpy.zeros((len(space),), "float32") refs = [ref.split() for ref in refs] for j in range(len(space)): example = space[j].split() score[j] = 1.0 - bleu([example], [refs], smoothing=True) ydata, ymask = convert_data(space, tvocab, unk_sym, eos_sym) cost, norm = trainer.optimize(xdata[:, 0:1], xmask[:, 0:1], ydata, ymask, score, sharp) trainer.update(alpha=alpha) t2 = time.time() totcost += cost count += 1 t = t2 - t1 ac = totcost / count print i + 1, count, len(space), cost, norm, ac, t else: t1 = time.time() cost, norm = trainer.optimize(xdata, xmask, ydata, ymask) trainer.update(alpha = alpha) t2 = time.time() count += 1 cost = cost * ymask.shape[1] / ymask.sum() totcost += cost / math.log(2) print i + 1, count, cost, norm, t2 - t1 # autosave if count % option["freq"] == 0: option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [count, reader.count] serialize(autoname, option) if count % option["vfreq"] == 0: if option["validation"] and references: trans = translate(model, option["validation"], **search_opt) bleu_score = bleu(trans, references) print "bleu: %2.4f" % bleu_score if bleu_score > best_score: best_score = bleu_score option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [count, reader.count] serialize(bestname, option) if count % option["sfreq"] == 0: n = len(data[0]) ind = numpy.random.randint(0, n) sdata = data[0][ind] tdata = data[1][ind] xdata = xdata[:, ind : ind + 1] xmask = xmask[:, ind : ind + 1] hls = beamsearch(model, xdata, xmask) best, score = hls[0] print sdata print tdata print " ".join(best[:-1]) print "--------------------------------------------------" if option["validation"] and references: trans = translate(model, option["validation"], **search_opt) bleu_score = bleu(trans, references) print "iter: %d, bleu: %2.4f" % (i + 1, bleu_score) if bleu_score > best_score: best_score = bleu_score option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [count, reader.count] serialize(bestname, option) print "averaged cost: ", totcost / count print "--------------------------------------------------" # early stopping if i + 1 >= option["stop"]: alpha = alpha * option["decay"] count = 0 totcost = 0.0 stream.reset() # update autosave option["epoch"] = i + 1 option["alpha"] = alpha option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [0, 0] serialize(autoname, option) print "best(bleu): %2.4f" % best_score stream.close()