def evaluate(args): option, params = load_model(args.model) model = build_model(**option) var_list = ops.trainable_variables() set_variables(var_list, params) if args.sntcost: for costs in evaluate_snt_cost(model, option, args.source, args.target, args.batch, args.normalize): for cost in costs: sys.stdout.write("{}\n".format(cost)) else: # use the first model svocabs, tvocabs = model.option["vocabulary"] unk_symbol = model.option["unk"] eos_symbol = model.option["eos"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs if args.align: inputs = [args.source, args.target[0], args.align] else: inputs = [args.source, args.target[0]] reader = textreader(inputs, False) stream = textiterator(reader, [args.batch, args.batch]) for data in stream: xdata, xmask = convert_data(data[0], svocab, unk_symbol, eos_symbol) ydata, ymask = convert_data(data[1], tvocab, unk_symbol, eos_symbol) if not args.align: align = None else: align = convert_align(data[0], data[1], data[2]) cost = evaluate_model(model, xdata, xmask, ydata, ymask, align, verbose=args.verbose) for i in range(len(cost)): if args.verbose: sys.stdout.write("src: %s\n" % data[0][i]) sys.stdout.write("tgt: %s\n" % data[1][i]) sys.stdout.write("cost: %f\n" % cost[i]) stream.close()
def decode(args): num_models = len(args.model) models = [None for i in range(num_models)] for i in range(num_models): option, params = load_model(args.model[i]) scope = "rnnsearch_%d" % i option['scope'] = scope model = build_model(**option) var_list = get_variables_with_prefix(scope) set_variables(var_list, params) models[i] = model # use the first model svocabs, tvocabs = models[0].option["vocabulary"] unk_sym = models[0].option["unk"] eos_sym = models[0].option["eos"] count = 0 svocab, isvocab = svocabs tvocab, itvocab = tvocabs option = {} option["maxlen"] = args.maxlen option["minlen"] = args.minlen option["beamsize"] = args.beamsize option["normalize"] = args.normalize option["arithmetic"] = args.arithmetic option["suppress_unk"] = args.suppress_unk while True: line = sys.stdin.readline() if line == "": break data = [line] seq, mask = convert_data(data, svocab, unk_sym, eos_sym) t1 = time.time() hlist = beamsearch(models, seq, **option) t2 = time.time() if args.n_best: for trans, score in hlist: translation = ' '.join(trans[:-1]) sys.stdout.write('%d ||| %s ||| %f\n' % (count, translation, score)) if len(hlist) == 0: sys.stdout.write('%d ||| %s ||| %f\n' % (count, '', -10000.0)) else: if len(hlist) == 0: translation = "" score = -10000.0 else: best, score = hlist[0] translation = " ".join(best[:-1]) sys.stdout.write('%s\n' % translation) sys.stderr.write('%d %f %f\n' % (count, score, t2 - t1)) count = count + 1
def decode(args): num_models = len(args.model) models = [None for i in range(num_models)] for i in range(num_models): option, params = load_model(args.model[i]) scope = "rnnsearch_%d" % i option['scope'] = scope model = build_model(**option) var_list = get_variables_with_prefix(scope) set_variables(var_list, params) models[i] = model # use the first model svocabs, tvocabs = models[0].option["vocabulary"] unk_sym = models[0].option["unk"] eos_sym = models[0].option["eos"] count = 0 svocab, isvocab = svocabs tvocab, itvocab = tvocabs option = {} option["maxlen"] = args.maxlen option["minlen"] = args.minlen option["beamsize"] = args.beamsize option["normalize"] = args.normalize option["arithmetic"] = args.arithmetic while True: line = sys.stdin.readline() if line == "": break data = [line] seq, mask = convert_data(data, svocab, unk_sym, eos_sym) t1 = time.time() tlist = beamsearch(models, seq, **option) t2 = time.time() if len(tlist) == 0: translation = "" score = -10000.0 else: best, score = tlist[0] translation = " ".join(best[:-1]) sys.stdout.write(translation) sys.stdout.write("\n") count = count + 1 sys.stderr.write(str(count) + " ") sys.stderr.write(str(score) + " " + str(t2 - t1) + "\n")
def evaluate_snt_cost(model, option, src, refs, batch, normalize): svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs unk_sym = option["unk"] eos_sym = option["eos"] reader = textreader([src] + refs) stream = textiterator(reader, [batch, batch]) get_cost = model.get_snt_cost for data in stream: xdata, xmask = convert_data(data[0], svocab, unk_sym, eos_sym) for i, y in enumerate(data[1:]): ydata, ymask = convert_data(y, tvocab, unk_sym, eos_sym) snt_cost = get_cost(xdata, xmask, ydata, ymask) if normalize: # per word cost lens = numpy.array([len(item) for item in y]) snt_cost /= lens yield snt_cost
def train_step_fn(data, **variables): alpha = option["alpha"] global_step = variables["global_step"] step = variables["step"] epoch = variables["epoch"] xdata, _, xlen = convert_data(data[0], svocab, unk, eos) ydata, _, ylen = convert_data(data[1], tvocab, unk, eos) t1 = time.time() cost, norm = optim.optimize(xdata, xlen, ydata, ylen) alpha = (1 / float(option["embedding"])**0.5) * min( 1 / float(global_step)**0.5, global_step / float(option["warmup"])**1.5) optim.update(alpha=alpha) t2 = time.time() #cost = cost * len(ylen) / sum(ylen) msg = "G/E/S: %d/%d/%d cost: %f norm: %f time: %f" print(msg % (global_step, epoch, step, cost, norm, t2 - t1)) return cost / math.log(2)
def sample_fn(*args, **kwargs): data = args[0] batch = len(data[0]) ind = np.random.randint(0, batch) sdata = data[0][ind] tdata = data[1][ind] xdata, _, xlen = convert_data(data[0], svocab, unk, eos) xdata = xdata[ind:ind + 1, :] xlen = xlen[ind:ind + 1] hls = beamsearch(model, xdata, xlen, **search_opt) best, score = hls[0] print("> " + sdata) print("> " + tdata) print("> " + " ".join(best[:-1]))
def decode(args): option, params = load_model(args.model) model = rnnsearch(**option) set_variables(ops.trainable_variables(), params) # use the first model svocabs, tvocabs = model.option["vocabulary"] unk_sym = model.option["unk"] eos_sym = model.option["eos"] count = 0 svocab, isvocab = svocabs tvocab, itvocab = tvocabs option = {} option["maxlen"] = args.maxlen option["minlen"] = args.minlen option["beamsize"] = args.beamsize option["normalize"] = args.normalize while True: line = sys.stdin.readline() if line == "": break data = [line] seq, mask = convert_data(data, svocab, unk_sym, eos_sym) t1 = time.time() tlist = beamsearch(model, seq, **option) t2 = time.time() if len(tlist) == 0: translation = "" score = -10000.0 else: best, score = tlist[0] translation = " ".join(best[:-1]) sys.stdout.write(translation) sys.stdout.write("\n") count = count + 1 sys.stderr.write(str(count) + " ") sys.stderr.write(str(score) + " " + str(t2 - t1) + "\n")
def sample(args): option, values = load_model(args.model) model = build_model(**option) set_variables(ops.trainable_variables(), values) svocabs, tvocabs = model.option["vocabulary"] unk_symbol = model.option["unk"] eos_symbol = model.option["eos"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs count = 0 batch = args.batch while True: line = sys.stdin.readline() if line == "": break data = [line] seq, mask = convert_data(data, svocab, unk_symbol, eos_symbol) t1 = time.time() seq = numpy.repeat(seq, batch, 1) mask = numpy.repeat(mask, batch, 1) tlist = batchsample(model, seq, mask, maxlen=args.maxlen) t2 = time.time() count = count + 1 if len(tlist) == 0: sys.stdout.write("\n") else: for i in range(min(args.batch, len(tlist))): example = tlist[i] sys.stdout.write(" ".join(example)) sys.stdout.write("\n") sys.stderr.write(str(count) + " " + str(t2 - t1) + "\n")
def translate(model, corpus, **opt): fd = open(corpus, "r") svocab = model.option["vocabulary"][0][0] unk_symbol = model.option["unk"] eos_symbol = model.option["eos"] trans = [] for line in fd: line = line.strip() data, _, length = convert_data([line], svocab, unk_symbol, eos_symbol) hls = beamsearch(model, data, **opt) if len(hls) > 0: best, score = hls[0] trans.append(best[:-1]) else: trans.append([]) fd.close() return trans
def train(args): option = default_option() # predefined model names pathname, basename = os.path.split(args.model) modelname = get_filename(basename) autoname_format = os.path.join(pathname, modelname + ".iter{epoch}-{batch}.pkl") bestname = os.path.join(pathname, modelname + ".best.pkl") # load models if os.path.exists(args.model): opt, params = load_model(args.model) override(option, opt) init = False else: init = True if args.initialize: pretrain_params = load_model(args.initialize) pretrain_params = pretrain_params[1] pretrain = True else: pretrain = False override(option, args_to_dict(args)) # check external validation script ext_val_script = option['ext_val_script'] if not os.path.exists(ext_val_script): raise ValueError('File doesn\'t exist: %s' % ext_val_script) elif not os.access(ext_val_script, os.X_OK): raise ValueError('File is not executable: %s' % ext_val_script) # check references format ref_stem = None if option['validation'] and option['references']: ref_stem = misc.infer_ref_stem([option['validation']], option['references']) ref_stem = ref_stem[0] # .yaml for ultimate options yaml_name = "%s.settings.yaml" % modelname if init or not os.path.exists(yaml_name): with open(yaml_name, "w") as w: _opt = args.__dict__.copy() for k, v in _opt.iteritems(): if k in option: _opt[k] = option[k] yaml.dump(_opt, w, default_flow_style=False) del _opt print_option(option) # reader batch = option["batch"] sortk = option["sort"] shuffle = option["shuffle"] reader = textreader(option["corpus"], shuffle) processor = [data_length, data_length] stream = textiterator(reader, [batch, batch * sortk], processor, option["limit"], option["sort"]) # progress # initialize before building model progress = Progress(option["delay_val"], stream, option["seed"]) # create model regularizer = [] if option["l1_scale"]: regularizer.append(ops.l1_regularizer(option["l1_scale"])) if option["l2_scale"]: regularizer.append(ops.l2_regularizer(option["l2_scale"])) scale = option["scale"] initializer = ops.random_uniform_initializer(-scale, scale) regularizer = ops.sum_regularizer(regularizer) option["scope"] = "rnnsearch" model = build_model(initializer=initializer, regularizer=regularizer, **option) variables = None if pretrain: print "using pretrain" _pp1 = {} for name, val in pretrain_params: names = name.split('/')[1:] if "embedding" in names[0]: _pp1['/'.join(names)] = val else: _pp1['/'.join(names[1:])] = val matched = [] not_matched = [] for var in ops.trainable_variables(): names = var.name.split('/')[1:] if "decoder2" in var.name: not_matched.append((var.name, var.get_value().size)) continue if "embedding" in names[0]: match_name = '/'.join(names) var.set_value(_pp1[match_name]) else: match_name = '/'.join(names[1:]) var.set_value(_pp1[match_name]) matched.append((var.name, var.get_value().size)) print "------------------- matched -------------------" for name, size in matched: print name, size print "------------------- not matched -------------------" for name, size in not_matched: print name, size print "------------------- end -------------------\n" if not init: set_variables(ops.trainable_variables(), params) print "parameters: %d\n" % count_parameters(ops.trainable_variables()) # tuning option tune_opt = {} tune_opt["algorithm"] = option["optimizer"] tune_opt["constraint"] = ("norm", option["norm"]) tune_opt["norm"] = True tune_opt["variables"] = variables # create optimizer scopes = [".*"] trainer = optimizer(model.inputs, model.outputs, model.cost, scopes, **tune_opt) # vocabulary and special symbol svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs unk_sym = option["unk"] eos_sym = option["eos"] alpha = option["alpha"] maxepoch = option["maxepoch"] # restore right before training to avoid randomness changing when trying to resume progress if not args.reset: if "#progress" in option: print 'Restore progress >>' progress = (option["#progress"]) stream = progress.iterator stream.set_processor(processor) for ttt in progress.task_manager.tasks: ttt.status = 4 ttt.result = 0.0 else: print 'New progress >>' else: print 'Discard progress >>' # setup progress progress.oldname = args.model progress.serializer = serialize stream = progress.iterator overwrite = not args.no_overwrite if progress.task_manager: print progress.task_manager try: while progress.epoch < maxepoch: epc = progress.epoch for data in stream: progress.tic() if progress.failed(): raise RuntimeError("progress failure") xdata, xmask = convert_data(data[0], svocab, unk_sym, eos_sym) ydata, ymask = convert_data(data[1], tvocab, unk_sym, eos_sym) bydata, _ = convert_data(data[1], tvocab, unk_sym, eos_sym, True) t1 = time.time() tot_cost, soft_cost, true_cost, norm = trainer.optimize( xdata, xmask, ydata, ymask, bydata) trainer.update(alpha=alpha) t2 = time.time() # per word cost w_cost = true_cost * ymask.shape[1] / ymask.sum() progress.batch_count += 1 progress.batch_total += 1 progress.loss_hist.append(w_cost) count = progress.batch_count if not args.pfreq or count % args.pfreq == 0: print epc + 1, progress.batch_count, w_cost, tot_cost, soft_cost, true_cost, norm, t2 - t1 if count % option["vfreq"] == 0 and not should_skip_val( args.skip_val, option["vfreq"], epc, progress.batch_total): if option["validation"] and option["references"]: progress.add_valid(option['scope'], option['validation'], ref_stem, ext_val_script, __file__, option, modelname, bestname, serialize) # save after validation progress.toc() if count % option["freq"] == 0: progress.save(option, autoname_format, overwrite) progress.tic() if count % option["sfreq"] == 0: n = len(data[0]) ind = numpy.random.randint(0, n) sdata = data[0][ind] tdata = data[1][ind] xdata = xdata[:, ind:ind + 1] xmask = xmask[:, ind:ind + 1] hls = beamsearch(model, xdata, xmask) best, score = hls[0] print "--", sdata print "--", tdata print "--", " ".join(best[:-1]) progress.toc() print "--------------------------------------------------" progress.tic() if option["validation"] and option["references"]: progress.add_valid(option['scope'], option['validation'], ref_stem, ext_val_script, __file__, option, modelname, bestname, serialize) print "--------------------------------------------------" progress.toc() # early stopping if epc + 1 >= option["stop"]: alpha = alpha * option["decay"] stream.reset() progress.epoch += 1 progress.batch_count = 0 # update autosave option["alpha"] = alpha progress.save(option, autoname_format, overwrite) stream.close() progress.tic() print "syncing ..." progress.barrier() # hangup and wait progress.toc() best_valid = max(progress.valid_hist, key=lambda item: item[1]) (epc, count), score = best_valid print "best bleu {}-{}: {:.4f}".format(epc + 1, count, score) if progress.delay_val: task_elapse = sum( [task.elapse for task in progress.task_manager.tasks]) print "training finished in {}({})".format( datetime.timedelta(seconds=int(progress.elapse)), datetime.timedelta(seconds=int(progress.elapse + task_elapse))) else: print "training finished in {}".format( datetime.timedelta(seconds=int(progress.elapse))) progress.save(option, autoname_format, overwrite) except KeyboardInterrupt: traceback.print_exc() progress.terminate() sys.exit(1) except Exception: traceback.print_exc() progress.terminate() sys.exit(1)
def replace(args): num_models = len(args.model) models = [None for i in range(num_models)] alignments = [None for i in range(num_models)] if args.dictionary: mapping = load_dictionary(args.dictionary) heuristic = args.heuristic else: if args.heuristic > 0: raise ValueError("heuristic > 0, but no dictionary available") heuristic = 0 for i in range(num_models): option, params = load_model(args.model[i]) scope = "rnnsearch_%d" % i option["scope"] = scope model = build_model(**option) var_list = get_variables_with_prefix(scope) set_variables(var_list, params) models[i] = model # use the first model svocabs, tvocabs = models[0].option["vocabulary"] unk_symbol = models[0].option["unk"] eos_symbol = models[0].option["eos"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs reader = textreader(args.text, False) stream = textiterator(reader, [args.batch, args.batch]) for data in stream: xdata, xmask = convert_data(data[0], svocab, unk_symbol, eos_symbol) ydata, ymask = convert_data(data[1], tvocab, unk_symbol, eos_symbol) for i in range(num_models): # compute attention score alignments[i] = models[i].align(xdata, xmask, ydata, ymask) # ensemble, alignment: tgt_len * src_len * batch if args.arithmetic: alignment = sum(alignments) / num_models else: alignments = map(numpy.log, alignments) alignment = numpy.exp(sum(alignments) / num_models) # find source word to which each target word was most aligned indices = numpy.argmax(alignment, 1) # write to output for i in range(len(data[1])): source_words = data[0][i].strip().split() target_words = data[1][i].strip().split() translation = [] for j in range(len(target_words)): source_length = len(source_words) word = target_words[j] # found unk symbol if word == unk_symbol: source_index = indices[j, i] if source_index >= source_length: translation.append(word) continue source_word = source_words[source_index] if heuristic and source_word in mapping: if heuristic == 1: translation.append(mapping[source_word]) else: # source word begin with lower case letter if source_word.decode("utf-8")[0].islower(): translation.append(mapping[source_word]) else: translation.append(source_word) else: translation.append(source_word) else: translation.append(word) sys.stdout.write(" ".join(translation)) sys.stdout.write("\n") stream.close()
def train(args): option = default_option() # predefined model names pathname, basename = os.path.split(args.model) modelname = get_filename(basename) autoname_format = os.path.join(pathname, modelname + ".iter{epoch}-{batch}.pkl") bestname = os.path.join(pathname, modelname + ".best.pkl") # load models if os.path.exists(args.model): opt, params = load_model(args.model) override(option, opt) init = False else: init = True if args.initialize: print "initialize:", args.initialize pretrain_params = load_model(args.initialize) pretrain_params = pretrain_params[1] pretrain = True else: pretrain = False override(option, args_to_dict(args)) # check external validation script ext_val_script = option['ext_val_script'] if not os.path.exists(ext_val_script): raise ValueError('File doesn\'t exist: %s' % ext_val_script) elif not os.access(ext_val_script, os.X_OK): raise ValueError('File is not executable: %s' % ext_val_script) # check references format ref_stem = option['references'] if option['validation'] and option['references']: ref_stem = misc.infer_ref_stem([option['validation']], option['references']) ref_stem = ref_stem[0] # .yaml for ultimate options yaml_name = "%s.settings.yaml" % modelname if init or not os.path.exists(yaml_name): with open(yaml_name, "w") as w: _opt = args.__dict__.copy() for k, v in _opt.iteritems(): if k in option: _opt[k] = option[k] yaml.dump(_opt, w, default_flow_style=False) del _opt print_option(option) # reader batch = option["batch"] sortk = option["sort"] shuffle = option["shuffle"] reader = textreader(option["corpus"][:3], shuffle) processor = [data_length, data_length, data_length] stream = textiterator(reader, [batch, batch * sortk], processor, option["limit"], option["sort"]) reader = textreader(option["corpus"][3:], shuffle) processor = [data_length, data_length, data_length] dstream = textiterator(reader, [batch, batch * sortk], processor, None, option["sort"]) # progress # initialize before building model progress = Progress(option["delay_val"], stream, option["seed"]) # create model regularizer = [] if option["l1_scale"]: regularizer.append(ops.l1_regularizer(option["l1_scale"])) if option["l2_scale"]: regularizer.append(ops.l2_regularizer(option["l2_scale"])) scale = option["scale"] initializer = ops.random_uniform_initializer(-scale, scale) regularizer = ops.sum_regularizer(regularizer) option["scope"] = "rnnsearch" model = build_model(initializer=initializer, regularizer=regularizer, **option) variables = None if pretrain: matched, not_matched = match_variables(ops.trainable_variables(), pretrain_params) if args.finetune: variables = not_matched if not variables: raise RuntimeError("no variables to finetune") if pretrain: restore_variables(matched, not_matched) if not init: set_variables(ops.trainable_variables(), params) print "parameters: %d\n" % count_parameters(ops.trainable_variables()) # tuning option tune_opt = {} tune_opt["algorithm"] = option["optimizer"] tune_opt["constraint"] = ("norm", option["norm"]) tune_opt["norm"] = True tune_opt["variables"] = variables # create optimizer scopes = ["((?!Shared).)*$"] trainer = optimizer(model.inputs, model.outputs, model.cost, scopes, **tune_opt) clascopes = [".*(Shared).*"] clatrainer = optimizer(model.inputs_cla, model.outputs_cla, model.cost_cla, clascopes, **tune_opt) #scopes = [".*(DSAenc).*"] #domain_trainer = optimizer(model.inputs, model.toutputs, model.domaincost, scopes, **tune_opt) # vocabulary and special symbol svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs unk_sym = option["unk"] eos_sym = option["eos"] alpha = option["alpha"] maxepoch = option["maxepoch"] # restore right before training to avoid randomness changing when trying to resume progress if not args.reset: if "#progress" in option: print 'Restore progress >>' progress = (option["#progress"]) stream = progress.iterator stream.set_processor(processor) else: print 'New progress >>' else: print 'Discard progress >>' if args.drop_tasks: print 'drop tasks' progress.drop_tasks() # setup progress progress.oldname = args.model progress.serializer = serialize stream = progress.iterator overwrite = not args.no_overwrite if progress.task_manager: print progress.task_manager register_killer() tagvocab = {} for idx, d in enumerate(option["dvocab"]): tagvocab[d] = idx if len(tagvocab) != option["dnum"]: raise ValueError('length of domain vocab %f not equal to domain num %f!' % (len(tagvocab), option["dnum"])) try: while progress.epoch < maxepoch: epc = progress.epoch for data in stream: progress.tic() if progress.failed(): raise RuntimeError("progress failure") # data = _stream.next() xdata, xmask = convert_data(data[0], svocab, unk_sym, eos_sym) ydata, ymask = convert_data(data[1], tvocab, unk_sym, eos_sym) tag = convert_tag(data[2], tagvocab) t1 = time.time() cost, dcost, scost, tdcost, norm = trainer.optimize(xdata, xmask, ydata, ymask, tag) clacost, _ = clatrainer.optimize(xdata, xmask, tag) trainer.update(alpha=alpha) clatrainer.update(alpha=alpha) t2 = time.time() # per word cost w_cost = cost * ymask.shape[1] / ymask.sum() progress.batch_count += 1 progress.batch_total += 1 progress.loss_hist.append(w_cost) if not args.pfreq or count % args.pfreq == 0: print epc + 1, progress.batch_count, w_cost, dcost, tdcost, scost, clacost, norm, t2 - t1 count = progress.batch_count if count % option["sfreq"] == 0: dright = 0.0 sright = 0.0 tdright = 0.0 total = 0.0 for ddata in dstream: txdata, txmask = convert_data(ddata[0], svocab, unk_sym, eos_sym) tydata, tymask = convert_data(ddata[1], tvocab, unk_sym, eos_sym) txtag = convert_tag(ddata[2], tagvocab) dtag_pred, stag_pred = model.tag_predict(txdata, txmask) txtag = txtag[0] dpretag = [] for i in dtag_pred: dpretag.append(int(i)) spretag = [] for i in stag_pred: spretag.append(int(i)) tdtag_pred = model.tgt_tag_predict(txdata, txmask, tydata, tymask) tdpretag = [] for i in tdtag_pred[0]: tdpretag.append(int(i)) dright = dright + sum([m == n for m, n in zip(txtag, dpretag)]) sright = sright + sum([m == n for m, n in zip(txtag, spretag)]) tdright = tdright + sum([m == n for m, n in zip(txtag, tdpretag)]) total = total + len(dpretag) dstream.reset() dacc = dright * 1.0 / total sacc = sright * 1.0 / total tdacc = tdright * 1.0 / total print "dacc:", dright, dacc print "sacc", sright, sacc print "tdacc", tdright, tdacc if count % option["vfreq"] == 0 and not should_skip_val(args.skip_val, option["vfreq"], epc, progress.batch_total): if option["validation"] and option["references"]: progress.add_valid(option['scope'], option['validation'], ref_stem, ext_val_script, __file__, option, modelname, bestname, serialize) # save after validation progress.toc() if count % option["freq"] == 0: progress.save(option, autoname_format, overwrite) progress.tic() if count % option["sfreq"] == 0: n = len(data[0]) ind = numpy.random.randint(0, n) sdata = data[0][ind] tdata = data[1][ind] xdata = xdata[:, ind: ind + 1] xmask = xmask[:, ind: ind + 1] hls = beamsearch(model, xdata, xmask) best, score = hls[0] print "--", sdata print "--", tdata print "--", " ".join(best[:-1]) progress.toc() print "--------------------------------------------------" progress.tic() if option["validation"] and option["references"]: progress.add_valid(option['scope'], option['validation'], ref_stem, ext_val_script, __file__, option, modelname, bestname, serialize) print "--------------------------------------------------" progress.toc() print "epoch cost {}".format(numpy.mean(progress.loss_hist)) progress.loss_hist = [] # early stopping if epc + 1 >= option["stop"]: alpha = alpha * option["decay"] stream.reset() progress.epoch += 1 progress.batch_count = 0 # update autosave option["alpha"] = alpha progress.save(option, autoname_format, overwrite) stream.close() progress.tic() print "syncing ..." progress.barrier() # hangup and wait progress.toc() best_valid = max(progress.valid_hist, key=lambda item: item[1]) (epc, count), score = best_valid print "best bleu {}-{}: {:.4f}".format(epc + 1, count, score) if progress.delay_val: task_elapse = sum([task.elapse for task in progress.task_manager.tasks]) print "training finished in {}({})".format(datetime.timedelta(seconds=int(progress.elapse)), datetime.timedelta(seconds=int(progress.elapse + task_elapse))) else: print "training finished in {}".format(datetime.timedelta(seconds=int(progress.elapse))) progress.save(option, autoname_format, overwrite) except KeyboardInterrupt: traceback.print_exc() progress.terminate() sys.exit(1) except Exception: traceback.print_exc() progress.terminate() sys.exit(1)
def rescore(args): num_models = len(args.model) models = [None for i in range(num_models)] for i in range(num_models): option, params = load_model(args.model[i]) scope = "rnnsearch_%d" % i option['scope'] = scope model = build_model(**option) var_list = get_variables_with_prefix(scope) set_variables(var_list, params) models[i] = model with open(args.source) as source_file: lines = source_file.readlines() if args.n_best_list: with open(args.n_best_list) as nbest_file: nbest_lines = nbest_file.readlines() else: nbest_lines = sys.stdin.readlines() scores = [] for model in models: with tempfile.NamedTemporaryFile() as src, tempfile.NamedTemporaryFile() as tgt: for line in nbest_lines: linesplit = line.split(' ||| ') idx = int(linesplit[0]) ##index from the source file. Starting from 0. src.write(lines[idx]) tgt.write(linesplit[1] + '\n') src.seek(0) tgt.seek(0) option = model.option svocabs, tvocabs = option["vocabulary"] unk_sym = option["unk"] eos_sym = option["eos"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs reader = textreader([src.name, tgt.name]) stream = textiterator(reader, [args.batch, args.batch]) scores_i = [] for x, y in stream: xdata, xmask = convert_data(x, svocab, unk_sym, eos_sym) ydata, ymask = convert_data(y, tvocab, unk_sym, eos_sym) _scores = model.get_snt_cost(xdata, xmask, ydata, ymask) if args.normalize: _scores = _scores / numpy.sum(ymask, 0) scores_i.append(_scores) scores.append(numpy.concatenate(scores_i)) if args.output_n_best: writer = open(args.output_n_best, 'w') else: writer = sys.stdout write = writer.write for i, line in enumerate(nbest_lines): score_str = ' '.join(map(str, [s[i] for s in scores])) write('{0} {1}\n'.format(line.strip(), score_str)) if args.output_n_best: writer.close()
def decode(args): option, values = load_model(args.model) #option, values = load_average_model(args.model) config = tf.ConfigProto() config.gpu_options.allow_growth = True svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs unk_sym = option["unk"] eos_sym = option["eos"] source_word2vec, target_word2vec = option["word2vecs"] count = 0 doption = { "maxlen": args.maxlen, "minlen": args.minlen, "beamsize": args.beamsize, "normalize": args.normalize } # create graph model = NMT(option["num_layers"], option["num_heads"], option["attention_dropout"], option["residual_dropout"], option["relu_dropout"], option["embedding"], option["hidden"], option["filter"], len(isvocab), len(itvocab), source_word2vec, target_word2vec) model.option = option input_file = open(args.corpus, 'r') output_file = open(args.translation, 'w') with tf.Session(config=config): tf.global_variables_initializer().run() set_variables(tf.trainable_variables(), values) line = input_file.readline() while line: line_list = line.split() data = [line] seq, _, seq_len = convert_data(data, svocab, unk_sym, eos_sym) t1 = time.time() tlist = beamsearch(model, seq, seq_len, **doption) t2 = time.time() if len(tlist) == 0: sys.stdout.write("\n") score = -10000.0 else: best, score = tlist[0] output_file.write(" ".join(best[:-1])) output_file.write("\n") count = count + 1 sys.stderr.write(str(count) + " ") sys.stderr.write(str(score) + " " + str(t2 - t1) + "\n") line = input_file.readline() output_file.close() input_file.close()
def decode(args): num_models = len(args.model) models = [None for i in range(num_models)] for i in range(num_models): option, params = load_model(args.model[i]) scope = "rnnsearch_%d" % i model = rnnsearch(scope=scope, **option) var_list = get_variables_with_prefix(scope) set_variables(var_list, params) models[i] = model # use the first model svocabs, tvocabs = models[0].option["vocabulary"] unk_sym = models[0].option["unk"] eos_sym = models[0].option["eos"] count = 0 svocab, isvocab = svocabs tvocab, itvocab = tvocabs option = {} option["maxlen"] = args.maxlen option["minlen"] = args.minlen option["beamsize"] = args.beamsize option["normalize"] = args.normalize option["arithmetic"] = args.arithmetic if args.oracle: references = load_references(args.oracle) else: references = None while True: line = sys.stdin.readline() if line == "": break data = [line] seq, mask = convert_data(data, svocab, unk_sym, eos_sym) t1 = time.time() tlist = beamsearch(models, seq, **option) t2 = time.time() if len(tlist) == 0: translation = "" score = -10000.0 else: if references is None: best, score = tlist[0] translation = " ".join(best[:-1]) sys.stdout.write(translation) sys.stdout.write("\n") else: best_ind = 0 best_score = 0 # find the best translation according to oracle for i, (trans, score) in enumerate(tlist): trans = trans[:-1] bleu_score = bleu([trans], [references[count]], smoothing=True) if bleu_score > best_score: best_score = bleu_score best_ind = i output = " ".join(tlist[0][0][:-1]) + " ||| " output += str(tlist[0][1]) + " ||| " + str(best_ind) + " ||| " output += " ".join(tlist[best_ind][0][:-1]) + " ||| " output += str(tlist[best_ind][1]) sys.stdout.write(output) sys.stdout.write("\n") count = count + 1 sys.stderr.write(str(count) + " ") sys.stderr.write(str(score) + " " + str(t2 - t1) + "\n")
def train(args): option = default_option() # predefined model names pathname, basename = os.path.split(args.model) modelname = get_filename(basename) autoname = os.path.join(pathname, modelname + ".autosave.pkl") bestname = os.path.join(pathname, modelname + ".best.pkl") # load models if os.path.exists(args.model): opt, params = load_model(args.model) option = opt init = False else: init = True if args.initialize: init_params = load_model(args.initialize) init_params = init_params[1] restore = True else: restore = False override(option, args_to_dict(args)) print_option(option) # load references if option["references"]: references = load_references(option["references"]) else: references = None if args.skip_val: references = None criterion = option["criterion"] if criterion == "mrt": sys.stderr.write("warning: In MRT mode, batch is set to 1\n") # input corpus batch = option["batch"] if criterion == "mle" else 1 sortk = option["sort"] or 1 if criterion == "mle" else 1 shuffle = option["seed"] if option["shuffle"] else None reader = textreader(option["corpus"], shuffle) processor = [data_length, data_length] stream = textiterator(reader, [batch, batch * sortk], processor, option["limit"], option["sort"]) if shuffle and option["indices"] is not None: reader.set_indices(option["indices"]) if args.reset: option["count"] = [0, 0] option["epoch"] = 0 option["cost"] = 0.0 skip_stream(reader, option["count"][1]) epoch = option["epoch"] maxepoch = option["maxepoch"] # create model regularizer = [] if option["l1_scale"]: regularizer.append(ops.l1_regularizer(option["l1_scale"])) if option["l2_scale"]: regularizer.append(ops.l2_regularizer(option["l2_scale"])) scale = option["scale"] initializer = ops.random_uniform_initializer(-scale, scale) regularizer = ops.sum_regularizer(regularizer) # set seed numpy.random.seed(option["seed"]) model = rnnsearch(initializer=initializer, regularizer=regularizer, **option) variables = None if restore: matched, not_matched = match_variables(ops.trainable_variables(), init_params) if args.finetune: variables = not_matched if not variables: raise RuntimeError("no variables to finetune") if not init: set_variables(ops.trainable_variables(), params) if restore: restore_variables(matched, not_matched) print "parameters:", count_parameters(ops.trainable_variables()) # tuning option tune_opt = {} tune_opt["algorithm"] = option["optimizer"] tune_opt["constraint"] = ("norm", option["norm"]) tune_opt["norm"] = True tune_opt["variables"] = variables # create optimizer trainer = optimizer(model, **tune_opt) # beamsearch option search_opt = {} search_opt["beamsize"] = option["beamsize"] search_opt["normalize"] = option["normalize"] search_opt["maxlen"] = option["maxlen"] search_opt["minlen"] = option["minlen"] # vocabulary and special symbol svocabs, tvocabs = option["vocabulary"] svocab, isvocab = svocabs tvocab, itvocab = tvocabs unk_sym = option["unk"] eos_sym = option["eos"] # summary count = option["count"][0] totcost = option["cost"] best_score = option["bleu"] alpha = option["alpha"] sharp = option["sharp"] for i in range(epoch, maxepoch): for data in stream: xdata, xmask = convert_data(data[0], svocab, unk_sym, eos_sym) ydata, ymask = convert_data(data[1], tvocab, unk_sym, eos_sym) if criterion == "mrt": refs = [] for item in data[1]: item = item.split() item = [unk_sym if word not in tvocab else word for word in item] refs.append(" ".join(item)) t1 = time.time() # sample from model nsample = option["sample"] - len(refs) xdata = numpy.repeat(xdata, nsample, 1) xmask = numpy.repeat(xmask, nsample, 1) maxlen = int(1.5 * len(ydata)) examples = batchsample(model, xdata, xmask, maxlen) space = build_sample_space(refs, examples) score = numpy.zeros((len(space),), "float32") refs = [ref.split() for ref in refs] for j in range(len(space)): example = space[j].split() score[j] = 1.0 - bleu([example], [refs], smoothing=True) ydata, ymask = convert_data(space, tvocab, unk_sym, eos_sym) cost, norm = trainer.optimize(xdata[:, 0:1], xmask[:, 0:1], ydata, ymask, score, sharp) trainer.update(alpha=alpha) t2 = time.time() totcost += cost count += 1 t = t2 - t1 ac = totcost / count print i + 1, count, len(space), cost, norm, ac, t else: t1 = time.time() cost, norm = trainer.optimize(xdata, xmask, ydata, ymask) trainer.update(alpha = alpha) t2 = time.time() count += 1 cost = cost * ymask.shape[1] / ymask.sum() totcost += cost / math.log(2) print i + 1, count, cost, norm, t2 - t1 # autosave if count % option["freq"] == 0: option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [count, reader.count] serialize(autoname, option) if count % option["vfreq"] == 0: if option["validation"] and references: trans = translate(model, option["validation"], **search_opt) bleu_score = bleu(trans, references) print "bleu: %2.4f" % bleu_score if bleu_score > best_score: best_score = bleu_score option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [count, reader.count] serialize(bestname, option) if count % option["sfreq"] == 0: n = len(data[0]) ind = numpy.random.randint(0, n) sdata = data[0][ind] tdata = data[1][ind] xdata = xdata[:, ind : ind + 1] xmask = xmask[:, ind : ind + 1] hls = beamsearch(model, xdata, xmask) best, score = hls[0] print sdata print tdata print " ".join(best[:-1]) print "--------------------------------------------------" if option["validation"] and references: trans = translate(model, option["validation"], **search_opt) bleu_score = bleu(trans, references) print "iter: %d, bleu: %2.4f" % (i + 1, bleu_score) if bleu_score > best_score: best_score = bleu_score option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [count, reader.count] serialize(bestname, option) print "averaged cost: ", totcost / count print "--------------------------------------------------" # early stopping if i + 1 >= option["stop"]: alpha = alpha * option["decay"] count = 0 totcost = 0.0 stream.reset() # update autosave option["epoch"] = i + 1 option["alpha"] = alpha option["indices"] = reader.get_indices() option["bleu"] = best_score option["cost"] = totcost option["count"] = [0, 0] serialize(autoname, option) print "best(bleu): %2.4f" % best_score stream.close()