def train_model(self): trace('making vocaburaries ...') src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab) trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab) trace('making model ...') model = self.new(src_vocab, trg_vocab, self.embed, self.hidden, self.parameter_dict) random_number = random.randint(0, self.minibatch) for i_epoch in range(self.epoch): trace('epoch %d/%d: ' % (i_epoch + 1, self.epoch)) trained = 0 gen1 = gens.word_list(self.source) gen2 = gens.word_list(self.target) gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * self.minibatch), self.minibatch) model.init_optimizer() for src_batch, trg_batch in gen3: src_batch = fill_batch(src_batch) trg_batch = fill_batch(trg_batch) K = len(src_batch) hyp_batch = model.train(src_batch, trg_batch) if trained == 0: self.print_out(random_number, i_epoch, trained, src_batch, trg_batch, hyp_batch) trained += K trace('saving model ...') model.save("ChainerMachineTranslation" + '.%03d' % (self.epoch + 1)) trace('finished.')
def wakati_all_text(path_list, mode): leng = len(path_list) - 1 out = "" for i, path in enumerate(path_list): trace(mode, "wakati", i, "/", leng) out += wakati(path, mode) return out
def test(self): """ Call the Attention Dialogue Test """ trace('initializing ...') encoderDecoderModel = EncoderDecoderModelAttention(self.parameter_dict) encoderDecoderModel.test()
def train(self): """ Call the Dialogue Training """ trace('initializing ...') encoderDecoderModel = EncoderDecoderModelAttention(self.parameter_dict) encoderDecoderModel.train()
def test(self): """ Call the Dialogue Test """ trace("initializing ...") encoderDecoderModel = EncoderDecoderModel(self.parameter_dict) encoderDecoderModel.test()
def main(): args = parse_args() trace('initializing ...') wrapper.init() if args.mode == 'train': train_model(args) elif args.mode == 'test': test_model(args)
def train(self): """ Call the Dialogue Training """ trace("initializing ...") encoderDecoderModel = EncoderDecoderModel(self.parameter_dict) encoderDecoderModel.train()
def train(self): """ Train method If you use the word2vec model, you possible to use the copy weight Optimizer method use the Adagrad """ trace('making vocabularies ...') src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab) trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab) trace('making model ...') self.attention_dialogue = AttentionDialogue(self.vocab, self.embed, self.hidden, self.XP) if self.word2vecFlag: self.copy_model(self.word2vec, self.attention_dialogue.emb) self.copy_model(self.word2vec, self.attention_dialogue.dec, dec_flag=True) for epoch in range(self.epoch): trace('epoch %d/%d: ' % (epoch + 1, self.epoch)) trained = 0 gen1 = gens.word_list(self.source) gen2 = gens.word_list(self.target) gen3 = gens.batch( gens.sorted_parallel(gen1, gen2, 100 * self.minibatch), self.minibatch) opt = optimizers.AdaGrad(lr=0.01) opt.setup(self.attention_dialogue) opt.add_hook(optimizer.GradientClipping(5)) random_number = random.randint(0, self.minibatch - 1) for src_batch, trg_batch in gen3: src_batch = fill_batch(src_batch) trg_batch = fill_batch(trg_batch) K = len(src_batch) hyp_batch, loss = self.forward_implement( src_batch, trg_batch, src_vocab, trg_vocab, self.attention_dialogue, True, 0) loss.backward() opt.update() self.print_out(random_number, epoch, trained, src_batch, trg_batch, hyp_batch) trained += K trace('saving model ...') prefix = self.model model_path = APP_ROOT + "/model/" + prefix src_vocab.save(model_path + '.srcvocab') trg_vocab.save(model_path + '.trgvocab') self.attention_dialogue.save_spec(model_path + '.spec') serializers.save_hdf5(model_path + '.weights', self.attention_dialogue) trace('finished.')
def main(): args = parse_args() trace("initializing CUDA ...") wrapper.init() if args.mode == "train": train_model(args) elif args.mode == "test": test_model(args)
def __init__(self, args): trace('loading model ...') self.args = args self.src_vocab = Vocabulary.load(args.model + '.srcvocab') self.trg_vocab = Vocabulary.load(args.model + '.trgvocab') self.encdec = EncoderDecoder.load_spec(args.model + '.spec') if args.use_gpu: self.encdec.to_gpu() serializers.load_hdf5(args.model + '.weights', self.encdec) trace('generating translation ...')
def test(args): trace('loading model ...') src_vocab = Vocabulary.load(args.model + '.srcvocab') trg_vocab = Vocabulary.load(args.model + '.trgvocab') attmt = AttentionMT.load_spec(args.model + '.spec') if args.use_gpu: attmt.to_gpu() serializers.load_hdf5(args.model + '.weights', attmt) trace('generating translation ...') generated = 0 with open(args.target, 'w') as fp: for src_batch in gens.batch(gens.word_list(args.source), args.minibatch): src_batch = fill_batch(src_batch) K = len(src_batch) trace('sample %8d - %8d ...' % (generated + 1, generated + K)) hyp_batch = forward(src_batch, None, src_vocab, trg_vocab, attmt, False, args.generation_limit) for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] print(' '.join(hyp), file=fp) generated += K trace('finished.')
def test_model(args): trace('loading model ...') model = EncoderDecoderModel.load(args.model) trace('generating translation ...') generated = 0 src_vectors = read_src_vectors(args.source) src_size = len(src_vectors[0]) with open(args.target, 'w') as fp: for src_batch in gens.batch(src_vectors, args.minibatch): #src_batch = fill_batch(src_batch) K = len(src_batch) trace('sample %8d - %8d ...' % (generated + 1, generated + K)) hyp_batch = model.predict(src_batch, args.generation_limit) for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] six.print_(' '.join(hyp), file=fp) generated += K trace('finished.')
def test_model(self, model_name): trace('loading model ...') model = self.load(model_name) trace('generating translation ...') generated = 0 with open(self.test_target, 'w') as fp: for src_batch in gens.batch(gens.word_list(self.test_source), self.minibatch): src_batch = fill_batch(src_batch) K = len(src_batch) trace('sample %8d - %8d ...' % (generated + 1, generated + K)) hyp_batch = model.predict(src_batch, self.generation_limit) source_cuont = 0 for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] # BLEUの結果を計算するため. print("".join(src_batch[source_cuont]).replace("</s>", "")) print(' '.join(hyp)) print(' '.join(hyp), file=fp) source_cuont = source_cuont + 1 generated += K trace('finished.')
def train(self): """ Train method If you use the word2vec model, you possible to use the copy weight Optimizer method use the Adagrad """ trace("making vocabularies ...") src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab) trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab) trace("making model ...") self.attention_dialogue = AttentionDialogue(self.vocab, self.embed, self.hidden, self.XP) if self.word2vecFlag: self.copy_model(self.word2vec, self.attention_dialogue.emb) self.copy_model(self.word2vec, self.attention_dialogue.dec, dec_flag=True) for epoch in range(self.epoch): trace("epoch %d/%d: " % (epoch + 1, self.epoch)) trained = 0 gen1 = gens.word_list(self.source) gen2 = gens.word_list(self.target) gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * self.minibatch), self.minibatch) opt = optimizers.AdaGrad(lr=0.01) opt.setup(self.attention_dialogue) opt.add_hook(optimizer.GradientClipping(5)) random_number = random.randint(0, self.minibatch - 1) for src_batch, trg_batch in gen3: src_batch = fill_batch(src_batch) trg_batch = fill_batch(trg_batch) K = len(src_batch) hyp_batch, loss = self.forward_implement( src_batch, trg_batch, src_vocab, trg_vocab, self.attention_dialogue, True, 0 ) loss.backward() opt.update() self.print_out(random_number, epoch, trained, src_batch, trg_batch, hyp_batch) trained += K trace("saving model ...") prefix = self.model model_path = APP_ROOT + "/model/" + prefix src_vocab.save(model_path + ".srcvocab") trg_vocab.save(model_path + ".trgvocab") self.attention_dialogue.save_spec(model_path + ".spec") serializers.save_hdf5(model_path + ".weights", self.attention_dialogue) trace("finished.")
def test(self): trace('loading model ...') src_vocab = Vocabulary.load(self.model + '.srcvocab') trg_vocab = Vocabulary.load(self.model + '.trgvocab') encdec = EncoderDecoder.load_spec(self.model + '.spec') serializers.load_hdf5(self.model + '.weights', encdec) trace('generating translation ...') generated = 0 with open(self.target, 'w') as fp: for src_batch in gens.batch(gens.word_list(self.source), self.minibatch): src_batch = fill_batch(src_batch) K = len(src_batch) trace('sample %8d - %8d ...' % (generated + 1, generated + K)) hyp_batch = self.forward(src_batch, None, src_vocab, trg_vocab, encdec, False, self.generation_limit) source_cuont = 0 for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] print("src : " + "".join(src_batch[source_cuont]).replace("</s>", "")) print('hyp : ' + ''.join(hyp)) print(' '.join(hyp), file=fp) source_cuont = source_cuont + 1 generated += K trace('finished.')
def test(self): trace('loading model ...') src_vocab = Vocabulary.load(self.model + '.srcvocab') trg_vocab = Vocabulary.load(self.model + '.trgvocab') encdec = EncoderDecoder.load_spec(self.model + '.spec') serializers.load_hdf5(self.model + '.weights', encdec) trace('generating translation ...') generated = 0 with open(self.target, 'w') as fp: for src_batch in gens.batch(gens.word_list(self.source), self.minibatch): src_batch = fill_batch(src_batch) K = len(src_batch) trace('sample %8d - %8d ...' % (generated + 1, generated + K)) hyp_batch = self.forward(src_batch, None, src_vocab, trg_vocab, encdec, False, self.generation_limit) source_cuont = 0 for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] print("src : " + "".join(src_batch[source_cuont]).replace("</s>", "")) print('hyp : ' +''.join(hyp)) print(' '.join(hyp), file=fp) source_cuont = source_cuont + 1 generated += K trace('finished.')
def train(self): trace('making vocabularies ...') src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab) trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab) trace('making model ...') encdec = EncoderDecoder(self.vocab, self.embed, self.hidden) if self.word2vecFlag: self.copy_model(self.word2vec, encdec.enc) self.copy_model(self.word2vec, encdec.dec, dec_flag=True) for epoch in range(self.epoch): trace('epoch %d/%d: ' % (epoch + 1, self.epoch)) trained = 0 gen1 = gens.word_list(self.source) gen2 = gens.word_list(self.target) gen3 = gens.batch( gens.sorted_parallel(gen1, gen2, 100 * self.minibatch), self.minibatch) opt = optimizers.AdaGrad(lr=0.01) opt.setup(encdec) opt.add_hook(optimizer.GradientClipping(5)) random_number = random.randint(0, self.minibatch - 1) for src_batch, trg_batch in gen3: src_batch = fill_batch(src_batch) trg_batch = fill_batch(trg_batch) K = len(src_batch) # If you use the ipython note book you hace to use the forward function # hyp_batch, loss = self.forward(src_batch, trg_batch, src_vocab, trg_vocab, encdec, True, 0) hyp_batch, loss = self.forward_implement( src_batch, trg_batch, src_vocab, trg_vocab, encdec, True, 0) loss.backward() opt.update() self.print_out(random_number, epoch, trained, src_batch, trg_batch, hyp_batch) trained += K trace('saving model ...') prefix = self.model src_vocab.save(prefix + '.srcvocab') trg_vocab.save(prefix + '.trgvocab') encdec.save_spec(prefix + '.spec') serializers.save_hdf5(prefix + '.weights', encdec) trace('finished.')
def print_out(self, K, i_epoch, trained, src_batch, trg_batch, hyp_batch): trace('epoch %3d/%3d, sample %8d' % (i_epoch + 1, self.epoch, trained + K + 1)) trace(' src = ' + ' '.join([x if x != '</s>' else '*' for x in src_batch[K]])) trace(' trg = ' + ' '.join([x if x != '</s>' else '*' for x in trg_batch[K]])) trace(' hyp = ' + ' '.join([x if x != '</s>' else '*' for x in hyp_batch[K]]))
def train(self): trace('making vocabularies ...') src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab) trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab) trace('making model ...') encdec = EncoderDecoder(self.vocab, self.embed, self.hidden) if self.word2vecFlag: self.copy_model(self.word2vec, encdec.enc) self.copy_model(self.word2vec, encdec.dec, dec_flag=True) else: encdec = self.encdec for epoch in range(self.epoch): trace('epoch %d/%d: ' % (epoch + 1, self.epoch)) trained = 0 gen1 = gens.word_list(self.source) gen2 = gens.word_list(self.target) gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * self.minibatch), self.minibatch) opt = optimizers.AdaGrad(lr = 0.01) opt.setup(encdec) opt.add_hook(optimizer.GradientClipping(5)) random_number = random.randint(0, self.minibatch - 1) for src_batch, trg_batch in gen3: src_batch = fill_batch(src_batch) trg_batch = fill_batch(trg_batch) K = len(src_batch) hyp_batch, loss = self.forward(src_batch, trg_batch, src_vocab, trg_vocab, encdec, True, 0) loss.backward() opt.update() if trained == 0: self.print_out(random_number, epoch, trained, src_batch, trg_batch, hyp_batch) trained += K trace('saving model ...') prefix = self.model src_vocab.save(prefix + '.srcvocab') trg_vocab.save(prefix + '.trgvocab') encdec.save_spec(prefix + '.spec') serializers.save_hdf5(prefix + '.weights', encdec) trace('finished.')
def test(args): trace('loading model ...') word_vocab = Vocabulary.load(args.model + '.words') phrase_vocab = Vocabulary.load(args.model + '.phrases') semiterminal_vocab = Vocabulary.load(args.model + '.semiterminals') parser = Parser.load_spec(args.model + '.spec') if args.use_gpu: parser.to_gpu() serializers.load_hdf5(args.model + '.weights', parser) embed_cache = {} parser.reset() trace('generating parse trees ...') with open(args.source) as fp: for l in fp: word_list = to_vram_words(convert_word_list(l.split(), word_vocab)) tree = combine_xbar( restore_labels( parser.forward(word_list, None, args.unary_limit, embed_cache), phrase_vocab, semiterminal_vocab)) print('( ' + tree_to_string(tree) + ' )') trace('finished.')
def train_mulit_model(self): """ Call the Dialogue Training for multi model """ trace('initializing ...') train_path = APP_ROOT + "/../twitter/data/" file_list = os.listdir(train_path) twitter_source_dict = {} twitter_replay_dict = {} for file in file_list: word_class = re.sub("_replay_twitter_data\.txt|_source_twitter_data\.txt", "", file.strip()) if word_class not in twitter_source_dict: twitter_source_dict.update({word_class: file.strip()}) if word_class not in twitter_replay_dict: twitter_replay_dict.update({word_class: file.strip()}) for word_class in twitter_source_dict.keys(): self.parameter_dict["source"] = train_path + word_class + "_source_twitter_data.txt" print(self.parameter_dict["source"]) self.parameter_dict["target"] = train_path + word_class + "_replay_twitter_data.txt" print(self.parameter_dict["target"]) self.parameter_dict["model"] = "ChainerDialogue_" + word_class encoderDecoderModel = EncoderDecoderModelAttention(self.parameter_dict) encoderDecoderModel.train()
def train_model(self): trace('making vocaburaries ...') src_vocab = Vocabulary.new(gens.word_list(self.source), self.vocab) trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab) trace('making model ...') model = self.new(src_vocab, trg_vocab, self.embed, self.hidden, self.parameter_dict) random_number = random.randint(0, self.minibatch) for i_epoch in range(self.epoch): trace('epoch %d/%d: ' % (i_epoch + 1, self.epoch)) trained = 0 gen1 = gens.word_list(self.source) gen2 = gens.word_list(self.target) gen3 = gens.batch( gens.sorted_parallel(gen1, gen2, 100 * self.minibatch), self.minibatch) model.init_optimizer() for src_batch, trg_batch in gen3: src_batch = fill_batch(src_batch) trg_batch = fill_batch(trg_batch) K = len(src_batch) hyp_batch = model.train(src_batch, trg_batch) if trained == 0: self.print_out(random_number, i_epoch, trained, src_batch, trg_batch, hyp_batch) trained += K trace('saving model ...') model.save("ChainerMachineTranslation" + '.%03d' % (self.epoch + 1)) trace('finished.')
def main(): args = parse_args() data, target, ids = load_data(args.train) test_data, test_target, ids = load_data(args.test, ids) model = init_model(input_size = len(ids), depth = args.depth, hidden_size = args.hidden_size, output_size = 2) optimizer = optimizers.SGD() # Begin Training optimizer.setup(model) for ep in range(epoch): UF.trace("Training Epoch %d" % ep) indexes = np.random.permutation(len(data)) for i in range(0, len(data), batchsize): x_batch = data[indexes[i: i+batchsize]] y_batch = target[indexes[i : i+batchsize]] optimizer.zero_grads() loss, accuracy = forward(model,x_batch, y_batch) loss.backward() optimizer.update() UF.trace(accuracy.data) # Begin Testing sum_loss, sum_accuracy = 0, 0 for i in range(0, len(test_data), batchsize): x_batch = test_data[i : i+batchsize] y_batch = test_target[i : i+batchsize] loss, accuracy = forward(model, x_batch, y_batch) sum_loss += loss.data * batchsize sum_accuracy += accuracy.data * batchsize mean_loss = sum_loss / len(test_data) mean_accuracy = sum_accuracy / len(test_data) print("Mean Loss", mean_loss) print("Mean Accuracy", mean_accuracy)
def print_out(self, K, i_epoch, trained, src_batch, trg_batch, hyp_batch): """ Print out :param K(int): setting the random number() :param i_epoch(int): epoch times :param trained: train times :param src_batch: source data :param trg_batch: target data :param hyp_batch: hypothesis data :return: """ if K > len(src_batch) and K > len(trg_batch) and K > len(hyp_batch): K = len(src_batch) - 1 trace('epoch %3d/%3d, sample %8d' % (i_epoch + 1, self.epoch, trained + K + 1)) trace(' src = ' + ' '.join([x if x != '</s>' else '*' for x in src_batch[K]])) trace(' trg = ' + ' '.join([x if x != '</s>' else '*' for x in trg_batch[K]])) trace(' hyp = ' + ' '.join([x if x != '</s>' else '*' for x in hyp_batch[K]]))
def print_out(self, K, i_epoch, trained, src_batch, trg_batch, hyp_batch): """ Print out :param K: :param i_epoch: :param trained: train times :param src_batch: :param trg_batch: :param hyp_batch: :return: """ if K > len(src_batch) and K > len(trg_batch) and K > len(hyp_batch): K = len(src_batch) - 1 trace("epoch %3d/%3d, sample %8d" % (i_epoch + 1, self.epoch, trained + K + 1)) trace(" src = " + " ".join([x if x != "</s>" else "*" for x in src_batch[K]])) trace(" trg = " + " ".join([x if x != "</s>" else "*" for x in trg_batch[K]])) trace(" hyp = " + " ".join([x if x != "</s>" else "*" for x in hyp_batch[K]]))
def test(self): """ Test method You have to parepare the train model """ trace('loading model ...') prefix = self.model model_path = APP_ROOT + "/model/" + prefix src_vocab = Vocabulary.load(model_path + '.srcvocab') trg_vocab = Vocabulary.load(model_path + '.trgvocab') self.attention_dialogue = AttentionDialogue.load_spec( model_path + '.spec', self.XP) serializers.load_hdf5(model_path + '.weights', self.attention_dialogue) trace('generating translation ...') generated = 0 with open(self.test_target, 'w') as fp: for src_batch in gens.batch(gens.word_list(self.source), self.minibatch): src_batch = fill_batch(src_batch) K = len(src_batch) trace('sample %8d - %8d ...' % (generated + 1, generated + K)) hyp_batch = self.forward_implement(src_batch, None, src_vocab, trg_vocab, self.attention_dialogue, False, self.generation_limit) source_cuont = 0 for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] print("src : " + "".join(src_batch[source_cuont]).replace("</s>", "")) print('hyp : ' + ''.join(hyp)) print(' '.join(hyp), file=fp) source_cuont = source_cuont + 1 generated += K trace('finished.')
def main(): global xp args = parse_args() x_ids = defaultdict(lambda:len(x_ids)) y_ids = defaultdict(lambda:len(y_ids)) init_wrapper(not args.use_cpu) data, target = load_data(args.train, x_ids, y_ids) test_data, test_target = load_data(args.test, x_ids, y_ids) model = init_model(input_size = args.input_size, embed_size = args.embed_size, hidden_size = args.hidden_size, output_size = len(y_ids)) optimizer = optimizers.SGD(lr=0.5) # Begin Training UF.init_model_parameters(model) model = UF.convert_to_GPU(not args.use_cpu, model) optimizer.setup(model) prev_acc = 0 for ep in range(epoch): UF.trace("Training Epoch %d" % ep) epoch_acc = 0 total = 0 for i in range(0, len(data), batchsize): x_batch = data[i: i+batchsize] y_batch = target[i : i+batchsize] optimizer.zero_grads() loss, accuracy = forward(model, x_batch, y_batch, args.hidden_size) loss.backward() optimizer.update() # Counting epoch accuracy epoch_acc += 100 * accuracy.data total += 1 epoch_acc /= total if prev_acc > epoch_acc: optimizer.lr *= 0.9 UF.trace("Reducing LR:", optimizer.lr) prev_acc = epoch_acc UF.trace("Epoch Accuracy: %.2f" % (epoch_acc)) # Begin Testing sum_loss, sum_accuracy = 0, 0 for i in range(0, len(test_data), batchsize): x_batch = test_data[i : i+batchsize] y_batch = test_target[i : i+batchsize] loss, accuracy = forward(model, x_batch, y_batch, args.hidden_size) sum_loss += loss.data * batchsize sum_accuracy += accuracy.data * batchsize mean_loss = sum_loss / len(test_data) mean_accuracy = sum_accuracy / len(test_data) print("Mean Loss", mean_loss) print("Mean Accuracy", mean_accuracy)
def test(args): trace('loading model ...') src_vocab = Vocabulary.load(args.model + '.srcvocab') trg_vocab = Vocabulary.load(args.model + '.trgvocab') encdec = EncoderDecoder.load_spec(args.model + '.spec') if args.use_gpu: encdec.to_gpu() serializers.load_hdf5(args.model + '.weights', encdec) trace('generating translation ...') generated = 0 temp = gens.to_words(args.target) # temp.append("</s>") src_batch = [] src_batch.append(temp) # src_batch = [['私は', '太郎', 'です', '(´', 'ー', '`*)', 'ウンウン', '</s>']] src_batch = fill_batch(src_batch) print("src_batch:", src_batch) K = len(src_batch) trace('sample %8d - %8d ...' % (generated + 1, generated + K)) print("question:") for srp in src_batch: srp.append('</s>') srp = srp[:srp.index('</s>')] print(''.join(srp)) hyp_batch = forward(src_batch, None, src_vocab, trg_vocab, encdec, False, args.generation_limit) print("answser:") for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] print(''.join(hyp)) print("----------------") generated += K trace('finished.')
def print_out(self, K, i_epoch, trained, src_batch, trg_batch, hyp_batch): """ Print out :param K: :param i_epoch: :param trained: train times :param src_batch: :param trg_batch: :param hyp_batch: :return: """ if K > len(src_batch) and K > len(trg_batch) and K > len(hyp_batch): K = len(src_batch) - 1 trace('epoch %3d/%3d, sample %8d' % (i_epoch + 1, self.epoch, trained + K + 1)) trace(' src = ' + ' '.join([x if x != '</s>' else '*' for x in src_batch[K]])) trace(' trg = ' + ' '.join([x if x != '</s>' else '*' for x in trg_batch[K]])) trace(' hyp = ' + ' '.join([x if x != '</s>' else '*' for x in hyp_batch[K]]))
def test(self): """ Test method You have to parepare the train model """ trace("loading model ...") prefix = self.model model_path = APP_ROOT + "/model/" + prefix src_vocab = Vocabulary.load(model_path + ".srcvocab") trg_vocab = Vocabulary.load(model_path + ".trgvocab") self.attention_dialogue = AttentionDialogue.load_spec(model_path + ".spec", self.XP) serializers.load_hdf5(model_path + ".weights", self.attention_dialogue) trace("generating translation ...") generated = 0 with open(self.test_target, "w") as fp: for src_batch in gens.batch(gens.word_list(self.source), self.minibatch): src_batch = fill_batch(src_batch) K = len(src_batch) trace("sample %8d - %8d ..." % (generated + 1, generated + K)) hyp_batch = self.forward_implement( src_batch, None, src_vocab, trg_vocab, self.attention_dialogue, False, self.generation_limit ) source_cuont = 0 for hyp in hyp_batch: hyp.append("</s>") hyp = hyp[: hyp.index("</s>")] print("src : " + "".join(src_batch[source_cuont]).replace("</s>", "")) print("hyp : " + "".join(hyp)) print(" ".join(hyp), file=fp) source_cuont = source_cuont + 1 generated += K trace("finished.")
def test_model(args): trace('loading model ...') model = AttentionalTranslationModel.load(args.model) trace('generating translation ...') generated = 0 with open(args.target, 'w') as fp: for src_batch in gens.batch(gens.word_list(args.source), args.minibatch): src_batch = fill_batch2(src_batch) K = len(src_batch) trace('sample %8d - %8d ...' % (generated + 1, generated + K)) hyp_batch = model.predict(src_batch, args.generation_limit) for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] six.print_(' '.join(hyp), file=fp) generated += K trace('finished.')
def test_model(args): trace('loading model ...') model = EncoderDecoderModel.load(args.model) trace('generating translation ...') generated = 0 with open(args.target, 'w') as fp: for src_batch in gens.batch(gens.word_list(args.source), args.minibatch): src_batch = fill_batch(src_batch) K = len(src_batch) trace('sample %8d - %8d ...' % (generated + 1, generated + K)) hyp_batch = model.predict(src_batch, args.generation_limit) for hyp in hyp_batch: hyp.append('</s>') hyp = hyp[:hyp.index('</s>')] print(' '.join(hyp), file=fp) generated += K trace('finished.')
def test_model(args): trace('loading model ...') model = TransSegmentationModel.load(args.model) trace('generating output ...') with open(args.corpus) as fp: for text in fp: letters = ''.join(text.split()) if not letters: print() continue scores = model.predict(text) hyp = make_hyp(letters, scores) print(hyp) trace('finished.')
def test_model(args): trace('loading model ...') model = RNNSegmentationModel.load(args.model) trace('generating output ...') with open(args.corpus) as fp: for text in fp: letters = ''.join(text.split()) if not letters: print() continue scores = model.predict(text) hyp = make_hyp(letters, scores) print(hyp) trace('finished.')
def test(args): trace('loading model ...') word_vocab = Vocabulary.load(args.model + '.words') phrase_vocab = Vocabulary.load(args.model + '.phrases') semi_vocab = Vocabulary.load(args.model + '.semiterminals') parser = Parser.load_spec(args.model + '.spec') if USE_GPU: parser.to_gpu() serializers.load_hdf5(args.model + '.weights', parser) trace('generating parse trees ...') with open(args.source) as fp: for l in fp: word_list = convert_word_list(l.split(), word_vocab) tree = restore_labels( parser.forward(word_list, None, args.unary_limit), phrase_vocab, semi_vocab) print('( ' + tree_to_string(tree) + ' )') trace('finished.')
def test(args): trace('loading model ...') word_vocab = Vocabulary.load(args.model + '.words') phrase_vocab = Vocabulary.load(args.model + '.phrases') semi_vocab = Vocabulary.load(args.model + '.semiterminals') parser = Parser.load_spec(args.model + '.spec') if USE_GPU: parser.to_gpu() serializers.load_hdf5(args.model + '.weights', parser) trace('generating parse trees ...') with open(args.source) as fp: for l in fp: word_list = convert_word_list(l.split(), word_vocab) tree = restore_labels( parser.forward(word_list, None, args.unary_limit), phrase_vocab, semi_vocab ) print('( ' + tree_to_string(tree) + ' )') trace('finished.')
import sys import math import numpy as np from net import BasicEncoderDecoderModel from util.functions import trace, fill_batch, parse_args from util.vocabulary import Vocabulary from util import generators as gens from util.controller import Controller from util.wrapper import wrapper from util.const import * if __name__ == '__main__': args = parse_args() trace('initializing ...') wrapper = wrapper(args.gpu_id) wrapper.init() trace('loading vocab ...') # src_vocab = Vocabulary.load(args.src_vocab) # trg_vocab = Vocabulary.load(args.trg_vocab) src_vocab = Vocabulary.load(VOCAB_SRC) trg_vocab = Vocabulary.load(VOCAB_TRG) controller = Controller(args.folder_name) if args.mode == 'train': controller.train_model(BasicEncoderDecoderModel, src_vocab, trg_vocab, args) elif args.mode == 'dev':
def train(args): trace('making vocabularies ...') src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab) trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab) trace('making model ...') attmt = AttentionMT(args.vocab, args.embed, args.hidden) if args.use_gpu: attmt.to_gpu() for epoch in range(args.epoch): trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) trained = 0 gen1 = gens.word_list(args.source) gen2 = gens.word_list(args.target) gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch), args.minibatch) opt = optimizers.AdaGrad(lr = 0.01) opt.setup(attmt) opt.add_hook(optimizer.GradientClipping(5)) for src_batch, trg_batch in gen3: src_batch = fill_batch(src_batch) trg_batch = fill_batch(trg_batch) K = len(src_batch) hyp_batch, loss = forward(src_batch, trg_batch, src_vocab, trg_vocab, attmt, True, 0) loss.backward() opt.update() for k in range(K): trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1)) trace(' src = ' + ' '.join([x if x != '</s>' else '*' for x in src_batch[k]])) trace(' trg = ' + ' '.join([x if x != '</s>' else '*' for x in trg_batch[k]])) trace(' hyp = ' + ' '.join([x if x != '</s>' else '*' for x in hyp_batch[k]])) trained += K trace('saving model ...') prefix = args.model + '.%03.d' % (epoch + 1) src_vocab.save(prefix + '.srcvocab') trg_vocab.save(prefix + '.trgvocab') attmt.save_spec(prefix + '.spec') serializers.save_hdf5(prefix + '.weights', attmt) trace('finished.')
def train(args): trace('loading corpus ...') with open(args.source) as fp: trees = [make_tree(l) for l in fp] trace('extracting leaf nodes ...') word_lists = [extract_words(t) for t in trees] lower_lists = [[w.lower() for w in words] for words in word_lists] trace('extracting gold operations ...') op_lists = [make_operations(t) for t in trees] trace('making vocabulary ...') word_vocab = Vocabulary.new(lower_lists, args.vocab) phrase_set = set() semiterminal_set = set() for tree in trees: phrase_set |= set(extract_phrase_labels(tree)) semiterminal_set |= set(extract_semiterminals(tree)) phrase_vocab = Vocabulary.new([list(phrase_set)], len(phrase_set), add_special_tokens=False) semiterminal_vocab = Vocabulary.new([list(semiterminal_set)], len(semiterminal_set), add_special_tokens=False) trace('converting data ...') word_lists = [convert_word_list(x, word_vocab) for x in word_lists] op_lists = [ convert_op_list(x, phrase_vocab, semiterminal_vocab) for x in op_lists ] trace('start training ...') parser = Parser( args.vocab, args.embed, args.char_embed, args.queue, args.stack, args.srstate, len(phrase_set), len(semiterminal_set), ) if args.use_gpu: parser.to_gpu() opt = optimizers.SGD(lr=0.1) opt.setup(parser) opt.add_hook(optimizer.GradientClipping(10)) opt.add_hook(optimizer.WeightDecay(0.0001)) batch_set = list(zip(word_lists, op_lists)) for epoch in range(args.epoch): n = 0 random.shuffle(batch_set) for samples in batch(batch_set, args.minibatch): parser.zerograds() loss = XP.fzeros(()) for word_list, op_list in zip(*samples): trace('epoch %3d, sample %6d:' % (epoch + 1, n + 1)) loss += parser.forward_train(word_list, op_list) n += 1 loss.backward() opt.update() trace('saving model ...') prefix = args.model + '.%03.d' % (epoch + 1) word_vocab.save(prefix + '.words') phrase_vocab.save(prefix + '.phrases') semiterminal_vocab.save(prefix + '.semiterminals') parser.save_spec(prefix + '.spec') serializers.save_hdf5(prefix + '.weights', parser) opt.lr *= 0.92 trace('finished.')
# -*- coding: utf-8 -* from gensim.models import word2vec from util.constants import * from util.functions import check_directory from util.functions import trace if __name__ == '__main__': trace("check directory") dir_path = [W2V_MECAB_MODEL, W2V_JUMAN_MODEL] check_directory(dir_path) trace("load mecab wakati file") data = word2vec.Text8Corpus(WAKATI_MECAB) trace("train mecab word2vec") model = word2vec.Word2Vec(data, size=UNIT) trace("save mecab word2vec model") model.save(W2V_MECAB_MODEL) trace("load juman wakati file") data = word2vec.Text8Corpus(WAKATI_JUMAN) trace("train juman word2vec") model = word2vec.Word2Vec(data, size=UNIT) trace("save juman word2vec model") model.save(W2V_JUMAN_MODEL) trace("finish!")
語彙数よりユニット数の数が多いと潜在空間への写像が出来ていないことになり結果的に意味がない処理になります。 """ parameter_dict["embed"] = 500 """ この数も多くなればなるほどモデルが複雑になります。この数を多くすると必然的に学習回数を多くしないと学習は 収束しません。 """ parameter_dict["hidden"] = 20 """ 学習回数。基本的に大きい方が良いが大きすぎると収束しないです。 """ parameter_dict["epoch"] = 20 """ ミニバッチ学習で扱うサイズです。この点は経験的に調整する場合が多いが、基本的に大きくすると学習精度が向上する 代わりに学習スピードが落ち、小さくすると学習精度が低下する代わりに学習スピードが早くなります。 """ parameter_dict["minibatch"] = 64 """ 予測の際に必要な単語数の設定。長いほど多くの単語の翻訳が確認できるが、一般的にニューラル翻訳は長い翻訳には 向いていないので小さい数値がオススメです。 """ parameter_dict["generation_limit"] = 256 #--------Hands on 2----------------------------------------------------------------# trace('initializing ...') wrapper.init() encoderDecoderModel = EncoderDecoderModelForward(parameter_dict) encoderDecoderModel.test()
def train(args): trace('loading corpus ...') with open(args.source) as fp: trees = [make_tree(l) for l in fp] trace('extracting leaf nodes ...') word_lists = [extract_words(t) for t in trees] lower_lists = [[w.lower() for w in words] for words in word_lists] trace('extracting gold operations ...') op_lists = [make_operations(t) for t in trees] trace('making vocabulary ...') word_vocab = Vocabulary.new(lower_lists, args.vocab) phrase_set = set() semiterminal_set = set() for tree in trees: phrase_set |= set(extract_phrase_labels(tree)) semiterminal_set |= set(extract_semiterminals(tree)) phrase_vocab = Vocabulary.new([list(phrase_set)], len(phrase_set), add_special_tokens=False) semiterminal_vocab = Vocabulary.new([list(semiterminal_set)], len(semiterminal_set), add_special_tokens=False) trace('converting data ...') word_lists = [convert_word_list(x, word_vocab) for x in word_lists] op_lists = [convert_op_list(x, phrase_vocab, semiterminal_vocab) for x in op_lists] trace('start training ...') parser = Parser( args.vocab, args.embed, args.queue, args.stack, args.srstate, len(phrase_set), len(semiterminal_set), ) if args.use_gpu: parser.to_gpu() opt = optimizers.AdaGrad(lr = 0.005) opt.setup(parser) opt.add_hook(optimizer.GradientClipping(5)) for epoch in range(args.epoch): n = 0 for samples in batch(zip(word_lists, op_lists), args.minibatch): parser.zerograds() loss = XP.fzeros(()) for word_list, op_list in zip(*samples): trace('epoch %3d, sample %6d:' % (epoch + 1, n + 1)) loss += parser.forward(word_list, op_list, 0) n += 1 loss.backward() opt.update() trace('saving model ...') prefix = args.model + '.%03.d' % (epoch + 1) word_vocab.save(prefix + '.words') phrase_vocab.save(prefix + '.phrases') semiterminal_vocab.save(prefix + '.semiterminals') parser.save_spec(prefix + '.spec') serializers.save_hdf5(prefix + '.weights', parser) trace('finished.')
def train(args): trace('loading corpus ...') with open(args.source) as fp: trees = [make_tree(l) for l in fp] trace('extracting leaf nodes ...') word_lists = [extract_words(t) for t in trees] trace('extracting gold operations ...') op_lists = [make_operations(t) for t in trees] trace('making vocabulary ...') word_vocab = Vocabulary.new(word_lists, args.vocab) phrase_set = set() semi_set = set() for tree in trees: phrase_set |= set(extract_phrase_labels(tree)) semi_set |= set(extract_semi_labels(tree)) phrase_vocab = Vocabulary.new([list(phrase_set)], len(phrase_set), add_special_tokens=False) semi_vocab = Vocabulary.new([list(semi_set)], len(semi_set), add_special_tokens=False) trace('converting data ...') word_lists = [convert_word_list(x, word_vocab) for x in word_lists] op_lists = [convert_op_list(x, phrase_vocab, semi_vocab) for x in op_lists] trace('start training ...') parser = Parser( args.vocab, args.embed, args.queue, args.stack, len(phrase_set), len(semi_set), ) if USE_GPU: parser.to_gpu() opt = optimizers.AdaGrad(lr=0.005) opt.setup(parser) opt.add_hook(optimizer.GradientClipping(5)) for epoch in range(args.epoch): n = 0 for samples in batch(zip(word_lists, op_lists), args.minibatch): parser.zerograds() loss = my_zeros((), np.float32) for word_list, op_list in zip(*samples): trace('epoch %3d, sample %6d:' % (epoch + 1, n + 1)) loss += parser.forward(word_list, op_list, 0) n += 1 loss.backward() opt.update() trace('saving model ...') prefix = args.model + '.%03.d' % (epoch + 1) word_vocab.save(prefix + '.words') phrase_vocab.save(prefix + '.phrases') semi_vocab.save(prefix + '.semiterminals') parser.save_spec(prefix + '.spec') serializers.save_hdf5(prefix + '.weights', parser) trace('finished.')
def train_model(args): train_begin = time.time() trace('making vocaburaries ...') vocab = Vocabulary.new(gens.letter_list(args.corpus), args.vocab) trace('begin training ...') model = TransSegmentationModel.new(vocab, args.context, args.hidden, args.labels, args.eta) for epoch in range(args.epoch): epoch_beg = time.time() trace('START epoch %d/%d: ' % (epoch + 1, args.epoch)) trained = 0 total_loss = 0 model.init_optimizer() with open(args.corpus) as fp: for text in fp: word_list = text.split() if not word_list: continue text = ' '.join(word_list) letters = ''.join(word_list) labels, accum_loss_f = model.train(text) total_loss += accum_loss_f trained += 1 hyp = make_hyp(letters, labels) """for 1sentence output trace("accum_loss : %lf"% (accum_loss_f)) trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) trace('trained %d: '% trained) trace(text) trace(hyp) """ """ if trained % 100 == 0: trace(' %8d' % trained) """ trace('FINISHED epoch %d/%d: ' % (epoch + 1, args.epoch)) trace('total_loss : %lf'%total_loss) trace('saving model ...') model.save(args.model + '.%03d' % (epoch + 1)) epoch_time = time.time() - epoch_beg trace('elapsed_time/1epoch : %lf'%epoch_time) trace('finished.') elapsed_time = time.time() - train_begin trace('train_time : %lf'%elapsed_time) trace('')
def train(args): vocab = Vocabulary.from_conll(args.train, args.vocab) train_dataset = [conll_to_train(x, vocab) for x in read_conll(args.train)] dev_dataset = [conll_to_train(x, vocab) for x in read_conll(args.dev)] parser = Parser(args.vocab, args.embed, args.hidden) if args.gpu >= 0: parser.to_gpu() opt = optimizers.AdaGrad(lr=0.01) opt.setup(parser) opt.add_hook(optimizer.GradientClipping(10)) opt.add_hook(optimizer.WeightDecay(0.0001)) for epoch in range(args.epoch): random.shuffle(train_dataset) parser.zerograds() loss = XP.fzeros(()) for i, data in enumerate(train_dataset): trace('epoch %3d: train sample %6d:' % (epoch + 1, i + 1)) parent_scores, root_scores = parser.forward(data) if len(data) > 1: parent_scores = functions.split_axis(parent_scores, len(data), 0) else: parent_scores = (parent_scores, ) root = -1 for j, (p_scores, (wid, parent)) in enumerate(zip(parent_scores, data)): if parent == -1: trace(' %3d: root' % j) root = j else: parent_est = p_scores.data.argmax() trace('%c %3d -> %3d (%3d)' % ('*' if parent == parent_est else ' ', j, parent_est, parent)) loss += functions.softmax_cross_entropy( p_scores, XP.iarray([parent])) root_est = root_scores.data.argmax() trace('ROOT: %3d (%3d)' % (root_est, root)) loss += functions.softmax_cross_entropy(root_scores, XP.iarray([root])) if (i + 1) % 200 == 0: loss.backward() opt.update() parser.zerograds() loss = XP.fzeros(()) loss.backward() opt.update() trace('epoch %3d: trained. ' % (epoch + 1)) parent_num = 0 parent_match = 0 root_num = 0 root_match = 0 for i, data in enumerate(dev_dataset): trace('epoch %3d: dev sample %6d:' % (epoch + 1, i + 1), rollback=True) parent_scores, root_scores = parser.forward(data) if len(data) > 1: parent_scores = functions.split_axis(parent_scores, len(data), 0) else: parent_scores = (parent_scores, ) root = -1 for j, (p_scores, (wid, parent)) in enumerate(zip(parent_scores, data)): if parent == -1: root = j else: parent_est = p_scores.data.argmax() parent_num += 1 parent_match += 1 if parent_est == parent else 0 root_est = root_scores.data.argmax() root_num += 1 root_match += 1 if root_est == root else 0 result_str = \ 'epoch %3d: dev: parent-acc = %.4f (%5d/%5d), root-acc = %.4f (%4d/%4d)' % \ ( \ epoch + 1, \ parent_match / parent_num, parent_match, parent_num, \ root_match / root_num, root_match, root_num) trace(result_str) with open(args.model + '.log', 'a') as fp: print(result_str, file=fp) trace('epoch %3d: saving models ...' % (epoch + 1)) prefix = args.model + '.%03d' % (epoch + 1) vocab.save(prefix + '.vocab') parser.save_spec(prefix + '.parent_spec') serializers.save_hdf5(prefix + '.parent_weights', parser) trace('finished.')
def main(): args = parse_args() init_program_state(args) vocab = make_vocab() data, batched_data = load_data(args.train, vocab, args.batch_size) dev , batched_dev = load_data(args.dev, vocab, 1) test, batched_test = load_data(args.test, vocab, 1) model = init_model(input_size = len(vocab), embed_size = args.embed_size, hidden_size = args.hidden_size, output_size = len(vocab)) optimizer = optimizers.SGD(lr=args.lr) # Begin Training UF.init_model_parameters(model) model = UF.convert_to_GPU(USE_GPU, model) optimizer.setup(model) batchsize = args.batch_size epoch = args.epoch accum_loss = Variable(xp.zeros((), dtype=np.float32)) counter = 0 # For each epoch.. for ep in range(epoch): UF.trace("Training Epoch %d" % ep) total_tokens = 0 log_ppl = 0.0 # For each batch, do forward & backward computations for i, batch in enumerate(batched_data): loss, nwords = forward(model, batch) accum_loss += loss log_ppl += loss.data.reshape(()) # Tracing... total_tokens += nwords # UF.trace(' %d/%d = %.5f' % (min(i*batchsize, len(data)), len(data), loss.data.reshape(())*batchsize)) # Counting if (counter+1) % bp_len == 0: optimizer.zero_grads() accum_loss.backward() accum_loss.unchain_backward() accum_loss = Variable(xp.zeros((), dtype=np.float32)) optimizer.clip_grads(grad_clip) optimizer.update() counter += 1 # Counting Perplexity log_ppl /= total_tokens UF.trace(" PPL (Train) = %.10f" % math.exp(UF.to_cpu(USE_GPU, log_ppl))) dev_ppl = evaluate(model, batched_dev) UF.trace(" PPL (Dev) = %.10f" % math.exp(UF.to_cpu(USE_GPU, dev_ppl))) # Reducing learning rate if ep > 6: optimizer.lr /= 1.2 UF.trace("Reducing LR:", optimizer.lr) # Begin Testing UF.trace("Begin Testing...") test_ppl = evaluate(model, batched_test) UF.trace(" log(PPL) = %.10f" % test_ppl) UF.trace(" PPL = %.10f" % math.exp(UF.to_cpu(USE_GPU, test_ppl)))
def wakati_2_method(path): text = codecs.open(path,"r","utf-8").readlines()[2:] # Delete URL & date text = [i.replace("\n","") for i in text if not i == "\n"] text = [i for i in text if len(i) > 10] samp_line = random.sample(text,1)[0] mecab_sp = MECAB_SEP.parse(samp_line) mecab_sp = "/".join(mecab_sp.split()) juman_sp = JUMAN_SEP.analysis(replace_head2jumanpp(samp_line)) juman_sp = "/".join([i.midasi for i in juman_sp.mrph_list()]) print() print("=== path ===") print(path) print() print("=== mecab split ===") print(mecab_sp) print() print("=== juman split ===") print(juman_sp) print() print("--------------------------------------------------------------") if __name__ == '__main__': random.seed(SEED) trace("read text file") path_list = get_text_file_path(CORPUS_DIR) samp_path = random.sample(path_list, 100) for path in samp_path: wakati_2_method(path)
def train_model(args): trace('making vocaburaries ...') #src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab) src_vectors = read_src_vectors(args.source) src_size = len(src_vectors[0]) trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab) trace('making model ...') model = EncoderDecoderModel.new(src_size, trg_vocab, args.embed, args.hidden) for epoch in range(args.epoch): trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) trained = 0 #gen1 = gens.word_list(args.source) gen2 = gens.word_list(args.target) gen3 = gens.batch(gens.sorted_parallel(src_vectors, gen2, 100 * args.minibatch), args.minibatch) model.init_optimizer() for src_batch, trg_batch in gen3: #src_batch = fill_batch(src_batch) trg_batch = fill_batch(trg_batch) K = len(src_batch) hyp_batch = model.train(src_batch, trg_batch) for k in range(K): trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1)) trace(' src = ' + str(src_batch[k])) trace(' trg = ' + ' '.join([x if x != '</s>' else '*' for x in trg_batch[k]])) trace(' hyp = ' + ' '.join([x if x != '</s>' else '*' for x in hyp_batch[k]])) trained += K #trace('saving model ...') #model.save(args.model + '.%03d' % (epoch + 1)) trace('finished.')
def train_model(args): trace('making vocabularies ...') src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab) trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab) trace('making model ...') model = AttentionalTranslationModel.new(src_vocab, trg_vocab, args.embed, args.hidden) for epoch in range(args.epoch): trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) trained = 0 gen1 = gens.word_list(args.source) gen2 = gens.word_list(args.target) gen3 = gens.batch( gens.sorted_parallel(gen1, gen2, 100 * args.minibatch, order=0), args.minibatch) model.init_optimizer() for src_batch, trg_batch in gen3: src_batch = fill_batch2(src_batch) trg_batch = fill_batch2(trg_batch) K = len(src_batch) hyp_batch = model.train(src_batch, trg_batch) for k in range(K): trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1)) trace( ' src = ' + ' '.join([x if x != '</s>' else '*' for x in src_batch[k]])) trace( ' trg = ' + ' '.join([x if x != '</s>' else '*' for x in trg_batch[k]])) trace( ' hyp = ' + ' '.join([x if x != '</s>' else '*' for x in hyp_batch[k]])) trained += K trace('saving model ...') model.save(args.model + '.%03d' % (epoch + 1)) trace('finished.')
def train_model(args): trace('making vocabularies ...') vocab = Vocabulary.new(gens.letter_list(args.corpus), args.vocab) trace('start training ...') model = RNNSegmentationModel.new(vocab, args.embed, args.hidden) for epoch in range(args.epoch): trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) trained = 0 model.init_optimizer() with open(args.corpus) as fp: for text in fp: word_list = text.split() if not word_list: continue text = ' '.join(word_list) letters = ''.join(word_list) scores = model.train(text) trained += 1 hyp = make_hyp(letters, scores) trace(trained) trace(text) trace(hyp) trace(' '.join('%+.1f' % x for x in scores)) if trained % 100 == 0: trace(' %8d' % trained) trace('saveing model ...') model.save(args.model + '.%03d' % (epoch + 1)) trace('finished.')