def train(): #加载训练数据并生成可训练数据 train_sor_data, train_mub_data = load_sentences(FLAGS.train_sor_path, FLAGS.train_mub_path) #将训练数据处理成N批次数据 train_manager = BatchManager(train_sor_data, train_mub_data, FLAGS.batch_size) #设置gpu参数 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True #加载FLAGS参数 config = config_model() logger = get_logger(config["logger_path"]) #计算批次数 word2id, id2word = load_sor_vocab() steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model_and_embedding(sess, Model, FLAGS.model_path, config, True) logger.info("start training") loss = [] with tf.device('/gpu:0'): for i in range(FLAGS.num_of_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info( "iteration:{} step:{}/{},chatbot loss:{:>9.6f}". format(iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] if i % 10 == 0: save_model(sess, model, FLAGS.model_path, logger)
def train(self): batch_manager = BatchManager(self.encoder_vec, self.decoder_vec, self.batch_size) config = tf.ConfigProto() config.gpu_options.allow_growth = True loss_track = [] total_time = 0 nums_batch = len(batch_manager.batch_data) for epoch in range(self.max_epoch): print "[->] epoch {}".format(epoch) batch_index = 0 for batch in batch_manager.batch(): batch_index += 1 # 获取fd [time_steps, batch_size] fd = self.get_fd(batch, self.model) _, loss, logits, labels = self.sess.run([self.model.train_op, self.model.loss, self.model.logits, self.model.decoder_labels], fd) loss_track.append(loss) if batch_index % self.show_batch == 0: print "\tstep: {}/{}".format(batch_index, nums_batch) print '\tloss: {}'.format(loss) print "\t"+"-"*50 checkpoint_path = self.model_path+"chatbot_seq2seq.ckpt" # 保存模型 self.model.saver.save(self.sess, checkpoint_path, global_step=self.model.global_step)
def train(self): print("++++++++train+++++++") batch_manager = BatchManager(self.encoder_vec, self.decoder_vec, self.batch_size) #用来配置tf的sess,使用gpu还是cpu config = tf.ConfigProto() config.gpu_options.allow_growth = True #存放交叉熵结果 loss_track = [] total_time = 0 #算第几轮用的参数 nums_batch = len(batch_manager.batch_data) for epoch in range(self.max_epoch): print("[->] epoch {}".format(epoch)) batch_index = 0 for batch in batch_manager.batch(): batch_index += 1 # 获取fd [time_steps, batch_size] fd = self.get_fd(batch, self.model) #sess.run计算model的张量tensor,这里利用优化器做优化 _, loss, logits, labels = self.sess.run([ self.model.train_op, self.model.loss, self.model.logits, self.model.decoder_labels ], fd) loss_track.append(loss) if batch_index % self.show_batch == 0: print("\tstep: {}/{}".format(batch_index, nums_batch)) print('\tloss: {}'.format(loss)) print("\t" + "-" * 50) checkpoint_path = self.model_path + "chatbot_seq2seq.ckpt" # 保存模型 self.model.saver.save(self.sess, checkpoint_path, global_step=self.model.global_step)
def main(): print(args) data_dir = '/home/tiankeke/workspace/datas/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt') VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) # emb_file = '/home/tiankeke/workspace/embeddings/giga-vec1.bin' # vocab, embeddings = load_word2vec_embedding(emb_file) max_src_len = 60 max_tgt_len = 20 bs = args.batch_size n_train = args.n_train n_valid = args.n_valid vocab = small_vocab train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab) train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab) valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab) valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab) # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 4, 256, # 64, 64, 1024, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda() # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 6, 300, # 50, 50, 1200, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda() # elmo_requries_grad=True after epoch 3 model = ElmoTransformer(max_src_len, len(vocab), 2, 8, 64, 64, 256, 512, 2048, dropout=0.5, elmo_requires_grad=False).cuda() # print(model) saved_state = {'epoch': 0, 'lr': 0.001} if os.path.exists(args.ckpt_file): saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) logging.info('Load model parameters from %s' % args.ckpt_file) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=saved_state['lr']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5) scheduler.step() # last_epoch=-1, which will not update lr at the first time # eval_model(valid_x, valid_y, vocab, model) train(train_x, train_y, valid_x, valid_y, model, optimizer, vocab, scheduler, args.n_epochs, saved_state['epoch'])
def main(): print(args) N_EPOCHS = args.n_epochs N_TRAIN = args.n_train N_VALID = args.n_valid BATCH_SIZE = args.batch_size data_dir = args.data_dir TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt') VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') """ vocab_file = os.path.join(data_dir, "vocab.json") if not os.path.exists(vocab_file): utils.build_vocab([TRAIN_X, TRAIN_Y], vocab_file, n_vocab=80000) vocab = json.load(open(vocab_file)) """ embedding_path = '/home/kaiying/coco/embeddings/giga-256d.bin' vocab, embeddings = utils.load_word2vec_embedding(embedding_path) print(len(vocab), embeddings.shape) train_x = BatchManager(load_data(TRAIN_X, vocab, N_TRAIN), BATCH_SIZE) train_y = BatchManager(load_data(TRAIN_Y, vocab, N_TRAIN), BATCH_SIZE) valid_x = BatchManager(load_data(VALID_X, vocab, N_VALID), BATCH_SIZE) valid_y = BatchManager(load_data(VALID_Y, vocab, N_VALID), BATCH_SIZE) model = Model(vocab, emb_dim=256, hid_dim=512, embeddings=embeddings).cuda() # model.embedding_look_up.to(torch.device("cpu")) ckpt_file = args.ckpt_file saved_state = {'lr': 0.001, 'epoch': 0} if os.path.exists(ckpt_file): saved_state = torch.load(ckpt_file) model.load_state_dict(saved_state['state_dict']) logging.info('Load model parameters from %s' % ckpt_file) optimizer = torch.optim.Adam(model.parameters(), lr=saved_state['lr']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5) scheduler.step() train(train_x, train_y, valid_x, valid_y, model, optimizer, scheduler, saved_state['epoch'], N_EPOCHS)
def main(): # if not os.path.exists(args.ckpt_file): # raise FileNotFoundError("model file not found") data_dir = '~/Textsum/textsum-transformer-master/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') TEST_X = args.input_file small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) max_src_len = 60 max_tgt_len = 20 test_x = BatchManager(load_data(TEST_X, max_src_len, args.n_test), args.batch_size, small_vocab) model = TransformerShareEmbedding(len(small_vocab), max_src_len, 1, 6, 300, 50, 50, 1200, False).cuda() # saved_state = torch.load(args.ckpt_file) # model.load_state_dict(saved_state['state_dict']) # print('Load model parameters from %s' % args.ckpt_file) my_test(test_x, model, small_vocab)
def get_train_data(): normal_train, normal_test = get_sentence(args.train_data, args.test_data) transfer_train, transfer_test = get_sentence(args.transfer_train_data, args.transfer_test_data) char2id, id2char, tag2id, id2tag, transfer_tag2id, transfer_id2tag = get_transform(normal_train + transfer_train, args.map_path, args.tag2label_path, args.transfer_tag2label_path) train_data = preprocess_data(normal_train, char2id, tag2id) train_manager = BatchManager(train_data, args.batch_size) test_data = preprocess_data(normal_test, char2id, tag2id) test_manager = BatchManager(test_data, args.batch_size) transfer_train_data = preprocess_data(transfer_train, char2id, transfer_tag2id) transfer_train_manager = BatchManager(transfer_train_data, args.batch_size) transfer_test_data = preprocess_data(transfer_test, char2id, transfer_tag2id) transfer_test_manager = BatchManager(transfer_test_data, args.batch_size) return train_manager, test_manager, transfer_train_manager, transfer_test_manager, id2char, id2tag, transfer_id2tag
def main(): vocab, max_src_len, max_tgt_len, inputs, targets = load_data('vocab.json', n_data=850) inputs, targets = shuffle(inputs, targets) # set d_model = d_word_vec model = Transformer(n_src_vocab=len(vocab), n_tgt_vocab=len(vocab), max_src_len=max_src_len, max_tgt_len=max_tgt_len, d_word_vec=32, N=6, n_head=4, d_q=16, d_k=16, d_v=16, d_model=32, d_inner=64) model.cuda() # model = Encoder(len(vocab), max_src_len, d_src_emb=32, N=3, n_head=4, # d_q=16, d_k=16, d_v=16, d_model=32, d_inner=32) model_file = 'models/params_transformer.pkl' if os.path.exists(model_file): print("Loading parameters from %s" % model_file) model.load_state_dict(torch.load(model_file)) train_idx = int(len(inputs) * 0.90) valid_idx = int(len(inputs) * 0.95) train_x = BatchManager(inputs[:train_idx], 32) train_y = BatchManager(targets[:train_idx], 32) valid_x = BatchManager(inputs[train_idx:valid_idx], 64) valid_y = BatchManager(targets[train_idx:valid_idx], 64) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = Adam(parameters, lr=0.0001) scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=1) # train(train_x, train_y, valid_x, valid_y, model, optimizer, n_epochs=100, scheduler=scheduler) eval(model, vocab, inputs[train_idx:], targets[train_idx:], out_len=12)
def main(): if not os.path.exists(args.ckpt_file): raise FileNotFoundError("model file not found") data_dir = '/home/tiankeke/workspace/datas/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') TEST_X = args.input_file small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) max_src_len = 101 max_tgt_len = 47 test_x = BatchManager(load_data(TEST_X, max_src_len, args.n_test), args.batch_size, small_vocab) model = Transformer(len(small_vocab), len(small_vocab), max_src_len, d_word_vec=300, d_model=300, d_inner=1200, n_layers=1, n_head=6, d_k=50, d_v=50, dropout=0.1, tgt_emb_prj_weight_sharing=True, emb_src_tgt_weight_sharing=True).cuda() # print(model) model.eval() saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) print('Load model parameters from %s' % args.ckpt_file) my_test(test_x, model, small_vocab)
def main(): if not os.path.exists(args.ckpt_file): raise FileNotFoundError("model file not found") data_dir = '/home/tiankeke/workspace/datas/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') TEST_X = args.input_file small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) max_src_len = 60 max_tgt_len = 20 bs = args.batch_size n_test = args.n_test vocab = small_vocab test_x = BatchManager(load_data(TEST_X, max_src_len, n_test), bs, vocab) model = ElmoTransformer(max_src_len, len(vocab), 2, 8, 64, 64, 256, 512, 2048, dropout=0.5, elmo_requires_grad=False).cuda() saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) print('Load model parameters from %s' % args.ckpt_file) my_test(test_x, model, small_vocab)
def main(): print(args) data_dir = 'data/' TRAIN_X = os.path.join(data_dir, 'train/in.txt') TRAIN_Y = os.path.join(data_dir, 'train/out.txt') VALID_X = os.path.join(data_dir, 'dev/in.txt') VALID_Y = os.path.join(data_dir, 'dev/out.txt') EVAL_X = os.path.join(data_dir, 'test/in.txt') EVAL_Y = os.path.join(data_dir, 'test/out.txt') small_vocab_file = os.path.join(data_dir, 'vocab.json') if os.path.exists(small_vocab_file): print("Vocab exists!") small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=800000) max_src_len = 34 max_tgt_len = 34 bs = args.batch_size n_train = args.n_train n_valid = args.n_valid n_eval = args.n_eval vocab = small_vocab train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab) train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab) valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab) valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab) eval_x = BatchManager(load_data(EVAL_X, max_src_len, n_eval), bs, vocab) eval_y = BatchManager(load_data(EVAL_Y, max_tgt_len, n_eval), bs, vocab) print("vocab length is: "+ str(len(vocab))) model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 6, 8, 256, 64, 64, 1024, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda() saved_state = {'epoch': 0, 'lr': 0.001} if os.path.exists(args.ckpt_file): saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) logging.info('Load model parameters from %s' % args.ckpt_file) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=saved_state['lr']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3) scheduler.step() # last_epoch=-1, which will not update lr at the first time # myeval(valid_x, valid_y, vocab, model) # train(train_x, train_y, valid_x, valid_y, model, optimizer, vocab, scheduler, args.n_epochs, saved_state['epoch']) myeval(eval_x, eval_y, vocab, model)
def main(): # if not os.path.exists(args.ckpt_file): # raise FileNotFoundError("model file not found") data_dir = '/home/tiankeke/workspace/datas/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') TEST_X = args.input_file small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) max_src_len = 100 max_tgt_len = 40 vocab = small_vocab test_x = BatchManager(load_data(TEST_X, max_src_len, args.n_test), args.batch_size, small_vocab) model = Transformer(len(vocab), len(vocab), 200, 200, 2, 4, 256, 1024, src_tgt_emb_share=True, tgt_prj_wt_share=True).cuda() saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) print('Load model parameters from %s' % args.ckpt_file) my_test(test_x, model, small_vocab)
def main(): N_TEST = args.n_test BATCH_SIZE = args.batch_size # vocab = json.load(open('sumdata/vocab.json')) embedding_path = '/home/kaiying/coco/embeddings/giga-256d.bin' vocab, embeddings = utils.load_word2vec_embedding(embedding_path) test_x = BatchManager(load_data(args.input_file, vocab, N_TEST), BATCH_SIZE) # model = Seq2SeqAttention(len(vocab), EMB_DIM, HID_DIM, BATCH_SIZE, vocab, max_trg_len=25).cuda() model = Model(vocab, emb_dim=256, hid_dim=512, embeddings=embeddings).cuda() model.eval() file = args.ckpt_file if os.path.exists(file): saved_state = torch.load(file) model.load_state_dict(saved_state['state_dict']) print('Load model parameters from %s' % file) my_test(test_x, model)
def train(inputs, targets, model, optimizer, batch_size=32, epochs=200): inputs_batch_manager = BatchManager(inputs, batch_size) targets_batch_manager = BatchManager(targets, batch_size) steps = inputs_batch_manager.steps for epoch in range(epochs): for i in range(steps): optimizer.zero_grad() batch_inputs = torch.tensor(inputs_batch_manager.next_batch(), dtype=torch.long) batch_targets = torch.tensor(targets_batch_manager.next_batch(), dtype=torch.long) logits = model(batch_inputs, batch_targets) # exclude start token loss = model.loss_layer(logits.transpose(1, 2), batch_targets[:, 1:]) loss.backward() optimizer.step() print(loss) torch.save(model.state_dict(), os.path.join("models", "params.pkl"))
def eval(model, vocab, inputs, targets, out_len=12): model.eval() batch_x = BatchManager(inputs, 32) batch_y = BatchManager(targets, 32) hits = 0 total = 0 for i in range(batch_x.steps): x = torch.tensor(batch_x.next_batch(), dtype=torch.long).cuda() y = torch.tensor(batch_y.next_batch(), dtype=torch.long).cuda() tgt_seq = torch.ones(x.shape[0], out_len, dtype=torch.long).cuda() tgt_seq *= vocab['<pad>'] tgt_seq[:, 0] = vocab['<s>'] for j in range(1, out_len): logits = model(x, tgt_seq) last_word = torch.argmax(logits[:, j - 1, :], dim=-1).view(-1, 1) tgt_seq[:, j] = last_word.squeeze() if j != out_len - 1: tgt_seq[:, j + 1] = vocab['</s>'] hits += visualize(x, y, tgt_seq, vocab) total += x.shape[0] print('%d/%d, accuracy=%f' % (hits, total, hits / total)) model.train()
def main(): print(args) data_dir = '/home/disk3/tiankeke/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt') VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') src_vocab_file = 'sumdata/src_vocab.txt' if not os.path.exists(src_vocab_file): build_vocab([TRAIN_X], src_vocab_file) src_vocab = load_vocab(src_vocab_file, vocab_size=90000) tgt_vocab_file = 'sumdata/tgt_vocab.txt' if not os.path.exists(tgt_vocab_file): build_vocab([TRAIN_Y], tgt_vocab_file) tgt_vocab = load_vocab(tgt_vocab_file) # emb_file = '/home/tiankeke/workspace/embeddings/giga-vec1.bin' # vocab, embeddings = load_word2vec_embedding(emb_file) max_src_len = 100 max_tgt_len = 40 max_pos = 200 bs = args.batch_size n_train = args.n_train n_valid = args.n_valid train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, src_vocab) train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, tgt_vocab) train_x, train_y = utils.shuffle(train_x, train_y) valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, src_vocab) valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, tgt_vocab) valid_x, valid_y = utils.shuffle(valid_x, valid_y) # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 4, 256, # 64, 64, 1024, src_tgt_emb_share=True, tgt_prj_wt_share=True).cuda() model = Transformer(len(src_vocab), len(tgt_vocab), max_pos, max_pos, 2, 4, 256, 1024, src_tgt_emb_share=False, tgt_prj_wt_share=True).cuda() # model = TransformerShareEmbedding(len(vocab), max_src_len, 2, 4, # 256, 1024, False, True).cuda() # print(model) saved_state = {'epoch': 0, 'lr': 0.001} if os.path.exists(args.ckpt_file): saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) logging.info('Load model parameters from %s' % args.ckpt_file) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=saved_state['lr']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3) scheduler.step( ) # last_epoch=-1, which will not update lr at the first time # eval_model(valid_x, valid_y, vocab, model) train(train_x, train_y, valid_x, valid_y, model, optimizer, tgt_vocab, scheduler, args.n_epochs, saved_state['epoch'])
with tf.Session(config=config) as sess: print('============= demo =============') saver.restore(sess, ckpt_file) while True: print('Please input your sentence (or key \'exit\' to exit):') demo_sent = input().strip() demo_sent = demo_sent.replace(" ", "") # if demo_sent == '' or demo_sent.isspace(): if demo_sent == 'exit': print('See you next time!') break else: demo_transfer_test = load_input_sentence(demo_sent) demo_transfer_test_data = preprocess_data(demo_transfer_test, char2id, transfer_tag2id) demo_transfer_test_manager = BatchManager(demo_transfer_test_data, args.batch_size) demo_data = model.evaluate(sess, demo_transfer_test_manager, transfer_id2tag) """ demo_data format: [ [ 'char <O> <pred>', ..., ] ] Notes: char <O>: default tag (no meaning) <pred>: predicted transfer tag """ ret = { "product_name": [],
def train(conf): train_sentences = load_sentences(conf.train_file, conf.zeros) dev_sentences = load_sentences(conf.dev_file, conf.zeros) test_sentences = load_sentences(conf.test_file, conf.zeros) dico_chars_train = char_mapping(train_sentences, conf.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), conf.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, conf.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, conf.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, conf.lower) #loading word embeddings all_word_embeds = {} for i, line in enumerate(codecs.open(conf.emb_file, 'r', 'utf-8')): s = line.strip().split() if len(s) == conf.embedding_dim + 1: all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]]) word_embeds_dict = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(char_to_id), conf.embedding_dim)) for w in char_to_id: if w in all_word_embeds: word_embeds_dict[char_to_id[w]] = all_word_embeds[w] elif w.lower() in all_word_embeds: word_embeds_dict[char_to_id[w]] = all_word_embeds[w.lower()] print('Loaded %i pretrained embeddings.' % len(all_word_embeds)) train_manager = BatchManager(train_data, conf.batch_size) model = BiLSTM_CRF(conf, tag_to_id, char_to_id, word_embeds_dict) optimizer = torch.optim.SGD(model.parameters(), lr=conf.learning_rate, weight_decay=1e-4) epoch = conf.epochs dev_f1_ = 0 for epoch in range(1, epoch + 1): print(f'train on epoch {epoch}') j = 1 for batch in train_manager.iter_batch(shuffle=True): batch_loss = 0.0 sentences = batch[1] tags = batch[-1] for i, index in enumerate(np.random.permutation(len(sentences))): model.zero_grad() sentence_in = sentences[index] tags_in = tags[index] loss = model.neg_log_likelihood(sentence_in, tags_in) loss.backward() optimizer.step() batch_loss += loss.data print( f'[batch {j},batch size:{conf.batch_size}] On this batch loss: {batch_loss}' ) j = j + 1 print(f'Begin validing result on [epoch {epoch}] valid dataset ...') dev_results = get_predictions(model, dev_data, id_to_tag) dev_f1 = evaluate_ner(dev_results, conf) if dev_f1 > dev_f1_: torch.save(model, conf.model_file) print('save model success.') test_results = get_predictions(model, test_data, id_to_tag) test_f1 = evaluate_ner(test_results, conf) print(f'[epoch {epoch}] On test dataset] f1: {test_f1:3f}')
def train(): train_sentences, dico, char_to_id, id_to_char = load_sentence( FLAGS.train_file) if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico.copy(), FLAGS.emb_file, ) else: sentences, dico, char_to_id, id_to_char = load_sentence( FLAGS.train_file) print(train_sentences[0]) with open(FLAGS.map_file, 'wb') as f: pickle.dump([char_to_id, id_to_char], f) else: with open(FLAGS.map_file, 'rb') as f: char_to_id, id_to_char = pickle.load(f) train_data, test_data, dev_data = prepare_dataset(train_sentences, char_to_id) print(train_data[0]) print(test_data[0]) print(dev_data[0]) print(len(train_data), len(dev_data), len(test_data)) train_manager = BatchManager(train_data, FLAGS.batch_size) test_manager = BatchManager(test_data, 100) dev_manager = BatchManager(dev_data, 100) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) tf_config = tf.ConfigProto(gpu_options=gpu_options) tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] best = 0 # sess.graph.finalize() for i in range(50): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{},".format( iteration, step % steps_per_epoch, steps_per_epoch)) loss = [] Acc_result = evaluate(sess, model, "dev", dev_manager, logger) logger.info("Acc{}".format(Acc_result)) logger.info("test") # precision, recall, f1_score = model.evaluete_(sess,test_manager) # logger.info("P, R, F,{},{},{}".format(precision, recall, f1_score)) test_result = evaluate(sess, model, "test", test_manager, logger) if test_result > best: best = test_result save_model(sess, model, FLAGS.ckpt_path, logger)
def main(): print(args) data_dir = '/home/tiankeke/workspace/datas/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt') VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') src_vocab, tgt_vocab = get_vocab(TRAIN_X, TRAIN_Y) small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) max_src_len = 101 max_tgt_len = 47 bs = args.batch_size n_train = args.n_train n_valid = args.n_valid vocab = small_vocab train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab) train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab) valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab) valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab) model = Transformer(len(vocab), len(vocab), max_src_len, d_word_vec=300, d_model=300, d_inner=1200, n_layers=1, n_head=6, d_k=50, d_v=50, dropout=0.1, tgt_emb_prj_weight_sharing=True, emb_src_tgt_weight_sharing=True).cuda() # print(model) saved_state = {'epoch': 0, 'lr': 0.001} if os.path.exists(args.ckpt_file): saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) logging.info('Load model parameters from %s' % args.ckpt_file) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=saved_state['lr']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3) scheduler.step( ) # last_epoch=-1, which will not update lr at the first time train(train_x, train_y, valid_x, valid_y, model, optimizer, scheduler, args.n_epochs, saved_state['epoch'])
def main(_): if not os.path.isdir(FLAGS.log_path): os.makedirs(FLAGS.log_path) if not os.path.isdir(FLAGS.model_path): os.makedirs(FLAGS.model_path) if not os.path.isdir(FLAGS.result_path): os.makedirs(FLAGS.result_path) tag_to_id = { "O": 0, "B-LOC": 1, "I-LOC": 2, "B-PER": 3, "I-PER": 4, "B-ORG": 5, "I-ORG": 6 } # load data id_to_word, id_to_tag, train_data, dev_data, test_data = load_data( FLAGS, tag_to_id) train_manager = BatchManager(train_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.batch_size) dev_manager = BatchManager(dev_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.valid_batch_size) test_manager = BatchManager(test_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.valid_batch_size) with tf.Session() as sess: model = create_model(sess, id_to_word, id_to_tag) loss = 0 best_test_f1 = 0 steps_per_epoch = len(train_data) // FLAGS.batch_size + 1 for _ in range(FLAGS.max_epoch): iteration = (model.global_step.eval()) // steps_per_epoch + 1 train_manager.shuffle() for batch in train_manager.iter_batch(): global_step = model.global_step.eval() step = global_step % steps_per_epoch batch_loss = model.run_step(sess, True, batch) loss += batch_loss / FLAGS.steps_per_checkpoint if global_step % FLAGS.steps_per_checkpoint == 0: model.logger.info( "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format( iteration, step, steps_per_epoch, loss)) loss = 0 model.logger.info("validating ner") ner_results = model.predict(sess, dev_manager) eval_lines = test_ner(ner_results, FLAGS.result_path) for line in eval_lines: model.logger.info(line) test_f1 = float(eval_lines[1].strip().split()[-1]) if test_f1 > best_test_f1: best_test_f1 = test_f1 model.logger.info("new best f1 score:{:>.3f}".format(test_f1)) model.logger.info("saving model ...") checkpoint_path = os.path.join(FLAGS.model_path, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) # test model model.logger.info("testing ner") ckpt = tf.train.get_checkpoint_state(FLAGS.model_path) model.logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) ner_results = model.predict(sess, test_manager) eval_lines = test_ner(ner_results, FLAGS.result_path) for line in eval_lines: model.logger.info(line)
if singletons is not None: words = insert_singletons(words, singletons) if parameters['cap_dim']: caps = data['caps'] char_for, char_rev, char_pos = pad_word_chars(chars) input = [] if parameters['word_dim']: input.append(words) if parameters['char_dim']: input.append(char_for) if parameters['char_bidirect']: input.append(char_rev) input.append(char_pos) if parameters['cap_dim']: input.append(caps) if add_label: input.append(data['tags']) return input if __name__ == "__main__": train_sentences = load_sentences("./data/input.train", True) print(train_sentences) # create maps if not exist _c, char_to_id, id_to_char = char_mapping(train_sentences, True) _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, True) train_manager = BatchManager(train_data, 100) for batch in train_manager.iter_batch(shuffle=True): print(batch[0]) print(batch[-1])
def main(): print(args) # local """ data_dir = 'sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt') VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') EVAL_X = os.path.join(data_dir, 'train/valid.article.filter.txt') EVAL_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') """ # server data_dir = 'sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article_01_new.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title_01_new.txt') VALID_X = os.path.join(data_dir, 'train/train.article_000_new.txt') VALID_Y = os.path.join(data_dir, 'train/train.title_000_new.txt') EVAL_X = os.path.join(data_dir, 'train/train.article_001_new.txt') EVAL_Y = os.path.join(data_dir, 'train/train.title_001_new.txt') small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) # bert embeddings emb_file = 'sumdata/bert-large-uncased.30522.1024d.vec' vocab, embeddings = load_word2vec_embedding(emb_file) max_src_len = 101 max_tgt_len = 47 bs = args.batch_size n_train = args.n_train n_valid = args.n_valid n_eval = args.n_eval # vocab = small_vocab train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab) train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab) valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab) valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab) eval_x = BatchManager(load_data(EVAL_X, max_src_len, n_eval), bs, vocab) eval_y = BatchManager(load_data(EVAL_Y, max_tgt_len, n_eval), bs, vocab) # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 4, 256, # 64, 64, 1024, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda() # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 6, 300, # 50, 50, 1200, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda() # model = TransformerShareEmbedding(len(vocab), max_src_len, 1, 6, 300, 50, 50, 1200, False).cuda() model = TransformerShareEmbedding(len(vocab), max_src_len, 1, 6, 1024, 50, 50, 1200, False, embeddings = embeddings).cuda() # print(model) saved_state = {'epoch': 0, 'lr': 0.001} if os.path.exists(args.ckpt_file): saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) logging.info('Load model parameters from %s' % args.ckpt_file) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=saved_state['lr']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3) scheduler.step() # last_epoch=-1, which will not update lr at the first time # eval_model(valid_x, valid_y, vocab, model) # train(train_x, train_y, valid_x, valid_y, model, optimizer, vocab, scheduler, args.n_epochs, saved_state['epoch']) myeval(eval_x, eval_y, vocab, model)