opt = optimizer.SGDOptimizer(hp.lr) train_op = opt.minimize(loss) executor = ad.Executor([loss, train_op], ctx=ctx) logging.info("# Session") for ep in range(hp.num_epochs): dataloader.make_epoch_data(hp.batch_size) for i in tqdm(range(dataloader.batch_num)): xs_val, ys_val = dataloader.get_batch() # st = time.time() xs_val = xs_val[0] ys1_val = ys_val[0][:, :-1] ys2_val = ys_val[0][:, 1:] nonpadding_val = np.not_equal(ys2_val, dataloader.get_pad()).astype(np.float32) _loss, _ = executor.run(feed_dict={ xs: xs_val, ys1: ys1_val, ys2: ys2_val, nonpadding: nonpadding_val }) # en = time.time() # if i == 100: # exit() log_str = 'Iteration %d, loss %f' % (i, _loss.asnumpy()) print(log_str) # print('time: ', (en - st)) logging.info("Done")
parser = hparams.parser hp = parser.parse_args() print(hp) # save_hparams(hp, hp.logdir) logging.info("# Prepare train/eval batches") dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab) xs = tf.placeholder(name='xs', dtype=tf.int32, shape=[16, 100]) ys1 = tf.placeholder(name='ys1', dtype=tf.int32, shape=[16, 99]) ys2 = tf.placeholder(name='ys2', dtype=tf.int32, shape=[16, 99]) logging.info("# Load model") m = Transformer(hp) loss = m.train(xs, (ys1, ys2)) nonpadding = tf.to_float(tf.not_equal(ys2, dataloader.get_pad())) # 0: <pad> loss = tf.reduce_sum(loss * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) global_step = tf.train.get_or_create_global_step() optimizer = tf.train.GradientDescentOptimizer(hp.lr) train_op = optimizer.minimize(loss, global_step=global_step) # y_hat, eval_summaries = m.eval(xs, ys) # y_hat = m.infer(xs, ys) logging.info("# Session") saver = tf.train.Saver(max_to_keep=hp.num_epochs) with tf.Session() as sess: ckpt = tf.train.latest_checkpoint(hp.logdir) if ckpt is None: logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer())