def train(args): print(args) data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) for e in range(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) #print("model learning rate is {}".format(model.lr.eval())) data_loader.reset_batch_pointer('train') state = model.initial_state.eval() for b in xrange(data_loader.ntrain): start = time.time() x, y = data_loader.next_batch('train') # tmp = '' # for c in x: # for i in c: # tmp += np.array(data_loader.chars)[i] # print(tmp) feed = {model.input_data: x, model.targets: y, model.initial_state: state} train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed) end = time.time() print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.ntrain + b, args.num_epochs * data_loader.ntrain, e, train_loss, end - start)) if (e * data_loader.ntrain + b) % args.save_every == 0: checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = e * data_loader.ntrain + b) print("model saved to {}".format(checkpoint_path)) # eval validation loss data_loader.reset_batch_pointer('validation') validation_state = model.initial_state.eval() val_losses = 0 for n in xrange(data_loader.nvalidation): x, y = data_loader.next_batch('validation') feed = {model.input_data: x, model.targets: y, model.initial_state: validation_state} validation_loss, validation_state = sess.run([model.cost, model.final_state], feed) val_losses += validation_loss validation_loss = val_losses / data_loader.nvalidation print("validation loss is {}".format(validation_loss))
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size with open(os.path.join(args.save_dir, 'config.pkl'), 'w') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'w') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) for e in xrange(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() state = model.initial_state.eval() for b in xrange(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y, model.initial_state: state} train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed) end = time.time() print "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start) if (e * data_loader.num_batches + b) % args.save_every == 0: checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = e * data_loader.num_batches + b) print "model saved to {}".format(checkpoint_path)
def train(args): # Step 1: load data data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size # print (sys.version) # print (data_loader.vocab_size) # ^^ 65 if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) # Step 2: define a model model = Model(args) # Step 3: define an optimizer tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(model.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(args.learning_rate) train_op = optimizer.apply_gradients(zip(grads, tvars)) # Step 4: train with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) for e in range(args.num_epochs): data_loader.reset_batch_pointer() state = sess.run(model.initial_state) # ^^ always starts with zero-filled state tensor for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} # copy the state of previous batch for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h train_loss, state, _ = sess.run([model.cost, model.final_state, train_op], feed) end = time.time() print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0\ or (e == args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path))
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) for e in range(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() state = model.initial_state.eval() for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y, model.initial_state: state} train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed) end = time.time() print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0: checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path))
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir(args.init_from)," %s must be a a path" % args.init_from assert os.path.isfile(os.path.join(args.init_from,"config.pkl")),"config.pkl file does not exist in path %s"%args.init_from assert os.path.isfile(os.path.join(args.init_from,"chars_vocab.pkl")),"chars_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt,"No checkpoint found" assert ckpt.model_checkpoint_path,"No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl')) as f: saved_model_args = cPickle.load(f) need_be_same=["model","rnn_size","num_layers","seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme]==vars(args)[checkme],"Command line argument and saved model disagree on '%s' "%checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'chars_vocab.pkl')) as f: saved_chars, saved_vocab = cPickle.load(f) assert saved_chars==data_loader.chars, "Data and loaded model disagreee on character set!" assert saved_vocab==data_loader.vocab, "Data and loaded model disagreee on dictionary mappings!" with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() state = model.initial_state.eval() for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y, model.initial_state: state} train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed) end = time.time() print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0\ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path))
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size with open(os.path.join(args.save_dir, 'config.pkl'), 'w') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'w') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) train_loss_iterations = {'iteration': [], 'epoch': [], 'train_loss': [], 'val_loss': []} for e in xrange(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() state = model.initial_state.eval() for b in xrange(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y, model.initial_state: state} train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed) end = time.time() batch_idx = e * data_loader.num_batches + b print "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(batch_idx, args.num_epochs * data_loader.num_batches, e, train_loss, end - start) train_loss_iterations['iteration'].append(batch_idx) train_loss_iterations['epoch'].append(e) train_loss_iterations['train_loss'].append(train_loss) if batch_idx % args.save_every == 0: # evaluate state_val = model.initial_state.eval() avg_val_loss = 0 for x_val, y_val in data_loader.val_batches: feed_val = {model.input_data: x_val, model.targets: y_val, model.initial_state: state_val} val_loss, state_val, _ = sess.run([model.cost, model.final_state, model.train_op], feed_val) avg_val_loss += val_loss / len(data_loader.val_batches) print 'val_loss: {:.3f}'.format(avg_val_loss) train_loss_iterations['val_loss'].append(avg_val_loss) checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print "model saved to {}".format(checkpoint_path) else: train_loss_iterations['val_loss'].append(None) pd.DataFrame(data=train_loss_iterations, columns=train_loss_iterations.keys()).to_csv(os.path.join(args.save_dir, 'log.csv'))
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length, args.input_encoding) args.vocab_size = data_loader.vocab_size with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) model = Model(args) merged = tf.summary.merge_all() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) for e in range(model.epoch_pointer.eval(), args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed } summary, train_loss, state, _, _ = sess.run([ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op ], feed) speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path))
def train(args): args.data_dir = 'data' args.save_dir = 'save' args.rnn_size = 64 args.num_layers = 1 args.num_epochs = 5 args.batch_size = 50 args.seq_length = 50 args.num_epochs = 5 args.save_every = 1000 args.grad_clip = 5. args.learning_rate = 0.002 args.decay_rate = 0.97 data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) with tf.Session() as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(tf.all_variables()) for e in range(args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * args.decay_rate**e)) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h train_loss, state, _ = sess.run( [model.cost, model.final_state, model.train_op], feed_dict=feed) end = time.time() print( "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start)) # save for the last result if (e == args.num_epochs - 1 and b == data_loader.num_batches - 1): checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model save to {}".format(checkpoint_path))
def train(args): # Step 1: load data data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) # Step 2: define a model model = Model(args) # Step 3: define an optimizer # YOUR CODE HERE # Step 4: train with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) for e in range(args.num_epochs): data_loader.reset_batch_pointer() state = sess.run(model.initial_state) for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} # copy the state of previous batch # YOUR CODE HERE train_loss, state, _ = sess.run( [model.cost, model.final_state, train_op], feed) end = time.time() print( "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0\ or (e == args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path))
def training(self, args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size if os.path.isdir(args.save_dir): ckpt = tf.train.get_checkpoint_state(args.save_dir) else: os.makedirs(args.save_dir) ckpt = None model = Model(args) with tf.Session() as sess: tf.summary.FileWriter(os.getcwd() + '\\logs', sess.graph) # 使用cmd在当前目录键入 tensorboard --logdir=logs,并根据提示在chrome打开网址查看网络结构 sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() now_epochs = 0 if ckpt: saver.restore(sess, ckpt.model_checkpoint_path) now_epochs = int(ckpt.model_checkpoint_path.split('-') [1]) // data_loader.num_batches count = 0 for e in range(now_epochs, args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) for b in range(data_loader.num_batches): x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h train_loss, state, _ = sess.run( [model.cost, model.final_state, model.train_op], feed) if count % 5 == 0: percent = (e * data_loader.num_batches + b + 1) / ( args.num_epochs * data_loader.num_batches) print('%' + str(int(percent * 100)) + '|' + '▉' * int(50 * percent) + ' ' * 2 * (50 - int(50 * percent)) + '|') count += 1 checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=(e + 1) * data_loader.num_batches) print('%100' + '|' + '▉' * 50 + '|') tf.reset_default_graph()
def train(): loader = TextLoader(DATA_DIR, rnn.BATCH_SIZE, rnn.SEQ_LENGTH) vocab_size = loader.vocab_size if init_from is not None: ckpt = tf.train.get_checkpoint_state(init_from) with open(os.path.join(SAVE_DIR + 'conf.pkl'), 'wb') as f: cPickle.dump((loader.vocab_size, loader.chars, loader.vocab), f) model = rnn.Model(vocab_size, True) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) if init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for i in range(NUM_EPOCHS): sess.run( tf.assign(model.lr, rnn.learning_rate * (rnn.decay_rate**i))) loader.reset_batch_pointer() state = sess.run(model.initial_state) for b in range(loader.num_batches): curr_batch = i * loader.num_batches + b start = time.time() x, y = loader.next_batch() feed = {model.input_data: x, model.targets: y} for j, s in enumerate(model.initial_state): feed[s] = state[j] train_loss, state, _ = sess.run( [model.cost, model.final_state, model.train_op], feed) end = time.time() print( ('{0}/{1} (epoch {2}),' + ' train_loss = {3:.2f},' + ' time/batch = {4:.2f},' + ' time_left = {5:.2f}').format( curr_batch, NUM_EPOCHS * loader.num_batches, i, train_loss, end - start, ((end - start) / 3600 * (NUM_EPOCHS * loader.num_batches - curr_batch)))) if curr_batch % 1000 == 0 or (i == (NUM_EPOCHS - 1) and (b == loader.num_batches - 1)): ckpath = os.path.join(SAVE_DIR, 'model.ckpt') saver.save(sess, ckpath, global_step=curr_batch) print('model saved to {}'.format(ckpath))
def train(embeddings, Ture, tag): data_loader = TextLoader(4, tag) vocab_size = data_loader.vocab_size print('vocab_size', vocab_size) print('downloading model.......') model = Model(embeddings, Ture) print('finished downloading model.......') saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('./tf_log/' + tag + '/', sess.graph) print('start loop .......') for e in range(num_epochs): sess.run(tf.assign(model.lr, learning_rate * (decay_rate**e))) data_loader.shuff() svm_x = [] svm_y = [] for b in range(data_loader.num_batches): x, y = data_loader.next_batch(b) # print(x.shape) # print('trouble shooting tag') feed = {model.input_data: x, model.targets: y} train_loss, state, _, accuracy, summary_, output3 = sess.run( [ model.cost, model.final_state, model.optimizer, model.accuracy, model.merged, model.output3 ], feed_dict=feed) writer.add_summary(summary_, global_step=b) print( '第{}批次,第{}次循环时,train_loss ={},accuracy ={},process tag is {}' .format(e, b, train_loss, accuracy, tag)) if e == num_epochs - 1: svm_x.append(np.array(output3)) svm_y.append(y) if e == num_epochs - 1 and b == data_loader.num_batches - 1: saver.save(sess, 'model_14/' + tag + '/' + tag + '.model') a, b, c, d = np.array(svm_x).shape svm_x = np.array(svm_x).reshape((a * b, c * d)) e, f = np.array(svm_y).shape svm_y = np.array(svm_y).reshape((e * f, 1)) print(np.array(svm_x).shape) print(np.array(svm_y).shape) np.save('./Peng_second/' + tag + '_x.npy', svm_x) np.save('./Peng_second/' + tag + '_y.npy', svm_y)
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size with open(os.path.join(args.save_dir, 'configure.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) model = Model(args) merged = tf.summary.merge_all() with tf.Session() as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) for e in range(args.num_epochs): #sess.run(tf.assign(model.lr,args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed } summary, train_loss, state, _, _ = sess.run([ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op ], feed) #train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) '''if (e * data_loader.num_batches + b) % args.save_every == 0 \
def train(args): # Load data data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) # Set vocabulary size args.vocab_size = data_loader.vocab_size # Create the save directory if it does not exist if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # Save the configuration and the vocab, used to reload models when sampling with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) # Create models with arguments model = Model(args) with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) for e in range(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() state = model.initial_state.eval() for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y, model.initial_state: state} train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed) end = time.time() print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0: checkpoint_path = os.path.join(args.save_dir, 'models.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("models saved to {}".format(checkpoint_path)) # Save the final state saver.save(sess, os.path.join(args.save_dir, 'models.ckpt'), global_step=args.num_epochs * data_loader.num_batches)
def evaluate(args): # TODO: train and test data could have different vocabulary, here just use the train vocab for testing with open(os.path.join(args.save_dir, 'config.pkl'), 'rb') as f: saved_args = cPickle.load(f) data_loader = TextLoader(args.data_dir, saved_args.batch_size, saved_args.seq_length) model = Model(saved_args, training=True) with tf.Session() as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state(args.save_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) start = time.time() state = sess.run(model.initial_state) all_sum_mean_loss = 0 all_count = 0 for b in range(data_loader.num_batches): # both x and y have shape (batch_size, seq_length) x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} # for LSTM each initial_state element is a tuple, for GRU it is a tensor if saved_args.model == "lstm": for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h elif saved_args.model == "gru": for i, c in enumerate(model.initial_state): feed[c] = state[i] sum_mean_loss, count, _ = sess.run( [model.pp_sum_mean_loss, model.pp_count, model.ppl], feed) all_sum_mean_loss += sum_mean_loss all_count += count print("total perplexity", np.exp(all_sum_mean_loss / all_count)) end = time.time() print("inference time (in seconds):", end - start)
def train(args): data_loader = TextLoader(args.batch_size) args.poem_length = data_loader.poem_length #self.rhymes[ID] = self.rhyme_set.index(rhyme) print("Maximal length:", args.poem_length) print('finish readling file ...\n') args.vocab_size = data_loader.vocab_size print("Capture Rules Suceessfully") # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "chars_vocab.pkl") ), "chars_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" assert os.path.isfile( os.path.join(args.init_from, "iterations") ), "iterations file does not exist in path %s " % args.init_from # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'chars_vocab.pkl'), 'rb') as f: saved_chars, saved_vocab, saved_rhymes = cPickle.load(f) assert saved_chars == data_loader.chars, "Data and loaded model disagree on character set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" assert saved_rhymes == data_loader.rhymes, "Data and loaded model disagree on rhyme mappings!" with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump( (data_loader.chars, data_loader.vocab, data_loader.rhymes), f) model = Model(args) def duplicate(x): i, j = x.shape k = 2 xx = np.empty([i, j, k]) for ii in range(i): for jj in range(j): for kk in range(k): xx[i, j, k] = x[i, j] return xx with tf.Session() as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) iterations = 0 # restore model and number of iterations if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) with open(os.path.join(args.save_dir, 'iterations'), 'rb') as f: iterations = cPickle.load(f) losses = [] for e in range(args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() for b in range(data_loader.num_batches): iterations += 1 start = time.time() xdata, ydata, xrhyme, yrhyme = data_loader.next_batch() #xx = duplicate(x) #yy = duplicate(y) #feed = {model.input_data: xx, # model.targets: yy} feed = { model.input_data: xdata, model.input_rhyme: xrhyme, model.target_data: ydata } train_loss, _, _ = sess.run( [model.cost, model.final_state, model.train_op], feed) end = time.time() sys.stdout.write('\r') info = "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start) sys.stdout.write(info) sys.stdout.flush() losses.append(train_loss) if (e * data_loader.num_batches + b) % args.save_every == 0\ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=iterations) with open(os.path.join(args.save_dir, "iterations"), 'wb') as f: cPickle.dump(iterations, f) with open( os.path.join(args.save_dir, "losses-" + str(iterations)), 'wb') as f: cPickle.dump(losses, f) losses = [] sys.stdout.write('\n') print("model saved to {}".format(checkpoint_path)) sys.stdout.write('\n')
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "chars_vocab.pkl") ), "chars_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'chars_vocab.pkl'), 'rb') as f: saved_chars, saved_vocab = cPickle.load(f) assert saved_chars == data_loader.chars, "Data and loaded model disagree on character set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) with tf.Session() as sess: # instrument for tensorboard summaries = tf.summary.merge_all() writer = tf.summary.FileWriter( os.path.join(args.log_dir, time.strftime("%Y-%m-%d-%H-%M-%S"))) writer.add_graph(sess.graph) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) for b in range(data_loader.num_batches): start = time.time() weights = sess.run(model.tvars) # get the hidden weights x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h train_loss, state, _ = sess.run( [model.cost, model.final_state, model.train_op], feed) # instrument for tensorboard summ, train_loss, state, _ = sess.run( [summaries, model.cost, model.final_state, model.train_op], feed) writer.add_summary(summ, e * data_loader.num_batches + b) # get the current input in character form my_dict = dict((y, x) for x, y in data_loader.vocab.items()) chars = [my_dict[i] for i in list(np.squeeze(x, axis=0))] # print the current sequence of characters and weight values print("Batch of Characters: ", chars) print("Layer 1 Weights.shape: ", weights[3].shape) print("Layer 1 Biases.shape: ", weights[4].shape) print("Layer 2 Weights.shape: ", weights[5].shape) print("Layer 2 Biases.shape: ", weights[6].shape) print("Layer 1 Weights: ", weights[3]) print("Layer 1 Biases: ", weights[4]) print("Layer 2 Weights: ", weights[5]) print("Layer 2 Biases: ", weights[6]) end = time.time() print( "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0\ or (e == args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path))
def train(args): data_loader = TextLoader(args.data_path, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size args.file_size = data_loader.file_size print("Vocab size: ", args.vocab_size) print("File size: ", args.file_size) args.lower_bound = 0 #If we know the entropy then we set it to this data_info = {} if args.info_path is not None: assert os.path.isfile( args.info_path ), "Info file not found in the path: %s" % args.info_path #Open the info file with open(args.info_path, 'rb') as f: data_info = json.load(f) #Assuming we know entropy args.lower_bound = data_info['Entropy'] print(data_info) # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "chars_vocab.pkl") ), "chars_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'chars_vocab.pkl'), 'rb') as f: saved_chars, saved_vocab = cPickle.load(f) assert saved_chars == data_loader.chars, "Data and loaded model disagree on character set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) ################################################## # Get the model ################################################## model = Model(args) print("model Loaded") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) writer = tf.summary.FileWriter(args.summary_dir, sess.graph) # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) ###################################################### # Perform the training ##################################################### for e in range(args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() #Need to check what this does state = sess.run(model.initial_state) #What is this initial state cumul_loss = 0 for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h summary, train_loss, state, _ = sess.run([ model.merged_summaries, model.cost, model.final_state, model.train_op ], feed) #what is the training loss train_loss /= np.log(2) cumul_loss += train_loss end = time.time() print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0\ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) if b % 10 == 0: writer.add_summary(summary, e * data_loader.num_batches + b) cumul_loss /= data_loader.num_batches print("Epoch {}: Cumulative Loss for the epoch: {:.3f}".format( e, cumul_loss)) if (abs(cumul_loss - args.lower_bound) < 0.1): print("Stopping Training as we get a good loss.. :) ... ") break ############################################################## # Append details to the output file ############################################################## args.epoch_stopped = e + 1 args.last_epoch_loss = cumul_loss with open(args.output_path, 'a') as f: params = vars(args) params.update(data_info) #json.dump(params, f,indent=2) cPickle.dump(params, f) #f.write("\n ############################################# \n") with open(args.output_path + ".json", 'a') as f: params = vars(args) params.update(data_info) json.dump(params, f, indent=2) #cPickle.dump(params) f.write("\n ############################################# \n")
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length, args.input_encoding) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "words_vocab.pkl") ), "words_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) assert saved_words == data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) model = Model(args) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(args.log_dir) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_writer.add_graph(sess.graph) tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(model.epoch_pointer.eval(), args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 if args.init_from is None: assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed } summary, train_loss, state, _, _ = sess.run([ merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op ], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) train_writer.close()
def train(args): model_name = args.data_dir.split("/")[-1] # make a dir to store checkpoints args.save_dir = os.path.join('checkpoints', model_name) if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir(args.init_from)," %s must be a a path" % args.init_from assert os.path.isfile(os.path.join(args.init_from,"config.pkl")),"config.pkl file does not exist in path %s"%args.init_from assert os.path.isfile(os.path.join(args.init_from,"chars_vocab.pkl")),"chars_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme]==vars(args)[checkme],"Command line argument and saved model disagree on '%s' "%checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'chars_vocab.pkl'), 'rb') as f: saved_chars, saved_vocab = cPickle.load(f) assert saved_chars==data_loader.chars, "Data and loaded model disagree on character set!" assert saved_vocab==data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) with tf.Session() as sess: # instrument for tensorboard summaries = tf.summary.merge_all() writer = tf.summary.FileWriter( os.path.join(args.log_dir, time.strftime("%Y-%m-%d-%H-%M-%S"))) writer.add_graph(sess.graph) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h # instrument for tensorboard summ, train_loss, state, _ = sess.run([summaries, model.cost, model.final_state, model.train_op], feed) writer.add_summary(summ, e * data_loader.num_batches + b) end = time.time() print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0\ or (e == args.num_epochs-1 and b == data_loader.num_batches-1): # remove previous checkpoints current_checkpoints = [f for f in os.listdir(args.save_dir) if os.path.isfile(os.path.join(args.save_dir, f))] for f in current_checkpoints: if model_name in f: os.remove(os.path.join(args.save_dir, f)) # save for the last result checkpoint_path = os.path.join(args.save_dir, model_name) saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) final_model = '{}-{}'.format(model_name, e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) # get the vocab model_vocab = getModelVocab(model_name) # dump the checkpoints to javascript dump_checkpoints(model_vocab, model_name, final_model)
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length, args.input_encoding) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir(args.init_from)," %s must be a path" % args.init_from assert os.path.isfile(os.path.join(args.init_from,"config.pkl")),"config.pkl file does not exist in path %s"%args.init_from assert os.path.isfile(os.path.join(args.init_from,"words_vocab.pkl")),"words_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt,"No checkpoint found" assert ckpt.model_checkpoint_path,"No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same=["model","rnn_size","num_layers","seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme]==vars(args)[checkme],"Command line argument and saved model disagree on '%s' "%checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = cPickle.load(f) assert saved_words==data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab==data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.words, data_loader.vocab), f) model = Model(args) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(args.log_dir) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_writer.add_graph(sess.graph) tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(model.epoch_pointer.eval(), args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) speed = 0 if args.init_from is None: assign_op = model.epoch_pointer.assign(e) sess.run(assign_op) if args.init_from is not None: data_loader.pointer = model.batch_pointer.eval() args.init_from = None for b in range(data_loader.pointer, data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y, model.initial_state: state, model.batch_time: speed} summary, train_loss, state, _, _ = sess.run([merged, model.cost, model.final_state, model.train_op, model.inc_batch_pointer_op], feed) train_writer.add_summary(summary, e * data_loader.num_batches + b) speed = time.time() - start if (e * data_loader.num_batches + b) % args.batch_size == 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, speed)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) train_writer.close()
optimizer = tf.train.AdamOptimizer(learning_rate) train_step = optimizer.apply_gradients(zip(grads, tvars)) # 세션을 열고 학습을 진행합니다. with tf.Session() as sess: # 변수들에 초기값을 할당합니다. sess.run(tf.global_variables_initializer()) for e in range(num_epochs): data_loader.reset_batch_pointer() # 초기 상태값을 지정합니다. state = sess.run(initial_state, feed_dict={state_batch_size: batch_size}) for b in range(data_loader.num_batches): # x, y 데이터를 불러옵니다. x, y = data_loader.next_batch() # y에 one_hot 인코딩을 적용합니다. y = tf.one_hot(y, vocab_size) # y : [batch_size, seq_length, vocab_size] y = tf.reshape(y, [-1, vocab_size]) # y : [batch_size * seq_length, vocab_size] y = y.eval() # feed-dict에 사용할 값들과 LSTM 초기 cell state(feed_dict[c])값과 hidden layer 출력값(feed_dict[h])을 지정합니다. feed_dict = {input_data: x, target_data: y, state_batch_size: batch_size} for i, (c, h) in enumerate(initial_state): feed_dict[c] = state[i].c feed_dict[h] = state[i].h # 한스텝 학습을 진행합니다. _, loss_print, state = sess.run([train_step, loss, final_state], feed_dict=feed_dict) print("{}(학습한 배치개수)/{}(학습할 배치개수), 반복(epoch): {}, 손실함수(loss): {:.3f}".format(
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size load_model = False if not os.path.exists(args.save_dir): print("Creating directory %s" % args.save_dir) os.mkdir(args.save_dir) elif (os.path.exists(os.path.join(args.save_dir, 'config.pkl'))): ckpt = tf.train.get_checkpoint_state(args.save_dir) if ckpt and ckpt.model_checkpoint_path: with open(os.path.join(args.save_dir, 'config.pkl'), 'rb') as f: saved_args = pickle.load(f) args.block_size = saved_args.block_size args.num_blocks = saved_args.num_blocks args.num_layers = saved_args.num_layers args.model = saved_args.model print( "Found a previous checkpoint. Overwriting model description arguments to:" ) print( " model: {}, block_size: {}, num_blocks: {}, num_layers: {}" .format(saved_args.model, saved_args.block_size, saved_args.num_blocks, saved_args.num_layers)) load_model = True with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: pickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: pickle.dump((data_loader.chars, data_loader.vocab), f) print("Building the model") model = Model(args) print("Total trainable parameters: {:,d}".format( model.trainable_parameter_count())) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' config = tf.ConfigProto(log_device_placement=False) with tf.Session(config=config) as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(model.save_variables_list(), max_to_keep=3) if (load_model): print("Loading saved parameters") saver.restore(sess, ckpt.model_checkpoint_path) global_epoch_fraction = sess.run(model.global_epoch_fraction) global_seconds_elapsed = sess.run(model.global_seconds_elapsed) if load_model: print( "Resuming from global epoch fraction {:.3f}," " total trained time: {}, learning rate: {}".format( global_epoch_fraction, datetime.timedelta(seconds=float(global_seconds_elapsed)), sess.run(model.lr))) if (args.set_learning_rate > 0): sess.run(tf.assign(model.lr, args.set_learning_rate)) print("Reset learning rate to {}".format(args.set_learning_rate)) data_loader.cue_batch_pointer_to_epoch_fraction(global_epoch_fraction) initial_batch_step = int( (global_epoch_fraction - int(global_epoch_fraction)) * data_loader.total_batch_count) epoch_range = (int(global_epoch_fraction), args.num_epochs + int(global_epoch_fraction)) writer = tf.summary.FileWriter(args.save_dir, graph=tf.get_default_graph()) outputs = [ model.cost, model.final_state, model.train_op, model.summary_op ] global_step = epoch_range[ 0] * data_loader.total_batch_count + initial_batch_step avg_loss = 0 avg_steps = 0 try: for e in range(*epoch_range): state = sess.run(model.zero_state) batch_range = (initial_batch_step, data_loader.total_batch_count) initial_batch_step = 0 for b in range(*batch_range): global_step += 1 if global_step % args.decay_steps == 0: current_learning_rate = sess.run(model.lr) current_learning_rate *= args.decay_rate sess.run(tf.assign(model.lr, current_learning_rate)) print("Decayed learning rate to {}".format( current_learning_rate)) start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} model.add_state_to_feed_dict(feed, state) train_loss, state, _, summary = sess.run(outputs, feed) elapsed = time.time() - start global_seconds_elapsed += elapsed writer.add_summary(summary, e * batch_range[1] + b + 1) if avg_steps < 100: avg_steps += 1 avg_loss = 1 / avg_steps * train_loss + ( 1 - 1 / avg_steps) * avg_loss print("{:,d} / {:,d} (epoch {:.3f} / {}), loss {:.3f} (avg {:.3f}), {:.3f}s" \ .format(b, batch_range[1], e + b / batch_range[1], epoch_range[1], train_loss, avg_loss, elapsed)) if (e * batch_range[1] + b + 1) % args.save_every == 0 \ or (e == epoch_range[1] - 1 and b == batch_range[1] - 1): save_model(sess, saver, model, args.save_dir, global_step, data_loader.total_batch_count, global_seconds_elapsed) except KeyboardInterrupt: print() finally: writer.flush() global_step = e * data_loader.total_batch_count + b save_model(sess, saver, model, args.save_dir, global_step, data_loader.total_batch_count, global_seconds_elapsed)
def train(args): if args.continue_training in ['True', 'true']: args.continue_training = True else: args.continue_training = False data_loader = TextLoader(True, args.utils_dir, args.data_path, args.batch_size, args.seq_length, None, None) args.vocab_size = data_loader.vocab_size args.label_size = data_loader.label_size if args.continue_training: assert os.path.isfile(os.path.join(args.save_dir, 'config.pkl')), 'config.pkl file does not exist in path %s' % args.save_dir assert os.path.isfile(os.path.join(args.utils_dir, 'chars_vocab.pkl')), 'chars_vocab.pkl file does not exist in path %s' % args.utils_dir assert os.path.isfile(os.path.join(args.utils_dir, 'labels.pkl')), 'labels.pkl file does not exist in path %s' % args.utils_dir ckpt = tf.train.get_checkpoint_state(args.save_dir) assert ckpt, 'No checkpoint found' assert ckpt.model_checkpoint_path, 'No model path found in checkpoint' with open(os.path.join(args.save_dir, 'config.pkl'), 'rb') as f: saved_model_args = pickle.load(f) need_be_same = ['model', 'rnn_size', 'num_layers', 'seq_length'] for checkme in need_be_same: assert vars(saved_model_args)[checkme]==vars(args)[checkme], 'command line argument and saved model disagree on %s' % checkme with open(os.path.join(args.utils_dir, 'chars_vocab.pkl'), 'rb') as f: saved_chars, saved_vocab = pickle.load(f) with open(os.path.join(args.utils_dir, 'labels.pkl'), 'rb') as f: saved_labels = pickle.load(f) assert saved_chars==data_loader.chars, 'data and loaded model disagree on character set' assert saved_vocab==data_loader.vocab, 'data and loaded model disagree on dictionary mappings' assert saved_labels==data_loader.labels, 'data and loaded model disagree on label dictionary mappings' with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: pickle.dump(args, f) with open(os.path.join(args.utils_dir, 'chars_vocab.pkl'), 'wb') as f: pickle.dump((data_loader.chars, data_loader.vocab), f) with open(os.path.join(args.utils_dir, 'labels.pkl'), 'wb') as f: pickle.dump(data_loader.labels, f) model = Model(args) with tf.Session() as sess: init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver(tf.all_variables()) if args.continue_training: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} train_loss, state, _, accuracy = sess.run([model.cost, model.final_state, model.optimizer, model.accuracy], feed_dict=feed) end = time.time() print '{}/{} (epoch {}), train_loss = {:.3f}, accuracy = {:.3f}, time/batch = {:.3f}'\ .format(e * data_loader.num_batches + b + 1, args.num_epochs * data_loader.num_batches, e + 1, train_loss, accuracy, end - start) if (e*data_loader.num_batches+b+1) % args.save_every == 0 \ or (e==args.num_epochs-1 and b==data_loader.num_batches-1): checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e*data_loader.num_batches+b+1) print 'model saved to {}'.format(checkpoint_path)
def train(args): if args.continue_training in ['True', 'true']: args.continue_training = True else: args.continue_training = False data_loader = TextLoader(True, args.utils_dir, args.data_path, args.batch_size, args.seq_length, None, None) args.vocab_size = data_loader.vocab_size args.label_size = data_loader.label_size if args.continue_training: assert os.path.isfile( os.path.join(args.save_dir, 'config.pkl') ), 'config.pkl file does not exist in path %s' % args.save_dir assert os.path.isfile( os.path.join(args.utils_dir, 'chars_vocab.pkl') ), 'chars_vocab.pkl file does not exist in path %s' % args.utils_dir assert os.path.isfile( os.path.join(args.utils_dir, 'labels.pkl') ), 'labels.pkl file does not exist in path %s' % args.utils_dir ckpt = tf.train.get_checkpoint_state(args.save_dir) assert ckpt, 'No checkpoint found' assert ckpt.model_checkpoint_path, 'No model path found in checkpoint' with open(os.path.join(args.save_dir, 'config.pkl'), 'rb') as f: saved_model_args = pickle.load(f) need_be_same = ['model', 'rnn_size', 'num_layers', 'seq_length'] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], 'command line argument and saved model disagree on %s' % checkme with open(os.path.join(args.utils_dir, 'chars_vocab.pkl'), 'rb') as f: saved_chars, saved_vocab = pickle.load(f) with open(os.path.join(args.utils_dir, 'labels.pkl'), 'rb') as f: saved_labels = pickle.load(f) assert saved_chars == data_loader.chars, 'data and loaded model disagree on character set' assert saved_vocab == data_loader.vocab, 'data and loaded model disagree on dictionary mappings' assert saved_labels == data_loader.labels, 'data and loaded model disagree on label dictionary mappings' with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: pickle.dump(args, f) with open(os.path.join(args.utils_dir, 'chars_vocab.pkl'), 'wb') as f: pickle.dump((data_loader.chars, data_loader.vocab), f) with open(os.path.join(args.utils_dir, 'labels.pkl'), 'wb') as f: pickle.dump(data_loader.labels, f) model = Model(args) with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver(tf.global_variables()) if args.continue_training: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} train_loss, state, _, accuracy = sess.run([ model.cost, model.final_state, model.optimizer, model.accuracy ], feed_dict=feed) end = time.time() print '{}/{} (epoch {}), train_loss = {:.3f}, accuracy = {:.3f}, time/batch = {:.3f}'\ .format(e * data_loader.num_batches + b + 1, args.num_epochs * data_loader.num_batches, e + 1, train_loss, accuracy, end - start) if (e*data_loader.num_batches+b+1) % args.save_every == 0 \ or (e==args.num_epochs-1 and b==data_loader.num_batches-1): checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b + 1) print 'model saved to {}'.format(checkpoint_path)
def train(args): model_name = args.data_dir.split("/")[-1] # make a dir to store checkpoints args.save_dir = os.path.join(args.save_checkpoints, model_name) if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "chars_vocab.pkl") ), "chars_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'chars_vocab.pkl'), 'rb') as f: saved_chars, saved_vocab = cPickle.load(f) assert saved_chars == data_loader.chars, "Data and loaded model disagree on character set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) with tf.Session() as sess: # instrument for tensorboard summaries = tf.summary.merge_all() writer = tf.summary.FileWriter( os.path.join(args.log_dir, time.strftime("%Y-%m-%d-%H-%M-%S"))) writer.add_graph(sess.graph) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h # instrument for tensorboard summ, train_loss, state, _ = sess.run( [summaries, model.cost, model.final_state, model.train_op], feed) writer.add_summary(summ, e * data_loader.num_batches + b) end = time.time() print( "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0\ or (e == args.num_epochs-1 and b == data_loader.num_batches-1): # remove previous checkpoints current_checkpoints = [ f for f in os.listdir(args.save_dir) if os.path.isfile(os.path.join(args.save_dir, f)) ] for f in current_checkpoints: if model_name in f: os.remove(os.path.join(args.save_dir, f)) # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) final_model = '{}-{}'.format( model_name, e * data_loader.num_batches + b) print("Model saved to {}!".format(checkpoint_path)) # get the vocab model_vocab = getModelVocab(args.save_checkpoints, model_name) # dump the checkpoints to javascript dump_checkpoints(args.save_checkpoints, args.save_model, model_vocab, model_name, final_model)
def train(args): # Load dataset. data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size # Checkpoint state. ckpt = None # Check if training can be continued from previously saved model. if args.init_from is not None: # Assert all necessary files exists. assert os.path.isdir(args.init_from), "{} doesn't exist.".format( args.init_from) assert os.path.exists(os.path.join(args.init_from, "config.pkl")), \ "config.pkl doesn't exist in path {}".format(args.init_from) assert os.path.exists(os.path.join(args.init_from, "chars_vocab.pkl")), \ "chars_vocab.pkl doesn't exist in path {}".format(args.init_from) # Get the state of checkpoint to be loaded. ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found!" assert ckpt.model_checkpoint_path, "model.ckpt-* not found in path {}".format( args.init_from) # Open config file and verify model compatibility. with open(os.path.join(args.init_from, "config.pkl"), mode="rb") as f: saved_model_args = pickle.load(f) # List of meta data that needs to be the same need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for check_me in need_be_same: assert vars(saved_model_args)[check_me] == vars(args)[check_me], \ "Saved model & command line arguments of {} aren't compatible!".format(check_me) # Load saved chars & vocab and check for compatibility. with open(os.path.join(args.init_from, "chars_vocab.pkl"), mode="rb") as f: saved_chars, saved_vocab = pickle.load(f) assert saved_chars == data_loader.chars, "Data and character set aren't compatible!" assert saved_vocab == data_loader.vocab, "Data and loaded dictionary mappings aren't compatible!" # Create save directory if it doesn't exist. if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) # Write the command line arguments into config file. with open(os.path.join(args.save_dir, "config.pkl"), mode="wb") as f: pickle.dump(args, f) # Save character set and dictionary mappings with open(os.path.join(args.save_dir, "chars_vocab.pkl"), mode="wb") as f: pickle.dump((data_loader.chars, data_loader.vocab), f) # Define the model. model = Model(args, training=True) # Start TensorFlow session. (with the default graph). with tf.Session() as sess: # Summary for Tensorboard. summaries = tf.summary.merge_all() writer = tf.summary.FileWriter(os.path.join( args.logdir, time.strftime("%Y-%m-%d-%H-%M-%S-%p")), graph=sess.graph) writer.add_graph(graph=sess.graph) # Initialize global variables. sess.run(tf.global_variables_initializer()) # Saver object for all global variables. saver = tf.train.Saver(var_list=tf.global_variables()) # Restore model from checkpoint. if args.init_from is not None: saver.restore(sess=sess, save_path=ckpt.model_checkpoint_path) # TRAINING LOOP. for epoch in range(args.num_epochs): # NOTE: Surrounded with try-except in case training was force-stopped. try: # Update Model's learning rate. sess.run( tf.assign(model.lr, value=args.learning_rate * (args.decay_rate**epoch))) # Reset mini batch pointer. data_loader.reset_batch_pointer() # Initial state. state = sess.run(model.initial_state) for batch in range(data_loader.num_batches): # Record start time for current batch. start = time.time() # Get the next mini batch. X, y = data_loader.next_batch() feed_dict = {model.input_data: X, model.targets: y} for i, (c, h) in enumerate(model.initial_state): feed_dict[c] = state[i].c feed_dict[h] = state[i].h # Train the model. _, _loss, _global, _summary, state = sess.run( [ model.train_op, model.loss, model.global_step, summaries, model.final_state ], feed_dict=feed_dict) writer.add_summary(summary=_summary, global_step=_global) end = time.time() batch_count = epoch * data_loader.num_batches + batch # Log progress. print( "\r{:,} of {:,} | global: {:,} Loss: {} time/batch: {}" .format(batch_count, args.num_epochs, _global, _loss, end - start), end="") # Save model at intervals. if batch_count % args.save_every == 0 or ( epoch == args.num_epochs - 1 and batch == data_loader.num_batches - 1): save_path = os.path.join(args.save_dir, "model.ckpt") saver.save(sess=sess, save_path=save_path, global_step=model.global_step) print("\nModel saved to {}\n".format(save_path)) """# !- end batch""" except KeyboardInterrupt: print('\nTraining interrupted by user. Saving...') save_path = os.path.join(args.save_dir, "model.ckpt") saver.save(sess=sess, save_path=save_path, global_step=model.global_step) print("Model saved to {}\n".format(save_path)) # End training. break # !- end epoch print("\n\nOverall training count = {}".format( sess.run(model.global_step)))
def train(args): print(args) data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length, args.training_data_ratio) args.vocab_size = data_loader.vocab_size with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) #sess = tf.InteractiveSession() with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter('/tmp', sess.graph) step = 0 for e in range(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) #print("model learning rate is {}".format(model.lr.eval())) data_loader.reset_batch_pointer('train') state = model.initial_state.eval() for b in xrange(data_loader.ntrain): start = time.time() x, y = data_loader.next_batch('train') feed = {model.input_data: x, model.targets: y, model.initial_state: state} train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed) end = time.time() step = e * data_loader.ntrain + b print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(step, args.num_epochs * data_loader.ntrain, e, train_loss, end - start)) if step % args.write_summary_every == 0: # training loss summary_str = sess.run(summary_op, feed_dict=feed) summary_writer.add_summary(summary_str, step) if step % args.save_every == 0 or (step + 1) == (args.num_epochs * data_loader.ntrain): # eval validation loss data_loader.reset_batch_pointer('validation') validation_state = model.initial_state.eval() val_losses = 0 for n in xrange(data_loader.nvalidation): x, y = data_loader.next_batch('validation') val_feed = {model.input_data: x, model.targets: y, model.initial_state: validation_state} validation_loss, validation_state = sess.run([model.cost, model.final_state], val_feed) val_losses += validation_loss validation_loss = val_losses / data_loader.nvalidation print("validation loss is {}".format(validation_loss)) # write top 5 validation loss to a json file args_dict = vars(args) args_dict['step'] = step val_loss_file = args.save_dir + '/val_loss.json' loss_json = '' save_new_checkpoint = False time_int = int(time.time()) args_dict['checkpoint_path'] = os.path.join(args.save_dir, 'model.ckpt-'+str(time_int)) if os.path.exists(val_loss_file): with open(val_loss_file, "r") as text_file: text = text_file.read() if text == '': loss_json = {validation_loss: args_dict} save_new_checkpoint = True else: loss_json = json.loads(text) losses = loss_json.keys() if len(losses) > 3: losses.sort(key=lambda x: float(x), reverse=True) loss = losses[0] if validation_loss < float(loss): to_be_remove_ckpt_file_path = loss_json[loss]['checkpoint_path'] to_be_remove_ckpt_meta_file_path = to_be_remove_ckpt_file_path + '.meta' print("removed checkpoint {}".format(to_be_remove_ckpt_file_path)) if os.path.exists(to_be_remove_ckpt_file_path): os.remove(to_be_remove_ckpt_file_path) if os.path.exists(to_be_remove_ckpt_meta_file_path): os.remove(to_be_remove_ckpt_meta_file_path) del(loss_json[loss]) loss_json[validation_loss] = args_dict save_new_checkpoint = True else: loss_json[validation_loss] = args_dict save_new_checkpoint = True else: loss_json = {validation_loss: args_dict} save_new_checkpoint = True if save_new_checkpoint: checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = time_int) print("model saved to {}".format(checkpoint_path + '-' + str(time_int))) with open(val_loss_file, "w") as text_file: json.dump(loss_json, text_file)
def cross_validation(args): data_loader = TextLoader(args.utils_dir, args.data_path, args.batch_size, args.seq_length, None, None) args.vocab_size = data_loader.vocab_size args.label_size = data_loader.label_size with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: pickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: pickle.dump((data_loader.chars, data_loader.vocab), f) with open(os.path.join(args.save_dir, 'labels.pkl'), 'wb') as f: pickle.dump(data_loader.labels, f) data = data_loader.tensor.copy() np.random.shuffle(data) data_list = np.array_split(data, 10, axis=0) model = Model(args) accuracy_list = [] with tf.Session() as sess: for n in range(10): init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver(tf.all_variables()) test_data = data_list[n].copy() train_data = np.concatenate(map(lambda i: data_list[i], [j for j in range(10) if j!=n]), axis=0) data_loader.tensor = train_data for e in range(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() for b in range(data_loader.num_batches): start = time.time() state = model.initial_state.eval() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y, model.initial_state: state} train_loss, state, _, accuracy = sess.run([model.cost, model.final_state, model.optimizer, model.accuracy], feed_dict=feed) end = time.time() print '{}/{} (epoch {}), train_loss = {:.3f}, accuracy = {:.3f}, time/batch = {:.3f}'\ .format(e * data_loader.num_batches + b + 1, args.num_epochs * data_loader.num_batches, e + 1, train_loss, accuracy, end - start) if (e*data_loader.num_batches+b+1) % args.save_every == 0 \ or (e==args.num_epochs-1 and b==data_loader.num_batches-1): checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e*data_loader.num_batches+b+1) print 'model saved to {}'.format(checkpoint_path) n_chunks = len(test_data) / args.batch_size if len(test_data) % args.batch_size: n_chunks += 1 test_data_list = np.array_split(test_data, n_chunks, axis=0) correct_total = 0.0 num_total = 0.0 for m in range(n_chunks): start = time.time() x = test_data_list[m][:, :-1] y = test_data_list[m][:, -1] results = model.predict_class(sess, x) correct_num = np.sum(results==y) end = time.time() correct_total += correct_num num_total += len(x) accuracy_total = correct_total / num_total accuracy_list.append(accuracy_total) print 'total_num = {}, total_accuracy = {:.6f}'.format(int(num_total), accuracy_total) accuracy_average = np.average(accuracy_list) print 'The average accuracy of cross_validation is {}'.format(accuracy_average)
def train(args): # Data Preparation # ==================================== data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size print("Number of sentences: {}".format(data_loader.num_data)) print("Vocabulary size: {}".format(args.vocab_size)) # Check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir( args.init_from), " %s must be a path" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from assert os.path.isfile( os.path.join(args.init_from, "words_vocab.pkl") ), "words_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = pickle.load(f) need_be_same = ["rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = pickle.load(f) assert saved_words == data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab == data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: pickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: pickle.dump((data_loader.words, data_loader.vocab), f) """ embedding_matrix = get_vocab_embedding(args.save_dir, data_loader.words, args.embedding_file) print("Embedding matrix shape:",embedding_matrix.shape) """ # Training # ==================================== with tf.Graph().as_default(): with tf.Session(config=tf.ConfigProto(gpu_options=options)) as sess: model = BasicLSTM(args) # Define training procedure global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(args.learning_rate) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(model.cost, tvars), args.grad_clip) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # Keep track of gradient values and sparsity grad_summaries = [] for g, v in zip(grads, tvars): if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) # Summary for loss loss_summary = tf.summary.scalar("loss", model.cost) # Train summaries merged = tf.summary.merge_all() if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) train_writer = tf.summary.FileWriter(args.log_dir, sess.graph) # saver = tf.train.Saver(tf.global_variables()) saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_model_keep) # Initialize all variables sess.run(tf.global_variables_initializer()) # Restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) # Start training print("Start training") valLoss_opt = 100000000.0 lastStripValLoss = 100000000.0 successiveHit = 0 trainLossList = list() largestGL = [-1000, 0] largestPG = [-1000, 0] largestUP = [0, 0] total_start_time = time.time() for epoch in range(args.num_epochs): data_loader.reset_batch_pointer() state = sess.run(model.initial_state) for i in range(data_loader.num_batches): start = time.time() #training x_batch, y_batch = data_loader.next_batch() feed_dict = { model.x: x_batch, model.y: y_batch, model.keep_prob: args.keep_prob } _, step, summary, loss, equal = sess.run([ train_op, global_step, merged, model.cost, model.equal ], feed_dict) print( "training step {}, epoch {}, batch {}/{}, loss: {:.4f}, accuracy: {:.4f}, time/batch: {:.3f}" .format(step, epoch, i, data_loader.num_batches, loss, np.mean(equal), time.time() - start)) train_writer.add_summary(summary, step) trainLossList.append(loss) current_step = tf.train.global_step(sess, global_step) #validing if current_step % args.check_strip_length == 0 and current_step > 0 and epoch > 0: start = time.time() x_batch_valid, y_batch_valid = data_loader.get_first_batch_as_valid( ) total_valid_loss = 0.0 total_valid_equal = 0.0 for index in range(len(x_batch_valid)): feed_dict_valid = { model.x: x_batch_valid[index], model.y: y_batch_valid[index], model.keep_prob: args.keep_prob } valid_loss, valid_equal = sess.run( [model.cost, model.equal], feed_dict_valid) total_valid_loss += valid_loss total_valid_equal += valid_equal total_valid_loss /= len(x_batch_valid) total_valid_equal /= len(x_batch_valid) print( "================================= step {} ===================================" .format(step)) print( "validing step {}, epoch {}, loss: {:.4f}, accuracy: {:.4f}, time/batch: {:.3f}" .format(step, epoch, total_valid_loss, np.mean(total_valid_equal), time.time() - start)) _GL = checkEarlyStopGL(total_valid_loss, valLoss_opt) _PG = checkEarlyStopPQ(total_valid_loss, valLoss_opt, trainLossList) _UP = checkEarlyStopUP(total_valid_loss, lastStripValLoss, successiveHit) if _GL > largestGL[0]: largestGL[0] = _GL largestGL[1] = current_step if _PG > largestPG[0]: largestPG[0] = _PG largestPG[1] = current_step if _UP > largestUP[0]: largestUP[0] = _UP largestUP[1] = current_step print("Criteria GL : " + str(_GL)) print("Criteria PG : " + str(_PG)) print("Criteria UP : " + str(_UP)) print( "==============================================================================" ) #save model #check GL criteria if _GL > args.GL_threshold0: args.GL_threshold0 = 10000000.0 checkpoint_path = os.path.join( args.save_dir, 'model_GL0.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved GL0 model checkpoint to {}".format( path)) print2LogFile(args, "GL0", current_step, epoch, total_start_time) if _GL > args.GL_threshold1: args.GL_threshold1 = 10000000.0 checkpoint_path = os.path.join( args.save_dir, 'model_GL1.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved GL1 model checkpoint to {}".format( path)) print2LogFile(args, "GL1", current_step, epoch, total_start_time) if _GL > args.GL_threshold2: args.GL_threshold2 = 10000000.0 checkpoint_path = os.path.join( args.save_dir, 'model_GL2.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved GL2 model checkpoint to {}".format( path)) print2LogFile(args, "GL2", current_step, epoch, total_start_time) if _GL > args.GL_threshold3: args.GL_threshold3 = 10000000.0 checkpoint_path = os.path.join( args.save_dir, 'model_GL3.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved GL3 model checkpoint to {}".format( path)) print2LogFile(args, "GL3", current_step, epoch, total_start_time) #check PG criteria if _PG > args.PG_threshold0: args.PG_threshold0 = 10000000.0 checkpoint_path = os.path.join( args.save_dir, 'model_PG0.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved PG0 model checkpoint to {}".format( path)) print2LogFile(args, "PG0", current_step, epoch, total_start_time) if _PG > args.PG_threshold1: args.PG_threshold1 = 10000000.0 checkpoint_path = os.path.join( args.save_dir, 'model_PG1.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved PG1 model checkpoint to {}".format( path)) print2LogFile(args, "PG1", current_step, epoch, total_start_time) if _PG > args.PG_threshold2: args.PG_threshold2 = 10000000.0 checkpoint_path = os.path.join( args.save_dir, 'model_PG2.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved PG2 model checkpoint to {}".format( path)) print2LogFile(args, "PG2", current_step, epoch, total_start_time) if _PG > args.PG_threshold3: args.PG_threshold3 = 10000000.0 checkpoint_path = os.path.join( args.save_dir, 'model_PG3.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved PG3 model checkpoint to {}".format( path)) print2LogFile(args, "PG3", current_step, epoch, total_start_time) #check UP criteria if _UP > args.UP_threshold0: args.UP_threshold0 = 1000 checkpoint_path = os.path.join( args.save_dir, 'model_UP0.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved UP0 model checkpoint to {}".format( path)) print2LogFile(args, "UP0", current_step, epoch, total_start_time) if _UP > args.UP_threshold1: args.UP_threshold1 = 1000 checkpoint_path = os.path.join( args.save_dir, 'model_UP1.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved UP1 model checkpoint to {}".format( path)) print2LogFile(args, "UP1", current_step, epoch, total_start_time) if _UP > args.UP_threshold2: args.UP_threshold2 = 1000 checkpoint_path = os.path.join( args.save_dir, 'model_UP2.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved UP2 model checkpoint to {}".format( path)) print2LogFile(args, "UP2", current_step, epoch, total_start_time) if _UP > args.UP_threshold3: args.UP_threshold3 = 1000 checkpoint_path = os.path.join( args.save_dir, 'model_UP3.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved UP3 model checkpoint to {}".format( path)) print2LogFile(args, "UP3", current_step, epoch, total_start_time) #setting variables if total_valid_loss < valLoss_opt: valLoss_opt = total_valid_loss lastStripValLoss = total_valid_loss successiveHit = _UP trainLossList = list() # current_step = tf.train.global_step(sess, global_step) if current_step % args.save_every == 0 or ( epoch == args.num_epochs - 1 and i == data_loader.num_batches - 1): #save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') path = saver.save(sess, checkpoint_path, global_step=current_step) print("Saved model checkpoint to {}".format(path)) printEarlyStopLog2File(args, largestGL, largestPG, largestUP, current_step, epoch, total_start_time) print("print early stop log to file : " + args.earlyStop_log_filename) print("cost time : " + str(time.time() - total_start_time) + " secs") #reset largestGL = [-1000, 0] largestPG = [-1000, 0] largestUP = [0, 0] train_writer.close()
def main(_): pp.pprint(FLAGS.__flags) if not os.path.exists(FLAGS.checkpoint_dir): print(" [*] Creating checkpoint directory...") os.makedirs(FLAGS.checkpoint_dir) data_loader = TextLoader(os.path.join(FLAGS.data_dir, FLAGS.dataset_name), FLAGS.batch_size, FLAGS.seq_length) vocab_size = data_loader.vocab_size with tf.variable_scope(FLAGS.dataset_name): train_model = CharRNN(vocab_size, FLAGS.batch_size, FLAGS.rnn_size, FLAGS.layer_depth, FLAGS.num_units, FLAGS.rnn_type, FLAGS.seq_length, FLAGS.keep_prob, FLAGS.grad_clip) with tf.variable_scope(FLAGS.dataset_name, reuse=True): valid_model = CharRNN(vocab_size, FLAGS.batch_size, FLAGS.rnn_size, FLAGS.layer_depth, FLAGS.num_units, FLAGS.rnn_type, FLAGS.seq_length, FLAGS.keep_prob, FLAGS.grad_clip) with tf.Session() as sess: tf.global_variables_initializer().run() train_model.load(sess, FLAGS.checkpoint_dir, FLAGS.dataset_name) best_val_pp = float('inf') best_val_epoch = 0 valid_loss = 0 valid_perplexity = 0 start = time.time() if FLAGS.export: print("Eval...") final_embeddings = train_model.embedding.eval(sess) emb_file = os.path.join(FLAGS.data_dir, FLAGS.dataset_name, 'emb.npy') print("Embedding shape: {}".format(final_embeddings.shape)) np.save(emb_file, final_embeddings) else: if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) with open( FLAGS.log_dir + "/" + FLAGS.dataset_name + "_hyperparams.pkl", 'wb') as f: cPickle.dump(FLAGS.__flags, f) for e in range(FLAGS.num_epochs): data_loader.reset_batch_pointer() sess.run(tf.assign(train_model.lr, FLAGS.learning_rate)) FLAGS.learning_rate /= 2 for b in range(data_loader.num_batches): x, y = data_loader.next_batch() res, time_batch = run_minibatches(sess, x, y, train_model) train_loss = res["loss"] train_perplexity = np.exp(train_loss) print( "{}/{} (epoch {}) loss = {:.2f}({:.2f}) perplexity(train/valid) = {:.2f}({:.2f}) time/batch = {:.2f} chars/sec = {:.2f}k" \ .format(data_loader.pointer, data_loader.num_batches, e, train_loss, valid_loss, train_perplexity, valid_perplexity, time_batch, (FLAGS.batch_size * FLAGS.seq_length) / time_batch / 1000)) valid_loss = 0 for vb in range(data_loader.num_valid_batches): res, valid_time_batch = run_minibatches( sess, data_loader.x_valid[vb], data_loader.y_valid[vb], valid_model, False) valid_loss += res["loss"] valid_loss = valid_loss / data_loader.num_valid_batches valid_perplexity = np.exp(valid_loss) print("### valid_perplexity = {:.2f}, time/batch = {:.2f}". format(valid_perplexity, valid_time_batch)) if valid_perplexity < best_val_pp: best_val_pp = valid_perplexity best_val_epoch = e train_model.save(sess, FLAGS.checkpoint_dir, FLAGS.dataset_name) print("model saved to {}".format(FLAGS.checkpoint_dir)) if e - best_val_epoch > FLAGS.early_stopping: print('Total time: {}'.format(time.time() - start)) break
def train(args): # Create the data_loader object, which loads up all of our batches, vocab dictionary, etc. # from utils.py (and creates them if they don't already exist). # These files go in the data directory. data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size load_model = False if not os.path.exists(args.save_dir): print("Creating directory %s" % args.save_dir) os.mkdir(args.save_dir) elif (os.path.exists(os.path.join(args.save_dir, 'config.pkl'))): # Trained model already exists ckpt = tf.train.get_checkpoint_state(args.save_dir) if ckpt and ckpt.model_checkpoint_path: with open(os.path.join(args.save_dir, 'config.pkl'), 'rb') as f: saved_args = pickle.load(f) args.block_size = saved_args.block_size args.num_blocks = saved_args.num_blocks args.num_layers = saved_args.num_layers args.model = saved_args.model print("Found a previous checkpoint. Overwriting model description arguments to:") print(" model: {}, block_size: {}, num_blocks: {}, num_layers: {}".format( saved_args.model, saved_args.block_size, saved_args.num_blocks, saved_args.num_layers)) load_model = True # Save all arguments to config.pkl in the save directory -- NOT the data directory. with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: pickle.dump(args, f) # Save a tuple of the characters list and the vocab dictionary to chars_vocab.pkl in # the save directory -- NOT the data directory. with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: pickle.dump((data_loader.chars, data_loader.vocab), f) # Create the model! print("Building the model") model = Model(args) print("Total trainable parameters: {:,d}".format(model.trainable_parameter_count())) # Make tensorflow less verbose; filter out info (1+) and warnings (2+) but not errors (3). os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' config = tf.ConfigProto(log_device_placement=False) # config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(model.save_variables_list(), max_to_keep=3) if (load_model): print("Loading saved parameters") saver.restore(sess, ckpt.model_checkpoint_path) global_epoch_fraction = sess.run(model.global_epoch_fraction) global_seconds_elapsed = sess.run(model.global_seconds_elapsed) if load_model: print("Resuming from global epoch fraction {:.3f}," " total trained time: {}, learning rate: {}".format( global_epoch_fraction, datetime.timedelta(seconds=float(global_seconds_elapsed)), sess.run(model.lr))) if (args.set_learning_rate > 0): sess.run(tf.assign(model.lr, args.set_learning_rate)) print("Reset learning rate to {}".format(args.set_learning_rate)) data_loader.cue_batch_pointer_to_epoch_fraction(global_epoch_fraction) initial_batch_step = int((global_epoch_fraction - int(global_epoch_fraction)) * data_loader.total_batch_count) epoch_range = (int(global_epoch_fraction), args.num_epochs + int(global_epoch_fraction)) writer = tf.summary.FileWriter(args.save_dir, graph=tf.get_default_graph()) outputs = [model.cost, model.final_state, model.train_op, model.summary_op] global_step = epoch_range[0] * data_loader.total_batch_count + initial_batch_step avg_loss = 0 avg_steps = 0 try: for e in range(*epoch_range): # e iterates through the training epochs. # Reset the model state, so it does not carry over from the end of the previous epoch. state = sess.run(model.zero_state) batch_range = (initial_batch_step, data_loader.total_batch_count) initial_batch_step = 0 for b in range(*batch_range): global_step += 1 if global_step % args.decay_steps == 0: # Set the model.lr element of the model to track # the appropriately decayed learning rate. current_learning_rate = sess.run(model.lr) current_learning_rate *= args.decay_rate sess.run(tf.assign(model.lr, current_learning_rate)) print("Decayed learning rate to {}".format(current_learning_rate)) start = time.time() # Pull the next batch inputs (x) and targets (y) from the data loader. x, y = data_loader.next_batch() # feed is a dictionary of variable references and respective values for initialization. # Initialize the model's input data and target data from the batch, # and initialize the model state to the final state from the previous batch, so that # model state is accumulated and carried over between batches. feed = {model.input_data: x, model.targets: y} model.add_state_to_feed_dict(feed, state) # Run the session! Specifically, tell TensorFlow to compute the graph to calculate # the values of cost, final state, and the training op. # Cost is used to monitor progress. # Final state is used to carry over the state into the next batch. # Training op is not used, but we want it to be calculated, since that calculation # is what updates parameter states (i.e. that is where the training happens). train_loss, state, _, summary = sess.run(outputs, feed) elapsed = time.time() - start global_seconds_elapsed += elapsed writer.add_summary(summary, e * batch_range[1] + b + 1) if avg_steps < 100: avg_steps += 1 avg_loss = 1 / avg_steps * train_loss + (1 - 1 / avg_steps) * avg_loss print("{:,d} / {:,d} (epoch {:.3f} / {}), loss {:.3f} (avg {:.3f}), {:.3f}s" \ .format(b, batch_range[1], e + b / batch_range[1], epoch_range[1], train_loss, avg_loss, elapsed)) # Every save_every batches, save the model to disk. # By default, only the five most recent checkpoint files are kept. if (e * batch_range[1] + b + 1) % args.save_every == 0 \ or (e == epoch_range[1] - 1 and b == batch_range[1] - 1): save_model(sess, saver, model, args.save_dir, global_step, data_loader.total_batch_count, global_seconds_elapsed) except KeyboardInterrupt: # Introduce a line break after ^C is displayed so save message # is on its own line. print() finally: writer.flush() global_step = e * data_loader.total_batch_count + b save_model(sess, saver, model, args.save_dir, global_step, data_loader.total_batch_count, global_seconds_elapsed)
def test(args): with open(os.path.join(args.save_dir, 'config.pkl'), 'rb') as f: saved_args = cPickle.load(f) with open(os.path.join(args.save_dir, 'event_words_vocab.pkl'), 'rb') as f: event_words, event_vocab, event_vocab_rev = cPickle.load(f) with open(os.path.join(args.save_dir, 'para_words_vocab.pkl'), 'rb') as f: para_words, para_vocab, para_vocab_rev = cPickle.load(f) onlyfiles = [f for f in listdir(args.data_dir) if isfile(join(args.data_dir, f)) and (not ("pkl" in f) and not ("npy" in f)) ] data_loader = TextLoader(args.data_dir, onlyfiles, 1, 50, args.cid_num) data_loader.reset_batch_pointer() arg1 = args.arg_1 arg2 = args.arg_2 model = Model(saved_args, False) with tf.Session() as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state(args.save_dir) eventWin = [] if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) #state = sess.run(model.cell.zero_state(1, tf.float32)) x_e, y_e, x_p1, y_p1, x_p2, y_p2= data_loader.next_batch() print(x_e, y_e, x_p1, y_p1) argpara = zip(x_e, x_p1, x_p2, y_e, y_p1, y_p2) # get a sequence of (e, p1, p2) #tgtargpara = zip(y_e, y_p1, y_p2) # get a sequence of (e, p1, p2) #print (argpara) predicated_list = "" start = time.time() count = 0 print (argpara) for (elist, p1list, p2list, elistNext, p1listNext, p2listNext) in argpara: for e, p1, p2, et, p1t, p2t in zip(elist, p1list, p2list, elistNext, p1listNext, p2listNext): state = sess.run(model.cell.zero_state(1, tf.float32)) count += 1 suspiciousRank = args.susp_rank eventStr = data_loader.event_vocab_rev.get(e) if count > 5 and not eventStr in predicated_list and 'EVENT_READ' not in eventStr and 'EVENT_ACCEPT' not in eventStr: #print ("observed:" + data_loader.event_vocab_rev.get(e))# + ' ' + data_loader.para_vocab_rev.get(p1) + ' ' + data_loader.para_vocab_rev.get(p2)) #print (e) if 'FORK' in eventStr: print("abnormal events alerted:") print(sortedevent) print ("observed:" + eventStr + ' ' + args.arg_1) #print (predicated_list) if ("none" not in args.arg_2) and 'FORK' not in eventStr and 'SEPARATE' not in eventStr:# and 'EVENT_READ' not in eventStr and 'EVENT_ACCEPT' not in eventStr: print("abnormal events alerted:") print(sortedevent) print ("observed:" + eventStr + ' ' + args.arg_2) #print (predicated_list) print (predicated_list) print("======== ") #print (e, p1, p2) x = np.zeros((1, 1)) x[0, 0] = e y1 = np.zeros((1, 1)) y1[0, 0] = p1 y2 = np.zeros((1, 1)) y2[0, 0] = p2 feed = {model.event_input_data: x, model.para1_input_data : y1, model.para2_input_data : y2, model.initial_state:state} [state, probs, probs1, probs2] = sess.run([model.final_state, model.probs, model.probs1, model.probs2], feed) #print(probs)#, probs1, probs2) #maxval = tf.reduce_max(probs, 1, keep_dims=False) #eventval = np.argmax(probs[0]) sortedevent = np.argsort(probs[0])[::-1] #desentsortedevent = sortedevent.reverse() #print(sortedevent[len(sortedevent)-1], probs[0]) #print (eventval) argval1 = np.argmax(probs1[0]) argval2 = np.argmax(probs2[0]) predicated_list = "predicate:[" for x in range(len(sortedevent)) : if sortedevent[x]==et: a = x print (a+1) eventWin.append(a+1) if (len(eventWin)==5): eventWin.pop(0) total = 0 for i in eventWin: total += i arg = total/len(eventWin) for i in range(suspiciousRank): #print(i) predicated_list += data_loader.event_vocab_rev.get(sortedevent[i]) + ' ' predicated_list += '] ' + data_loader.para_vocab_rev.get(argval1) + ' ' + data_loader.para_vocab_rev.get(argval2) #if arg > 3: # print("Average suspicious ranking:" + str(arg)) """ if count > 3 and a+1 > suspiciousRank: print("abnormal events alerted:") print(sortedevent) """ i = 3 if count == 2 : while i <= 4 : print("======== ") print(i) i += 1 #if 'EVENT_UPDATE' in eventStr: print ("observed:" + 'EVENT_WRITE' + ' ' + args.arg_1) #print ("observed:" + data_loader.event_vocab_rev.get(e) + ' ' + args.arg_2) print("abnormal alerted:") print(sortedevent) i = 1 while i <= 10 : print("======== ") print(i) i += 1 #e = eventval.eval() #arg1 = argval1.eval() #arg2 = argval2.eval() #print(data_loader.event_vocab_rev.get(e[0]), data_loader.event_vocab_rev.get(arg1[0]), data_loader.event_vocab_rev.get(arg2[0])) end = time.time() print("time/batch = {:.3f}".format(end - start))
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size with open(os.path.join(args.save_dir, 'config.pkl'), 'w') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'w') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) train_loss_iterations = { 'iteration': [], 'epoch': [], 'train_loss': [], 'val_loss': [] } for e in xrange(args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) data_loader.reset_batch_pointer() state = model.initial_state.eval() for b in xrange(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = { model.input_data: x, model.targets: y, model.initial_state: state } train_loss, state, _ = sess.run( [model.cost, model.final_state, model.train_op], feed) end = time.time() batch_idx = e * data_loader.num_batches + b print "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(batch_idx, args.num_epochs * data_loader.num_batches, e, train_loss, end - start) train_loss_iterations['iteration'].append(batch_idx) train_loss_iterations['epoch'].append(e) train_loss_iterations['train_loss'].append(train_loss) if batch_idx % args.save_every == 0: # evaluate state_val = model.initial_state.eval() avg_val_loss = 0 for x_val, y_val in data_loader.val_batches: feed_val = { model.input_data: x_val, model.targets: y_val, model.initial_state: state_val } val_loss, state_val, _ = sess.run( [model.cost, model.final_state, model.train_op], feed_val) avg_val_loss += val_loss / len(data_loader.val_batches) print 'val_loss: {:.3f}'.format(avg_val_loss) train_loss_iterations['val_loss'].append(avg_val_loss) checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print "model saved to {}".format(checkpoint_path) else: train_loss_iterations['val_loss'].append(None) pd.DataFrame(data=train_loss_iterations, columns=train_loss_iterations.keys()).to_csv( os.path.join(args.save_dir, 'log.csv'))
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) if args.init_from is not None: ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" Disc = Discriminator(args) Gen = Generator(args) # D_tvars = [Disc.W1,Disc.W2] # G_tvars = [Gen.weight] fp1 = open('G_loss_training', 'w') fp2 = open('D_loss_training', 'w') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) for e in range(args.num_epochs): print str(e) + 'th epoch' sess.run(tf.assign(Disc.lr, args.disc_learning_rate)) sess.run(tf.assign(Gen.lr, args.gen_learning_rate)) data_loader.reset_batch_pointer() for b in range(data_loader.num_batches): start = time.time() con, res = data_loader.next_batch() real_data = np.concatenate((con, res), axis=1) fake_data = sess.run(Fake_data, feed_dict={Gen.input_data: con}) D_real, D_logit_real = sess.run( [prob, logit], feed_dict={Disc.input_data: real_data}) D_fake, D_logit_fake = sess.run( [prob, logit], feed_dict={Disc.input_data: fake_data}) D_loss = -tf.reduce_mean(tf.log(D_real) + tf.log(1 - D_fake)) G_loss = -tf.reduce_mean(tf.log(D_fake)) D_tvars = [v for v in t_vars if v.name.startswith('disc')] G_tvars = [v for v in t_vars if v.name.startswith('gen')] D_solver = tf.train.AdamOptimizer(Disc.lr).minimize( D_loss, var_list=D_tvars) G_solver = tf.train.AdamOptimizer(Gen.lr).minimize( G_loss, var_list=G_tvars) _, d_loss = sess.run([D_solver, D_loss], feed_dict={ Disc.input_data: real_data, Gen.input_data: con }) _, g_loss = sess.run([G_solver, G_loss], feed_dict={ Disc.input_data: fake_data, Gen.input_data: con }) fp1.write(str(g_loss) + '\n') fp2.write(str(d_loss) + '\n') end = time.time() print("{}/{} (epoch {}), Generator_loss = {:.3f}, Discriminator_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, g_loss, d_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0\ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) fp1.close() fp2.close()
def train(args): print(args) data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) for e in range(args.num_epochs): sess.run( tf.assign(model.lr, args.learning_rate * (args.decay_rate**e))) #print("model learning rate is {}".format(model.lr.eval())) data_loader.reset_batch_pointer('train') state = model.initial_state.eval() for b in xrange(data_loader.ntrain): start = time.time() x, y = data_loader.next_batch('train') # tmp = '' # for c in x: # for i in c: # tmp += np.array(data_loader.chars)[i] # print(tmp) feed = { model.input_data: x, model.targets: y, model.initial_state: state } train_loss, state, _ = sess.run( [model.cost, model.final_state, model.train_op], feed) end = time.time() print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.ntrain + b, args.num_epochs * data_loader.ntrain, e, train_loss, end - start)) if (e * data_loader.ntrain + b) % args.save_every == 0: checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.ntrain + b) print("model saved to {}".format(checkpoint_path)) # eval validation loss data_loader.reset_batch_pointer('validation') validation_state = model.initial_state.eval() val_losses = 0 for n in xrange(data_loader.nvalidation): x, y = data_loader.next_batch('validation') feed = { model.input_data: x, model.targets: y, model.initial_state: validation_state } validation_loss, validation_state = sess.run( [model.cost, model.final_state], feed) val_losses += validation_loss validation_loss = val_losses / data_loader.nvalidation print("validation loss is {}".format(validation_loss))
def train(args): # Data Preparation # ==================================== data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size print("Number of sentences: {}" .format(data_loader.num_data)) print("Vocabulary size: {}" .format(args.vocab_size)) # Check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir(args.init_from)," %s must be a path" % args.init_from assert os.path.isfile(os.path.join(args.init_from,"config.pkl")),"config.pkl file does not exist in path %s"%args.init_from assert os.path.isfile(os.path.join(args.init_from,"words_vocab.pkl")),"words_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt,"No checkpoint found" assert ckpt.model_checkpoint_path,"No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = pickle.load(f) need_be_same=["rnn_size","num_layers","seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme]==vars(args)[checkme],"Command line argument and saved model disagree on '%s' "%checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'words_vocab.pkl'), 'rb') as f: saved_words, saved_vocab = pickle.load(f) assert saved_words==data_loader.words, "Data and loaded model disagree on word set!" assert saved_vocab==data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: pickle.dump(args, f) with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f: pickle.dump((data_loader.words, data_loader.vocab), f) """ embedding_matrix = get_vocab_embedding(args.save_dir, data_loader.words, args.embedding_file) print("Embedding matrix shape:",embedding_matrix.shape) """ # Training # ==================================== with tf.Graph().as_default(): with tf.Session() as sess: model = BasicLSTM(args) # Define training procedure global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(args.learning_rate) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(model.cost, tvars), args.grad_clip) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # Keep track of gradient values and sparsity grad_summaries = [] for g, v in zip(grads, tvars): if g is not None: grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) # Summary for loss loss_summary = tf.summary.scalar("loss", model.cost) # Train summaries merged = tf.summary.merge_all() if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) train_writer = tf.summary.FileWriter(args.log_dir, sess.graph) saver = tf.train.Saver(tf.global_variables()) # Initialize all variables sess.run(tf.global_variables_initializer()) # Restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) # Start training print("Start training") for epoch in range(args.num_epochs): data_loader.reset_batch_pointer() state = sess.run(model.initial_state) for i in range(data_loader.num_batches): start = time.time() x_batch, y_batch = data_loader.next_batch() feed_dict = {model.x: x_batch, model.y: y_batch, model.keep_prob: args.keep_prob } _, step, summary, loss, equal = sess.run([train_op, global_step, merged, model.cost, model.equal], feed_dict) print("training step {}, epoch {}, batch {}/{}, loss: {:.4f}, accuracy: {:.4f}, time/batch: {:.3f}" .format(step, epoch, i, data_loader.num_batches, loss, np.mean(equal), time.time()-start)) train_writer.add_summary(summary, step) current_step = tf.train.global_step(sess, global_step) if current_step % args.save_every == 0 or (epoch == args.num_epochs-1 and i == data_loader.num_batches-1): #save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') path = saver.save(sess, checkpoint_path, global_step = current_step) print("Saved model checkpoint to {}".format(path)) train_writer.close()
def train(args): data_loader = TextLoader(args.data_path, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size args.file_size = data_loader.file_size print("Vocab size: ",args.vocab_size) print("File size: ",args.file_size) args.lower_bound = 0 #If we know the entropy then we set it to this data_info = {} if args.info_path is not None: assert os.path.isfile(args.info_path),"Info file not found in the path: %s"%args.info_path #Open the info file with open(args.info_path, 'rb') as f: data_info = json.load(f) #Assuming we know entropy args.lower_bound = data_info['Entropy'] print(data_info) # check compatibility if training is continued from previously saved model if args.init_from is not None: # check if all necessary files exist assert os.path.isdir(args.init_from)," %s must be a a path" % args.init_from assert os.path.isfile(os.path.join(args.init_from,"config.pkl")),"config.pkl file does not exist in path %s"%args.init_from assert os.path.isfile(os.path.join(args.init_from,"chars_vocab.pkl")),"chars_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt,"No checkpoint found" assert ckpt.model_checkpoint_path,"No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same=["model","rnn_size","num_layers","seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme]==vars(args)[checkme],"Command line argument and saved model disagree on '%s' "%checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'chars_vocab.pkl'), 'rb') as f: saved_chars, saved_vocab = cPickle.load(f) assert saved_chars==data_loader.chars, "Data and loaded model disagree on character set!" assert saved_vocab==data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) ################################################## # Get the model ################################################## model = Model(args) print("model Loaded") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) writer = tf.summary.FileWriter(args.summary_dir,sess.graph) # restore model if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) ###################################################### # Perform the training ##################################################### for e in range(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() #Need to check what this does state = sess.run(model.initial_state) #What is this initial state cumul_loss = 0 for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h summary, train_loss, state, _ = sess.run([model.merged_summaries, model.cost, model.final_state, model.train_op], feed) #what is the training loss train_loss /= np.log(2) cumul_loss += train_loss end = time.time() print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start)) if (e * data_loader.num_batches + b) % args.save_every == 0\ or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) if b%10 == 0: writer.add_summary(summary,e*data_loader.num_batches + b) cumul_loss /= data_loader.num_batches print("Epoch {}: Cumulative Loss for the epoch: {:.3f}".format(e,cumul_loss)) if (abs(cumul_loss - args.lower_bound) < 0.1): print("Stopping Training as we get a good loss.. :) ... ") break ############################################################## # Append details to the output file ############################################################## args.epoch_stopped=e+1 args.last_epoch_loss = cumul_loss with open(args.output_path, 'a') as f: params = vars(args) params.update(data_info) #json.dump(params, f,indent=2) cPickle.dump(params,f) #f.write("\n ############################################# \n") with open(args.output_path+".json", 'a') as f: params = vars(args) params.update(data_info) json.dump(params, f,indent=2) #cPickle.dump(params) f.write("\n ############################################# \n")
def train(args): # Create the data_loader object, which loads up all of our batches, vocab dictionary, etc. # from utils.py (and creates them if they don't already exist). # These files go in the data directory. data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size load_model = False if not os.path.exists(args.save_dir): print("Creating directory %s" % args.save_dir) os.mkdir(args.save_dir) elif (os.path.exists(os.path.join(args.save_dir, 'config.pkl'))): # Trained model already exists ckpt = tf.train.get_checkpoint_state(args.save_dir) if ckpt and ckpt.model_checkpoint_path: with open(os.path.join(args.save_dir, 'config.pkl')) as f: saved_args = cPickle.load(f) args.rnn_size = saved_args.rnn_size args.num_layers = saved_args.num_layers args.model = saved_args.model print("Found a previous checkpoint. Overwriting model description arguments to:") print(" model: {}, rnn_size: {}, num_layers: {}".format( saved_args.model, saved_args.rnn_size, saved_args.num_layers)) load_model = True # Save all arguments to config.pkl in the save directory -- NOT the data directory. with open(os.path.join(args.save_dir, 'config.pkl'), 'w') as f: cPickle.dump(args, f) # Save a tuple of the characters list and the vocab dictionary to chars_vocab.pkl in # the save directory -- NOT the data directory. with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'w') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) # Create the model! print("Building the model") model = Model(args) config = tf.ConfigProto(log_device_placement=False) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(model.save_variables_list()) if (load_model): print("Loading saved parameters") saver.restore(sess, ckpt.model_checkpoint_path) global_epoch_fraction = sess.run(model.global_epoch_fraction) global_seconds_elapsed = sess.run(model.global_seconds_elapsed) if load_model: print("Resuming from global epoch fraction {:.3f}," " total trained time: {}, learning rate: {}".format( global_epoch_fraction, global_seconds_elapsed, sess.run(model.lr))) data_loader.cue_batch_pointer_to_epoch_fraction(global_epoch_fraction) initial_batch_step = int((global_epoch_fraction - int(global_epoch_fraction)) * data_loader.total_batch_count) epoch_range = (int(global_epoch_fraction), args.num_epochs + int(global_epoch_fraction)) writer = tf.train.SummaryWriter(args.save_dir, graph=tf.get_default_graph()) outputs = [model.cost, model.final_state, model.train_op, model.summary_op] is_lstm = args.model == 'lstm' global_step = epoch_range[0] * data_loader.total_batch_count + initial_batch_step try: for e in xrange(*epoch_range): # e iterates through the training epochs. # Reset the model state, so it does not carry over from the end of the previous epoch. state = sess.run(model.initial_state) batch_range = (initial_batch_step, data_loader.total_batch_count) initial_batch_step = 0 for b in xrange(*batch_range): global_step += 1 if global_step % args.decay_steps == 0: # Set the model.lr element of the model to track # the appropriately decayed learning rate. current_learning_rate = sess.run(model.lr) current_learning_rate *= args.decay_rate sess.run(tf.assign(model.lr, current_learning_rate)) print("Decayed learning rate to {}".format(current_learning_rate)) start = time.time() # Pull the next batch inputs (x) and targets (y) from the data loader. x, y = data_loader.next_batch() # feed is a dictionary of variable references and respective values for initialization. # Initialize the model's input data and target data from the batch, # and initialize the model state to the final state from the previous batch, so that # model state is accumulated and carried over between batches. feed = {model.input_data: x, model.targets: y} if is_lstm: for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h else: for i, c in enumerate(model.initial_state): feed[c] = state[i] # Run the session! Specifically, tell TensorFlow to compute the graph to calculate # the values of cost, final state, and the training op. # Cost is used to monitor progress. # Final state is used to carry over the state into the next batch. # Training op is not used, but we want it to be calculated, since that calculation # is what updates parameter states (i.e. that is where the training happens). train_loss, state, _, summary = sess.run(outputs, feed) elapsed = time.time() - start global_seconds_elapsed += elapsed writer.add_summary(summary, e * batch_range[1] + b + 1) print "{}/{} (epoch {}/{}), loss = {:.3f}, time/batch = {:.3f}s" \ .format(b, batch_range[1], e, epoch_range[1], train_loss, elapsed) # Every save_every batches, save the model to disk. # By default, only the five most recent checkpoint files are kept. if (e * batch_range[1] + b + 1) % args.save_every == 0 \ or (e == epoch_range[1] - 1 and b == batch_range[1] - 1): save_model(sess, saver, model, args.save_dir, global_step, data_loader.total_batch_count, global_seconds_elapsed) except KeyboardInterrupt: # Introduce a line break after ^C is displayed so save message # is on its own line. print() finally: writer.flush() global_step = e * data_loader.total_batch_count + b save_model(sess, saver, model, args.save_dir, global_step, data_loader.total_batch_count, global_seconds_elapsed)
def train(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size name = "" if args.separate != 0 and args.data_dir is not None: name = args.data_dir if '\\' in name: name = name[name.rfind('\\')+1:] if '/' in name: name = name[name.rfind('/')+1:] print("Name: "+name) args.save_dir = os.path.join(args.save_dir,name) # check compatibility if training is continued from previously saved model if args.init_from is not None: #if args.cont != 0: # check if all necessary files exist #if args.separate != 0: #args.init_from = os.path.join(args.init_from,name) assert os.path.isdir(args.init_from)," %s must be a a path" % args.init_from assert os.path.isfile(os.path.join(args.init_from,"config.pkl")),"config.pkl file does not exist in path %s"%args.init_from assert os.path.isfile(os.path.join(args.init_from,"chars_vocab.pkl")),"chars_vocab.pkl.pkl file does not exist in path %s" % args.init_from ckpt = tf.train.latest_checkpoint(args.init_from) assert ckpt, "No checkpoint found" # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = cPickle.load(f) need_be_same = ["model", "rnn_size", "num_layers", "seq_length"] for checkme in need_be_same: assert vars(saved_model_args)[checkme]==vars(args)[checkme],"Command line argument and saved model disagree on '%s' "%checkme # open saved vocab/dict and check if vocabs/dicts are compatible with open(os.path.join(args.init_from, 'chars_vocab.pkl'), 'rb') as f: saved_chars, saved_vocab = cPickle.load(f) assert saved_chars==data_loader.chars, "Data and loaded model disagree on character set!" assert saved_vocab==data_loader.vocab, "Data and loaded model disagree on dictionary mappings!" if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) with codecs.open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) with codecs.open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'wb') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = Model(args) config = tf.ConfigProto() #config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # instrument for tensorboard summaries = tf.summary.merge_all() writer = tf.summary.FileWriter( os.path.join(args.log_dir,time.strftime("%Y-%m-%d-%H-%M-%S")+' '+name)) #os.path.join(args.log_dir, time.strftime("%Y-%m-%d-%H-%M-%S"))) writer.add_graph(sess.graph) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) # restore model if args.init_from is not None: saver.restore(sess, ckpt) for e in range(args.num_epochs): sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e))) data_loader.reset_batch_pointer() state = sess.run(model.initial_state) for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {model.input_data: x, model.targets: y} for i, (c, h) in enumerate(model.initial_state): feed[c] = state[i].c feed[h] = state[i].h # instrument for tensorboard summ, train_loss, state, _ = sess.run([summaries, model.cost, model.final_state, model.train_op], feed) writer.add_summary(summ, e * data_loader.num_batches + b) end = time.time() if (e * data_loader.num_batches + b) % args.print_every == 0: tDelta = str(datetime.timedelta(seconds=(args.num_epochs * data_loader.num_batches) - (e * data_loader.num_batches + b))*(end - start))[:-7] print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}, remaining = {}" .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start, tDelta)) if (e * data_loader.num_batches + b) % args.save_every == 0 \ or (e == args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * data_loader.num_batches + b) print("model saved to {}".format(checkpoint_path)) #sample.sample() #sampling the output #with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'rb') as f: #chars, vocab = cPickle.load(f) #Use most frequent char if no prime is given #if args.prime == '': #args.prime = chars[0] #s = str(bytes.decode(model.sample(sess, chars, vocab, args.n, args.prime,args.sample).encode('utf-8'))).encode('utf-8')[:-1] #with codecs.open(os.path.join(args.save_dir,name+' checkpoint '+str(train_loss)),'w') as f: #f.write(s) #print(s) with open(os.path.join(args.save_dir,'status.txt'),'w') as f: f.write("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}, remaining = {}" .format(e * data_loader.num_batches + b, args.num_epochs * data_loader.num_batches, e, train_loss, end - start,str(datetime.timedelta(seconds=(args.num_epochs * data_loader.num_batches) - (e * data_loader.num_batches + b))*(end - start))[:-7])) f.write('\nRNN size: {}\nLayers: {}\nSequence length: {}\nModel: {}'.format(args.rnn_size,args.num_layers,args.seq_length,args.model)) f.write('\n\nNum epochs: {}\nGradient clip: {}\nLearning rate: {}\nDecay rate: {}\nOutput-keep-prob: {}\nInput-keep=prob:{}'.format(args.num_epochs,args.grad_clip,args.learning_rate,args.decay_rate,args.output_keep_prob,args.input_keep_prob))
def train(args): """ Trains a RNN model. Args: args (argparse): arguments to train the RNN. Returns: None. """ s_time = time.time() # Check compatibility to continue training from a previous model. if args.init_from: assert os.path.isdir(args.init_from), \ "{} does not exist".format(args.init_from) assert os.path.isfile(os.path.join(args.init_from, "config.pkl")), \ "config.pkl file does not exist in path {}".format(args.init_from) ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # Check if models are compatible. with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_model_args = pickle.load(f) check_list = ["rnn_size", "seq_length"] for check in check_list: assert vars(saved_model_args)[check] == vars(args)[check], \ "CLI argument and saved model disagree on %s".format(check) # Store configuration arguments. args.save_dir = os.path.join(args.save_dir, str(args.shard)) if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) args.save_dir = os.path.abspath(args.save_dir) with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: pickle.dump(args, f) # Load input data. args.data_dir = os.path.join(args.data_dir, str(args.shard)) if not os.path.isdir(args.data_dir): sys.exit('{} does not exist'.format(args.data_dir)) args.data_dir = os.path.abspath(args.data_dir) data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length) # Set logs directories. args.log_dir = os.path.join(args.log_dir, args.option) if not os.path.isdir(args.log_dir): os.makedirs(args.log_dir) args.log_dir = os.path.abspath(args.log_dir) # Create an instance of the tensorflow model. tf.reset_default_graph() model = Model(args) with tf.Session() as sess: # Tensorboard summaries. summaries = tf.summary.merge_all() writer = tf.summary.FileWriter(args.log_dir, sess.graph) # Initialize variables (weigths and biases), with a Xavier uniform # initializer. See tf.glorot_uniform_initializer(). sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) saver = tf.train.Saver() # Restore previous model and session. if args.init_from: saver.restore(sess, ckpt.model_checkpoint_path) losses, accuracies = [], [] global_step_init = model.global_step.eval() # Run the model for training or validation/testing. for epoch_id in range(args.num_epochs): data_loader.reset_batch_pointer() # Reset the states at the beginning of each epoch. h_state = np.zeros([args.batch_size, args.rnn_size]) c_state = np.zeros([args.batch_size, args.rnn_size]) for batch_id in range(data_loader.num_batches): x, y = data_loader.next_batch() # Update the learning rate, with linear decay. global_step = model.global_step.eval() if args.lr_decay and global_step != 0: total_weight_updates = args.train_bytes \ / (args.batch_size * args.seq_length) lr = args.lr_init - (args.lr_init / total_weight_updates) \ * global_step if lr < 1.5 * 10**-13: lr = 1.5 * 10**-13 else: lr = args.lr_init sess.run(tf.assign(model.lr, lr)) # Keep the states between batches to simulate full # backpropagation (stateful RNN). feed = { model.initial_hidden_state: h_state, model.initial_cell_state: c_state, model.batchX_placeholder: x, model.batchY_placeholder: y } if args.option == 'train': _, h_state, c_state, loss, accuracy, \ summary = sess.run([model.train_step, model.final_hidden_state, model.final_cell_state, model.total_loss, model.accuracy, summaries], feed_dict=feed) elif args.option == 'validate': h_state, c_state, loss, accuracy, \ summary = sess.run([model.final_hidden_state, model.final_cell_state, model.total_loss, model.accuracy, summaries], feed_dict=feed) losses.append(loss) accuracies.append(accuracy) if args.option == 'train' and \ global_step % args.print_every == 0: # Record training for tensorboard. writer.add_summary(summary, global_step) writer.flush() print("Shard {} Epoch {}/{} Batch {}/{} ({}) -- " "loss: {:.3f}, acc: {:.3f}".format( args.shard, epoch_id, args.num_epochs - 1, batch_id, data_loader.num_batches - 1, global_step, loss, accuracy)) sys.stdout.flush() # Save the model at the end of training. if args.option == 'train': save_model(args, sess, saver, global_step) # Save losses and accuracies. np.save(os.path.join(args.log_dir, 'loss_' + str(global_step_init)), losses) np.save( os.path.join(args.log_dir, 'accuracy_' + str(global_step_init)), accuracies) # Record training for tensorboard. writer.add_summary(summary, global_step) writer.flush() print("Shard {} Epoch {}/{} Batch {}/{} ({}) -- " "loss: {:.3f}, acc: {:.3f}".format(args.shard, epoch_id, args.num_epochs - 1, batch_id, data_loader.num_batches - 1, global_step, loss, accuracy)) sys.stdout.flush() # Record time spent. time_spent = time.time() - s_time hours, rem = divmod(time_spent, 3600) minutes, seconds = divmod(rem, 60) print('Train time: {:0>2}:{:0>2}:{:05.2f}'.format( int(hours), int(minutes), seconds)) print('Time per batch: {:.3f}ms, time per byte: {:.3f}ms'.format( time_spent / (args.num_epochs * data_loader.num_batches) * 1000, time_spent / (args.num_epochs * data_loader.num_batches * args.batch_size * args.seq_length) * 1000))
def train2(args): data_loader = TextLoader(args.data_dir, args.batch_size, args.seq_length, args.reprocess) args.vocab_size = data_loader.vocab_size totalTask = args.num_epochs * data_loader.num_batches lastCheckpoint = tf.train.latest_checkpoint(args.save_dir) if lastCheckpoint is None: startEpoch = 0 else: print "Last checkpoint :", lastCheckpoint startEpoch = int(lastCheckpoint.split("-")[-1]) print "startEpoch = ", startEpoch with open(os.path.join(args.save_dir, 'config.pkl'), 'w') as f: cPickle.dump(args, f) with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'w') as f: cPickle.dump((data_loader.chars, data_loader.vocab), f) model = ConstrainedModel(args) etaCount = 0 etaString = "-" etaStart = time.time() etaTime = 0 with tf.Session() as sess: tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) if startEpoch > 0: # load latest checkpoint print "Loading last checkpoint" saver.restore(sess, lastCheckpoint) for e in xrange(startEpoch, args.num_epochs): sess.run(tf.assign(model.lr, decayForEpoch(args, e))) data_loader.reset_batch_pointer() state = model.initial_state.eval() for b in xrange(data_loader.num_batches): start = time.time() x, y, con = data_loader.next_batch() feed = {model.input_data: x, model.targets: y, model.initial_state: state, model.con_data:con} train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed) #time.sleep(0.01) #train_loss = 5 end = time.time() taskNum = (e * data_loader.num_batches + b) etaCount += 1 if (etaCount) % 25 == 0: duration = time.time() - etaStart etaTime = (totalTask - (taskNum + 1)) / 25 * duration m, s = divmod(etaTime, 60) h, m = divmod(m, 60) etaString = "%d:%02d:%02d" % (h, m, s) etaStart = time.time() print "{}/{} (epoch {}), loss = {:.3f}, time/batch = {:.3f}, ETA: {} ({})" \ .format(taskNum, totalTask, e, train_loss, end - start, time.ctime(time.time()+etaTime), etaString) if (e + 1) % args.save_every == 0 or e == args.num_epochs - 1: checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = e + 1) print "model saved to {}".format(checkpoint_path)
def main(_): pp.pprint(FLAGS.__flags) if not os.path.exists(FLAGS.checkpoint_dir): print(" [*] Creating checkpoint directory...") os.makedirs(FLAGS.checkpoint_dir) data_loader = TextLoader(os.path.join(FLAGS.data_dir, FLAGS.dataset_name), FLAGS.batch_size, FLAGS.seq_length) vocab_size = data_loader.vocab_size valid_size = 50 valid_window = 100 with tf.variable_scope('model'): train_model = CharRNN(vocab_size, FLAGS.batch_size, FLAGS.rnn_size, FLAGS.layer_depth, FLAGS.num_units, FLAGS.rnn_type, FLAGS.seq_length, FLAGS.keep_prob, FLAGS.grad_clip) with tf.variable_scope('model', reuse=True): simple_model = CharRNN(vocab_size, 1, FLAGS.rnn_size, FLAGS.layer_depth, FLAGS.num_units, FLAGS.rnn_type, 1, FLAGS.keep_prob, FLAGS.grad_clip) with tf.variable_scope('model', reuse=True): valid_model = CharRNN(vocab_size, FLAGS.batch_size, FLAGS.rnn_size, FLAGS.layer_depth, FLAGS.num_units, FLAGS.rnn_type, FLAGS.seq_length, FLAGS.keep_prob, FLAGS.grad_clip) with tf.Session() as sess: tf.global_variables_initializer().run() train_model.load(sess, FLAGS.checkpoint_dir, FLAGS.dataset_name) best_val_pp = float('inf') best_val_epoch = 0 valid_loss = 0 valid_perplexity = 0 start = time.time() if FLAGS.export: print("Eval...") final_embeddings = train_model.embedding.eval(sess) emb_file = os.path.join(FLAGS.data_dir, FLAGS.dataset_name, 'emb.npy') print("Embedding shape: {}".format(final_embeddings.shape)) np.save(emb_file, final_embeddings) else: # Train current_step = 0 similarity, valid_examples, _ = compute_similarity(train_model, valid_size, valid_window, 6) # save hyper-parameters cPickle.dump(FLAGS.__flags, open(FLAGS.log_dir + "/hyperparams.pkl", 'wb')) # run it! for e in range(FLAGS.num_epochs): data_loader.reset_batch_pointer() # decay learning rate sess.run(tf.assign(train_model.lr, FLAGS.learning_rate)) # iterate by batch for b in range(data_loader.num_batches): x, y = data_loader.next_batch() res, time_batch = run_epochs(sess, x, y, train_model) train_loss = res["loss"] train_perplexity = np.exp(train_loss) iterate = e * data_loader.num_batches + b # print log print("{}/{} (epoch {}) loss = {:.2f}({:.2f}) perplexity(train/valid) = {:.2f}({:.2f}) time/batch = {:.2f} chars/sec = {:.2f}k"\ .format(e * data_loader.num_batches + b, FLAGS.num_epochs * data_loader.num_batches, e, train_loss, valid_loss, train_perplexity, valid_perplexity, time_batch, (FLAGS.batch_size * FLAGS.seq_length) / time_batch / 1000)) current_step = tf.train.global_step(sess, train_model.global_step) # validate valid_loss = 0 for vb in range(data_loader.num_valid_batches): res, valid_time_batch = run_epochs(sess, data_loader.x_valid[vb], data_loader.y_valid[vb], valid_model, False) valid_loss += res["loss"] valid_loss = valid_loss / data_loader.num_valid_batches valid_perplexity = np.exp(valid_loss) print("### valid_perplexity = {:.2f}, time/batch = {:.2f}".format(valid_perplexity, valid_time_batch)) log_str = "" # Generate sample smp1 = simple_model.sample(sess, data_loader.chars, data_loader.vocab, UNK_ID, 5, u"我喜歡做") smp2 = simple_model.sample(sess, data_loader.chars, data_loader.vocab, UNK_ID, 5, u"他吃飯時會用") smp3 = simple_model.sample(sess, data_loader.chars, data_loader.vocab, UNK_ID, 5, u"人類總要重複同樣的") smp4 = simple_model.sample(sess, data_loader.chars, data_loader.vocab, UNK_ID, 5, u"天色暗了,好像快要") log_str = log_str + smp1 + "\n" log_str = log_str + smp2 + "\n" log_str = log_str + smp3 + "\n" log_str = log_str + smp4 + "\n" # Write a similarity log # Note that this is expensive (~20% slowdown if computed every 500 steps) sim = similarity.eval() for i in range(valid_size): valid_word = data_loader.chars[valid_examples[i]] top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k+1] log_str = log_str + "Nearest to %s:" % valid_word for k in range(top_k): close_word = data_loader.chars[nearest[k]] log_str = "%s %s," % (log_str, close_word) log_str = log_str + "\n" print(log_str) # Write to log text_file = codecs.open(FLAGS.log_dir + "/similarity.txt", "w", "utf-8") text_file.write(log_str) text_file.close() if valid_perplexity < best_val_pp: best_val_pp = valid_perplexity best_val_epoch = iterate # save best model train_model.save(sess, FLAGS.checkpoint_dir, FLAGS.dataset_name) print("model saved to {}".format(FLAGS.checkpoint_dir)) # early_stopping if iterate - best_val_epoch > FLAGS.early_stopping: print('Total time: {}'.format(time.time() - start)) break