def tower_loss(scope, maze_ims, maze_labels, config): ''' Computer the loss for each GPU tower. Args: scope: tower scope maze_ims: Tensor of [batch_size, maze_size, maze_size, 1] of maze images maze_labels: Tensor of [batch_size, maze_size] for target label of the connection of diagonal elements config: configuration of the predictron hyperparameters Returns: total_loss to optimize, preturns regression loss and \lambda-preturn loss ''' model = Predictron(maze_ims, maze_labels, config) model.build() loss_preturns = model.loss_preturns loss_lambda_preturns = model.loss_lambda_preturns losses = tf.get_collection('losses', scope) total_loss = tf.add_n(losses, name='total_loss') return total_loss, loss_preturns, loss_lambda_preturns
# types and sequence steps for all allowed actions. action_space = list(chain.from_iterable(my_sim.station_HT_seq.values())) action_size = len(action_space) state_size = len(state) step_counter = 0 # setup of predictron config = Config_predictron() config.state_size = state_size state_queue = list([]) for i in range(config.episode_length): state_queue.append(np.zeros(config.state_size)) reward_queue = list(np.zeros(config.episode_length)) replay_buffer = Replay_buffer(memory_size=config.replay_memory_size) predictron = Predictron(config) model = predictron.model model.load_weights("Predictron_CR.h5") preturn_loss_arr = [] max_preturn_loss = 0 lambda_preturn_loss_arr = [] max_lambda_preturn_loss = 0 DQN_arr = [] predictron_lambda_arr = [] reward_episode_arr = [] # Creating the DQN agent dqn_agent = DeepQNet.DQN(state_space_dim=state_size, action_space=action_space, epsilon_max=0.,
def train(): config = FLAGS global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) maze_ims_ph = tf.placeholder(tf.float32, [None, FLAGS.maze_size, FLAGS.maze_size, 1]) maze_labels_ph = tf.placeholder(tf.float32, [None, FLAGS.maze_size]) model = Predictron(maze_ims_ph, maze_labels_ph, config) model.build() loss = model.total_loss loss_preturns = model.loss_preturns loss_lambda_preturns = model.loss_lambda_preturns opt = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) grad_vars = opt.compute_gradients(loss, tf.trainable_variables()) grads, vars = zip(*grad_vars) grads_clipped, _ = tf.clip_by_global_norm(grads, FLAGS.max_grad_norm) grad_vars = zip(grads_clipped, vars) apply_gradient_op = opt.apply_gradients(grad_vars, global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) update_op = tf.group(*update_ops) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, update_op) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) saver = tf.train.Saver(tf.global_variables()) tf.train.start_queue_runners(sess=sess) train_dir = os.path.join(FLAGS.train_dir, 'max_steps_{}'.format(FLAGS.max_depth)) summary_merged = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(train_dir, sess.graph) maze_queue = Queue.Queue(100) def maze_generator(): maze_gen = MazeGenerator(height=FLAGS.maze_size, width=FLAGS.maze_size, density=FLAGS.maze_density) while True: maze_ims, maze_labels = maze_gen.generate_labelled_mazes( FLAGS.batch_size) maze_queue.put((maze_ims, maze_labels)) for thread_i in xrange(FLAGS.num_threads): t = threading.Thread(target=maze_generator) t.start() for step in xrange(FLAGS.max_steps): start_time = time.time() maze_ims_np, maze_labels_np = maze_queue.get() _, loss_value, loss_preturns_val, loss_lambda_preturns_val, summary_str = sess.run( [ train_op, loss, loss_preturns, loss_lambda_preturns, summary_merged ], feed_dict={ maze_ims_ph: maze_ims_np, maze_labels_ph: maze_labels_np }) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = duration format_str = ( '%s: step %d, loss = %.4f, loss_preturns = %.4f, loss_lambda_preturns = %.4f (%.1f examples/sec; %.3f ' 'sec/batch)') logger.info( format_str % (datetime.datetime.now(), step, loss_value, loss_preturns_val, loss_lambda_preturns_val, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main(): parser = argparse.ArgumentParser(description='Predictron on random mazes') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of transitions in each mini-batch') parser.add_argument('--max-iter', type=int, default=10000, help='Number of iterations to run') parser.add_argument('--n-model-steps', type=int, default=16, help='Number of model steps') parser.add_argument('--n-channels', type=int, default=32, help='Number of channels for hidden units') parser.add_argument('--maze-size', type=int, default=20, help='Size of random mazes') parser.add_argument('--use-reward-gamma', type=bool, default=True, help='Use reward and gamma') parser.add_argument('--use-lambda', type=bool, default=True, help='Use lambda-network') parser.add_argument('--usage-weighting', type=bool, default=True, help='Enable usage weighting') parser.add_argument('--n-unsupervised-updates', type=int, default=0, help='Number of unsupervised upates per supervised' 'updates') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') args = parser.parse_args() # chainer.set_debug(True) model = Predictron(n_tasks=args.maze_size, n_channels=args.n_channels, model_steps=args.n_model_steps, use_reward_gamma=args.use_reward_gamma, use_lambda=args.use_lambda, usage_weighting=args.usage_weighting) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() model.to_gpu(args.gpu) opt = optimizers.Adam() opt.setup(model) for i in range(args.max_iter): x, t = generate_supervised_batch(maze_size=args.maze_size, batch_size=args.batchsize) if args.gpu >= 0: x = chainer.cuda.to_gpu(x) t = chainer.cuda.to_gpu(t) model.cleargrads() g_k_loss, g_lambda_loss = model.supervised_loss(x, t) supervised_loss = g_k_loss + g_lambda_loss supervised_loss.backward() opt.update() for _ in range(args.n_unsupervised_updates): x = generate_unsupervised_batch(maze_size=args.maze_size, batch_size=args.batchsize) if args.gpu >= 0: x = chainer.cuda.to_gpu(x) model.cleargrads() unsupervised_loss = model.unsupervised_loss(x) unsupervised_loss.backward() opt.update() print(i, g_k_loss.data, g_lambda_loss.data, (g_lambda_loss.data**0.5))