def test_observation_zeroing(self): """ Tests zeroing out of frames not from current episode """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) for terminal_idx in range(5): obs_ = [] obs_next_ = [] for i in range(1, 6): partial_obs = np.ones(obs_shape) * i terminal = 1 if i == terminal_idx else 0 er.append(partial_obs, 0, 0, terminal) if i <= terminal_idx: partial_obs *= 0 if i < 5: obs_.append(partial_obs) if i > 1: obs_next_.append(partial_obs) obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0)) obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0)) batch = er.sample(1) obs, rewards, actions, obs_next, terminals = batch assert np.array_equal(obs_, obs) assert np.array_equal(obs_next_, obs_next)
def run_episode(plan_step_fn, learner, dataset, cache_subtree, add_returns, preproc_obs_fn=None, render=False): episode_done = False actor.reset() episode_rewards = [] aux_replay = ExperienceReplay( ) # New auxiliary buffer to save current episode transitions while not episode_done: # Planning step tree_policy = plan_step_fn(len(episode_rewards)) # Execute action (choose one node as the new root from depth 1) a = sample_pmf(tree_policy) prev_root_data, current_root_data = actor.step(a, cache_subtree, render, render_size=(512, 512)) aux_replay.append({ "observations": prev_root_data["obs"], "target_policy": tree_policy }) episode_rewards.append(current_root_data["r"]) episode_done = current_root_data["done"] # Learning step if learner is not None: batch = dataset.sample(batch_size) if preproc_obs_fn is not None: batch["observations"] = preproc_obs_fn(batch["observations"]) obs = tf.constant(batch["observations"], dtype=tf.float32) target_policy = tf.constant(batch["target_policy"], dtype=tf.float32) if add_returns: returns = tf.constant(batch["returns"], dtype=tf.float32) loss, _ = learner.train_step(obs, target_policy, returns) else: loss, _ = learner.train_step(obs, target_policy) # Add episode to the dataset if add_returns: returns = compute_returns(episode_rewards, discount_factor) # Backpropagate rewards aux_replay.add_column("returns", returns) # Add them to the dataset dataset.extend( aux_replay ) # Add transitions to the buffer that will be used for learning return episode_rewards
def test_sampling(self): """ Tests observation construction from partial observations """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) for i in range(1, 6): partial_obs = np.ones(obs_shape) * i er.append(partial_obs, 1, 1, 0) batch = er.sample(1) _, rewards, actions, _, terminals = batch assert np.array_equal(rewards, np.array([1])) assert np.array_equal(actions, np.array([1])) assert np.array_equal(terminals, np.array([0]))
def test_observation_construction(self): """ Tests observation construction from partial observations """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) obs_ = [] obs_next_ = [] for i in range(1, 6): partial_obs = np.ones(obs_shape) * i if i < 5: obs_.append(partial_obs) if i > 1: obs_next_.append(partial_obs) er.append(partial_obs, 0, 0, 0) obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0)) obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0)) batch = er.sample(1) obs, rewards, actions, obs_next, terminals = batch assert np.array_equal(obs_, obs) assert np.array_equal(obs_next_, obs_next)
def main(_): # Reproducability tf.reset_default_graph() np.random.seed(cfg.random_seed) tf.set_random_seed(cfg.random_seed) # Logging summary_writer = tf.summary.FileWriter(cfg.log_dir) if not cfg.evaluate and not tf.gfile.Exists(cfg.save_dir): tf.gfile.MakeDirs(cfg.save_dir) else: assert tf.gfile.Exists(cfg.save_dir) # TODO handel this episode_results_path = os.path.join(cfg.log_dir, "episodeResults.csv") episode_results = tf.gfile.GFile(episode_results_path, "w") episode_results.write("model_freq={},save_dir={}".format( cfg.model_freq, cfg.save_dir)) episode_results.write("episode,reward,steps\n") episode_results.flush() # Setup ALE and DQN graph obs_shape = (84, 84, 1) input_height, input_width, _ = obs_shape dqn = DQN(input_height, input_width, cfg.num_actions) # Global step global_step = tf.train.get_or_create_global_step() increment_step = tf.assign_add(global_step, 1) # Save all variables vars_to_save = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="agent/q") vars_to_save.append(global_step) saver = tf.train.Saver(var_list=vars_to_save) # Handle loading specific variables sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess = tf.Session(config=sess_config) restore_or_initialize_weights(sess, dqn, saver) sess.run(dqn.copy_to_target) if cfg.evaluate: # if in evaluation mode, saver is no longer needed saver = None # ##### Restoring AEs ######## if not cfg.evaluate: vaes = create_generative_models(sess) image_summaries = [] image_summaries_ph = tf.placeholder(tf.float32, shape=(4, 84, 84, 4), name="image_summaries_placeholder") for i in range(4): for j in range(4): image_summaries.append( tf.summary.image( "VAE_OUT_{}_{}".format(i, j), tf.reshape(image_summaries_ph[i, :, :, j], (1, 84, 84, 1)))) # ############################ if not cfg.evaluate: summary_writer.add_graph(tf.get_default_graph()) summary_writer.add_graph(vaes[0].graph) summary_writer.add_graph(vaes[1].graph) summary_writer.add_graph(vaes[2].graph) summary_writer.flush() # Initialize ALE postprocess_frame = lambda frame: sess.run(dqn.process_frame, feed_dict={dqn.image: frame}) env = AtariEnvironment(obs_shape, postprocess_frame) # Replay buffer if not cfg.evaluate: replay_buffer = ExperienceReplay(cfg.replay_buffer_size, obs_shape) # Perform random policy to get some training data with tqdm(total=cfg.seed_frames, disable=cfg.disable_progress or cfg.evaluate) as pbar: seed_steps = 0 while seed_steps * cfg.frame_skip < cfg.seed_frames and not cfg.evaluate: action = np.random.randint(cfg.num_actions) reward, next_state, terminal = env.act(action) seed_steps += 1 replay_buffer.append(next_state[:, :, -1, np.newaxis], action, reward, terminal) if terminal: pbar.update(env.episode_frames) env.reset(inc_episode_count=False) if cfg.evaluate: assert cfg.max_episode_count > 0 else: assert len(replay_buffer) >= cfg.seed_frames // cfg.frame_skip # Main training loop steps = tf.train.global_step(sess, global_step) env.reset(inc_episode_count=False) terminal = False total = cfg.max_episode_count if cfg.evaluate else cfg.num_frames with tqdm(total=total, disable=cfg.disable_progress) as pbar: # Loop while we haven't observed our max frame number # If we are at our max frame number we will finish the current episode while (not ( # We must be evaluating or observed the last frame # As well as be terminal # As well as seen the maximum episode number (steps * cfg.frame_skip > cfg.num_frames or cfg.evaluate) and terminal and env.episode_count >= cfg.max_episode_count)): # Epsilon greedy policy with epsilon annealing if not cfg.evaluate and steps * cfg.frame_skip < cfg.eps_anneal_over: # Only compute epsilon step while we're still annealing epsilon epsilon = cfg.eps_initial - steps * ( (cfg.eps_initial - cfg.eps_final) / cfg.eps_anneal_over) else: epsilon = cfg.eps_final # Epsilon greedy policy if np.random.uniform() < epsilon: action = np.random.randint(0, cfg.num_actions) else: action = sess.run(dqn.action, feed_dict={dqn.S: [env.state]}) # Perform environment step steps = sess.run(increment_step) reward, next_state, terminal = env.act(action) if not cfg.evaluate: replay_buffer.append(next_state[:, :, -1, np.newaxis], action, reward, terminal) # Sample and do gradient updates if steps % cfg.learning_freq == 0: placeholders = [ dqn.S, dqn.actions, dqn.rewards, dqn.S_p, dqn.terminals, ] batch = replay_buffer.sample(cfg.batch_size) train_op = [dqn.train] if steps % (cfg.learning_freq * cfg.model_freq) == 0: experience_batch = batch batch = imagined_batch(vaes, batch[1]) if steps / (cfg.learning_freq * cfg.model_freq) < 10: placeholders.append(image_summaries_ph) batch = list(batch) batch.append(batch[0][ np.random.randint(0, 32, size=4), :, :, :]) train_op.extend(image_summaries) if steps % cfg.log_summary_every: train_op.append(dqn.summary) result = sess.run( train_op, feed_dict=dict(zip(placeholders, batch)), ) if len(result) > 1: for i in range(1, len(result)): summary_writer.add_summary(result[i], global_step=steps) if steps % cfg.target_update_every == 0: sess.run([dqn.copy_to_target]) if steps % cfg.model_chkpt_every == 0: saver.save(sess, "%s/model_epoch_%04d" % (cfg.save_dir, steps)) if terminal: episode_results.write("%d,%d,%d\n" % (env.episode_count, env.episode_reward, env.episode_frames)) episode_results.flush() # Log episode summaries to Tensorboard add_simple_summary(summary_writer, "episode/reward", env.episode_reward, env.episode_count) add_simple_summary(summary_writer, "episode/frames", env.episode_frames, env.episode_count) pbar.update(env.episode_frames if not cfg.evaluate else 1) env.reset() episode_results.close() tf.logging.info("Finished %d %s" % ( cfg.max_episode_count if cfg.evaluate else cfg.num_frames, "episodes" if cfg.evaluate else "frames", ))