def collect_steps(env: tf_py_environment.TFPyEnvironment, policy: tf_policy.Base, buffer: ReplayBuffer): time_step = env.current_time_step() action_step = policy.action(time_step) next_time_step = env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) buffer.add_batch(traj)
def save_environment_agent_video( filename: str, agent: tf_agent.TFAgent, tf_env: TFPyEnvironment, py_env: TimeLimit, num_episodes: int = 1, ) -> None: """ Save a video of an agent acting in the environment. Render method needs to be available in the python version of the environment. TODO: - how to prevent opening a window when saving a video? - sometimes nothing is saved? - gym wrappers monitoring VideoRecorder :param filename: A valid path to which a file with the video will be saved. :param agent: An agent whose policy will be evaluated. :param tf_env: A TensorFlow environment used for interaction with the agent. :param py_env: A Python OpenAI Gym environment used for rendering the video. Environment has to provide `render` method. :param num_episodes: A number of episodes to evaluate. :return: A video is saved to filename. """ with imageio.get_writer(filename, fps=60) as video: for _ in range(num_episodes): time_step = tf_env.reset() video.append_data(py_env.render()) while not time_step.is_last(): action_step = agent.policy.action(time_step) time_step = tf_env.step(action_step.action) video.append_data(py_env.render()) py_env.close()
def compute_total_reward(env: TFPyEnvironment, policy): total_reward = 0.0 time_step = env.reset() while not time_step.is_last(): policy_step = policy.action(time_step) time_step = env.step(policy_step.action) total_reward += time_step.reward return total_reward.numpy()[0]
def collect_step(env: tf_py_environment.TFPyEnvironment, policy, buffer): time_step = env.current_time_step() action_step = policy.action(time_step) next_time_step = env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer buffer.add_batch(traj)
def step( environment: TFPyEnvironment, policy: tf_policy.TFPolicy, replay_buffer: ReplayBuffer ) -> typing.Tuple[float, bool]: time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) replay_buffer.add_batch(traj) return next_time_step.reward.numpy()[0], next_time_step.is_last()
def create_policy_eval_video(self, env, policy, filename, num_episodes=5, fps=30): filename = filename + ".mp4" tf_env = TFPyEnvironment(env) with imageio.get_writer(filename, fps=fps) as video: for _ in range(num_episodes): time_step = tf_env.reset() tf_env.step(1) video.append_data(env.render()) while not time_step.is_last(): action_step = policy.action(time_step) time_step = tf_env.step(action_step.action) video.append_data(env.render()) video.close() return self.embed_mp4(filename)
def _evaluate_dyke_agent(env: TFPyEnvironment, agent: DqnAgent, num_episodes: int = 10) -> np.ndarray: returns: np.ndarray = np.zeros(shape=(num_episodes, )) for ep in range(num_episodes): time_step: TimeStep = env.reset() episode_return: float = 0.0 while not time_step.is_last(): action_step = agent.policy.action(time_step) time_step = env.step(action_step.action) episode_return += time_step.reward returns[ep] = episode_return return returns
def compute_avg_return(env: tf_py_environment.TFPyEnvironment, policy, num_episodes): total_return = 0.0 for _ in range(num_episodes): time_step = env.reset() episode_return = 0.0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = env.step(action_step.action) episode_return += time_step.reward total_return += episode_return avg_return = total_return / num_episodes return avg_return.numpy()[0]
def create_video(py_environment: PyEnvironment, tf_environment: TFPyEnvironment, policy: tf_policy, num_episodes=10, video_filename='imageio.mp4'): print("Generating video %s" % video_filename) with imageio.get_writer(video_filename, fps=60) as video: for episode in range(num_episodes): print("Generating episode %d of %d" % (episode, num_episodes)) time_step = tf_environment.reset() video.append_data(py_environment.render()) while not time_step.is_last(): action_step = policy.action(time_step) time_step = tf_environment.step(action_step.action) video.append_data(py_environment.render())
def compute_average_reward(env: tf_py_environment.TFPyEnvironment, policy: tf_policy.Base, num_episodes=10) -> float: total_reward = 0 for _ in range(num_episodes): time_step: ts.TimeStep = env.reset() episode_reward = 0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = env.step(action_step.action) episode_reward += time_step.reward # print(action_step.action.numpy()[0], end=' ') print(time_step.observation.numpy()) total_reward += episode_reward return total_reward / num_episodes
def evaluate_episode(policy, env_params): """Use naive while loop to evaluate policy in single episode.""" if 'n_monsters' in env_params: env = MultiMonsterEnvironment elif 'is_jumping' in env_params: env = JumpingEnvironment else: env = LakeMonsterEnvironment py_env = env(**env_params) tf_env = TFPyEnvironment(py_env) ts = tf_env.reset() n_steps = 0 while not ts.is_last(): action = policy.action(ts) ts = tf_env.step(action.action) n_steps += 1 reward = ts.reward.numpy().item() return reward, n_steps * py_env.step_size
def create_video(py_environment: PyEnvironment, tf_environment: TFPyEnvironment, policy: tf_policy, num_episodes=10, video_filename='imageio.mp4'): print("Generating video %s" % video_filename) with imageio.get_writer(video_filename, fps=60) as video: for episode in range(num_episodes): episode_return = 0.0 time_step = tf_environment.reset() video.append_data(py_environment.render()) while not time_step.is_last(): action_step = policy.action(time_step) time_step = tf_environment.step(action_step.action) episode_return += time_step.reward video.append_data(py_environment.render()) print( f"Generated episode {episode} of {num_episodes}. Return:{episode_return} " )
def compute_average_return(env: tf_py_environment.TFPyEnvironment, policy, num_episodes: int = 1) -> float: total_return = 0.0 for _ in range(num_episodes): time_step_ = env.reset() episode_return = 0.0 while not any(time_step_.is_last()): action_step = policy.action(time_step_) time_step_ = env.step(action=action_step.action) episode_return += np.mean(time_step_.reward) total_return += episode_return average_return = total_return / num_episodes return average_return
def create_many_policy_gif(uid, file_path, monster_speed=4.0): """Create a gif superimposing the actions of many policies.""" n_steps = 300 # = timeout_factor / step_size step_size = 0.01 fps = 10 p_paths = glob.glob(configs.POLICY_DIR + uid + '*') all_positions = [] colors = [] for p_path in tqdm(p_paths): color = (np.random.randint(256), np.random.randint(128), 0) policy = tf.saved_model.load(p_path) env_params = policy.get_metadata() env_params = tf_to_py(env_params) # overwriting parameters env_params['step_size'] = step_size env_params['monster_speed'] = monster_speed py_env = LakeMonsterEnvironment(**env_params) tf_env = TFPyEnvironment(py_env) time_step = tf_env.reset() agent_positions = {} for step in range(n_steps): if not time_step.is_last(): action = policy.action(time_step) time_step = tf_env.step(action.action) theta = py_env.total_monster_rotation - py_env.total_agent_rotation c, s = np.cos(theta), np.sin(theta) rot_matrix = np.array(((c, -s), (s, c))) agent_positions[step] = np.dot(rot_matrix, np.array((py_env.r, 0))) all_positions.append(agent_positions) colors.append(color) with imageio.get_writer(file_path, mode='I', fps=fps) as gif: for step in range(n_steps): positions = [item[step] for item in all_positions] im = render_many_agents(positions, colors, step, step_size, 4, monster_speed) gif.append_data(np.array(im)) pygifsicle.optimize(file_path)
def episode_as_video(py_env, policy, filepath, fps=10): """Create mp4 video through py_environment render method.""" tf_env = TFPyEnvironment(py_env) with imageio.get_writer('tmp.mp4', fps=fps) as video: time_step = tf_env.reset() video.append_data(py_env.render()) while not time_step.is_last(): action = policy.action(time_step).action time_step = tf_env.step(action) video.append_data(py_env.render()) for _ in range(3 * fps): # play for 3 more seconds video.append_data(py_env.render()) # giving video file a more descriptive name _, result = py_env.determine_reward() assert filepath.split('.')[1] == 'mp4' split = filepath.split('.') split[0] += '-' + result filepath = '.'.join(split) os.rename('tmp.mp4', filepath)
def compute_mean_reward(environment: TFPyEnvironment, policy: tf_policy.Base, num_episodes=10) -> float: """ Evaluate mean reward over `num_episodes` Implementation is taken from Tensorflow official documentation tutorial: https://www.tensorflow.org/agents/tutorials/6_reinforce_tutorial#metrics_and_evaluation """ total_reward = 0.0 for _ in range(num_episodes): time_step = environment.reset() episode_reward = 0.0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = environment.step(action_step.action) episode_reward += time_step.reward total_reward += episode_reward avg_rewards = total_reward / num_episodes return avg_rewards.numpy()[0]
def episode_as_gif(py_env, policy, save_path, fps=10, show_path=True): """Create gif through py_environment render method.""" tf_env = TFPyEnvironment(py_env) path = [] with imageio.get_writer(save_path, mode='I', fps=fps) as gif: time_step = tf_env.reset() # using the policy_state to deal with scripted_policy possibility policy_state = policy.get_initial_state(batch_size=1) gif.append_data(py_env.render()) while not time_step.is_last(): action = policy.action(time_step, policy_state) time_step = tf_env.step(action.action) im, real_position = py_env.render('return_real') path.append(real_position) if show_path: im = render_agent_path(im, path) policy_state = action.state gif.append_data(np.array(im)) for _ in range(fps): # play for 1 more seconds gif.append_data(py_env.render()) pygifsicle.optimize(save_path)
print('Step = {0}: Loss = {1}'.format(step, train_loss.loss)) # Save to checkpoint checkpointer.save(global_step) if it % eval_interval == 0: reward = compute_total_reward(eval_env, ppo_agent.policy) print('Step = {0}: Average reward = {1}'.format(step, reward)) rewards.append([reward]) # Save policy policy_saver.save(os.path.relpath('ppo_policy')) # View a video of the robot video_filename = 'ppo_minitaur_{0}.mp4'.format(video_num) print('Creating video...') writer = imageio.get_writer(video_filename, fps=30) ts = eval_env.reset() writer.append_data(eval_py_env.render()) while not ts.is_last(): ts = eval_env.step(ppo_agent.policy.action(ts).action) writer.append_data(eval_py_env.render()) writer.close() # Show the video os.startfile(video_filename) # Increment counter video_num += 1 # View the average reward over training time steps = range(0, num_iter + 1, eval_interval) plt.plot(steps, rewards) plt.ylabel('Average reward') plt.xlabel('Step') plt.show()
if cumulative_done: self._episode_ended = True return ts.termination(self._state, reward) else: return ts.transition(self._state, reward, discount=0.98) from tf_agents.environments.tf_py_environment import TFPyEnvironment tf_env = TFPyEnvironment(YoushiEnv) #tf_env = YoushiEnv() #tf_agent = tf.saved_model.load(saved_models_path) q_net = tf.saved_model.load("MyPolicyHard") time_step = tf_env.reset() display = DisplayIA.Display() lost = False score = 0 print(type(q_net)) while not time_step.is_last(): display.refresh(time_step.observation) action = q_net.action(time_step) time_step = tf_env.step(action) score += 1 print(score)
def reset_and_fire_on_life_lost(trajectory): global prev_lives lives = tf_env.pyenv.envs[0].ale.lives() if prev_lives != lives: tf_env.reset() tf_env.step(1) prev_lives = lives watch_driver = DynamicStepDriver(tf_env, saved_policy, observers=[ save_frames, reset_and_fire_on_life_lost, ShowProgress(1000) ], num_steps=1000) tf_env.reset() # reset the env time_step = tf_env.step(1) # fire the ball to begin playing policy_state = saved_policy.get_initial_state() # empty state () final_time_step, final_policy_state = watch_driver.run( time_step, policy_state) # render a window that shows the agent plays (works on the jupyter notebook) renderingUtils = RenderingUtils(frames) renderingUtils.plot_animation() renderingUtils.generate_gif("breakout.gif") renderingUtils.create_policy_eval_video(env, saved_policy, "trained-agent")
# Main training loop time_step, policy_state = None, None for it in range(N_ITERATIONS): if COLLECT_RANDOM: print('Running random driver...') time_step, policy_state = random_driver.run(time_step, policy_state) print('Running agent driver...') time_step, policy_state = driver.run(time_step, policy_state) print('Training...') for train_it in range(BUFFER_LENGTH//BATCH_SIZE): experience, _ = replay_buffer.get_next(sample_batch_size=BATCH_SIZE, num_steps=2) agent.train(experience) if (train_it + 1) % 100 == 0: print('{0} training iterations'.format(train_it + 1)) print('Saving...') # Save to checkpoint checkpointer.save(global_step) # Save policy policy_saver.save(os.path.relpath('policy')) # Show total reward of actual policy for 1 episode total_reward = 0.0 eval_ts = eval_env.reset() num_steps = 0 while (not eval_ts.is_last()) and num_steps < EVAL_MAX_STEPS: action_step = agent.policy.action(eval_ts) eval_ts = eval_env.step(action_step.action) total_reward += eval_ts.reward num_steps += 1 print('Iteration = {0}: Steps taken: = {1} of {2}: Total reward = {3}'.format(it, num_steps, EVAL_MAX_STEPS, total_reward))