def collect_steps(env: tf_py_environment.TFPyEnvironment,
                  policy: tf_policy.Base, buffer: ReplayBuffer):
    time_step = env.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = env.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    buffer.add_batch(traj)
示例#2
0
def save_environment_agent_video(
    filename: str,
    agent: tf_agent.TFAgent,
    tf_env: TFPyEnvironment,
    py_env: TimeLimit,
    num_episodes: int = 1,
) -> None:
    """
    Save a video of an agent acting in the environment. Render method needs to be available in the
    python version of the environment.
    TODO:
    - how to prevent opening a window when saving a video?
    - sometimes nothing is saved?
    - gym wrappers monitoring VideoRecorder

    :param filename: A valid path to which a file with the video will be saved.
    :param agent: An agent whose policy will be evaluated.
    :param tf_env: A TensorFlow environment used for interaction with the agent.
    :param py_env: A Python OpenAI Gym environment used for rendering the video. Environment has
        to provide `render` method.
    :param num_episodes: A number of episodes to evaluate.

    :return: A video is saved to filename.
    """
    with imageio.get_writer(filename, fps=60) as video:
        for _ in range(num_episodes):
            time_step = tf_env.reset()
            video.append_data(py_env.render())
            while not time_step.is_last():
                action_step = agent.policy.action(time_step)
                time_step = tf_env.step(action_step.action)
                video.append_data(py_env.render())
    py_env.close()
def compute_total_reward(env: TFPyEnvironment, policy):
    total_reward = 0.0
    time_step = env.reset()
    while not time_step.is_last():
        policy_step = policy.action(time_step)
        time_step = env.step(policy_step.action)
        total_reward += time_step.reward
    return total_reward.numpy()[0]
示例#4
0
def collect_step(env: tf_py_environment.TFPyEnvironment, policy, buffer):
    time_step = env.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = env.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    buffer.add_batch(traj)
示例#5
0
def step(
    environment: TFPyEnvironment, policy: tf_policy.TFPolicy, replay_buffer: ReplayBuffer
) -> typing.Tuple[float, bool]:
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    replay_buffer.add_batch(traj)
    return next_time_step.reward.numpy()[0], next_time_step.is_last()
示例#6
0
 def create_policy_eval_video(self,
                              env,
                              policy,
                              filename,
                              num_episodes=5,
                              fps=30):
     filename = filename + ".mp4"
     tf_env = TFPyEnvironment(env)
     with imageio.get_writer(filename, fps=fps) as video:
         for _ in range(num_episodes):
             time_step = tf_env.reset()
             tf_env.step(1)
             video.append_data(env.render())
             while not time_step.is_last():
                 action_step = policy.action(time_step)
                 time_step = tf_env.step(action_step.action)
                 video.append_data(env.render())
         video.close()
     return self.embed_mp4(filename)
def _evaluate_dyke_agent(env: TFPyEnvironment,
                         agent: DqnAgent,
                         num_episodes: int = 10) -> np.ndarray:
    returns: np.ndarray = np.zeros(shape=(num_episodes, ))
    for ep in range(num_episodes):
        time_step: TimeStep = env.reset()
        episode_return: float = 0.0
        while not time_step.is_last():
            action_step = agent.policy.action(time_step)
            time_step = env.step(action_step.action)
            episode_return += time_step.reward
        returns[ep] = episode_return
    return returns
示例#8
0
def compute_avg_return(env: tf_py_environment.TFPyEnvironment, policy,
                       num_episodes):
    total_return = 0.0
    for _ in range(num_episodes):
        time_step = env.reset()
        episode_return = 0.0
        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = env.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]
示例#9
0
def create_video(py_environment: PyEnvironment,
                 tf_environment: TFPyEnvironment,
                 policy: tf_policy,
                 num_episodes=10,
                 video_filename='imageio.mp4'):
    print("Generating video %s" % video_filename)
    with imageio.get_writer(video_filename, fps=60) as video:
        for episode in range(num_episodes):
            print("Generating episode %d of %d" % (episode, num_episodes))

            time_step = tf_environment.reset()
            video.append_data(py_environment.render())
            while not time_step.is_last():
                action_step = policy.action(time_step)

                time_step = tf_environment.step(action_step.action)
                video.append_data(py_environment.render())
def compute_average_reward(env: tf_py_environment.TFPyEnvironment,
                           policy: tf_policy.Base,
                           num_episodes=10) -> float:
    total_reward = 0
    for _ in range(num_episodes):
        time_step: ts.TimeStep = env.reset()
        episode_reward = 0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = env.step(action_step.action)
            episode_reward += time_step.reward
            # print(action_step.action.numpy()[0], end=' ')
            print(time_step.observation.numpy())

        total_reward += episode_reward

    return total_reward / num_episodes
示例#11
0
def evaluate_episode(policy, env_params):
    """Use naive while loop to evaluate policy in single episode."""
    if 'n_monsters' in env_params:
        env = MultiMonsterEnvironment
    elif 'is_jumping' in env_params:
        env = JumpingEnvironment
    else:
        env = LakeMonsterEnvironment
    py_env = env(**env_params)
    tf_env = TFPyEnvironment(py_env)
    ts = tf_env.reset()
    n_steps = 0
    while not ts.is_last():
        action = policy.action(ts)
        ts = tf_env.step(action.action)
        n_steps += 1

    reward = ts.reward.numpy().item()
    return reward, n_steps * py_env.step_size
示例#12
0
def create_video(py_environment: PyEnvironment,
                 tf_environment: TFPyEnvironment,
                 policy: tf_policy,
                 num_episodes=10,
                 video_filename='imageio.mp4'):
    print("Generating video %s" % video_filename)
    with imageio.get_writer(video_filename, fps=60) as video:
        for episode in range(num_episodes):
            episode_return = 0.0
            time_step = tf_environment.reset()
            video.append_data(py_environment.render())
            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = tf_environment.step(action_step.action)
                episode_return += time_step.reward
                video.append_data(py_environment.render())
            print(
                f"Generated episode {episode} of {num_episodes}. Return:{episode_return} "
            )
def compute_average_return(env: tf_py_environment.TFPyEnvironment,
                           policy,
                           num_episodes: int = 1) -> float:
    total_return = 0.0

    for _ in range(num_episodes):
        time_step_ = env.reset()
        episode_return = 0.0

        while not any(time_step_.is_last()):
            action_step = policy.action(time_step_)
            time_step_ = env.step(action=action_step.action)
            episode_return += np.mean(time_step_.reward)

        total_return += episode_return

    average_return = total_return / num_episodes

    return average_return
示例#14
0
def create_many_policy_gif(uid, file_path, monster_speed=4.0):
    """Create a gif superimposing the actions of many policies."""
    n_steps = 300  # = timeout_factor / step_size
    step_size = 0.01
    fps = 10
    p_paths = glob.glob(configs.POLICY_DIR + uid + '*')

    all_positions = []
    colors = []
    for p_path in tqdm(p_paths):
        color = (np.random.randint(256), np.random.randint(128), 0)
        policy = tf.saved_model.load(p_path)
        env_params = policy.get_metadata()
        env_params = tf_to_py(env_params)

        # overwriting parameters
        env_params['step_size'] = step_size
        env_params['monster_speed'] = monster_speed
        py_env = LakeMonsterEnvironment(**env_params)
        tf_env = TFPyEnvironment(py_env)

        time_step = tf_env.reset()
        agent_positions = {}
        for step in range(n_steps):
            if not time_step.is_last():
                action = policy.action(time_step)
                time_step = tf_env.step(action.action)
            theta = py_env.total_monster_rotation - py_env.total_agent_rotation
            c, s = np.cos(theta), np.sin(theta)
            rot_matrix = np.array(((c, -s), (s, c)))
            agent_positions[step] = np.dot(rot_matrix, np.array((py_env.r, 0)))
        all_positions.append(agent_positions)
        colors.append(color)

    with imageio.get_writer(file_path, mode='I', fps=fps) as gif:
        for step in range(n_steps):
            positions = [item[step] for item in all_positions]
            im = render_many_agents(positions, colors, step, step_size, 4,
                                    monster_speed)
            gif.append_data(np.array(im))
    pygifsicle.optimize(file_path)
示例#15
0
def episode_as_video(py_env, policy, filepath, fps=10):
    """Create mp4 video through py_environment render method."""

    tf_env = TFPyEnvironment(py_env)
    with imageio.get_writer('tmp.mp4', fps=fps) as video:
        time_step = tf_env.reset()
        video.append_data(py_env.render())
        while not time_step.is_last():
            action = policy.action(time_step).action
            time_step = tf_env.step(action)
            video.append_data(py_env.render())
        for _ in range(3 * fps):  # play for 3 more seconds
            video.append_data(py_env.render())

    # giving video file a more descriptive name
    _, result = py_env.determine_reward()

    assert filepath.split('.')[1] == 'mp4'
    split = filepath.split('.')
    split[0] += '-' + result
    filepath = '.'.join(split)
    os.rename('tmp.mp4', filepath)
示例#16
0
def compute_mean_reward(environment: TFPyEnvironment,
                        policy: tf_policy.Base,
                        num_episodes=10) -> float:
    """
    Evaluate mean reward over `num_episodes`
    Implementation is taken from Tensorflow official documentation tutorial:
    https://www.tensorflow.org/agents/tutorials/6_reinforce_tutorial#metrics_and_evaluation
    """
    total_reward = 0.0
    for _ in range(num_episodes):
        time_step = environment.reset()
        episode_reward = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_reward += time_step.reward

        total_reward += episode_reward

    avg_rewards = total_reward / num_episodes
    return avg_rewards.numpy()[0]
示例#17
0
def episode_as_gif(py_env, policy, save_path, fps=10, show_path=True):
    """Create gif through py_environment render method."""

    tf_env = TFPyEnvironment(py_env)
    path = []
    with imageio.get_writer(save_path, mode='I', fps=fps) as gif:
        time_step = tf_env.reset()
        # using the policy_state to deal with scripted_policy possibility
        policy_state = policy.get_initial_state(batch_size=1)
        gif.append_data(py_env.render())

        while not time_step.is_last():
            action = policy.action(time_step, policy_state)
            time_step = tf_env.step(action.action)
            im, real_position = py_env.render('return_real')
            path.append(real_position)
            if show_path:
                im = render_agent_path(im, path)
            policy_state = action.state
            gif.append_data(np.array(im))

        for _ in range(fps):  # play for 1 more seconds
            gif.append_data(py_env.render())
    pygifsicle.optimize(save_path)
        print('Step = {0}: Loss = {1}'.format(step, train_loss.loss))
        # Save to checkpoint
        checkpointer.save(global_step)
        if it % eval_interval == 0:
            reward = compute_total_reward(eval_env, ppo_agent.policy)
            print('Step = {0}: Average reward = {1}'.format(step, reward))
            rewards.append([reward])
            # Save policy
            policy_saver.save(os.path.relpath('ppo_policy'))
            # View a video of the robot
            video_filename = 'ppo_minitaur_{0}.mp4'.format(video_num)
            print('Creating video...')
            writer = imageio.get_writer(video_filename, fps=30)
            ts = eval_env.reset()
            writer.append_data(eval_py_env.render())
            while not ts.is_last():
                ts = eval_env.step(ppo_agent.policy.action(ts).action)
                writer.append_data(eval_py_env.render())
            writer.close()
            # Show the video
            os.startfile(video_filename)
            # Increment counter
            video_num += 1

    # View the average reward over training time
    steps = range(0, num_iter + 1, eval_interval)
    plt.plot(steps, rewards)
    plt.ylabel('Average reward')
    plt.xlabel('Step')
    plt.show()
示例#19
0
        if cumulative_done:
            self._episode_ended = True
            return ts.termination(self._state, reward)
        else:
            return ts.transition(self._state, reward, discount=0.98)


from tf_agents.environments.tf_py_environment import TFPyEnvironment

tf_env = TFPyEnvironment(YoushiEnv)
#tf_env = YoushiEnv()

#tf_agent = tf.saved_model.load(saved_models_path)
q_net = tf.saved_model.load("MyPolicyHard")

time_step = tf_env.reset()
display = DisplayIA.Display()

lost = False
score = 0

print(type(q_net))

while not time_step.is_last():
    display.refresh(time_step.observation)
    action = q_net.action(time_step)
    time_step = tf_env.step(action)
    score += 1

print(score)
示例#20
0
    def reset_and_fire_on_life_lost(trajectory):
        global prev_lives
        lives = tf_env.pyenv.envs[0].ale.lives()
        if prev_lives != lives:
            tf_env.reset()
            tf_env.step(1)
            prev_lives = lives

    watch_driver = DynamicStepDriver(tf_env,
                                     saved_policy,
                                     observers=[
                                         save_frames,
                                         reset_and_fire_on_life_lost,
                                         ShowProgress(1000)
                                     ],
                                     num_steps=1000)

    tf_env.reset()  # reset the env
    time_step = tf_env.step(1)  # fire the ball to begin playing
    policy_state = saved_policy.get_initial_state()  # empty state ()
    final_time_step, final_policy_state = watch_driver.run(
        time_step, policy_state)

    # render a window that shows the agent plays (works on the jupyter notebook)
    renderingUtils = RenderingUtils(frames)

    renderingUtils.plot_animation()

    renderingUtils.generate_gif("breakout.gif")

    renderingUtils.create_policy_eval_video(env, saved_policy, "trained-agent")
示例#21
0
    # Main training loop
    time_step, policy_state = None, None
    for it in range(N_ITERATIONS):
        if COLLECT_RANDOM:
            print('Running random driver...')
            time_step, policy_state = random_driver.run(time_step, policy_state)
        print('Running agent driver...')
        time_step, policy_state = driver.run(time_step, policy_state)
        print('Training...')
        for train_it in range(BUFFER_LENGTH//BATCH_SIZE):
            experience, _ = replay_buffer.get_next(sample_batch_size=BATCH_SIZE, num_steps=2)
            agent.train(experience)
            if (train_it + 1) % 100 == 0:
                print('{0} training iterations'.format(train_it + 1))
        print('Saving...')
        # Save to checkpoint
        checkpointer.save(global_step)
        # Save policy
        policy_saver.save(os.path.relpath('policy'))
        # Show total reward of actual policy for 1 episode
        total_reward = 0.0
        eval_ts = eval_env.reset()
        num_steps = 0
        while (not eval_ts.is_last()) and num_steps < EVAL_MAX_STEPS:
            action_step = agent.policy.action(eval_ts)
            eval_ts = eval_env.step(action_step.action)
            total_reward += eval_ts.reward
            num_steps += 1
        print('Iteration = {0}: Steps taken: = {1} of {2}: Total reward = {3}'.format(it, num_steps,
                                                                                      EVAL_MAX_STEPS, total_reward))