def train_dyke_agent(train_env: TFPyEnvironment, eval_env: TFPyEnvironment,
                     agent: DqnAgent, train_steps: int, steps_per_episode: int,
                     eval_episodes: int) -> Dict[str, Any]:
    """
	Trains the DQN agent on the dyke maintenance task.

	:param train_env: The training environment.
	:param eval_env: The environment for testing agent performance.
	:param agent: The agent.
	:param train_steps: The number of training steps to use.
	:param steps_per_episode: The number of time steps that can be taken in a single dyke environment episode.
	:param eval_episodes: The number of episodes to use per evaluation.
	:return: A mapping to various metrics pertaining to the training's results.
	"""
    losses: np.ndarray = np.zeros(shape=(train_steps, steps_per_episode))
    evaluations: np.ndarray = np.zeros(shape=(train_steps, eval_episodes))
    train_metrics: Tuple = (AverageReturnMetric, )
    train_metric_results: np.ndarray = np.zeros(shape=(len(train_metrics),
                                                       train_steps,
                                                       steps_per_episode))
    for step in range(train_steps):
        # we uniformly sample experiences (single time steps) from one episode per train step
        print('STEP %d/%d' % (step + 1, train_steps))
        train_env.reset()
        rep_buf = _dyke_replay_buffer(train_env, agent, steps_per_episode)
        train_metric_inst: Tuple = tuple(
            [metric() for metric in train_metrics])  # instantiate the metrics
        obs: Tuple = (rep_buf.add_batch, ) + train_metric_inst
        _ = DynamicStepDriver(
            env=train_env,
            policy=agent.collect_policy,
            observers=obs,
            num_steps=steps_per_episode
        ).run(
        )  # experience a single episode using the agent's current configuration
        dataset: tf.data.Dataset = rep_buf.as_dataset(
            sample_batch_size=_REP_BUF_BATCH_SIZE,
            num_steps=_REP_BUF_NUM_STEPS)
        iterator = iter(dataset)
        for tr in range(steps_per_episode):
            trajectories, _ = next(iterator)
            losses[step, tr] = agent.train(experience=trajectories).loss
            for met in range(len(train_metrics)):
                train_metric_results[
                    met, step, tr] = train_metric_inst[met].result().numpy()
        evaluations[step, :] = _evaluate_dyke_agent(eval_env, agent,
                                                    eval_episodes)
    return {
        'loss': losses,
        'eval': evaluations,
        'train-metrics': train_metric_results
    }
示例#2
0
def save_environment_agent_video(
    filename: str,
    agent: tf_agent.TFAgent,
    tf_env: TFPyEnvironment,
    py_env: TimeLimit,
    num_episodes: int = 1,
) -> None:
    """
    Save a video of an agent acting in the environment. Render method needs to be available in the
    python version of the environment.
    TODO:
    - how to prevent opening a window when saving a video?
    - sometimes nothing is saved?
    - gym wrappers monitoring VideoRecorder

    :param filename: A valid path to which a file with the video will be saved.
    :param agent: An agent whose policy will be evaluated.
    :param tf_env: A TensorFlow environment used for interaction with the agent.
    :param py_env: A Python OpenAI Gym environment used for rendering the video. Environment has
        to provide `render` method.
    :param num_episodes: A number of episodes to evaluate.

    :return: A video is saved to filename.
    """
    with imageio.get_writer(filename, fps=60) as video:
        for _ in range(num_episodes):
            time_step = tf_env.reset()
            video.append_data(py_env.render())
            while not time_step.is_last():
                action_step = agent.policy.action(time_step)
                time_step = tf_env.step(action_step.action)
                video.append_data(py_env.render())
    py_env.close()
def compute_total_reward(env: TFPyEnvironment, policy):
    total_reward = 0.0
    time_step = env.reset()
    while not time_step.is_last():
        policy_step = policy.action(time_step)
        time_step = env.step(policy_step.action)
        total_reward += time_step.reward
    return total_reward.numpy()[0]
示例#4
0
def step_episode(
    environment: TFPyEnvironment, policy: tf_policy.TFPolicy, replay_buffer: ReplayBuffer
) -> typing.Tuple[int, int]:
    done = False
    environment.reset()

    curr_episode_rewards = []
    episode_reward = 0
    episode_length = 0

    while not done:
        reward, done = step(environment, policy, replay_buffer)
        curr_episode_rewards.append(reward)
        episode_length += 1

        if done:
            episode_reward = sum(curr_episode_rewards)

    return episode_reward, episode_length
def _evaluate_dyke_agent(env: TFPyEnvironment,
                         agent: DqnAgent,
                         num_episodes: int = 10) -> np.ndarray:
    returns: np.ndarray = np.zeros(shape=(num_episodes, ))
    for ep in range(num_episodes):
        time_step: TimeStep = env.reset()
        episode_return: float = 0.0
        while not time_step.is_last():
            action_step = agent.policy.action(time_step)
            time_step = env.step(action_step.action)
            episode_return += time_step.reward
        returns[ep] = episode_return
    return returns
示例#6
0
    def test_ppo(self):
        env_class = PolicyUnittestEnv
        learning_rate = 1e-1
        iterations = 20
        batch_size = 100
        steps_per_episode = 13
        env = env_class(batch_size, steps_per_episode)
        env = TFPyEnvironment(env)

        eval_env = env_class(batch_size, steps_per_episode)
        eval_env = TFPyEnvironment(eval_env)

        algorithm = create_algorithm(env, learning_rate=learning_rate)
        driver = SyncOffPolicyDriver(env,
                                     algorithm,
                                     debug_summaries=DEBUGGING,
                                     summarize_grads_and_vars=DEBUGGING)
        replayer = driver.exp_replayer
        eval_driver = OnPolicyDriver(eval_env,
                                     algorithm,
                                     training=False,
                                     greedy_predict=True)

        env.reset()
        eval_env.reset()
        time_step = driver.get_initial_time_step()
        policy_state = driver.get_initial_policy_state()
        for i in range(iterations):
            time_step, policy_state = driver.run(max_num_steps=batch_size *
                                                 steps_per_episode,
                                                 time_step=time_step,
                                                 policy_state=policy_state)

            experience = replayer.replay_all()
            driver.train(experience, num_updates=4, mini_batch_size=25)
            replayer.clear()
            eval_env.reset()
            eval_time_step, _ = eval_driver.run(
                max_num_steps=(steps_per_episode - 1) * batch_size)
            logging.info("%d reward=%f", i,
                         float(tf.reduce_mean(eval_time_step.reward)))

        eval_env.reset()
        eval_time_step, _ = eval_driver.run(
            max_num_steps=(steps_per_episode - 1) * batch_size)
        logging.info("reward=%f", float(tf.reduce_mean(eval_time_step.reward)))
        self.assertAlmostEqual(1.0,
                               float(tf.reduce_mean(eval_time_step.reward)),
                               delta=1e-1)
示例#7
0
def compute_avg_return(env: tf_py_environment.TFPyEnvironment, policy,
                       num_episodes):
    total_return = 0.0
    for _ in range(num_episodes):
        time_step = env.reset()
        episode_return = 0.0
        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = env.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]
示例#8
0
def load_model_checkpoint(c):#returns the model at given chkpoint

    dir_name = tf.train.latest_checkpoint(c.model_dir)
    #if ver_name =='None':
    #    check_or_make_dir(dir_name)
        
    #else:
    #    dir_name = os.path.join(dir_name,ver_name)
    dummy_env= TFPyEnvironment(StockEnvBasic(**c.default_env))
    time_step = dummy_env.reset()

    temp = ValueNet(**c.model_vars)
    #initialize model
    temp(time_step.observation)
    checkpoint2 = tf.train.Checkpoint(module=temp)
    status=checkpoint2.restore(dir_name)
    return temp,checkpoint2
示例#9
0
def create_video(py_environment: PyEnvironment,
                 tf_environment: TFPyEnvironment,
                 policy: tf_policy,
                 num_episodes=10,
                 video_filename='imageio.mp4'):
    print("Generating video %s" % video_filename)
    with imageio.get_writer(video_filename, fps=60) as video:
        for episode in range(num_episodes):
            print("Generating episode %d of %d" % (episode, num_episodes))

            time_step = tf_environment.reset()
            video.append_data(py_environment.render())
            while not time_step.is_last():
                action_step = policy.action(time_step)

                time_step = tf_environment.step(action_step.action)
                video.append_data(py_environment.render())
def compute_average_reward(env: tf_py_environment.TFPyEnvironment,
                           policy: tf_policy.Base,
                           num_episodes=10) -> float:
    total_reward = 0
    for _ in range(num_episodes):
        time_step: ts.TimeStep = env.reset()
        episode_reward = 0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = env.step(action_step.action)
            episode_reward += time_step.reward
            # print(action_step.action.numpy()[0], end=' ')
            print(time_step.observation.numpy())

        total_reward += episode_reward

    return total_reward / num_episodes
示例#11
0
def test_planning_policy_batch_environment_model():
    """
    Ensure that planning policy is operational.
    """

    # number of trajectories for planning and planning horizon
    population_size = 3
    planner_horizon = 5
    number_of_particles = 1

    # setup the environment and a model of it
    py_env = suite_gym.load("MountainCar-v0")
    tf_env = TFPyEnvironment(py_env)
    reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
    terminates = MountainCarTermination(tf_env.observation_spec())
    network = LinearTransitionNetwork(tf_env.observation_spec())
    transition_model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    initial_state = MountainCarInitialState(tf_env.observation_spec())
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=terminates,
        initial_state_distribution_model=initial_state,
    )

    # setup the trajectory optimiser
    random_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                   tf_env.action_spec())
    trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy,
                                                     planner_horizon,
                                                     population_size,
                                                     number_of_particles)
    planning_policy = PlanningPolicy(environment_model, trajectory_optimiser)

    # test whether it runs
    collect_driver_planning_policy = DynamicEpisodeDriver(tf_env,
                                                          planning_policy,
                                                          num_episodes=1)
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
def compute_average_return(env: tf_py_environment.TFPyEnvironment,
                           policy,
                           num_episodes: int = 1) -> float:
    total_return = 0.0

    for _ in range(num_episodes):
        time_step_ = env.reset()
        episode_return = 0.0

        while not any(time_step_.is_last()):
            action_step = policy.action(time_step_)
            time_step_ = env.step(action=action_step.action)
            episode_return += np.mean(time_step_.reward)

        total_return += episode_return

    average_return = total_return / num_episodes

    return average_return
示例#13
0
def create_video(py_environment: PyEnvironment,
                 tf_environment: TFPyEnvironment,
                 policy: tf_policy,
                 num_episodes=10,
                 video_filename='imageio.mp4'):
    print("Generating video %s" % video_filename)
    with imageio.get_writer(video_filename, fps=60) as video:
        for episode in range(num_episodes):
            episode_return = 0.0
            time_step = tf_environment.reset()
            video.append_data(py_environment.render())
            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = tf_environment.step(action_step.action)
                episode_return += time_step.reward
                video.append_data(py_environment.render())
            print(
                f"Generated episode {episode} of {num_episodes}. Return:{episode_return} "
            )
示例#14
0
def evaluate_episode(policy, env_params):
    """Use naive while loop to evaluate policy in single episode."""
    if 'n_monsters' in env_params:
        env = MultiMonsterEnvironment
    elif 'is_jumping' in env_params:
        env = JumpingEnvironment
    else:
        env = LakeMonsterEnvironment
    py_env = env(**env_params)
    tf_env = TFPyEnvironment(py_env)
    ts = tf_env.reset()
    n_steps = 0
    while not ts.is_last():
        action = policy.action(ts)
        ts = tf_env.step(action.action)
        n_steps += 1

    reward = ts.reward.numpy().item()
    return reward, n_steps * py_env.step_size
示例#15
0
 def create_policy_eval_video(self,
                              env,
                              policy,
                              filename,
                              num_episodes=5,
                              fps=30):
     filename = filename + ".mp4"
     tf_env = TFPyEnvironment(env)
     with imageio.get_writer(filename, fps=fps) as video:
         for _ in range(num_episodes):
             time_step = tf_env.reset()
             tf_env.step(1)
             video.append_data(env.render())
             while not time_step.is_last():
                 action_step = policy.action(time_step)
                 time_step = tf_env.step(action_step.action)
                 video.append_data(env.render())
         video.close()
     return self.embed_mp4(filename)
示例#16
0
def create_many_policy_gif(uid, file_path, monster_speed=4.0):
    """Create a gif superimposing the actions of many policies."""
    n_steps = 300  # = timeout_factor / step_size
    step_size = 0.01
    fps = 10
    p_paths = glob.glob(configs.POLICY_DIR + uid + '*')

    all_positions = []
    colors = []
    for p_path in tqdm(p_paths):
        color = (np.random.randint(256), np.random.randint(128), 0)
        policy = tf.saved_model.load(p_path)
        env_params = policy.get_metadata()
        env_params = tf_to_py(env_params)

        # overwriting parameters
        env_params['step_size'] = step_size
        env_params['monster_speed'] = monster_speed
        py_env = LakeMonsterEnvironment(**env_params)
        tf_env = TFPyEnvironment(py_env)

        time_step = tf_env.reset()
        agent_positions = {}
        for step in range(n_steps):
            if not time_step.is_last():
                action = policy.action(time_step)
                time_step = tf_env.step(action.action)
            theta = py_env.total_monster_rotation - py_env.total_agent_rotation
            c, s = np.cos(theta), np.sin(theta)
            rot_matrix = np.array(((c, -s), (s, c)))
            agent_positions[step] = np.dot(rot_matrix, np.array((py_env.r, 0)))
        all_positions.append(agent_positions)
        colors.append(color)

    with imageio.get_writer(file_path, mode='I', fps=fps) as gif:
        for step in range(n_steps):
            positions = [item[step] for item in all_positions]
            im = render_many_agents(positions, colors, step, step_size, 4,
                                    monster_speed)
            gif.append_data(np.array(im))
    pygifsicle.optimize(file_path)
示例#17
0
def episode_as_video(py_env, policy, filepath, fps=10):
    """Create mp4 video through py_environment render method."""

    tf_env = TFPyEnvironment(py_env)
    with imageio.get_writer('tmp.mp4', fps=fps) as video:
        time_step = tf_env.reset()
        video.append_data(py_env.render())
        while not time_step.is_last():
            action = policy.action(time_step).action
            time_step = tf_env.step(action)
            video.append_data(py_env.render())
        for _ in range(3 * fps):  # play for 3 more seconds
            video.append_data(py_env.render())

    # giving video file a more descriptive name
    _, result = py_env.determine_reward()

    assert filepath.split('.')[1] == 'mp4'
    split = filepath.split('.')
    split[0] += '-' + result
    filepath = '.'.join(split)
    os.rename('tmp.mp4', filepath)
示例#18
0
def compute_mean_reward(environment: TFPyEnvironment,
                        policy: tf_policy.Base,
                        num_episodes=10) -> float:
    """
    Evaluate mean reward over `num_episodes`
    Implementation is taken from Tensorflow official documentation tutorial:
    https://www.tensorflow.org/agents/tutorials/6_reinforce_tutorial#metrics_and_evaluation
    """
    total_reward = 0.0
    for _ in range(num_episodes):
        time_step = environment.reset()
        episode_reward = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_reward += time_step.reward

        total_reward += episode_reward

    avg_rewards = total_reward / num_episodes
    return avg_rewards.numpy()[0]
示例#19
0
def episode_as_gif(py_env, policy, save_path, fps=10, show_path=True):
    """Create gif through py_environment render method."""

    tf_env = TFPyEnvironment(py_env)
    path = []
    with imageio.get_writer(save_path, mode='I', fps=fps) as gif:
        time_step = tf_env.reset()
        # using the policy_state to deal with scripted_policy possibility
        policy_state = policy.get_initial_state(batch_size=1)
        gif.append_data(py_env.render())

        while not time_step.is_last():
            action = policy.action(time_step, policy_state)
            time_step = tf_env.step(action.action)
            im, real_position = py_env.render('return_real')
            path.append(real_position)
            if show_path:
                im = render_agent_path(im, path)
            policy_state = action.state
            gif.append_data(np.array(im))

        for _ in range(fps):  # play for 1 more seconds
            gif.append_data(py_env.render())
    pygifsicle.optimize(save_path)
示例#20
0
def main(_):
    # Environment
    env_name = "Breakout-v4"
    train_num_parallel_environments = 5
    max_steps_per_episode = 1000
    # Replay buffer
    replay_buffer_capacity = 50000
    init_replay_buffer = 500
    # Driver
    collect_steps_per_iteration = 1 * train_num_parallel_environments
    # Training
    train_batch_size = 32
    train_iterations = 100000
    train_summary_interval = 200
    train_checkpoint_interval = 200
    # Evaluation
    eval_num_parallel_environments = 5
    eval_summary_interval = 500
    eval_num_episodes = 20
    # File paths
    path = pathlib.Path(__file__)
    parent_dir = path.parent.resolve()
    folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S")
    train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint")
    train_summary_dir = str(parent_dir / folder_name / "train_summary")
    eval_summary_dir = str(parent_dir / folder_name / "eval_summary")

    # Parallel training environment
    tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * train_num_parallel_environments))
    tf_env.seed([42] * tf_env.batch_size)
    tf_env.reset()

    # Parallel evaluation environment
    eval_tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * eval_num_parallel_environments))
    eval_tf_env.seed([42] * eval_tf_env.batch_size)
    eval_tf_env.reset()

    # Creating the Deep Q-Network
    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.)

    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(tf_env.observation_spec(),
                     tf_env.action_spec(),
                     preprocessing_layers=preprocessing_layer,
                     conv_layer_params=conv_layer_params,
                     fc_layer_params=fc_layer_params)

    # Creating the DQN Agent
    optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                         rho=0.95,
                                         momentum=0.0,
                                         epsilon=0.00001,
                                         centered=True)

    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=2500000,
        end_learning_rate=0.01)  # final ε

    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=200,
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=global_step,
        epsilon_greedy=lambda: epsilon_fn(global_step))
    agent.initialize()

    # Creating the Replay Buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    # Observer: Replay Buffer Observer
    replay_buffer_observer = replay_buffer.add_batch

    # Observer: Training Metrics
    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size),
        tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size),
    ]

    # Creating the Collect Driver
    collect_driver = DynamicStepDriver(tf_env,
                                       agent.collect_policy,
                                       observers=[replay_buffer_observer] +
                                       train_metrics,
                                       num_steps=collect_steps_per_iteration)

    # Initialize replay buffer
    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer_observer,
                   ShowProgress()],
        num_steps=init_replay_buffer)
    final_time_step, final_policy_state = init_driver.run()

    # Creating the Dataset
    dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size,
                                       num_steps=2,
                                       num_parallel_calls=3).prefetch(3)

    # Optimize by wrapping some of the code in a graph using TF function.
    collect_driver.run = function(collect_driver.run)
    agent.train = function(agent.train)

    print("\n\n++++++++++++++++++++++++++++++++++\n")

    # Create checkpoint
    train_checkpointer = Checkpointer(
        ckpt_dir=train_checkpoint_dir,
        max_to_keep=1,
        agent=agent,
        # replay_buffer=replay_buffer,
        global_step=global_step,
        # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')
    )

    # Restore checkpoint
    # train_checkpointer.initialize_or_restore()

    # Summary writers and metrics
    train_summary_writer = tf.summary.create_file_writer(train_summary_dir)
    eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir)
    eval_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size,
                                       buffer_size=eval_num_episodes),
        tf_metrics.AverageEpisodeLengthMetric(
            batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes)
    ]

    # Create evaluate callback function
    eval_callback = evaluate(eval_metrics=eval_metrics,
                             eval_tf_env=eval_tf_env,
                             eval_policy=agent.policy,
                             eval_num_episodes=eval_num_episodes,
                             train_step=global_step,
                             eval_summary_writer=eval_summary_writer)

    # Train agent
    train_agent(tf_env=tf_env,
                train_iterations=train_iterations,
                global_step=global_step,
                agent=agent,
                dataset=dataset,
                collect_driver=collect_driver,
                train_metrics=train_metrics,
                train_checkpointer=train_checkpointer,
                train_checkpoint_interval=train_checkpoint_interval,
                train_summary_writer=train_summary_writer,
                train_summary_interval=train_summary_interval,
                eval_summary_interval=eval_summary_interval,
                eval_callback=eval_callback)

    print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")
示例#21
0
    def test_off_policy_algorithm(self, algorithm_ctor, use_rollout_state,
                                  sync_driver):
        logging.info("{} {}".format(algorithm_ctor.__name__, sync_driver))

        batch_size = 128
        if use_rollout_state:
            steps_per_episode = 5
            mini_batch_length = 8
            unroll_length = 8
            env_class = RNNPolicyUnittestEnv
        else:
            steps_per_episode = 12
            mini_batch_length = 2
            unroll_length = 12
            env_class = PolicyUnittestEnv
        env = TFPyEnvironment(
            env_class(
                batch_size,
                steps_per_episode,
                action_type=ActionType.Continuous))

        eval_env = TFPyEnvironment(
            env_class(
                batch_size,
                steps_per_episode,
                action_type=ActionType.Continuous))

        common.set_global_env(env)
        algorithm = algorithm_ctor()
        algorithm.set_summary_settings(summarize_grads_and_vars=True)
        algorithm.use_rollout_state = use_rollout_state

        if sync_driver:
            driver = SyncOffPolicyDriver(env, algorithm)
        else:
            driver = AsyncOffPolicyDriver([env],
                                          algorithm,
                                          num_actor_queues=1,
                                          unroll_length=unroll_length,
                                          learn_queue_cap=1,
                                          actor_queue_cap=1)
        eval_driver = OnPolicyDriver(eval_env, algorithm, training=False)

        eval_env.reset()
        driver.start()
        if sync_driver:
            time_step = driver.get_initial_time_step()
            policy_state = driver.get_initial_policy_state()
            for i in range(5):
                time_step, policy_state = driver.run(
                    max_num_steps=batch_size * steps_per_episode,
                    time_step=time_step,
                    policy_state=policy_state)

        for i in range(500):
            if sync_driver:
                time_step, policy_state = driver.run(
                    max_num_steps=batch_size * mini_batch_length * 2,
                    time_step=time_step,
                    policy_state=policy_state)
                whole_replay_buffer_training = False
                clear_replay_buffer = False
            else:
                driver.run_async()
                whole_replay_buffer_training = True
                clear_replay_buffer = True

            driver.algorithm.train(
                mini_batch_size=128,
                mini_batch_length=mini_batch_length,
                whole_replay_buffer_training=whole_replay_buffer_training,
                clear_replay_buffer=clear_replay_buffer)
            eval_env.reset()
            eval_time_step, _ = eval_driver.run(
                max_num_steps=(steps_per_episode - 1) * batch_size)
            logging.log_every_n_seconds(
                logging.INFO,
                "%d reward=%f" %
                (i, float(tf.reduce_mean(eval_time_step.reward))),
                n_seconds=1)
        driver.stop()

        self.assertAlmostEqual(
            1.0, float(tf.reduce_mean(eval_time_step.reward)), delta=2e-1)
def train_agent(
        env: TFPyEnvironment,
        agent: Union[ReinforceAgent, PPOAgent],
        data_collection_driver: DynamicEpisodeDriver,
        replay_buffer: TFUniformReplayBuffer,
        num_iters: int,
        global_step=None,
        metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None,
        policy_metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None,
        policy_summary_writers: Optional[Sequence[tf.summary.SummaryWriter]] = None,
        eval_env: Optional[TFPyEnvironment] = None,
        eval_summary_writer: Optional[tf.summary.SummaryWriter] = None,
        num_eval_episodes: int = 1,
        eval_metrics: Optional[List[tf_metric.TFStepMetric]] = None,
        per_step_eval_metrics: Optional[List[Any]] = None,
        eval_freq: int = 10,
        log_freq: int = 5,
        save_freq: int = 5,
        model_save_path: Optional[str] = None,
        tf_log_stream_path: Optional[str] = None) -> None:
    """
    Function for putting the pieces together to train and evaluate an agent.

    :param env: The environment for which the agent will be trained.
    :param agent: The agent to train.
    :param data_collection_driver: The driver used for data collection and metric tracking.
    :param replay_buffer: Replay buffer in which to store experience.
    :param num_iters: The number of training iterations to perform.
    :param global_step: A counter of the number of training iterations.
    :param metrics: A list of the metrics to track during training.
    :param policy_metrics: A list of metrics related to the policy distribution to track during
        training.
    :param policy_summary_writers: A list of summary writers to facilitate overlaying plots of
        policy metrics in TensorBoard.
    :param eval_env: The environment in which to play out evaluations of the policy.
    :param eval_summary_writer: The summary writer used for evaluation metrics.
    :param num_eval_episodes: The number of evaluation episodes to run at each evaluation point.
    :param eval_metrics: The metrics to track when evaluating the policy (with episodic resolution).
    :param per_step_eval_metrics: The metrics to track when evaluating the policy (with time step
        resolution).
    :param eval_freq: The number of training iterations between runs of policy evaluation logging.
    :param log_freq: The frequency with which to log values to TensorBoard.
    :param save_freq: The number of training iterations between model saves.
    :param model_save_path: Directory in which to save model checkpoints (weights etc). If None
        model will not be saved.
    :param tf_log_stream_path:
    """
    # Get the initial states of the agent and environment before training.
    time_step = env.reset()
    policy_state = agent.collect_policy.get_initial_state(env.batch_size)

    # Set up the model saving infrastructure if a path to save to is provided.
    save_model = bool(model_save_path)
    if save_model:
        # Ensure that we save all trackable values (i.e. variables) from the TensorFlow Agent.
        checkpoint = tf.train.Checkpoint(agent=agent)
        # The checkpoint manager enables us to save multiple versions of the check point at
        # different training steps. We save the 20 most recent saves to span a wide section of
        # training.
        checkpoint_manager = tf.train.CheckpointManager(checkpoint, model_save_path, max_to_keep=20)
    else:
        # Warn the user that training will continue but models will not be saved.
        warn("No save directory provided. Model will not be saved.")

    if metrics is None:
        metrics = []
    if per_step_eval_metrics is None:
        per_step_eval_metrics = []
    # Set up a minimal training loop to simply test training mechanics work.
    for i in range(num_iters):
        with tf.summary.record_if(lambda: tf.math.equal(global_step % log_freq, 0)):
            # Collect experience.
            time_step, policy_state = data_collection_driver.run(
                time_step=time_step,
                policy_state=policy_state
            )
            # Now the replay buffer should have data in it so we can collect the data and train the
            # agent.
            experience = replay_buffer.gather_all()
            agent.train(experience)
            # Clear the replay buffer and return to play.
            replay_buffer.clear()
            for metric in metrics:
                metric.tf_summaries(
                    train_step=global_step,
                    step_metrics=metrics[:2]
                )
            # Run the policy tracking metrics one at a time each on their own summary writer to
            # enable shared axes on TensorBoard.
            for metric, summary_writer in zip(policy_metrics, policy_summary_writers):
                with summary_writer.as_default():
                    tf.summary.scalar(name=metric.name, data=metric.result(), step=global_step)

        if eval_summary_writer and eval_metrics and eval_env:
            if i > 0 and global_step % eval_freq == 0:
                evaluate_policy(
                    eval_metrics,
                    eval_env,
                    agent.policy,
                    per_step_metrics=per_step_eval_metrics,
                    num_episodes=num_eval_episodes,
                    train_step=global_step,
                    summary_writer=eval_summary_writer,
                    summary_prefix="Metrics",
                    logging=True,
                    tf_log_stream_path=tf_log_stream_path
                )
        # Periodically save the model provided that we have the infrastructure in place.
        if save_model and i > 0 and (i + 1) % save_freq == 0:
            checkpoint_manager.save(i + 1)
        if i % (num_iters // 100) == 0:
            print(f"\tCompleted: {i / num_iters * 100} %")
    checkpoint_manager.save(num_iters)
示例#23
0
    def reset_and_fire_on_life_lost(trajectory):
        global prev_lives
        lives = tf_env.pyenv.envs[0].ale.lives()
        if prev_lives != lives:
            tf_env.reset()
            tf_env.step(1)
            prev_lives = lives

    watch_driver = DynamicStepDriver(tf_env,
                                     saved_policy,
                                     observers=[
                                         save_frames,
                                         reset_and_fire_on_life_lost,
                                         ShowProgress(1000)
                                     ],
                                     num_steps=1000)

    tf_env.reset()  # reset the env
    time_step = tf_env.step(1)  # fire the ball to begin playing
    policy_state = saved_policy.get_initial_state()  # empty state ()
    final_time_step, final_policy_state = watch_driver.run(
        time_step, policy_state)

    # render a window that shows the agent plays (works on the jupyter notebook)
    renderingUtils = RenderingUtils(frames)

    renderingUtils.plot_animation()

    renderingUtils.generate_gif("breakout.gif")

    renderingUtils.create_policy_eval_video(env, saved_policy, "trained-agent")
示例#24
0
    # Main training loop
    time_step, policy_state = None, None
    for it in range(N_ITERATIONS):
        if COLLECT_RANDOM:
            print('Running random driver...')
            time_step, policy_state = random_driver.run(time_step, policy_state)
        print('Running agent driver...')
        time_step, policy_state = driver.run(time_step, policy_state)
        print('Training...')
        for train_it in range(BUFFER_LENGTH//BATCH_SIZE):
            experience, _ = replay_buffer.get_next(sample_batch_size=BATCH_SIZE, num_steps=2)
            agent.train(experience)
            if (train_it + 1) % 100 == 0:
                print('{0} training iterations'.format(train_it + 1))
        print('Saving...')
        # Save to checkpoint
        checkpointer.save(global_step)
        # Save policy
        policy_saver.save(os.path.relpath('policy'))
        # Show total reward of actual policy for 1 episode
        total_reward = 0.0
        eval_ts = eval_env.reset()
        num_steps = 0
        while (not eval_ts.is_last()) and num_steps < EVAL_MAX_STEPS:
            action_step = agent.policy.action(eval_ts)
            eval_ts = eval_env.step(action_step.action)
            total_reward += eval_ts.reward
            num_steps += 1
        print('Iteration = {0}: Steps taken: = {1} of {2}: Total reward = {3}'.format(it, num_steps,
                                                                                      EVAL_MAX_STEPS, total_reward))
示例#25
0
def test_all_mepo_variants_work(transition_model, trajectory_sampler,
                                model_free_agent_type):
    """
    Mepo Agent has prespecified transition model, trajectory sampler and model-free agent
    types. Here we check that all combinations execute without errors.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(
        observation_spec)

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    population_size = num_elites + 10
    horizon = 1

    # define agent, many transition model and trajectory optimiser parameters can
    # be arbitrary
    agent = MepoAgent(
        time_step_spec,
        action_spec,
        transition_model,
        1,
        10,
        tf.nn.relu,
        ensemble_size,
        False,
        1,
        1,
        [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
        reward_model,
        initial_state_distribution_model,
        trajectory_sampler,
        horizon,
        population_size,
        model_free_agent_type,
        1,
        10,
        tf.nn.relu,
        2,
    )

    # we need some training data
    random_policy = RandomTFPolicy(
        time_step_spec,
        action_spec,
        info_spec=agent.collect_policy.info_spec,
    )
    model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        random_policy.trajectory_spec, batch_size=1, max_length=1000)
    collect_driver_random_policy = TFDriver(
        tf_env,
        random_policy,
        observers=[model_training_buffer.add_batch],
        max_steps=10,
        disable_tf_function=True,
    )
    initial_time_step = tf_env.reset()
    collect_driver_random_policy.run(initial_time_step)
    pets_agent_trainer = BackgroundPlanningAgentTrainer(10, 10)
    tf_training_scheduler = pets_agent_trainer.create_training_scheduler(
        agent, model_training_buffer)
    training_losses = tf_training_scheduler.maybe_train(
        tf.constant(10, dtype=tf.int64))
    assert EnvironmentModelComponents.TRANSITION in training_losses

    # test the agent
    collect_driver_planning_policy = TFDriver(
        tf_env,
        agent.collect_policy,
        observers=[model_training_buffer.add_batch],
        max_steps=10,
        disable_tf_function=True,
    )
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
# 8. Evaluating the agent.
def evaluate(env, policy, num_episodes):
    total_return = 0.0

    for _ in range(num_episodes):
        time_step = env.reset()
        episode_return = 0.0
        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = env.step(action_step.action)
            episode_return += time_step.reward

        total_return += episode_return

    average_return = total_return / num_episodes
    return average_return.numpy()[0]


# Resetting the train step.
agent.train_step_counter.assign(0)

# Resetting eval environment.
eval_env.reset()

# Evaluate the agent's policy once before training.
num_of_episodes = 1
avg_return = evaluate(eval_env, agent.policy, num_of_episodes)

print('\nAverage return in', num_of_episodes, 'episodes =', avg_return)

carla_environment.close()
示例#27
0
        if cumulative_done:
            self._episode_ended = True
            return ts.termination(self._state, reward)
        else:
            return ts.transition(self._state, reward, discount=0.98)


from tf_agents.environments.tf_py_environment import TFPyEnvironment

tf_env = TFPyEnvironment(YoushiEnv)
#tf_env = YoushiEnv()

#tf_agent = tf.saved_model.load(saved_models_path)
q_net = tf.saved_model.load("MyPolicyHard")

time_step = tf_env.reset()
display = DisplayIA.Display()

lost = False
score = 0

print(type(q_net))

while not time_step.is_last():
    display.refresh(time_step.observation)
    action = q_net.action(time_step)
    time_step = tf_env.step(action)
    score += 1

print(score)
示例#28
0
    def test_off_policy_algorithm(self, algorithm_ctor, use_rollout_state,
                                  sync_driver):
        logging.info("{} {}".format(algorithm_ctor.__name__, sync_driver))

        batch_size = 128
        if use_rollout_state:
            steps_per_episode = 5
            mini_batch_length = 8
            unroll_length = 8
            env_class = RNNPolicyUnittestEnv
        else:
            steps_per_episode = 12
            mini_batch_length = 2
            unroll_length = 12
            env_class = PolicyUnittestEnv
        env = TFPyEnvironment(
            env_class(batch_size,
                      steps_per_episode,
                      action_type=ActionType.Continuous))

        eval_env = TFPyEnvironment(
            env_class(batch_size,
                      steps_per_episode,
                      action_type=ActionType.Continuous))

        algorithm = algorithm_ctor(env)
        algorithm.use_rollout_state = use_rollout_state

        if sync_driver:
            driver = SyncOffPolicyDriver(env,
                                         algorithm,
                                         use_rollout_state=use_rollout_state,
                                         debug_summaries=True,
                                         summarize_grads_and_vars=True)
        else:
            driver = AsyncOffPolicyDriver(
                [env],
                algorithm,
                use_rollout_state=algorithm.use_rollout_state,
                num_actor_queues=1,
                unroll_length=unroll_length,
                learn_queue_cap=1,
                actor_queue_cap=1,
                debug_summaries=True,
                summarize_grads_and_vars=True)
        replayer = driver.exp_replayer
        eval_driver = OnPolicyDriver(eval_env,
                                     algorithm,
                                     training=False,
                                     greedy_predict=True)

        eval_env.reset()
        driver.start()
        if sync_driver:
            time_step = driver.get_initial_time_step()
            policy_state = driver.get_initial_policy_state()
            for i in range(5):
                time_step, policy_state = driver.run(max_num_steps=batch_size *
                                                     steps_per_episode,
                                                     time_step=time_step,
                                                     policy_state=policy_state)

        for i in range(500):
            if sync_driver:
                time_step, policy_state = driver.run(max_num_steps=batch_size *
                                                     mini_batch_length * 2,
                                                     time_step=time_step,
                                                     policy_state=policy_state)
                experience, _ = replayer.replay(
                    sample_batch_size=128, mini_batch_length=mini_batch_length)
            else:
                driver.run_async()
                experience = replayer.replay_all()

            driver.train(experience,
                         mini_batch_size=128,
                         mini_batch_length=mini_batch_length)
            eval_env.reset()
            eval_time_step, _ = eval_driver.run(
                max_num_steps=(steps_per_episode - 1) * batch_size)
            logging.info("%d reward=%f", i,
                         float(tf.reduce_mean(eval_time_step.reward)))
        driver.stop()

        self.assertAlmostEqual(1.0,
                               float(tf.reduce_mean(eval_time_step.reward)),
                               delta=2e-1)
def evaluate_policy(metrics: List[Any],
                    environment: TFPyEnvironment,
                    policy: tf_agents.policies.tf_policy.Base,
                    per_step_metrics: Optional[List[tf.Module]] = None,
                    num_episodes: int = 1,
                    train_step: Optional[Any] = None,
                    summary_writer: Optional[tf.summary.SummaryWriter] = None,
                    summary_prefix: str = "Eval",
                    logging: bool = False,
                    tf_log_stream_path: Optional[str] = None) -> None:
    """
    Track performance (via metrics) using policy in the environment provided.
    Prints a dictionary of results {metric_name: metric_value}.

    *NOTE*: Because placeholders are not compatible with Eager mode this is not compatible with
    python policies.

    This function is adapted from tf_agents.eval.metric_utils.eager_compute to allow for per time
    step logging.

    :param metrics: List of metrics to compute.
    :param environment: tf_environment instance.
    :param policy: tf_policy instance used to step the environment.
    :param per_step_metrics: List of metrics to be passed as observers to run every time step during
        evaluation.
    :param num_episodes: Number of episodes to compute the metrics over.
    :param train_step: An optional step to write summaries against.
    :param summary_writer: An optional writer for generating metric summaries.
    :param summary_prefix: An optional prefix scope for metric summaries.
    :param logging: Option to enable logging to the console of standard metrics.
    :param tf_log_stream_path: Path to a file which tf.print calls are set to write to. If none
        tf.print statements print to sys.stdout.
    """
    # Reset the state of all metrics (e.g. running totals for averages).
    for metric in metrics + per_step_metrics:
        metric.reset()

    # Attain the initial state of the environment and policy.
    time_step = environment.reset()
    policy_state = policy.get_initial_state(environment.batch_size)

    # Set up a driver to run the evaluation episodes while logging the desired metrics.
    driver = DynamicEpisodeDriver(
        environment,
        policy,
        observers=metrics,
        transition_observers=per_step_metrics,
        num_episodes=num_episodes)

    # Run the driver which adds experience to the replay buffer.
    driver.run(time_step, policy_state)

    # If we have the required prerequisites then perform the TensorBoard logging as well as logging
    # results to the console.
    if train_step and summary_writer:
        # Utilise a (possibly) different summary writer to put the evaluation metrics to
        # TensorBoard.
        with summary_writer.as_default():
            for m in metrics:
                # Attain the full name of the metric to record.
                tag = "/".join([summary_prefix, m.name])
                # Simply calculating and forming the scalar summary in the current context with a
                # default summary writer does the logging to TensorBoard for us.
                tf.summary.scalar(name=tag, data=m.result(), step=train_step)
    # If requested to then log metrics to the console.
    if logging and train_step:
        for m in metrics:
            tf.print(f"Evaluation at step {train_step.numpy()}: {m.name}\t{m.result()}",
                     output_stream=f'file://{tf_log_stream_path}' if tf_log_stream_path else
                     sys.stdout)