Exemplo n.º 1
0
class ExperienceReplay(object):
    def __init__(self, agent, enviroment, batch_size):
        self._replay_buffer = TFUniformReplayBuffer(
            data_spec=agent.collect_data_spec,
            batch_size=enviroment.batch_size,
            max_length=100000)

        self._random_policy = RandomTFPolicy(enviroment.time_step_spec(),
                                             enviroment.action_spec())

        self._fill_buffer(enviroment, self._random_policy, steps=100)

        self.dataset = self._replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=2,
            single_deterministic_pass=False).prefetch(3)

        self.iterator = iter(self.dataset)

    def _fill_buffer(self, enviroment, policy, steps):
        for _ in range(steps):
            self.timestamp_data(enviroment, policy)

    def timestamp_data(self, environment, policy):
        time_step = environment.current_time_step()
        action_step = policy.action(time_step)
        next_time_step = environment.step(action_step.action)
        timestamp_trajectory = trajectory.from_transition(
            time_step, action_step, next_time_step)

        self._replay_buffer.add_batch(timestamp_trajectory)
Exemplo n.º 2
0
def test_reinforce_agent_learning(env_name):
    """
    Extension of the test for an agent playing in the environment to include training.
    Note: This does not test that training improves the policy. It simply tests that the training
    loop runs effectively.
    """
    # Set up environment using default parameters.
    # Environment parameters do not affect the test result here.
    tf_env, _ = rl_env_from_snc_env(load_scenario(
        env_name,
        job_gen_seed=10,
        override_env_params={'max_episode_length': 25})[1],
                                    discount_factor=0.99)

    # Set up a training step counter.
    global_step = tf.compat.v1.train.get_or_create_global_step()
    # Instantiate a REINFORCE agent
    reinforce_agent = create_reinforce_agent(tf_env,
                                             training_step_counter=global_step)

    # Instantiate a replay buffer.
    replay_buffer = TFUniformReplayBuffer(
        data_spec=reinforce_agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=1000)

    # Initialise the action network weights etc.
    reinforce_agent.initialize()

    # Use a driver to handle data collection for the agent. This handles a lot of the backend
    # TensorFlow set up and solves previous errors with episodes of differing lengths.
    collect_driver = DynamicEpisodeDriver(tf_env,
                                          reinforce_agent.collect_policy,
                                          observers=[replay_buffer.add_batch],
                                          num_episodes=2)

    # Get the initial states of the agent and environment before training.
    time_step = tf_env.reset()
    policy_state = reinforce_agent.collect_policy.get_initial_state(
        tf_env.batch_size)

    # Take a copy of the variables in order to ensure that training does lead to parameter changes.
    initial_vars = deepcopy(reinforce_agent.trainable_variables)
    assert len(initial_vars) > 0, "Agent has no trainable variables."

    # Set up a minimal training loop to simply test training mechanics work.
    for _ in range(5):
        # Collect experience.
        time_step, policy_state = collect_driver.run(time_step=time_step,
                                                     policy_state=policy_state)
        # Now the replay buffer should have data in it so we can collect the data and train the
        # agent.
        experience = replay_buffer.gather_all()
        reinforce_agent.train(experience)
        # Clear the replay buffer and return to play.
        replay_buffer.clear()

    # Check that training has had some effect
    for v1, v2 in zip(initial_vars, reinforce_agent.trainable_variables):
        assert not np.allclose(v1.numpy(), v2.numpy())
Exemplo n.º 3
0
 def __init__(self, experience_spec, batch_size):
     # TFUniformReplayBuffer does not support list in spec, we have to do
     # some conversion.
     self._experience_spec = experience_spec
     self._exp_has_list = nest_utils.nest_contains_list(experience_spec)
     tuple_experience_spec = nest_utils.nest_list_to_tuple(experience_spec)
     self._buffer = TFUniformReplayBuffer(tuple_experience_spec, batch_size)
     self._data_iter = None
Exemplo n.º 4
0
 def collect_step(self, env: tf_py_environment.TFPyEnvironment,
                  policy: tf_policy.Base,
                  replay_buffer: TFUniformReplayBuffer):
     time_step = env.current_time_step()
     action_step = policy.action(time_step)
     next_time_step = env.step(action_step.action)
     traj = trajectory.from_transition(time_step, action_step,
                                       next_time_step)
     replay_buffer.add_batch(traj)
def main():

    env = suite_gym.load('Trajectory-v0', gym_kwargs={
        'num_dimensions': 2,
        'num_observables': 3,
        'max_targets': 100,
        'max_steps': 5000,
        'max_steps_without_target': 5000,
        'max_position': 100.0,
        'max_acceleration': 10.2,
        'max_velocity': 15.0,
        'collision_epsilon': 10.0
    })
    tf_env = tf_py_environment.TFPyEnvironment(env)

    agent = RandomAgent(tf_env.time_step_spec(), tf_env.action_spec())
    uniform_replay_buffer = TFUniformReplayBuffer(agent.collect_data_spec, batch_size=1)

    transitions = []

    driver = DynamicStepDriver(
        tf_env,
        policy=agent.policy,
        observers=[uniform_replay_buffer.add_batch],
        transition_observers=[transitions.append],
        num_steps=500
    )

    initial_time_step = tf_env.reset()
    final_time_step, final_policy_state = driver.run(initial_time_step)
    dataset = uniform_replay_buffer.as_dataset()

    input_state = []
    input_action = []
    output_state = []
    output_reward = []
    for transition in transitions:
        input_state.append(tf.concat(tf.nest.flatten(transition[0].observation), axis=-1))
        input_action.append(tf.concat(tf.nest.flatten(transition[1].action), axis=-1))
        output_state.append(tf.concat(tf.nest.flatten(transition[2].observation), axis=-1))
        output_reward.append(tf.concat(tf.nest.flatten(transition[2].reward), axis=-1))

    tf_input_state = tf.squeeze(tf.stack(input_state), axis=1)
    tf_input_action = tf.squeeze(tf.stack(input_action), axis=1)
    tf_output_state = tf.squeeze(tf.stack(output_state), axis=1)
    tf_output_reward = tf.stack(output_reward)
     
    # dataset = (features, labels)

    # (time_step_before, policy_step_action, time_step_after) = transitions[0]
    # observation = time_step_before.observation
    # action = policy_step_action.action
    # # (discount_, observation_, reward_, step_type_) = time_step_after
    # observation_ = time_step_after.observation

    pass
def main():

    env = suite_gym.load('Trajectory-v0',
                         gym_kwargs={
                             'num_dimensions': 2,
                             'num_observables': 15,
                             'max_targets': 100,
                             'max_steps': 5000,
                             'max_steps_without_target': 5000,
                             'max_position': 100.0,
                             'max_acceleration': 10.2,
                             'max_velocity': 15.0,
                             'collision_epsilon': 10.0
                         })
    tf_env = tf_py_environment.TFPyEnvironment(env)

    agent = RandomAgent(time_step_spec=tf_env.time_step_spec(),
                        action_spec=tf_env.action_spec())

    metric = AverageReturnMetric()
    replay_buffer = []
    # uniform_replay_buffer = PyUniformReplayBuffer(data_spec=agent.collect_data_spec, capacity=2000)
    uniform_replay_buffer = TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec, batch_size=1)
    # observers = [replay_buffer.append, metric]

    # driver = PyDriver(
    #     env,
    #     policy=RandomPyPolicy(env.time_step_spec(), env.action_spec()),
    #     observers=[replay_buffer.append, metric],
    #     max_steps=2000
    # )

    # driver = TFDriver(
    #     tf_env,
    #     # policy=RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()),
    #     policy=agent.policy,
    #     observers=[uniform_replay_buffer],
    #     max_steps=2000
    # )

    driver = DynamicStepDriver(
        tf_env,
        policy=agent.policy,
        observers=[uniform_replay_buffer.add_batch],  #, metric],
        # transition_observers=None,
        num_steps=1000)

    agent.initialize()
    initial_time_step = tf_env.reset()
    final_time_step, final_policy_state = driver.run(initial_time_step)

    dataset = uniform_replay_buffer.as_dataset()
Exemplo n.º 7
0
def collect_step(environment: TFEnvironment, policy: TFPyPolicy,
                 replay_buffer: TFUniformReplayBuffer):
    """
    Coleta uma iteração com o ambiente e devolve o resultado m
    :param environment:
    :param policy:
    :param replay_buffer:
    :return:
    """
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    replay_buffer.add_batch(traj)
Exemplo n.º 8
0
    def __init__(self, agent, enviroment):
        self._replay_buffer = TFUniformReplayBuffer(
            data_spec=agent.collect_data_spec,
            batch_size=enviroment.batch_size,
            max_length=50000)

        self._random_policy = RandomTFPolicy(train_env.time_step_spec(),
                                             enviroment.action_spec())

        self._fill_buffer(train_env, self._random_policy, steps=100)

        self.dataset = self._replay_buffer.as_dataset(
            num_parallel_calls=3, sample_batch_size=BATCH_SIZE,
            num_steps=2).prefetch(3)

        self.iterator = iter(self.dataset)
Exemplo n.º 9
0
    def __init__(self, **kwargs):
        self.batch_size = 1
        self.tf_agent = kwargs["tf_agent"]

        self.replay_buffer = TFUniformReplayBuffer(
            self.tf_agent.collect_data_spec,
            batch_size=self.batch_size,
            max_length=kwargs[MAX_REPLAY_BUFFER_LENGTH])
Exemplo n.º 10
0
    def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float,
                replay_buffer_max_length: int, layers: dict) -> None:
        """
        Create the Q-network, agent and policy

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            lr: learn rate for the optimizer (default Adam)
            epsilon: Used for the default epsilon greedy policy for choosing a random action.
            gamma: The discount factor for learning Q-values
            imb_ratio: ratio of imbalance. Used to specifiy reward in the environment
            replay_buffer_max_length: Maximum lenght of replay memory.
            layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout).
        """

        dense_layers = layers.get("dense")
        conv_layers = layers.get("conv")
        dropout_layers = layers.get("dropout")

        self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio))  # create a custom environment

        q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers,
                         fc_layer_params=dense_layers, dropout_layer_params=dropout_layers)

        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr)

        train_step_counter = tf.Variable(0)

        self.agent = DqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=train_step_counter,
            gamma=gamma,
            epsilon_greedy=epsilon,
        )

        self.agent.initialize()

        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=replay_buffer_max_length)
Exemplo n.º 11
0
    def __init__(self, agent, enviroment, batch_size):
        self._replay_buffer = TFUniformReplayBuffer(
            data_spec=agent.collect_data_spec,
            batch_size=enviroment.batch_size,
            max_length=50000)

        self._random_policy = RandomTFPolicy(enviroment.time_step_spec(),
                                             enviroment.action_spec())

        self._fill_buffer(enviroment, self._random_policy, steps=100)

        self.dataset = self._replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=2,
            single_deterministic_pass=False).prefetch(3)

        self.iterator = iter(self.dataset)
Exemplo n.º 12
0
def train(agent: DdqnAgent, train_env: TFEnvironment,
          replay_buffer: TFUniformReplayBuffer, num_episodes: int,
          replay_buffer_batch_size: int, save_path: str, ramdomize_step: int,
          validate_step: int):

    train_dataset = replay_buffer.as_dataset(
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
        num_steps=REPLAY_BUFFER_NUM_STEPS,
        sample_batch_size=replay_buffer_batch_size)

    train_iterator = iter(train_dataset)

    # Savers
    checkpointer, saver = create_savers(save_path, agent, replay_buffer)
    policy_path = os.path.join(save_path, "saved", "policy")

    checkpointer.initialize_or_restore()

    global_step = 0
    episode = 0
    tf.print("Aguardando conexão do cliente")

    random_policy = create_random_policy(train_env)
    collect_episode_data(train_env,
                         random_policy,
                         replay_buffer,
                         repeats=3,
                         phase="Random")

    for episode in range(num_episodes):
        tf.print(f"Episódio {episode} iniciado")
        # Colect random data
        episode_info = collect_episode_data(train_env,
                                            agent.collect_policy,
                                            replay_buffer,
                                            repeats=1)

        experience, unused_info = next(train_iterator)
        train_loss = agent.train(experience).loss
        global_step = agent.train_step_counter.numpy()

        tf.print(
            f"Episódio {episode} finalizado, com custo {train_loss} e recompensa {episode_info['reward']}"
        )
        collect_episode_data(train_env,
                             agent.policy,
                             replay_buffer,
                             phase="Inference")

        #TODO: Salvar métricas no tensorboard ou outro cara

        # Salvar política e agente
        checkpointer.save(global_step=global_step)
        saver.save(policy_path)

    tf.print(
        f"Treinamento finalizado no eposiódio {episode} passo {global_step}")
Exemplo n.º 13
0
 def create_real_replay_buffer(self) -> ReplayBuffer:
     """
     Create the replay buffer for storing data from the real environment.
     """
     return TFUniformReplayBuffer(
         self._agent.collect_policy.trajectory_spec,
         batch_size=1,
         max_length=self._real_replay_buffer_capacity,
     )
Exemplo n.º 14
0
 def _init_replay_buffer(self, batch_size, data_spec):
     self._batch_size = batch_size
     buffer_config = {
         "batch_size": self._batch_size,
         "data_spec": data_spec,
         "max_length": 1
     }
     tf.compat.v2.summary.scalar(name="replay_buffer_size",
                                 data=self._batch_size)
     self._replay_buffer = TFReplayBuffer(**buffer_config)
Exemplo n.º 15
0
class TFUniformReplayBuffer(TFReplayBufferAbstract):
    def _init_replay_buffer(self, batch_size, data_spec):
        self._batch_size = batch_size
        buffer_config = {
            "batch_size": self._batch_size,
            "data_spec": data_spec,
            "max_length": 1
        }
        tf.compat.v2.summary.scalar(name="replay_buffer_size",
                                    data=self._batch_size)
        self._replay_buffer = TFReplayBuffer(**buffer_config)

    def add_batch(self, traj_dict):
        """
        add a trajectory to the replay buffer

        Params
            traj (dict[dim]:numpy): a dict of tensors representing the trajectory to be added it to the replay buffer
        """

        collect_spec_dict = self.collect_data_spec._asdict()
        traj_tf, traj_spec = build_tf_trajectory(traj_dict, collect_spec_dict)

        if not self._replay_buffer:
            batch_size = len(traj_dict["observation"])
            self._init_replay_buffer(batch_size, traj_spec)

        self._replay_buffer.add_batch(traj_tf)

    def get_batch(self, batch_size):

        if batch_size is None:
            batch_size = self._batch_size

        # TODO: convert the replay buffer to a dataset and iterate over it
        traj, metadata = self._replay_buffer.get_next(
            sample_batch_size=batch_size)
        return traj, metadata
Exemplo n.º 16
0
def replay_actions_across_batch_transition_models(env_model,
                                                  actions) -> Trajectory:
    """
    Use an open loop policy to apply a sequence of actions to the environment model. This returns
    at least one episode per environment batch (the same action sequence is applied to each batch).
    """
    open_loop_policy = TFOpenLoopPolicy(env_model.time_step_spec(),
                                        env_model.action_spec(), actions)
    buffer = TFUniformReplayBuffer(open_loop_policy.trajectory_spec,
                                   batch_size=env_model.batch_size,
                                   max_length=1000)
    driver = TFDriver(
        env_model,
        open_loop_policy,
        observers=[buffer.add_batch],
        max_steps=env_model.batch_size * actions.shape[0],
        disable_tf_function=True,
    )
    driver.run(env_model.reset())

    trajectories = buffer.gather_all()

    return trajectories
def get_replay_buffer(
        env: TFPyEnvironment, agent: Union[ReinforceAgent, PPOAgent], max_length: int = 100000
    ) -> TFUniformReplayBuffer:
    """
    Sets up a replay buffer object for use in training the agent.

    :param env: TensorFlow environment which provides specifications for use in setting up a replay
        buffer.
    :param agent: The agent which provides specifications for use in setting up a replay buffer.
    :param max_length: The maximum length/capacity of the replay buffer.
    :return: A replay buffer (TFUniformReplayBuffer)
    """
    replay_buffer = TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=env.batch_size,
        max_length=max_length
    )
    return replay_buffer
Exemplo n.º 18
0
class SyncUniformExperienceReplayer(ExperienceReplayer):
    """
    For synchronous off-policy training.

    Example algorithms: DDPG, SAC
    """
    def __init__(self, experience_spec, batch_size):
        self._buffer = TFUniformReplayBuffer(experience_spec, batch_size)
        self._data_iter = None

    def observe(self, exp, env_ids=None):
        """
        For the sync driver, `exp` has the shape (`env_batch_size`, ...)
        with `num_envs`==1 and `unroll_length`==1. This function always ignores
        `env_ids`.
        """
        self._buffer.add_batch(exp)

    def replay(self, sample_batch_size, mini_batch_length):
        if self._data_iter is None:
            dataset = self._buffer.as_dataset(
                num_parallel_calls=3,
                sample_batch_size=sample_batch_size,
                num_steps=mini_batch_length).prefetch(3)
            self._data_iter = iter(dataset)
        return next(self._data_iter)

    def replay_all(self):
        return self._buffer.gather_all()

    def clear(self):
        self._buffer.clear()

    @property
    def batch_size(self):
        return self._buffer._batch_size
                    optimizer,
                    actor_net,
                    value_net,
                    num_epochs=num_epochs,
                    train_step_counter=global_step,
                    discount_factor=0.995,
                    gradient_clipping=0.5,
                    entropy_regularization=1e-2,
                    importance_ratio_clipping=0.2,
                    use_gae=True,
                    use_td_lambda_return=True)

agent.initialize()

replay_buffer = TFUniformReplayBuffer(data_spec=agent.collect_data_spec,
                                      batch_size=env.batch_size,
                                      max_length=100000)


def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return
def _dyke_replay_buffer(env: TFPyEnvironment, agent: DqnAgent,
                        steps_per_episode: int) -> TFUniformReplayBuffer:
    return TFUniformReplayBuffer(data_spec=agent.collect_data_spec,
                                 batch_size=env.batch_size,
                                 max_length=steps_per_episode)
Exemplo n.º 21
0
    target_categorical_q_network=target_q_net,
    target_update_tau=tau,
    target_update_period=1,
    td_errors_loss_fn=loss,
    gamma=gamma,
    train_step_counter=train_step
)
agent.initialize()

# 4. Constructing the Replay Memory.
memory_size = 20000
batch_size = 64

replay_buffer = TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=memory_size
)

# Initializing Observer of replay buffer to store experiences (trajectories) to memory.
replay_buffer_observer = replay_buffer.add_batch

# Defining Metrics for measuring training progress.
train_metrics = [ AverageReturnMetric(), AverageEpisodeLengthMetric() ]

# 5. Defining initial policy as random to collect enough examples to fill the memory buffer (Training delay).
initial_collect_policy = random_tf_policy.RandomTFPolicy( train_env.time_step_spec(), train_env.action_spec() )
initial_collect_steps = 2000


class ShowProgress:
Exemplo n.º 22
0
 def build_replay_buffer(self):
   """Build replay buffer."""
   return TFUniformReplayBuffer(
       data_spec=self.agent.collect_data_spec,
       batch_size=1,
       max_length=self.replay_buffer_max_length)
Exemplo n.º 23
0
class DQNAgent:
    def __init__(self) -> None:
        """
        A class for training a TF-agent
        based on https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial
        """

        self.train_env = None  # Training environment
        self.agent = None  # The algorithm used to solve an RL problem is represented by a TF-Agent
        self.replay_buffer = None  # The replay buffer keeps track of data collected from the environment
        self.dataset = None  # The agent needs access to the replay buffer via an iterable tf.data.Dataset
        self.iterator = None  # The iterator of self.dataset

    def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float,
                replay_buffer_max_length: int, layers: dict) -> None:
        """
        Create the Q-network, agent and policy

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            lr: learn rate for the optimizer (default Adam)
            epsilon: Used for the default epsilon greedy policy for choosing a random action.
            gamma: The discount factor for learning Q-values
            imb_ratio: ratio of imbalance. Used to specifiy reward in the environment
            replay_buffer_max_length: Maximum lenght of replay memory.
            layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout).
        """

        dense_layers = layers.get("dense")
        conv_layers = layers.get("conv")
        dropout_layers = layers.get("dropout")

        self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio))  # create a custom environment

        q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers,
                         fc_layer_params=dense_layers, dropout_layer_params=dropout_layers)

        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr)

        train_step_counter = tf.Variable(0)

        self.agent = DqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=train_step_counter,
            gamma=gamma,
            epsilon_greedy=epsilon,
        )

        self.agent.initialize()

        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=replay_buffer_max_length)

    def fit(self, X_train: np.ndarray, y_train: np.ndarray, epochs: int, batch_size: int, eval_step: int, log_step: int,
            collect_steps_per_episode: int) -> None:
        """
        Starts the training of the Agent.

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            epochs: Number of epochs to train Agent
            batch_size: The Batch Size
            eval_step: Evaluate Model each 'eval_step'
            log_step: Monitor results of model each 'log_step'
            collect_steps_per_episode: Collect a few steps using collect_policy and save to the replay buffer.
        """

        self.dataset = self.replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=2).prefetch(3)

        self.iterator = iter(self.dataset)

        def collect_step(environment, policy, buffer):
            time_step = environment.current_time_step()
            action_step = policy.action(time_step)
            next_time_step = environment.step(action_step.action)
            traj = trajectory.from_transition(time_step, action_step, next_time_step)

            # Add trajectory to the replay buffer
            buffer.add_batch(traj)

        def collect_data(env, policy, buffer, steps):
            for _ in range(steps):
                collect_step(env, policy, buffer)

        # (Optional) Optimize by wrapping some of the code in a graph using TF function.
        self.agent.train = common.function(self.agent.train)

        # Reset the train step
        self.agent.train_step_counter.assign(0)

        for _ in range(epochs):
            #print("epoch: ", _)
            # Collect a few steps using collect_policy and save to the replay buffer.
            collect_data(self.train_env, self.agent.collect_policy, self.replay_buffer, collect_steps_per_episode)

            # Sample a batch of data from the buffer and update the agent's network.
            experience, _ = next(self.iterator)
            train_loss = self.agent.train(experience).loss

            step = self.agent.train_step_counter.numpy()

            if step % log_step == 0:
                print('step = {0}: loss = {1}'.format(step, train_loss))

            if step % eval_step == 0:
                metrics = self.compute_metrics(X_train, y_train)
                print(metrics)

    def compute_metrics(self, X: np.ndarray, y_true: list) -> dict:
        """Compute Metrics for Evaluation"""
        # TODO: apply softmax layer for q logits?

        q, _ = self.agent._target_q_network (X, training=False)

        # y_scores = np.max(q.numpy(), axis=1)  # predicted scores (Q-Values)
        y_pred = np.argmax(q.numpy(), axis=1)  # predicted class label

        metrics = custom_metrics(y_true, y_pred)

        return metrics

    def evaluate(self, X: np.ndarray, y: list, X_train=None, y_train=None) -> dict:
        """
         Evaluation of trained Q-network
        """
        metrics = self.compute_metrics(X, y)

        print("evaluation: ", metrics)
        return metrics
def train_agent(
        env: TFPyEnvironment,
        agent: Union[ReinforceAgent, PPOAgent],
        data_collection_driver: DynamicEpisodeDriver,
        replay_buffer: TFUniformReplayBuffer,
        num_iters: int,
        global_step=None,
        metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None,
        policy_metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None,
        policy_summary_writers: Optional[Sequence[tf.summary.SummaryWriter]] = None,
        eval_env: Optional[TFPyEnvironment] = None,
        eval_summary_writer: Optional[tf.summary.SummaryWriter] = None,
        num_eval_episodes: int = 1,
        eval_metrics: Optional[List[tf_metric.TFStepMetric]] = None,
        per_step_eval_metrics: Optional[List[Any]] = None,
        eval_freq: int = 10,
        log_freq: int = 5,
        save_freq: int = 5,
        model_save_path: Optional[str] = None,
        tf_log_stream_path: Optional[str] = None) -> None:
    """
    Function for putting the pieces together to train and evaluate an agent.

    :param env: The environment for which the agent will be trained.
    :param agent: The agent to train.
    :param data_collection_driver: The driver used for data collection and metric tracking.
    :param replay_buffer: Replay buffer in which to store experience.
    :param num_iters: The number of training iterations to perform.
    :param global_step: A counter of the number of training iterations.
    :param metrics: A list of the metrics to track during training.
    :param policy_metrics: A list of metrics related to the policy distribution to track during
        training.
    :param policy_summary_writers: A list of summary writers to facilitate overlaying plots of
        policy metrics in TensorBoard.
    :param eval_env: The environment in which to play out evaluations of the policy.
    :param eval_summary_writer: The summary writer used for evaluation metrics.
    :param num_eval_episodes: The number of evaluation episodes to run at each evaluation point.
    :param eval_metrics: The metrics to track when evaluating the policy (with episodic resolution).
    :param per_step_eval_metrics: The metrics to track when evaluating the policy (with time step
        resolution).
    :param eval_freq: The number of training iterations between runs of policy evaluation logging.
    :param log_freq: The frequency with which to log values to TensorBoard.
    :param save_freq: The number of training iterations between model saves.
    :param model_save_path: Directory in which to save model checkpoints (weights etc). If None
        model will not be saved.
    :param tf_log_stream_path:
    """
    # Get the initial states of the agent and environment before training.
    time_step = env.reset()
    policy_state = agent.collect_policy.get_initial_state(env.batch_size)

    # Set up the model saving infrastructure if a path to save to is provided.
    save_model = bool(model_save_path)
    if save_model:
        # Ensure that we save all trackable values (i.e. variables) from the TensorFlow Agent.
        checkpoint = tf.train.Checkpoint(agent=agent)
        # The checkpoint manager enables us to save multiple versions of the check point at
        # different training steps. We save the 20 most recent saves to span a wide section of
        # training.
        checkpoint_manager = tf.train.CheckpointManager(checkpoint, model_save_path, max_to_keep=20)
    else:
        # Warn the user that training will continue but models will not be saved.
        warn("No save directory provided. Model will not be saved.")

    if metrics is None:
        metrics = []
    if per_step_eval_metrics is None:
        per_step_eval_metrics = []
    # Set up a minimal training loop to simply test training mechanics work.
    for i in range(num_iters):
        with tf.summary.record_if(lambda: tf.math.equal(global_step % log_freq, 0)):
            # Collect experience.
            time_step, policy_state = data_collection_driver.run(
                time_step=time_step,
                policy_state=policy_state
            )
            # Now the replay buffer should have data in it so we can collect the data and train the
            # agent.
            experience = replay_buffer.gather_all()
            agent.train(experience)
            # Clear the replay buffer and return to play.
            replay_buffer.clear()
            for metric in metrics:
                metric.tf_summaries(
                    train_step=global_step,
                    step_metrics=metrics[:2]
                )
            # Run the policy tracking metrics one at a time each on their own summary writer to
            # enable shared axes on TensorBoard.
            for metric, summary_writer in zip(policy_metrics, policy_summary_writers):
                with summary_writer.as_default():
                    tf.summary.scalar(name=metric.name, data=metric.result(), step=global_step)

        if eval_summary_writer and eval_metrics and eval_env:
            if i > 0 and global_step % eval_freq == 0:
                evaluate_policy(
                    eval_metrics,
                    eval_env,
                    agent.policy,
                    per_step_metrics=per_step_eval_metrics,
                    num_episodes=num_eval_episodes,
                    train_step=global_step,
                    summary_writer=eval_summary_writer,
                    summary_prefix="Metrics",
                    logging=True,
                    tf_log_stream_path=tf_log_stream_path
                )
        # Periodically save the model provided that we have the infrastructure in place.
        if save_model and i > 0 and (i + 1) % save_freq == 0:
            checkpoint_manager.save(i + 1)
        if i % (num_iters // 100) == 0:
            print(f"\tCompleted: {i / num_iters * 100} %")
    checkpoint_manager.save(num_iters)
Exemplo n.º 25
0
class SyncUniformExperienceReplayer(ExperienceReplayer):
    """
    For synchronous off-policy training.

    Example algorithms: DDPG, SAC
    """

    def __init__(self, experience_spec, batch_size):
        # TFUniformReplayBuffer does not support list in spec, we have to do
        # some conversion.
        self._experience_spec = experience_spec
        self._exp_has_list = nest_utils.nest_contains_list(experience_spec)
        tuple_experience_spec = nest_utils.nest_list_to_tuple(experience_spec)
        self._buffer = TFUniformReplayBuffer(tuple_experience_spec, batch_size)
        self._data_iter = None

    def _list_to_tuple(self, exp):
        if self._exp_has_list:
            return nest_utils.nest_list_to_tuple(exp)
        else:
            return exp

    def _tuple_to_list(self, exp):
        if self._exp_has_list:
            return nest_utils.nest_tuple_to_list(exp, self._experience_spec)
        else:
            return exp

    def observe(self, exp, env_ids=None):
        """
        For the sync driver, `exp` has the shape (`env_batch_size`, ...)
        with `num_envs`==1 and `unroll_length`==1. This function always ignores
        `env_ids`.
        """
        self._buffer.add_batch(self._list_to_tuple(exp))

    def replay(self, sample_batch_size, mini_batch_length):
        """Get a random batch.

        Args:
            sample_batch_size (int): number of sequences
            mini_batch_length (int): the length of each sequence
        Returns:
            Experience: experience batch in batch major (B, T, ...)
            tf_uniform_replay_buffer.BufferInfo: information about the batch
        """
        if self._data_iter is None:
            dataset = self._buffer.as_dataset(
                num_parallel_calls=3,
                sample_batch_size=sample_batch_size,
                num_steps=mini_batch_length).prefetch(3)
            self._data_iter = iter(dataset)
        exp, info = next(self._data_iter)
        return self._tuple_to_list(exp), info

    def replay_all(self):
        return self._tuple_to_list(self._buffer.gather_all())

    def clear(self):
        self._buffer.clear()

    @property
    def batch_size(self):
        return self._buffer._batch_size
Exemplo n.º 26
0
        ]

        output_layer = Dense(num_actions, activation=None)

        cloning_net = Sequential(dense_layers + [output_layer])
        optimizer = Adam(learning_rate=learning_rate)
        train_step_counter = train_utils.create_train_step()
        agent = BehavioralCloningAgent(env.time_step_spec(),
                                       env.action_spec(),
                                       cloning_network=cloning_net,
                                       optimizer=optimizer)

    policy = agent.policy

    replay_buffer = TFUniformReplayBuffer(data_spec=agent.collect_data_spec,
                                          batch_size=env.batch_size,
                                          max_length=replay_buffer_capacity)

    agent.train_step_counter.assign(0)

    replay_observer = [replay_buffer.add_batch]
    with strategy.scope():
        driver = TFDriver(env,
                          collect_policy,
                          replay_observer,
                          max_episodes=100)

    average = AverageReturnMetric()
    metrics_observer = [average]
    metrics_driver = TFRenderDriver(env,
                                    policy,
Exemplo n.º 27
0
    # Initialize agent
    agent.initialize()
    # Wrap the training function in a TF graph
    agent.train = common.function(agent.train)

    # Create game environments: training and evaluation
    train_env = TFPyEnvironment(NineMensMorris(agent.policy, discount=DISCOUNT))
    eval_env = TFPyEnvironment(NineMensMorris(agent.policy, discount=DISCOUNT))

    # Random policy for data collection
    random_policy = RandomTFPolicy(time_step_spec=train_env.time_step_spec(),
                                   action_spec=train_env.action_spec())

    # Create replay buffer for data collection
    replay_buffer = TFUniformReplayBuffer(data_spec=agent.collect_data_spec,
                                          batch_size=train_env.batch_size,
                                          max_length=BUFFER_LENGTH)

    # Create driver for the agent
    driver = DynamicStepDriver(env=train_env,
                               policy=agent.collect_policy,
                               observers=[replay_buffer.add_batch],
                               num_steps=STEPS_PER_ITER)
    # Wrap the run function in a TF graph
    driver.run = common.function(driver.run)
    # Create driver for the random policy
    random_driver = DynamicStepDriver(env=train_env,
                                      policy=random_policy,
                                      observers=[replay_buffer.add_batch],
                                      num_steps=STEPS_PER_ITER)
    # Wrap the run function in a TF graph
Exemplo n.º 28
0
        total_return = 0
        for _ in range(num_episodes):
            time_step = environment.reset()
            episode_return = 0
            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = environment.step(action_step.action)
                episode_return += time_step.reward
            total_return += episode_return

        return total_return / num_episodes

    # Replay buffer
    replay_buffer = TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=REPLAY_BUFFER_MAX
    )

    driver = DynamicStepDriver(
        train_env,
        agent.collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=1
    )

    dataset = replay_buffer.as_dataset(
        num_parallel_calls=3,
        sample_batch_size=BATCH_SIZE,
        num_steps=2).prefetch(3)
    iterator = iter(dataset)
Exemplo n.º 29
0
 def __init__(self, experience_spec, batch_size):
     self._buffer = TFUniformReplayBuffer(experience_spec, batch_size)
     self._data_iter = None
Exemplo n.º 30
0
def create_replay_buffer(agent: DdqnAgent, environment: TFEnvironment,
                         max_lenght: int) -> TFUniformReplayBuffer:
    return TFUniformReplayBuffer(agent.collect_data_spec,
                                 environment.batch_size,
                                 max_length=max_lenght)