示例#1
0
class ExperienceReplay(object):
    def __init__(self, agent, enviroment, batch_size):
        self._replay_buffer = TFUniformReplayBuffer(
            data_spec=agent.collect_data_spec,
            batch_size=enviroment.batch_size,
            max_length=100000)

        self._random_policy = RandomTFPolicy(enviroment.time_step_spec(),
                                             enviroment.action_spec())

        self._fill_buffer(enviroment, self._random_policy, steps=100)

        self.dataset = self._replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=2,
            single_deterministic_pass=False).prefetch(3)

        self.iterator = iter(self.dataset)

    def _fill_buffer(self, enviroment, policy, steps):
        for _ in range(steps):
            self.timestamp_data(enviroment, policy)

    def timestamp_data(self, environment, policy):
        time_step = environment.current_time_step()
        action_step = policy.action(time_step)
        next_time_step = environment.step(action_step.action)
        timestamp_trajectory = trajectory.from_transition(
            time_step, action_step, next_time_step)

        self._replay_buffer.add_batch(timestamp_trajectory)
示例#2
0
 def collect_step(self, env: tf_py_environment.TFPyEnvironment,
                  policy: tf_policy.Base,
                  replay_buffer: TFUniformReplayBuffer):
     time_step = env.current_time_step()
     action_step = policy.action(time_step)
     next_time_step = env.step(action_step.action)
     traj = trajectory.from_transition(time_step, action_step,
                                       next_time_step)
     replay_buffer.add_batch(traj)
示例#3
0
def collect_step(environment: TFEnvironment, policy: TFPyPolicy,
                 replay_buffer: TFUniformReplayBuffer):
    """
    Coleta uma iteração com o ambiente e devolve o resultado m
    :param environment:
    :param policy:
    :param replay_buffer:
    :return:
    """
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    replay_buffer.add_batch(traj)
class TFUniformReplayBuffer(TFReplayBufferAbstract):
    def _init_replay_buffer(self, batch_size, data_spec):
        self._batch_size = batch_size
        buffer_config = {
            "batch_size": self._batch_size,
            "data_spec": data_spec,
            "max_length": 1
        }
        tf.compat.v2.summary.scalar(name="replay_buffer_size",
                                    data=self._batch_size)
        self._replay_buffer = TFReplayBuffer(**buffer_config)

    def add_batch(self, traj_dict):
        """
        add a trajectory to the replay buffer

        Params
            traj (dict[dim]:numpy): a dict of tensors representing the trajectory to be added it to the replay buffer
        """

        collect_spec_dict = self.collect_data_spec._asdict()
        traj_tf, traj_spec = build_tf_trajectory(traj_dict, collect_spec_dict)

        if not self._replay_buffer:
            batch_size = len(traj_dict["observation"])
            self._init_replay_buffer(batch_size, traj_spec)

        self._replay_buffer.add_batch(traj_tf)

    def get_batch(self, batch_size):

        if batch_size is None:
            batch_size = self._batch_size

        # TODO: convert the replay buffer to a dataset and iterate over it
        traj, metadata = self._replay_buffer.get_next(
            sample_batch_size=batch_size)
        return traj, metadata
示例#5
0
class SyncUniformExperienceReplayer(ExperienceReplayer):
    """
    For synchronous off-policy training.

    Example algorithms: DDPG, SAC
    """
    def __init__(self, experience_spec, batch_size):
        self._buffer = TFUniformReplayBuffer(experience_spec, batch_size)
        self._data_iter = None

    def observe(self, exp, env_ids=None):
        """
        For the sync driver, `exp` has the shape (`env_batch_size`, ...)
        with `num_envs`==1 and `unroll_length`==1. This function always ignores
        `env_ids`.
        """
        self._buffer.add_batch(exp)

    def replay(self, sample_batch_size, mini_batch_length):
        if self._data_iter is None:
            dataset = self._buffer.as_dataset(
                num_parallel_calls=3,
                sample_batch_size=sample_batch_size,
                num_steps=mini_batch_length).prefetch(3)
            self._data_iter = iter(dataset)
        return next(self._data_iter)

    def replay_all(self):
        return self._buffer.gather_all()

    def clear(self):
        self._buffer.clear()

    @property
    def batch_size(self):
        return self._buffer._batch_size
示例#6
0
class SyncUniformExperienceReplayer(ExperienceReplayer):
    """
    For synchronous off-policy training.

    Example algorithms: DDPG, SAC
    """

    def __init__(self, experience_spec, batch_size):
        # TFUniformReplayBuffer does not support list in spec, we have to do
        # some conversion.
        self._experience_spec = experience_spec
        self._exp_has_list = nest_utils.nest_contains_list(experience_spec)
        tuple_experience_spec = nest_utils.nest_list_to_tuple(experience_spec)
        self._buffer = TFUniformReplayBuffer(tuple_experience_spec, batch_size)
        self._data_iter = None

    def _list_to_tuple(self, exp):
        if self._exp_has_list:
            return nest_utils.nest_list_to_tuple(exp)
        else:
            return exp

    def _tuple_to_list(self, exp):
        if self._exp_has_list:
            return nest_utils.nest_tuple_to_list(exp, self._experience_spec)
        else:
            return exp

    def observe(self, exp, env_ids=None):
        """
        For the sync driver, `exp` has the shape (`env_batch_size`, ...)
        with `num_envs`==1 and `unroll_length`==1. This function always ignores
        `env_ids`.
        """
        self._buffer.add_batch(self._list_to_tuple(exp))

    def replay(self, sample_batch_size, mini_batch_length):
        """Get a random batch.

        Args:
            sample_batch_size (int): number of sequences
            mini_batch_length (int): the length of each sequence
        Returns:
            Experience: experience batch in batch major (B, T, ...)
            tf_uniform_replay_buffer.BufferInfo: information about the batch
        """
        if self._data_iter is None:
            dataset = self._buffer.as_dataset(
                num_parallel_calls=3,
                sample_batch_size=sample_batch_size,
                num_steps=mini_batch_length).prefetch(3)
            self._data_iter = iter(dataset)
        exp, info = next(self._data_iter)
        return self._tuple_to_list(exp), info

    def replay_all(self):
        return self._tuple_to_list(self._buffer.gather_all())

    def clear(self):
        self._buffer.clear()

    @property
    def batch_size(self):
        return self._buffer._batch_size
示例#7
0
class TFAgentsPPOAgent(RLAgent):
    def __init__(self,
                 name=None,
                 actor_net=None,
                 value_net=None,
                 predictor=None,
                 keep_models_fixed=False,
                 featurizer=None):
        super().__init__(name, predictor, keep_models_fixed, featurizer)

        action_spec = BoundedTensorSpec((1, ), tf.int64, 0,
                                        ACTION_DIMENSIONS - 1)

        # we store both mask and the actual observation in the observation
        # given to the agent in order to get an association between these two
        # see also https://github.com/tensorflow/agents/issues/125#issuecomment-496583325
        observation_spec = {
            'state': TensorSpec((self.featurizer.state_dimension(), ),
                                tf.float32),
            'mask': TensorSpec((ACTION_DIMENSIONS, ), tf.float32)
        }

        layers = equal_spacing_fc(5, self.featurizer.state_dimension())

        if actor_net is None:
            self.actor_net = MaskedActorNetwork(observation_spec, action_spec,
                                                layers)
        else:
            self.actor_net = actor_net

        if value_net is None:
            self.value_net = DummyMaskedValueNetwork(observation_spec,
                                                     fc_layer_params=layers)
        else:
            self.value_net = value_net

        self.agent = tf_agents.agents.ppo.ppo_agent.PPOAgent(
            time_step_spec=ts.time_step_spec(observation_spec),
            action_spec=action_spec,
            actor_net=self.actor_net,
            value_net=self.value_net,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-5),
            discount_factor=1,
            use_gae=True,
            use_td_lambda_return=True,
            lambda_value=0.85,
            num_epochs=30,

            # the observations are dicts { 'state': ..., 'mask': ... }
            # normalization does not make any sense for the mask
            normalize_observations=False,
        )

        if actor_net is not None or value_net is not None:
            self.agent.initialize()
        else:
            self._create_train_checkpointer()

            # All the variables are in fact successfully restored but this is
            # not done immediately but only once some shapes are known.
            # Therefore, if the shapes are never known, the variables are not
            # restored. This is no problem in self play, where all of the shapes
            # are known after the first training but it is a problem when playing
            # against old versions because often some of the old versions aren't
            # used (and also the value net is never used because the old versions
            # aren't trained). It isn't an error but tensorflow gives warnings at
            # the end which are confusing if one doesn't know this.
            # Therefore we silence those warnings with .expect_partial().
            # For more information see
            # https://github.com/tensorflow/tensorflow/issues/27937#issuecomment-484683443
            # https://github.com/tensorflow/tensorflow/issues/27937#issuecomment-488356053

            self.train_checkpointer.initialize_or_restore().expect_partial()

        # it seems like there is also agent.policy. I still don't understand when
        # one should use which and why but this one works for now.
        self.policy = self.agent.collect_policy

        # because tf_agents wants the data as trajectories
        # (prev_time_step, action, new_time_step), we have to store the prev_time_step
        # until we have the new_time_step to build the trajectory at which point
        # the new prev_time_step is the new_time_step
        # this variable is to keep track of the prev_time_step
        self.last_time_step = None

        # even though PPO is on policy, storing the stuff for a bit seems to be ok
        # and the examples in the tf_agents repo also use one
        self.replay_buffer = TFUniformReplayBuffer(
            self.agent.collect_data_spec,
            batch_size=1,
            max_length=REPLAY_BUFFER_SIZE)
        self.replay_buffer_position = 0

        self.clone_counter = 0

    def _create_train_checkpointer(self):
        self.train_checkpointer = tf_agents.utils.common.Checkpointer(
            ckpt_dir=os.path.join(MODELS_PATH, self.name, 'Agent'),
            agent=self.agent)

    def _add_trajectory(self, prev_time_step, action, new_time_step):
        """Add a trajectory (prev_time_step, action, new_time_step) to the replay buffer

        Also train the agent on the whole buffer if it is full.
        """

        traj = tf_agents.trajectories.trajectory.from_transition(
            prev_time_step, action, new_time_step)

        self.replay_buffer.add_batch(traj)
        self.replay_buffer_position += 1

        if self.replay_buffer_position == REPLAY_BUFFER_SIZE + 1:
            if not self.keep_models_fixed:
                self.agent.train(self.replay_buffer.gather_all())
            self.replay_buffer_position = 0
            self.replay_buffer.clear()

    def act(self, observation, valid_action_mask):
        observation = {
            'state': np.array(observation, dtype=np.float32),
            'mask': valid_action_mask
        }

        if self.last_time_step is None:
            # a new episode started
            self.last_time_step = _to_tf_timestep(ts.restart(observation))
            self.last_action_step = self.policy.action(self.last_time_step)
            return self.last_action_step.action.numpy()[0, 0]

        new_time_step = _to_tf_timestep(
            ts.transition(observation, self.prev_reward))
        self._add_trajectory(self.last_time_step, self.last_action_step,
                             new_time_step)

        self.last_time_step = new_time_step
        self.last_action_step = self.policy.action(new_time_step)
        self.prev_reward = None

        return self.last_action_step.action.numpy()[0, 0]

    def observe(self, reward, terminal):
        if not terminal:
            self.prev_reward = reward
            return

        # even when the episode ends, tf_agents expects some observation
        # additionally to the reward. Because that makes no sense for us,
        # we just give it an observation consisting of all-zeros
        new_time_step = _to_tf_timestep(
            ts.termination(
                {
                    'state': np.zeros(self.featurizer.state_dimension()),
                    'mask': np.zeros(ACTION_DIMENSIONS)
                }, reward))

        self._add_trajectory(self.last_time_step, self.last_action_step,
                             new_time_step)

        self.last_time_step = None
        self.last_action_step = None
        self.prev_reward = None

    def clone(self, name=None):
        """Return a clone of this agent with networks & predictor shared"""

        if name is None:
            self.clone_counter += 1
            name = self.name + 'Clone' + str(self.clone_counter)

        return TFAgentsPPOAgent(name=name,
                                actor_net=self.actor_net,
                                value_net=self.value_net,
                                predictor=self.predictor,
                                keep_models_fixed=self.keep_models_fixed,
                                featurizer=self.featurizer)

    def save_models(self):
        """Save actor, critic and predictor

        Args:
            global_step: the current game number, is appended to
                the filenames of the saved models
        """

        if self.keep_models_fixed:
            return

        super().save_models(os.path.join(MODELS_PATH, self.name))
        if not hasattr(self, 'train_checkpointer'):
            self._create_train_checkpointer()
        self.train_checkpointer.save(0)