Пример #1
0
    def rollout(self,
                time_step: ActionTimeStep,
                state,
                mode,
                epsilon_greedy=1.0):
        """Train one step.

        Args:
            time_step: time_step.observation should be the latent vector
            state: state of the model
        """
        latent_vector = time_step.observation
        rnn_output, rnn_state = self._rnn(latent_vector, state)
        mem_readout = self._memory.genkey_and_read(self._key_net, rnn_output)
        policy_input = tf.concat(
            [tf.stop_gradient(latent_vector), rnn_output, mem_readout],
            axis=-1)
        action_distribution, _ = self._actor_net(policy_input,
                                                 step_type=time_step.step_type,
                                                 network_state=None)

        value, _ = self._value_net(latent_vector,
                                   step_type=time_step.step_type,
                                   network_state=None)

        info = ActorCriticInfo(action_distribution=action_distribution,
                               value=value)
        action = common.epsilon_greedy_sample(action_distribution,
                                              epsilon_greedy)
        return PolicyStep(action=action, state=rnn_state, info=info)
Пример #2
0
    def predict(self, time_step: ActionTimeStep, state: ActorCriticState,
                epsilon_greedy):
        """Predict for one step."""
        action_dist, actor_state = self._actor_network(
            time_step.observation,
            step_type=time_step.step_type,
            network_state=state.actor)

        action = common.epsilon_greedy_sample(action_dist, epsilon_greedy)
        return PolicyStep(
            action=action,
            state=ActorCriticState(actor=actor_state),
            info=ActorCriticInfo(action_distribution=action_dist))
Пример #3
0
 def _predict(self,
              time_step: ActionTimeStep,
              state=None,
              epsilon_greedy=1.):
     action_dist, state = self._actor_network(
         time_step.observation,
         step_type=time_step.step_type,
         network_state=state.share.actor)
     empty_state = tf.nest.map_structure(lambda x: (),
                                         self.train_state_spec)
     state = empty_state._replace(share=SacShareState(actor=state))
     action = common.epsilon_greedy_sample(action_dist, epsilon_greedy)
     return PolicyStep(action=action,
                       state=state,
                       info=SacInfo(action_distribution=action_dist))
Пример #4
0
    def _get_action(self, actor_network, time_step, state, epsilon_greedy=1.0):
        action_distribution, state = actor_network(
            time_step.observation,
            step_type=time_step.step_type,
            network_state=state)
        if isinstance(actor_network, DistributionNetwork):
            action = common.epsilon_greedy_sample(action_distribution,
                                                  epsilon_greedy)
        else:

            def _sample(a, ou):
                return tf.cond(
                    tf.less(tf.random.uniform((), 0, 1), epsilon_greedy),
                    lambda: a + ou(), lambda: a)

            action = tf.nest.map_structure(_sample, action_distribution,
                                           self._ou_process)
            action_distribution = ()
        return action_distribution, action, state