def test_append(self):
     count = 100
     start_length = count // 2
     max_length = count
     buffer = ReplayBuffer(start_length=start_length, max_length=max_length)
     for append_count in range(max_length*2):
         buffer.append(append_count)
         self.assertEqual(len(buffer.buffer), min(append_count+1, max_length), "Incorrect buffer size.")
         self.assertEqual(buffer.buffer[0], max(0, (append_count+1) - max_length), "Incorrect first value.")
         self.assertEqual(buffer.buffer[-1], append_count, "Incorrect last value.")
Пример #2
0
class Brain:
    """
    The Brain that contains all the models
    """
    def __init__(self,
                 num_states,
                 num_actions,
                 action_high,
                 action_low,
                 gamma=GAMMA,
                 rho=RHO,
                 std_dev=STD_DEV):
        # initialize everything
        self.actor_network = ActorNetwork(num_states, num_actions, action_high)
        self.critic_network = CriticNetwork(num_states, num_actions,
                                            action_high)
        self.actor_target = ActorNetwork(num_states, num_actions, action_high)
        self.critic_target = CriticNetwork(num_states, num_actions,
                                           action_high)

        # Making the weights equal initially
        self.actor_target.set_weights(self.actor_network.get_weights())
        self.critic_target.set_weights(self.critic_network.get_weights())

        self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)
        self.gamma = tf.constant(gamma)
        self.rho = rho
        self.action_high = action_high
        self.action_low = action_low
        self.num_states = num_states
        self.num_actions = num_actions
        self.noise = OUActionNoise(mean=np.zeros(1),
                                   std_deviation=float(std_dev) * np.ones(1))

        # optimizers
        self.critic_optimizer = tf.keras.optimizers.Adam(CRITIC_LR,
                                                         amsgrad=True)
        self.actor_optimizer = tf.keras.optimizers.Adam(ACTOR_LR, amsgrad=True)

        # temporary variable for side effects
        self.cur_action = None

        # define update weights with tf.function for improved performance
        @tf.function(input_signature=[
            tf.TensorSpec(shape=(None, num_states), dtype=tf.float32),
            tf.TensorSpec(shape=(None, num_actions), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
            tf.TensorSpec(shape=(None, num_states), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
        ])
        def update_weights(s, a, r, sn, d):
            """
            Function to update weights with optimizer
            """
            with tf.GradientTape() as tape:
                # define target
                y = r + self.gamma * (1 - d) * self.critic_target(
                    [sn, self.actor_target(sn)])
                # define the delta Q
                critic_loss = tf.math.reduce_mean(
                    tf.math.abs(y - self.critic_network([s, a])))
            critic_grad = tape.gradient(
                critic_loss, self.critic_network.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic_network.trainable_variables))

            with tf.GradientTape() as tape:
                # define the delta mu
                actor_loss = -tf.math.reduce_mean(
                    self.critic_network([s, self.actor_network(s)]))
            actor_grad = tape.gradient(actor_loss,
                                       self.actor_network.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor_network.trainable_variables))
            return critic_loss, actor_loss

        self.update_weights = update_weights

    def act(self, state, _notrandom=True, noise=True):
        """
        Run action by the actor network

        Args:
            state: the current state
            _notrandom: whether greedy is used
            noise: whether noise is to be added to the result action (this improves exploration)

        Returns:
            the resulting action
        """
        self.cur_action = (
            self.actor_network(state)[0].numpy() if _notrandom else
            (np.random.uniform(self.action_low, self.action_high,
                               self.num_actions)) +
            (self.noise() if noise else 0))
        self.cur_action = np.clip(self.cur_action, self.action_low,
                                  self.action_high)

        return self.cur_action

    def remember(self, prev_state, reward, state, done):
        """
        Store states, reward, done value to the buffer
        """
        # record it in the buffer based on its reward
        self.buffer.append(prev_state, self.cur_action, reward, state, done)

    def learn(self, entry):
        """
        Run update for all networks (for training)
        """
        s, a, r, sn, d = zip(*entry)

        c_l, a_l = self.update_weights(
            tf.convert_to_tensor(s, dtype=tf.float32),
            tf.convert_to_tensor(a, dtype=tf.float32),
            tf.convert_to_tensor(r, dtype=tf.float32),
            tf.convert_to_tensor(sn, dtype=tf.float32),
            tf.convert_to_tensor(d, dtype=tf.float32))

        update_target(self.actor_target, self.actor_network, self.rho)
        update_target(self.critic_target, self.critic_network, self.rho)

        return c_l, a_l

    def save_weights(self, path):
        """
        Save weights to `path`
        """
        parent_dir = os.path.dirname(path)
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)
        # Save the weights
        self.actor_network.save_weights(path + "an.h5")
        self.critic_network.save_weights(path + "cn.h5")
        self.critic_target.save_weights(path + "ct.h5")
        self.actor_target.save_weights(path + "at.h5")

    def load_weights(self, path):
        """
        Load weights from path
        """
        try:
            self.actor_network.load_weights(path + "an.h5")
            self.critic_network.load_weights(path + "cn.h5")
            self.critic_target.load_weights(path + "ct.h5")
            self.actor_target.load_weights(path + "at.h5")
        except OSError as err:
            logging.warning("Weights files cannot be found, %s", err)