Exemplo n.º 1
0
class Agent(object):
    def __init__(self, state_size, action_size, max_action, minibatch_size,
                 a_lr, c_lr, gamma, tau):
        self.state_size = state_size
        self.action_size = action_size
        self.max_action = max_action

        self.critic_lr = c_lr
        self.actor_lr = a_lr

        self.actor_network = Actor(self.state_size, self.action_size,
                                   self.max_action, self.actor_lr)
        self.actor_target_network = Actor(self.state_size, self.action_size,
                                          self.max_action, self.actor_lr)
        self.critic_network = Critic(self.state_size, self.action_size,
                                     self.critic_lr)
        self.critic_target_network = Critic(self.state_size, self.action_size,
                                            self.critic_lr)

        self.actor_target_network.set_weights(self.actor_network.get_weights())
        self.critic_target_network.set_weights(
            self.critic_network.get_weights())

        self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr)
        self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr)

        self.replay_buffer = ReplayBuffer(1e6)
        self.MINIBATCH_SIZE = minibatch_size
        self.GAMMA = tf.cast(gamma, dtype=tf.float64)
        self.TAU = tau
        self.noise = OUNoise(self.action_size)

    def step(self, s, a, r, s_1, t, train=True):
        self.replay_buffer.add(s, a, r, s_1, t)
        if (train and self.replay_buffer.size() >= self.MINIBATCH_SIZE):
            minibatch = self.replay_buffer.sample_batch(self.MINIBATCH_SIZE)
            self.learn(minibatch)

    @tf.function
    def critic_train(self, minibatch):
        s_batch, a_batch, r_batch, s_1_batch, t_batch = minibatch

        mu_prime = self.actor_target_network(s_1_batch)
        q_prime = self.critic_target_network([s_1_batch, mu_prime])

        ys = r_batch + self.GAMMA * (1 - t_batch) * q_prime

        with tf.GradientTape() as tape:
            predicted_qs = self.critic_network([s_batch, a_batch])
            loss = (predicted_qs - ys) * (predicted_qs - ys)
            loss = tf.reduce_mean(loss)
        dloss = tape.gradient(loss, self.critic_network.trainable_weights)

        self.critic_optimizer.apply_gradients(
            zip(dloss, self.critic_network.trainable_weights))

    def actor_train(self, minibatch):
        s_batch, _, _, _, _ = minibatch

        with tf.GradientTape() as tape:
            next_action = self.actor_network(s_batch)
            actor_loss = -tf.reduce_mean(
                self.critic_network([s_batch, next_action]))
        actor_grad = tape.gradient(actor_loss,
                                   self.actor_network.trainable_weights)

        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor_network.trainable_weights))

    def learn(self, minibatch):
        s, a, r, s_1, t = minibatch

        s = np.array(s, dtype=np.float64).reshape(self.MINIBATCH_SIZE,
                                                  self.state_size)
        s = tf.convert_to_tensor(s)
        a = np.array(a, dtype=np.float64).reshape(self.MINIBATCH_SIZE,
                                                  self.action_size)
        a = tf.convert_to_tensor(a)
        r = np.array(r, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1)
        s_1 = np.array(s_1, dtype=np.float64).reshape(self.MINIBATCH_SIZE,
                                                      self.state_size)
        s_1 = tf.convert_to_tensor(s_1)
        t = np.array(t, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1)

        minibatch = (s, a, r, s_1, t)

        self.critic_train(minibatch)
        self.actor_train(minibatch)
        self.update_target_networks()

    def act(self, state, t=0):
        state = np.array(state).reshape(1, self.state_size)
        action = self.actor_network(state)[0]
        noisy = self.noise.get_action(action, t)
        return action, noisy

    def update_target_networks(self):
        self.actor_target_network.set_weights(
            np.array(self.actor_network.get_weights()) * self.TAU +
            np.array(self.actor_target_network.get_weights()) * (1 - self.TAU))
        self.critic_target_network.set_weights(
            np.array(self.critic_network.get_weights()) * self.TAU +
            np.array(self.critic_target_network.get_weights()) *
            (1 - self.TAU))
Exemplo n.º 2
0
class Agent(object):
    def __init__(self, state_space, action_space, max_action, device):
        self.state_size = state_space.shape[0]
        self.action_size = action_space.shape[0]
        self.max_action = max_action
        self.device = device
        self.actor_local = Actor(state_space.shape, action_space.high.size,
                                 max_action)
        self.actor_target = Actor(state_space.shape, action_space.high.size,
                                  max_action)
        self.actor_optimizer = optimizers.Adam(LR_ACTOR)
        # let target be equal to local
        self.actor_target.set_weights(self.actor_local.get_weights())

        self.critic_local = Critic(state_space.shape, action_space.high.size)
        self.critic_target = Critic(state_space.shape, action_space.high.size)
        self.critic_optimizer = optimizers.Adam(LR_CRITIC)
        # let target be equal to local
        self.critic_target.set_weights(self.critic_local.get_weights())

        self.noise = OUNoise(self.action_size)
        self.memory = ReplayBuffer(BUFFER_SIZE)

        self.current_steps = 0

    def step(self,
             state,
             action,
             reward,
             done,
             next_state,
             train=True) -> None:
        self.memory.store(state, action, reward, done, next_state)
        if train and self.memory.count > BATCH_SIZE and self.memory.count > MIN_MEM_SIZE:
            if self.current_steps % UPDATE_STEPS == 0:
                experiences = self.memory.sample(BATCH_SIZE)
                self.learn(experiences, GAMMA)
            self.current_steps += 1

    @tf.function
    def critic_train(self, states, actions, rewards, dones, next_states):
        with tf.device(self.device):
            # Compute yi
            u_t = self.actor_target(next_states)
            q_t = self.critic_target([next_states, u_t])
            yi = tf.cast(rewards, dtype=tf.float64) + \
                 tf.cast(GAMMA, dtype=tf.float64) * \
                 tf.cast((1 - tf.cast(dones, dtype=tf.int64)), dtype=tf.float64) * \
                 tf.cast(q_t, dtype=tf.float64)

            # Compute MSE
            with tf.GradientTape() as tape:
                q_l = tf.cast(self.critic_local([states, actions]),
                              dtype=tf.float64)
                loss = (q_l - yi) * (q_l - yi)
                loss = tf.reduce_mean(loss)
                # Update critic by minimizing loss
                dloss_dql = tape.gradient(loss,
                                          self.critic_local.trainable_weights)
            self.critic_optimizer.apply_gradients(
                zip(dloss_dql, self.critic_local.trainable_weights))
        return

    @tf.function
    def actor_train(self, states):
        with tf.device(self.device):
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(self.actor_local.trainable_variables)
                u_l = self.actor_local(states)
                q_l = -tf.reduce_mean(self.critic_local([states, u_l]))
            j = tape.gradient(q_l, self.actor_local.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(j, self.actor_local.trainable_variables))
        return

    def learn(self, experiences, gamma) -> None:
        states, actions, rewards, dones, next_states = experiences
        states = np.array(states).reshape(BATCH_SIZE, self.state_size)
        states = tf.convert_to_tensor(states)
        actions = np.array(actions).reshape(BATCH_SIZE, self.action_size)
        actions = tf.convert_to_tensor(actions)
        rewards = np.array(rewards).reshape(BATCH_SIZE, 1)
        next_states = np.array(next_states).reshape(BATCH_SIZE,
                                                    self.state_size)
        dones = np.array(dones).reshape(BATCH_SIZE, 1)

        self.critic_train(states, actions, rewards, dones, next_states)
        self.actor_train(states)
        self.update_local()
        return

    def update_local(self):
        def soft_updates(local_model: tf.keras.Model,
                         target_model: tf.keras.Model) -> np.ndarray:
            local_weights = np.array(local_model.get_weights())
            target_weights = np.array(target_model.get_weights())

            assert len(local_weights) == len(target_weights)
            new_weights = TAU * local_weights + (1 - TAU) * target_weights
            return new_weights

        self.actor_target.set_weights(
            soft_updates(self.actor_local, self.actor_target))
        self.critic_target.set_weights(
            soft_updates(self.critic_local, self.critic_target))

    def store_weights(self, episode: int) -> None:
        self.actor_target.save_weights(
            join(CKPTS_PATH, ACTOR_CKPTS, f'cp-{episode}'))
        self.critic_target.save_weights(
            join(CKPTS_PATH, CRITIC_CKPTS, f'cp-{episode}'))
        return

    def act(self, state, add_noise=True) -> (float, float):
        state = np.array(state).reshape(1, self.state_size)
        pure_action = self.actor_local.predict(state)[0]
        action = self.noise.get_action(pure_action)
        return action, pure_action

    def reset(self):
        self.noise.reset()
Exemplo n.º 3
0
def main():
    sess = tf.Session()
    K.set_session(sess)
    env = gym.make("MountainCarContinuous-v0")

    #Parameters
    memory_size = 100000
    batch_size = 32
    tau = 0.001
    lr_actor = 0.0001
    lr_critic = 0.001
    discount_factor = 0.99
    episodes = 1001
    time_steps = 501
    collect_experience = 50000
    save_frequency = 250
    ep_reward = []
    training = False

    #Noise objecct
    noise = OUNoise(env.action_space)
    #Initialize actor and critic objects
    actor = Actor(env, sess, lr_actor, tau)

    #Uncomment to the following line to save the actor model architecture as json file. Need to be saved
    #once only

    # actor.save_model_architecture("Actor_model_architecture.json")
    critic = Critic(env, sess, lr_critic, tau, discount_factor)

    #Initialize replay memory of size defined by memory_size
    replay_memory = ReplayMemory(memory_size)

    #Toggle between true and false for debugging purposes. For training it is always true
    run = True
    if run:
        #Loop over the number of episodes. At eqach new episode reset the environment, reset the noise
        #state and set total episode reward to 0
        for episode in range(episodes):
            state = env.reset()
            noise.reset()
            episode_reward = 0

            #Loop over the number of steps in an episode
            for time in range(time_steps):
                #Uncomment the following line of you want to visualize the mountain car during training.
                #Can also be trained without visualization for the case where we are using
                #position and velocities as state variables.

                # env.render()

                #Predict an action from the actor model using the current state
                action = actor.predict_action(state.reshape((1, 2)))[0]

                #Add ohlnbeck noise to the predicted action to encourage exploration of the environment
                exploratory_action = noise.get_action(action, time)

                #Take the noisy action to enter the next state
                next_state, reward, done, _ = env.step(exploratory_action)

                #Predict the action to be taken given the next_state. This next state action is predicted
                #using the actor's target model
                next_action = actor.predict_next_action(
                    next_state.reshape((1, 2)))[0]

                #Append this experience sample to the replay memory
                replay_memory.append(state, exploratory_action, reward,
                                     next_state, next_action, done)

                #Only start training when there are a minimum number of experience samples available in
                #memory
                if replay_memory.count() == collect_experience:
                    training = True
                    print('Start training')

                #When training:
                if training:
                    # 1)first draw a random batch of samples from the replay memory
                    batch = replay_memory.sample(batch_size)
                    # 2) using this sample calculate dQ/dA from the critic model
                    grads = critic.calc_grads(batch)
                    # 3) calculate dA/dTheta from the actor using the same batch
                    # 4) multiply dA/dTheta by negative dQ/dA to get dJ/dTheta
                    # 5) Update actor weights such that dJ/dTheta is maximized
                    # 6) The above operation is easily performed by minimizing the value obtained in (4)
                    t_grads = actor.train(batch, grads)

                    # update critic weights by minimizing the bellman loss. Use actor target to compute
                    # next action in the next state (already computed and stored in replay memory)
                    # in order to compute TD target
                    critic.train(batch)

                    #After each weight update of the actor and critic online model perform soft updates
                    # of their targets so that they can smoothly and slowly track the online model's
                    #weights
                    actor.update_target()
                    critic.update_target()

                #Add each step reward to the episode reward
                episode_reward += reward

                #Set current state as next state
                state = next_state

                #If target reached before the max allowed time steps, break the inner for loop
                if done:
                    break

            #Store episode reward
            ep_reward.append([episode, episode_reward])

            #Print info for each episode to track training progress
            print(
                "Completed in {} steps.... episode: {}/{}, episode reward: {} "
                .format(time, episode, episodes, episode_reward))

            #Save model's weights and episode rewards after each save_frequency episode
            if training and (episode % save_frequency) == 0:
                print('Data saved at epsisode:', episode)
                actor.save_weights(
                    './Model/DDPG_actor_model_{}.h5'.format(episode))
                pickle.dump(
                    ep_reward,
                    open('./Rewards/rewards_{}.dump'.format(episode), 'wb'))

        # Close the mountain car environment
        env.close()