def train(sess, env, actor, critic, RESTORE):

    sess.run(tf.global_variables_initializer())

    # Initialize random noise generator
    exploration_noise = OUNoise(env.action_space.n)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay buffER
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    # Store q values for illustration purposes
    q_max_array = []
    reward_array = []

    for i in range(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(MAX_EP_STEPS):

            # if i % 40 == 0 and i > 1:
            #     env.render()

            # Begin "Experimentation and Evaluation Phase"

            # Seleect next experimental action by adding noise to action prescribed by policy
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))

            # If in a testing episode, do not add noise
            # if i%100 is not 49 and i%100 is not 99:
            noise = exploration_noise.noise()
            a = a + noise

            # Take step with experimental action
            action = np.argmax(a)
            s2, r, terminal, info = env.step(action)
            # s2, r, terminal, info = env.step(np.reshape(a.T,newshape=(env.action_space.n,)))

            # Add transition to replay buffer if not testing episode
            # if i%100 is not 49 and i%100 is not 99:
            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    MINIBATCH_SIZE)

                # Find target estimate to use for updating the Q-function

                # Predict_traget function determines Q-value of next state
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1)))
                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Perform gradient descent to update critic
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value, axis=0)

                # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            # If episode is finished, print results
            if terminal:
                print('| Reward: %.2i' % int(ep_reward), " | Episode", i,
                      '| Qmax: %.4f' % (ep_ave_max_q / float(j)))
                q_max_array.append(ep_ave_max_q / float(j))
                #reward_array.append(ep_reward)
                break

        ep_reward = 0
        s = env.reset()

        for j in range(MAX_EP_STEPS):
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))
            # Take step with experimental action
            action = np.argmax(a)
            s2, r, terminal, info = env.step(action)

            ep_reward += r
            s = s2

            if terminal:
                print('Normal | Reward: %.2i' % int(ep_reward), " | Episode",
                      i)
                reward_array.append(ep_reward)
                break

    # Max Q plot
    plt.plot(range(1, MAX_EPISODES + 1), q_max_array, 'b-')
    plt.xlabel('Episode Number')
    plt.ylabel('Max Q-Value')
    plt.savefig('Q.png')
    plt.show()

    # Reward plot
    plt.plot(range(1, MAX_EPISODES + 1), reward_array, 'g-')
    plt.xlabel('Episode Number')
    plt.ylabel('Reward')
    plt.savefig('Reward.png')
    plt.show()
    save_result([[str(i[0]) for i in q_max_array],
                 [str(i) for i in reward_array]])
示例#2
0
class DDPGAgent(object):
    """ class of the DDPG Agent """
    def __init__(self, config):
        """Initialize an Agent object.

        Args:
            param1: (config)
        """

        self.state_size = config.state_dim
        self.action_size = config.action_dim
        self.seed = np.random.seed(config.seed)
        self.n_agents = config.n_agents
        self.batch_size = config.batch_size
        self.tau = config.tau
        self.gamma = config.gamma
        self.device = config.device
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(config).to(config.device)
        self.actor_target = Actor(config).to(config.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(config).to(config.device)
        self.critic_target = Critic(config).to(config.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=config.lr_critic)

        # Noise process
        self.noise = OUNoise(config)

        # Replay memory
        self.memory = ReplayBuffer(config)
        #self.timesteps = 0

    def act(self, states, epsilon, add_noise=True):
        """ Given a list of states for each agent it returns the actions to be
        taken by each agent based on the current policy.
        Returns a numpy array of shape [n_agents, n_actions]
        NOTE: clips actions to be between -1, 1
        Args:
            states:    (torch) states
            epsilon: (float)
            add_noise: (bool) add noise to the actions
        """
        states = torch.from_numpy(states).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise and epsilon > np.random.random():
            actions += [self.noise.sample() for _ in range(self.n_agents)]
        return np.clip(actions, -1, 1)

    def reset_noise(self):
        """ reset noise"""
        self.noise.reset()

    def learn(self):
        """Update policy and value parameters using given batch of experience tuples.
        actor_target(state) -> action
        critic_target(state, action) -> Q-value
        """
        if self.batch_size > self.memory.size():
            return
        states, actions, rewards, next_states, dones = self.memory.sample()

        # ---------------------------- update critic ----------------------------

        # Get predicted next-state actions and Q values from target model

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.

        Args:
         param1: (torch network) local_model
         param2: (torch network) target_model
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
示例#3
0
class NECAgent:
    """
    NEC agent
    """
    def __init__(self, config):
        self.nec_net = NEC(config).to(config['device'])
        self.train_eps = config['train_eps']
        self.eval_eps = config['eval_eps']
        self.num_actions = config['num_actions']
        self.replay_buffer = ReplayBuffer(config['observation_shape'],
                                          config['replay_buffer_size'])
        self.batch_size = config['batch_size']
        self.discount = config['discount']
        self.n_step_horizon = config['horizon']
        self.episode = 0
        self.logger = ScoreLogger(config['env_name'], config['exp_name'])
        self.env_name = config['env_name']
        self.exp_name = config['exp_name']
        self.device = config['device']
        self.train()

        # make sure model is on appropriate device at this point before constructing optimizer
        self.optimizer = RMSprop(self.nec_net.parameters(),
                                 lr=config['learning_rate'],
                                 alpha=config['rmsprop_alpha'],
                                 eps=config['rmsprop_epsilon'])
        self.loss_fn = MSELoss()

    def train(self):
        self.training = True
        self.nec_net.train()

    def eval(self):
        self.training = False
        self.nec_net.eval()

    def new_episode(self):
        # trackers for computing N-step returns and updating replay and dnd memories at the end of episode
        self.observations, self.keys, self.actions, self.values, self.rewards = [], [], [], [], []
        self.episode += 1

    def set_epsilon(self, eps):
        self.train_eps = eps

    def step(self, obs):
        q_values, key = self.nec_net.lookup(obs)

        eps = self.train_eps if self.training else self.eval_eps

        # do epsilon-greedy crap
        action = np.random.choice(np.arange(
            self.num_actions)) if np.random.rand() < eps else _argmax(q_values)

        # update trackers
        if self.training:
            self.actions.append(action)
            self.observations.append(obs)
            self.keys.append(key)
            self.values.append(np.max(q_values))

        return action

    def update(self, consequence):
        """
        Called from main training loop to inform agent of consequence of last action including
        reward and if the episode terminated
        """
        reward, done = consequence

        if self.env_name.startswith("CartPole"):
            reward = reward if not done else -reward

        # update reward tracker
        self.rewards.append(reward)

        if done:
            episode_length = len(self.actions)

            # compute N-step returns in reverse order
            returns, n_step_returns = [None] * (episode_length +
                                                1), [None] * episode_length
            returns[episode_length] = 0

            for t in range(episode_length - 1, -1, -1):
                returns[t] = self.rewards[t] + self.discount * returns[t + 1]
                if episode_length - t > self.n_step_horizon:
                    n_step_returns[t] = returns[
                        t] + self.discount**self.n_step_horizon * (
                            self.values[t + self.n_step_horizon] -
                            returns[t + self.n_step_horizon])
                else:  # use on-policy monte carlo returns when below horizon
                    n_step_returns[t] = returns[t]

            self.keys, n_step_returns = torch.stack(self.keys), np.array(
                n_step_returns, dtype=np.float32)  # for fancy indexing

            # batch update of replay memory
            self.replay_buffer.append_batch(
                np.stack(self.observations),
                np.asarray(self.actions, dtype=np.int64), n_step_returns)

            # batch update of episodic memories
            unique_actions = np.unique(self.actions)
            for action in unique_actions:
                action_idxs = np.nonzero(self.actions == action)[0]
                self.nec_net.update_memory(action, self.keys[action_idxs],
                                           n_step_returns[action_idxs])

            # save/log metrics for plotting or whatever
            solved = self.logger.add_score(sum(self.rewards), self.episode)
            if solved:
                path = f'{os.getcwd()}/cartpole/trained_agents/nec_{self.exp_name}.pth'
                torch.save(self.nec_net.state_dict(), path)
                return True

        return False

    def optimize(self):
        """
        Here, we sample from the replay buffer and train the NEC model end-to-end with backprop
        """
        if self.replay_buffer.size() < self.batch_size:
            return

        observations, actions, returns = self.replay_buffer.sample(
            self.batch_size)
        self.optimizer.zero_grad()
        q_values = self.nec_net(observations.to(self.device))[range(
            self.batch_size), actions]  # pick q_values for chosen actions
        loss = self.loss_fn(q_values, returns.to(self.device))
        loss.backward()
        self.optimizer.step()

    def get_q_values(self, observations, actions):
        """
        Computes q_values for observation, action pairs passed in.

        Used for testing
        """
        with torch.no_grad():
            self.eval()
            observations = torch.from_numpy(observations)
            q_values = self.nec_net(observations)[range(len(actions)), actions]

            return q_values.numpy()
示例#4
0
def train(sess, env, actor, critic, RESTORE):

    sess.run(tf.global_variables_initializer())

    # Initialize random noise generator
    exploration_noise = OUNoise(env.action_space.shape[0])

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay buffER
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    totSteps = 0

    # Store q values for illustration purposes
    q_max_array = []

    actor.learning_rate = MAX_ACTOR_LEARNING_RATE
    critic.learning_rate = MAX_CRITIC_LEARNING_RATE

    for i in xrange(MAX_EPISODES):

        s = env.reset()
        s = normalize(s)

        ep_reward = 0
        ep_ave_max_q = 0

        # update learning rates using cosine annealing
        T_cur = i % LR_CYCLE
        actor.learning_rate = MIN_ACTOR_LEARNING_RATE +\
                              0.5 * (MAX_ACTOR_LEARNING_RATE - MIN_ACTOR_LEARNING_RATE) * \
                              (1 + np.cos(np.pi * T_cur / LR_CYCLE))

        critic.learning_rate = MIN_CRITIC_LEARNING_RATE +\
                              0.5 * (MAX_CRITIC_LEARNING_RATE - MIN_CRITIC_LEARNING_RATE) * \
                              (1 + np.cos(np.pi * T_cur / LR_CYCLE))

        for j in xrange(MAX_EP_STEPS):

            totSteps += 1

            # Begin "Experimentation and Evaluation Phase"

            # Select next experimental action by adding noise to action prescribed by policy
            a = actor.predict(np.reshape(s, (1, actor.s_dim, 1)))

            # If in a testing episode, do not add noise
            if i < EXPLORATION_SIZE and not (i % 100 is 49 or i % 100 is 99):
                noise = exploration_noise.noise()
                a = a + noise

            # Constrain action
            a = np.clip(a, -15, 15)

            # Take step with experimental action
            s2, r, terminal, info = env.step(
                np.reshape(a.T, newshape=(env.action_space.shape[0], )),
                CONST_THROTTLE)

            #print("car pos: " + str(env.car_dist_s))
            #print("action: " + str(a))
            #print("reward: " + str(r))

            s2 = normalize(s2)

            # Add transition to replay buffer if not testing episode
            if i % 100 is not 49 and i % 100 is not 99:
                replay_buffer.add(np.reshape(s, (actor.s_dim, 1)),
                                  np.reshape(a, (actor.a_dim, )), r, terminal,
                                  np.reshape(s2, (actor.s_dim, 1)))

                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > MEMORY_WARMUP:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                        MINIBATCH_SIZE)

                    # Find target estimate to use for updating the Q-function

                    # Predict_traget function determines Q-value of next state
                    target_q = critic.predict_target(
                        s2_batch, actor.predict_target(s2_batch))

                    # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1)))
                    y_i = []
                    for k in xrange(MINIBATCH_SIZE):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + GAMMA * target_q[k])

                    # Perform gradient descent to update critic
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                    ep_ave_max_q += np.amax(predicted_q_value, axis=0)

                    # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    actor.train(s_batch, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()

            s = s2
            ep_reward += r

            # If episode is finished, print results
            if terminal:

                if i % 100 is 49 or i % 100 is 99:
                    print("Testing")

                    kmodel = Sequential()
                    actVars = []
                    for var in tf.trainable_variables():
                        if 'non-target' in str(var):
                            actVars.append(var)

                    kmodel.add(
                        Dense(units=l1size,
                              activation='tanh',
                              weights=[
                                  sess.run(actVars[0]),
                                  sess.run(actVars[1])
                              ],
                              input_dim=actor.s_dim))
                    kmodel.add(
                        Dense(units=l2size,
                              activation='tanh',
                              weights=[
                                  sess.run(actVars[2]),
                                  sess.run(actVars[3])
                              ]))
                    kmodel.add(
                        Dense(units=1,
                              activation='tanh',
                              weights=[
                                  sess.run(actVars[4]),
                                  sess.run(actVars[5])
                              ]))
                    optimizer = optimizers.RMSprop(lr=0.00025,
                                                   rho=0.9,
                                                   epsilon=1e-06)
                    kmodel.compile(loss="mse", optimizer=optimizer)
                    kmodel.save(modelfile)

                else:
                    print("Training")

                print('| Reward: %.2i' % int(ep_reward), " | Episode", i,
                      '| Qmax: %.4f' % (ep_ave_max_q / float(j)))
                q_max_array.append(ep_ave_max_q / float(j))

                print('Finished in ' + str(j) + ' steps')

                break

    plt.plot(q_max_array)
    plt.xlabel('Episode Number')
    plt.ylabel('Max Q-Value')
    plt.show()

    kmodel = Sequential()
    actVars = []
    for var in tf.trainable_variables():
        if 'non-target' in str(var):
            actVars.append(var)

    kmodel.add(
        Dense(units=l1size,
              activation='tanh',
              weights=[sess.run(actVars[0]),
                       sess.run(actVars[1])],
              input_dim=actor.s_dim))
    kmodel.add(
        Dense(units=l2size,
              activation='tanh',
              weights=[sess.run(actVars[2]),
                       sess.run(actVars[3])]))
    kmodel.add(
        Dense(units=1,
              activation='tanh',
              weights=[sess.run(actVars[4]),
                       sess.run(actVars[5])]))
    optimizer = optimizers.RMSprop(lr=0.00025, rho=0.9, epsilon=1e-06)
    kmodel.compile(loss="mse", optimizer=optimizer)
    kmodel.summary()
    kmodel.save(modelfile)