Exemplo n.º 1
0
class DDPGController(object):
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.state_dim
        self.action_dim = env.action_dim

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            log_device_placement=True))

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.model_saver = tf.train.Saver()

    def train(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

            # if self.time_step % 10000 == 0:
            # self.actor_network.save_network(self.time_step)
            # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def initial_train(self, mini_batch):
        state_batch = np.asarray([data[0] for data in mini_batch])
        action_batch = np.asarray([data[1] for data in mini_batch])
        action_label_batch = np.asarray([data[2] for data in mini_batch])
        value_label_batch = np.asarray([data[3] for data in mini_batch])
        done_batch = np.asarray([data[4] for data in mini_batch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])
        action_label_batch = np.resize(action_label_batch,
                                       [BATCH_SIZE, self.action_dim])

        # Calculate y_batch
        y_batch = []
        for i in range(len(mini_batch)):
            y_batch.append(value_label_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        critic_cost = self.critic_network.train(y_batch, state_batch,
                                                action_label_batch)

        # Update the actor policy using the sampled gradient:
        # action_batch_for_gradients = self.actor_network.actions(state_batch)
        # q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients)

        # self.actor_network.train(q_gradient_batch, state_batch)
        action_cost = self.actor_network.initial_train(
            action_label_batch=action_label_batch, state_batch=state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()
        return critic_cost, action_cost

    def save_model(self, path, check_point):
        self.model_saver.save(self.sess,
                              path + 'DDPGControllerModel.ckpt',
                              global_step=check_point)
        print("Model saved at " + path + 'model.ckpt')

    def load_model(self, path):
        self.model_saver.restore(self.sess, path)
        print("Model loaded at " + path)
        pass
Exemplo n.º 2
0
class Agent_DDPG(object):
    def __init__(
        self,
        action_size,
        state_size,
        action_limit,
    ):
        self.memory_size = 10000
        self.replayBuffer = ReplayBuffer(self.memory_size)
        self.sess = tf.Session()

        self.discount_factor = 0.9
        self.action_variance = 3
        self.critic_learning_rate = 0.001
        self.actor_learning_rate = 0.002
        self.batch_size = 32

        self.action_size, self.state_size, self.action_limit = action_size, state_size, action_limit,
        self.input_state = tf.placeholder(tf.float32, [None, state_size], 's')
        self.input_state_ = tf.placeholder(tf.float32, [None, state_size],
                                           's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')

        with tf.variable_scope('Actor'):
            self.a = self.build_actor_network(self.input_state,
                                              scope='eval',
                                              trainable=True)
            a_ = self.build_actor_network(self.input_state_,
                                          scope='tar',
                                          trainable=False)
        with tf.variable_scope('Critic'):
            q_eval = self.build_critic_network(self.input_state,
                                               self.a,
                                               scope='eval',
                                               trainable=True)
            q_target = self.build_critic_network(self.input_state_,
                                                 a_,
                                                 scope='target',
                                                 trainable=False)

        self.actor_evaluation_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.actor_target_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/tar')
        self.critic_evaluation_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.critic_target_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/tar')

        self.replace = [
            tf.assign(t, (1 - 0.01) * t + 0.01 * e) for t, e in zip(
                self.actor_target_params +
                self.critic_target_params, self.actor_evaluation_params +
                self.critic_evaluation_params)
        ]
        '''
               dJ/dtheta = E[ dQ/dtheta ] 

        '''
        # Actor Loss 는 Q로부터 내려오는 값을 maximize 하면 된다(논문 참조)
        self.a_loss = tf.reduce_mean(q_eval)  # maximize the q
        # Maximize Q 를 해야하므로 learning rate에 '-' 를 붙인다.
        self.atrain = tf.train.AdamOptimizer(
            -self.actor_learning_rate).minimize(
                tf.reduce_mean(q_eval), var_list=self.actor_evaluation_params)

        # self.c_train 을 호출할때 self.a 에 배치의 action을 넣게 된다.
        # Placeholder가 아닌 self.a 에 직접 값을 대입하는 것!
        # s a r s_ 를 이용해서 critic을 업데이트 하는데, 정석으로 구한 y가 트루 라벨, 뉴럴넷에 값을 넣고 나오는 것이 우리의 prediction이다.
        # True Label,  y = r(s,u_t(s)) + gamma*Q(s_, u_t(s_))
        q_true = self.R + self.discount_factor * q_target

        # Prediction, Q = q_eval
        # 우리가 mseLoss를 구하려면 q_eval을 구해야 하므로 self.input_state에 피딩을 해 주어야 함.
        # 또한 q_true 를 구하기 위해 self.R 과 q_target에 들어갈 self.input_state_ 도 피딩 해주어야 함.
        self.mseloss = tf.losses.mean_squared_error(labels=q_true,
                                                    predictions=q_eval)
        # 이 부분은 오직 Critic net을 업데이트하기위한 Loss이다. 때문에 var_list를 Critic evaluation network로 지정해주어야한다.
        self.ctrain = tf.train.AdamOptimizer(
            self.critic_learning_rate).minimize(
                self.mseloss, var_list=self.critic_evaluation_params)

        # 네트워크를 만들고 항상 초기화를 해준다.
        self.sess.run(tf.global_variables_initializer())

        self.actor_loss_history = []
        self.critic_loss_history = []

    def store_transition(self, s, a, r, s_):
        self.replayBuffer.add(s, a, r, s_)

    def choose_action(self, s):
        return np.clip(
            np.random.normal(
                self.sess.run(self.a, {self.input_state: s[np.newaxis, :]})[0],
                self.action_variance), -2, 2)

    def learn(self):
        if self.replayBuffer.count() > self.batch_size:
            self.action_variance *= .9995
            self.sess.run(self.replace)

            batch = self.replayBuffer.get_batch(self.batch_size)
            batch_s = np.asarray([x[0] for x in batch])
            batch_a = np.asarray([x[1] for x in batch])
            batch_r = np.asarray([[x[2]] for x in batch])
            batch_s_ = np.asarray([x[3] for x in batch])

            actor_loss, _ = self.sess.run([self.a_loss, self.atrain],
                                          {self.input_state: batch_s})
            critic_loss, _ = self.sess.run(
                [self.mseloss, self.ctrain], {
                    self.input_state: batch_s,
                    self.a: batch_a,
                    self.R: batch_r,
                    self.input_state_: batch_s_
                })

            self.actor_loss_history.append(actor_loss)
            self.critic_loss_history.append(critic_loss)

    def build_actor_network(self, s, scope, trainable):
        actor_hidden_size = 30
        with tf.variable_scope(scope):
            hidden1 = tf.layers.dense(s,
                                      actor_hidden_size,
                                      activation=tf.nn.relu,
                                      name='l1',
                                      trainable=trainable)
            a = tf.layers.dense(hidden1,
                                self.action_size,
                                activation=tf.nn.tanh,
                                name='a',
                                trainable=trainable)
            return tf.multiply(a, self.action_limit, name='scaled_a')

    def build_critic_network(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            critic_hidden_size = 30
            hidden1 = tf.layers.dense(s, critic_hidden_size,  name='s1', trainable=trainable) \
            + tf.layers.dense(a, critic_hidden_size, name='a1', trainable=trainable) \
            + tf.get_variable('b1', [1, critic_hidden_size], trainable=trainable)
            hidden1 = tf.nn.relu(hidden1)
            return tf.layers.dense(hidden1, 1, trainable=trainable)

    def plot_loss(self):
        plt.title('history', fontsize=25)
        ms = 0.1
        me = 1
        line_width = 0.1
        plt.ylabel('Loss')
        plt.xlabel('Training steps')

        actor_loss_mean = sum(self.actor_loss_history) / len(
            self.actor_loss_history)
        self.actor_loss_history /= actor_loss_mean
        critic_loss_mean = sum(self.critic_loss_history) / len(
            self.critic_loss_history)
        self.critic_loss_history /= critic_loss_mean

        plt.plot(np.arange(len(self.actor_loss_history)),
                 self.actor_loss_history,
                 '-p',
                 color='b',
                 markevery=me,
                 label=r'actor loss',
                 lw=line_width,
                 markersize=ms)
        plt.plot(np.arange(len(self.critic_loss_history)),
                 self.critic_loss_history,
                 '--^',
                 color='r',
                 markevery=me,
                 label=r'critic loss',
                 lw=line_width,
                 markersize=ms)

        plt.grid()
        ax = plt.subplot(111)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.ylim(0, 10)
        plt.show()

    def plot_reward(self, reward_history):
        plt.plot(np.arange(len(reward_history)), reward_history)
        plt.ylabel('Reward')
        plt.xlabel('Episodes')
        plt.grid()
        plt.show()