Exemplos de ReplayBuffer.get_batch em Python, exemplos de replayBuffer.ReplayBuffer.get_batch em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: main1_Agent.py Projeto: snucml/rl-tutorial

class Agent_DQN:
    def __init__(self,
                 action_size,
                 state_size,
                 learning_rate=0.01,
                 discount_factor=0.9,
                 epsilon_initial=1,
                 epsilon_decay=0.995,
                 batch_size=32):

        # 생성자에 다양한 변수 지정하기
        self.action_size = action_size
        self.state_size = state_size

        # LR 과 global step을 지정한다.
        self.global_step = tf.Variable(0, trainable=False)

        # decayed_learning_rate = learning_rate *
        #                         decay_rate ^ (global_step / decay_steps)
        self.learning_rate = tf.train.exponential_decay(learning_rate,
                                                        self.global_step,
                                                        100,
                                                        0.9999,
                                                        staircase=False,
                                                        name='learning_rate')

        # Discount factor 도 지정해 준다.
        self.gamma = discount_factor

        # epsilon greedy 방식으로 탐험을 할 것이므로 epsilon 과 decay를 정해준다.
        self.epsilon = epsilon_initial
        self.epsilon_decay = epsilon_decay

        # 배치 사이즈 정하기
        self.batch_size = batch_size
        self.learning_iteration = 0

        # 메모리 정의해주기. 메모리에는 s,a,r,s_ 를 저장해야 한다. 따라서 s, s_ 저장공간, a, r을 위한 저장공간을 만든다.
        self.memory_size = 2000
        self.replayBuffer = ReplayBuffer(self.memory_size)

        # 두가지 네트워크를 정의해서 하나는 Fixed Q-target으로 사용한다.
        self.build_evaluation_network()
        self.build_target_network()

        # target net과 eval net의 파라미터를 모아준다. scope의 상위 directory를 이용해서 모아줄 수 있다.
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='tn')
        self.t_params = t_params
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='en')

        # tf assign을 이용하면 특정 텐서 변수값에 다른 하나를 넣을 수 있다.
        self.replace_target_op = [
            tf.assign(t, (1 - 0.03) * t + 0.03 * e)
            for t, e in zip(t_params, e_params)
        ]

        # 세션을 정의한다.
        self.sess = tf.Session()

        # initializer 실행
        self.sess.run(tf.global_variables_initializer())
        self.loss_history = []
        self.learning_rate_history = []

    def build_evaluation_network(self):
        '''
        eval net을 만들 땐 target net과는 다르게 loss를 구하는 net이 추가되어야 함.
        target net 은 fixed Q-target을 위해서 쓰는 것이지 업데이트를 하지 않는다. 때문에 이 eval net만 tarinable = Ture 로 설정되어야 함.
        :return:
        '''
        # evaluation net 으로 들어갈 data 를 넣을 placeholder 이다.
        self.eval_input = tf.placeholder(tf.float32, [None, self.state_size],
                                         name='eval_input')

        #  self.y 와 self.a 는 placeholder 로써, loss 를 구하기 위한 placeholder 이다.
        self.y = tf.placeholder(tf.float32, [None], name='Q_target')
        self.a = tf.placeholder(tf.int64, [None], name='action')

        #  실제 네트워크
        with tf.variable_scope('en'):
            hidden1 = tf.layers.dense(
                self.eval_input,
                10,
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., 0.5),
                bias_initializer=tf.random_normal_initializer(0., 0.1),
                name='layer1',
                trainable=True)
            self.q_eval = tf.layers.dense(
                hidden1,
                self.action_size,
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., 0.5),
                bias_initializer=tf.random_normal_initializer(0., 0.1),
                name='layer2',
                trainable=True)

        # loss를 구하는 부분
        with tf.variable_scope('loss'):
            self.a_one_hot = tf.one_hot(self.a, depth=self.action_size)
            self.q_predict = tf.reduce_sum(tf.multiply(self.q_eval,
                                                       self.a_one_hot),
                                           axis=1)
            self.loss = tf.reduce_mean(
                tf.squared_difference(self.y, self.q_predict))
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.learning_rate)\
                .minimize(self.loss, global_step=self.global_step)

    def build_target_network(self):
        self.target_input = tf.placeholder(tf.float32, [None, self.state_size],
                                           name='target_input')
        with tf.variable_scope('tn'):
            hidden1 = tf.layers.dense(
                self.target_input,
                10,
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., 0.5),
                bias_initializer=tf.random_normal_initializer(0., 0.1),
                name='layer1',
                trainable=False)
            self.get_q_target = tf.layers.dense(
                hidden1,
                self.action_size,
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., 0.5),
                bias_initializer=tf.random_normal_initializer(0., 0.1),
                name='layer2',
                trainable=False)

    def store_transition(self, s, a, r, s_):
        self.replayBuffer.add(s, a, r, s_)

    def get_action(self, observation):
        '''
        x : 카트 위치
        dx/dt : 카트 속도
        θ : 막대기 각도
        dθ/dt : 각속도
        이 함수는 epsilon 값에 따라 Neural Network 또는 임의의 값 하나를 action으로 선택하여 return 한다.
        '''
        if np.random.uniform() > self.epsilon:
            actions_value = self.sess.run(
                self.q_eval, feed_dict={self.eval_input: [observation]})
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.action_size)
        return action

    def learn(self):
        '''
        인공신경망의 업데이트가 이루어지는 함수
        '''
        # 메모리를 적당히 채우면 learn 하고 그렇지 않으면 learn을 생략한다.
        if self.learning_iteration >= self.memory_size:
            # eval_net 과 fixed_q_target을 적절한 비율로 교체해준다.
            self.sess.run(self.replace_target_op)

            batch = self.replayBuffer.get_batch(self.batch_size)
            batch_s = np.asarray([x[0] for x in batch])
            batch_a = np.asarray([x[1] for x in batch])
            batch_r = np.asarray([x[2] for x in batch])
            batch_s_ = np.asarray([x[3] for x in batch])

            # q_eval 은 현재 Q함수값을 구하기 위해, get_q_target은 max함수에 포함되어있는 Q값을 구하기 위해 사용한다.
            get_q_target, q_eval = self.sess.run(
                [self.get_q_target, self.q_eval],
                feed_dict={
                    self.target_input: batch_s_,  # fixed params
                    self.eval_input: batch_s,  # newest params
                })

            # action 은 배치 메모리에서 state가 저장된 다음부분부터가 action이므로 그 값을 가져오면 된다.
            a = batch_a
            # reward는 action 다음에 저장했으므로 그 다음 값을 가져오면 된다.
            reward = batch_r
            # self.y placeholder에 넣어줄 값을 위에서 구한 값으로 적절히 만들어서 넣는다.
            _, self.loss_out = self.sess.run(
                [self._train_op, self.loss],
                feed_dict={
                    self.eval_input: batch_s,
                    self.y: reward + self.gamma * np.max(get_q_target, axis=1),
                    self.a: a
                })
            self.loss_history.append(self.loss_out)

            # epsilon -greedy 탐험을 하기 위해 epsilon 값을 주기적으로 낮춰주어야한다.
            self.epsilon = self.epsilon * self.epsilon_decay

        # iteration을 세어주기 위한 변수, 러닝레이트 출력을 위해 히스토리에 하나씩 추가해본다.
        self.learning_iteration += 1
        self.learning_rate_history.append(self.sess.run([self.learning_rate]))

    def plot_loss(self):
        # 파이썬에서 Times New Roman 글씨체를 이용하여 그래프를 출력할 수 있음!
        plt.title('History')
        ms = 0.1
        me = 1
        line_width = 0.5
        plt.ylabel('Loss')
        plt.xlabel('Training steps')
        plt.plot(np.arange(len(self.loss_history)),
                 self.loss_history,
                 '--^',
                 color='r',
                 markevery=me,
                 label=r'critic loss',
                 lw=line_width,
                 markersize=ms)
        plt.grid()
        ax = plt.subplot(111)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.ylim(0, 2)
        plt.show()

    def plot_reward(self, reward_history):
        plt.plot(np.arange(len(reward_history)), reward_history)
        plt.grid()
        plt.ylabel('Reward')
        plt.xlabel('Episodes')
        plt.show()

Exemplo n.º 2

0

Exibir arquivo

class DDPGController(object):
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.state_dim
        self.action_dim = env.action_dim

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            log_device_placement=True))

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.model_saver = tf.train.Saver()

    def train(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

            # if self.time_step % 10000 == 0:
            # self.actor_network.save_network(self.time_step)
            # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def initial_train(self, mini_batch):
        state_batch = np.asarray([data[0] for data in mini_batch])
        action_batch = np.asarray([data[1] for data in mini_batch])
        action_label_batch = np.asarray([data[2] for data in mini_batch])
        value_label_batch = np.asarray([data[3] for data in mini_batch])
        done_batch = np.asarray([data[4] for data in mini_batch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])
        action_label_batch = np.resize(action_label_batch,
                                       [BATCH_SIZE, self.action_dim])

        # Calculate y_batch
        y_batch = []
        for i in range(len(mini_batch)):
            y_batch.append(value_label_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        critic_cost = self.critic_network.train(y_batch, state_batch,
                                                action_label_batch)

        # Update the actor policy using the sampled gradient:
        # action_batch_for_gradients = self.actor_network.actions(state_batch)
        # q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients)

        # self.actor_network.train(q_gradient_batch, state_batch)
        action_cost = self.actor_network.initial_train(
            action_label_batch=action_label_batch, state_batch=state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()
        return critic_cost, action_cost

    def save_model(self, path, check_point):
        self.model_saver.save(self.sess,
                              path + 'DDPGControllerModel.ckpt',
                              global_step=check_point)
        print("Model saved at " + path + 'model.ckpt')

    def load_model(self, path):
        self.model_saver.restore(self.sess, path)
        print("Model loaded at " + path)
        pass

Exemplo n.º 3

0

Exibir arquivo

Arquivo: main2_Agent.py Projeto: snucml/rl-tutorial

class Agent_DDPG(object):
    def __init__(
        self,
        action_size,
        state_size,
        action_limit,
    ):
        self.memory_size = 10000
        self.replayBuffer = ReplayBuffer(self.memory_size)
        self.sess = tf.Session()

        self.discount_factor = 0.9
        self.action_variance = 3
        self.critic_learning_rate = 0.001
        self.actor_learning_rate = 0.002
        self.batch_size = 32

        self.action_size, self.state_size, self.action_limit = action_size, state_size, action_limit,
        self.input_state = tf.placeholder(tf.float32, [None, state_size], 's')
        self.input_state_ = tf.placeholder(tf.float32, [None, state_size],
                                           's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')

        with tf.variable_scope('Actor'):
            self.a = self.build_actor_network(self.input_state,
                                              scope='eval',
                                              trainable=True)
            a_ = self.build_actor_network(self.input_state_,
                                          scope='tar',
                                          trainable=False)
        with tf.variable_scope('Critic'):
            q_eval = self.build_critic_network(self.input_state,
                                               self.a,
                                               scope='eval',
                                               trainable=True)
            q_target = self.build_critic_network(self.input_state_,
                                                 a_,
                                                 scope='target',
                                                 trainable=False)

        self.actor_evaluation_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.actor_target_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/tar')
        self.critic_evaluation_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.critic_target_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/tar')

        self.replace = [
            tf.assign(t, (1 - 0.01) * t + 0.01 * e) for t, e in zip(
                self.actor_target_params +
                self.critic_target_params, self.actor_evaluation_params +
                self.critic_evaluation_params)
        ]
        '''
               dJ/dtheta = E[ dQ/dtheta ] 

        '''
        # Actor Loss 는 Q로부터 내려오는 값을 maximize 하면 된다(논문 참조)
        self.a_loss = tf.reduce_mean(q_eval)  # maximize the q
        # Maximize Q 를 해야하므로 learning rate에 '-' 를 붙인다.
        self.atrain = tf.train.AdamOptimizer(
            -self.actor_learning_rate).minimize(
                tf.reduce_mean(q_eval), var_list=self.actor_evaluation_params)

        # self.c_train 을 호출할때 self.a 에 배치의 action을 넣게 된다.
        # Placeholder가 아닌 self.a 에 직접 값을 대입하는 것!
        # s a r s_ 를 이용해서 critic을 업데이트 하는데, 정석으로 구한 y가 트루 라벨, 뉴럴넷에 값을 넣고 나오는 것이 우리의 prediction이다.
        # True Label,  y = r(s,u_t(s)) + gamma*Q(s_, u_t(s_))
        q_true = self.R + self.discount_factor * q_target

        # Prediction, Q = q_eval
        # 우리가 mseLoss를 구하려면 q_eval을 구해야 하므로 self.input_state에 피딩을 해 주어야 함.
        # 또한 q_true 를 구하기 위해 self.R 과 q_target에 들어갈 self.input_state_ 도 피딩 해주어야 함.
        self.mseloss = tf.losses.mean_squared_error(labels=q_true,
                                                    predictions=q_eval)
        # 이 부분은 오직 Critic net을 업데이트하기위한 Loss이다. 때문에 var_list를 Critic evaluation network로 지정해주어야한다.
        self.ctrain = tf.train.AdamOptimizer(
            self.critic_learning_rate).minimize(
                self.mseloss, var_list=self.critic_evaluation_params)

        # 네트워크를 만들고 항상 초기화를 해준다.
        self.sess.run(tf.global_variables_initializer())

        self.actor_loss_history = []
        self.critic_loss_history = []

    def store_transition(self, s, a, r, s_):
        self.replayBuffer.add(s, a, r, s_)

    def choose_action(self, s):
        return np.clip(
            np.random.normal(
                self.sess.run(self.a, {self.input_state: s[np.newaxis, :]})[0],
                self.action_variance), -2, 2)

    def learn(self):
        if self.replayBuffer.count() > self.batch_size:
            self.action_variance *= .9995
            self.sess.run(self.replace)

            batch = self.replayBuffer.get_batch(self.batch_size)
            batch_s = np.asarray([x[0] for x in batch])
            batch_a = np.asarray([x[1] for x in batch])
            batch_r = np.asarray([[x[2]] for x in batch])
            batch_s_ = np.asarray([x[3] for x in batch])

            actor_loss, _ = self.sess.run([self.a_loss, self.atrain],
                                          {self.input_state: batch_s})
            critic_loss, _ = self.sess.run(
                [self.mseloss, self.ctrain], {
                    self.input_state: batch_s,
                    self.a: batch_a,
                    self.R: batch_r,
                    self.input_state_: batch_s_
                })

            self.actor_loss_history.append(actor_loss)
            self.critic_loss_history.append(critic_loss)

    def build_actor_network(self, s, scope, trainable):
        actor_hidden_size = 30
        with tf.variable_scope(scope):
            hidden1 = tf.layers.dense(s,
                                      actor_hidden_size,
                                      activation=tf.nn.relu,
                                      name='l1',
                                      trainable=trainable)
            a = tf.layers.dense(hidden1,
                                self.action_size,
                                activation=tf.nn.tanh,
                                name='a',
                                trainable=trainable)
            return tf.multiply(a, self.action_limit, name='scaled_a')

    def build_critic_network(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            critic_hidden_size = 30
            hidden1 = tf.layers.dense(s, critic_hidden_size,  name='s1', trainable=trainable) \
            + tf.layers.dense(a, critic_hidden_size, name='a1', trainable=trainable) \
            + tf.get_variable('b1', [1, critic_hidden_size], trainable=trainable)
            hidden1 = tf.nn.relu(hidden1)
            return tf.layers.dense(hidden1, 1, trainable=trainable)

    def plot_loss(self):
        plt.title('history', fontsize=25)
        ms = 0.1
        me = 1
        line_width = 0.1
        plt.ylabel('Loss')
        plt.xlabel('Training steps')

        actor_loss_mean = sum(self.actor_loss_history) / len(
            self.actor_loss_history)
        self.actor_loss_history /= actor_loss_mean
        critic_loss_mean = sum(self.critic_loss_history) / len(
            self.critic_loss_history)
        self.critic_loss_history /= critic_loss_mean

        plt.plot(np.arange(len(self.actor_loss_history)),
                 self.actor_loss_history,
                 '-p',
                 color='b',
                 markevery=me,
                 label=r'actor loss',
                 lw=line_width,
                 markersize=ms)
        plt.plot(np.arange(len(self.critic_loss_history)),
                 self.critic_loss_history,
                 '--^',
                 color='r',
                 markevery=me,
                 label=r'critic loss',
                 lw=line_width,
                 markersize=ms)

        plt.grid()
        ax = plt.subplot(111)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.ylim(0, 10)
        plt.show()

    def plot_reward(self, reward_history):
        plt.plot(np.arange(len(reward_history)), reward_history)
        plt.ylabel('Reward')
        plt.xlabel('Episodes')
        plt.grid()
        plt.show()