Python ReplayBuffer.count примеры использования

Язык программирования: Python

Пространство имен/Пакет: replayBuffer

Класс/Тип: ReplayBuffer

Метод/Функция: count

Примеров на hotexamples.com: 2

Python ReplayBuffer.count - 2 примера найдено. Это лучшие примеры Python кода для replayBuffer.ReplayBuffer.count, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayBuffer(30)

add(15)

append(4)

get_batch(3)

count(2)

__len__(1)

addData(1)

add_experience(1)

add_to_buffer(1)

can_pick(1)

fill_replay_buffer(1)

getRandBatch(1)

get_sample_replay_buffer(1)

Пример #1

Показать файл

class DDPGController(object):
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.state_dim
        self.action_dim = env.action_dim

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            log_device_placement=True))

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.model_saver = tf.train.Saver()

    def train(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

            # if self.time_step % 10000 == 0:
            # self.actor_network.save_network(self.time_step)
            # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def initial_train(self, mini_batch):
        state_batch = np.asarray([data[0] for data in mini_batch])
        action_batch = np.asarray([data[1] for data in mini_batch])
        action_label_batch = np.asarray([data[2] for data in mini_batch])
        value_label_batch = np.asarray([data[3] for data in mini_batch])
        done_batch = np.asarray([data[4] for data in mini_batch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])
        action_label_batch = np.resize(action_label_batch,
                                       [BATCH_SIZE, self.action_dim])

        # Calculate y_batch
        y_batch = []
        for i in range(len(mini_batch)):
            y_batch.append(value_label_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        critic_cost = self.critic_network.train(y_batch, state_batch,
                                                action_label_batch)

        # Update the actor policy using the sampled gradient:
        # action_batch_for_gradients = self.actor_network.actions(state_batch)
        # q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients)

        # self.actor_network.train(q_gradient_batch, state_batch)
        action_cost = self.actor_network.initial_train(
            action_label_batch=action_label_batch, state_batch=state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()
        return critic_cost, action_cost

    def save_model(self, path, check_point):
        self.model_saver.save(self.sess,
                              path + 'DDPGControllerModel.ckpt',
                              global_step=check_point)
        print("Model saved at " + path + 'model.ckpt')

    def load_model(self, path):
        self.model_saver.restore(self.sess, path)
        print("Model loaded at " + path)
        pass

Пример #2

Показать файл

Файл: main2_Agent.py Проект: snucml/rl-tutorial

class Agent_DDPG(object):
    def __init__(
        self,
        action_size,
        state_size,
        action_limit,
    ):
        self.memory_size = 10000
        self.replayBuffer = ReplayBuffer(self.memory_size)
        self.sess = tf.Session()

        self.discount_factor = 0.9
        self.action_variance = 3
        self.critic_learning_rate = 0.001
        self.actor_learning_rate = 0.002
        self.batch_size = 32

        self.action_size, self.state_size, self.action_limit = action_size, state_size, action_limit,
        self.input_state = tf.placeholder(tf.float32, [None, state_size], 's')
        self.input_state_ = tf.placeholder(tf.float32, [None, state_size],
                                           's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')

        with tf.variable_scope('Actor'):
            self.a = self.build_actor_network(self.input_state,
                                              scope='eval',
                                              trainable=True)
            a_ = self.build_actor_network(self.input_state_,
                                          scope='tar',
                                          trainable=False)
        with tf.variable_scope('Critic'):
            q_eval = self.build_critic_network(self.input_state,
                                               self.a,
                                               scope='eval',
                                               trainable=True)
            q_target = self.build_critic_network(self.input_state_,
                                                 a_,
                                                 scope='target',
                                                 trainable=False)

        self.actor_evaluation_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.actor_target_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/tar')
        self.critic_evaluation_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.critic_target_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/tar')

        self.replace = [
            tf.assign(t, (1 - 0.01) * t + 0.01 * e) for t, e in zip(
                self.actor_target_params +
                self.critic_target_params, self.actor_evaluation_params +
                self.critic_evaluation_params)
        ]
        '''
               dJ/dtheta = E[ dQ/dtheta ] 

        '''
        # Actor Loss 는 Q로부터 내려오는 값을 maximize 하면 된다(논문 참조)
        self.a_loss = tf.reduce_mean(q_eval)  # maximize the q
        # Maximize Q 를 해야하므로 learning rate에 '-' 를 붙인다.
        self.atrain = tf.train.AdamOptimizer(
            -self.actor_learning_rate).minimize(
                tf.reduce_mean(q_eval), var_list=self.actor_evaluation_params)

        # self.c_train 을 호출할때 self.a 에 배치의 action을 넣게 된다.
        # Placeholder가 아닌 self.a 에 직접 값을 대입하는 것!
        # s a r s_ 를 이용해서 critic을 업데이트 하는데, 정석으로 구한 y가 트루 라벨, 뉴럴넷에 값을 넣고 나오는 것이 우리의 prediction이다.
        # True Label,  y = r(s,u_t(s)) + gamma*Q(s_, u_t(s_))
        q_true = self.R + self.discount_factor * q_target

        # Prediction, Q = q_eval
        # 우리가 mseLoss를 구하려면 q_eval을 구해야 하므로 self.input_state에 피딩을 해 주어야 함.
        # 또한 q_true 를 구하기 위해 self.R 과 q_target에 들어갈 self.input_state_ 도 피딩 해주어야 함.
        self.mseloss = tf.losses.mean_squared_error(labels=q_true,
                                                    predictions=q_eval)
        # 이 부분은 오직 Critic net을 업데이트하기위한 Loss이다. 때문에 var_list를 Critic evaluation network로 지정해주어야한다.
        self.ctrain = tf.train.AdamOptimizer(
            self.critic_learning_rate).minimize(
                self.mseloss, var_list=self.critic_evaluation_params)

        # 네트워크를 만들고 항상 초기화를 해준다.
        self.sess.run(tf.global_variables_initializer())

        self.actor_loss_history = []
        self.critic_loss_history = []

    def store_transition(self, s, a, r, s_):
        self.replayBuffer.add(s, a, r, s_)

    def choose_action(self, s):
        return np.clip(
            np.random.normal(
                self.sess.run(self.a, {self.input_state: s[np.newaxis, :]})[0],
                self.action_variance), -2, 2)

    def learn(self):
        if self.replayBuffer.count() > self.batch_size:
            self.action_variance *= .9995
            self.sess.run(self.replace)

            batch = self.replayBuffer.get_batch(self.batch_size)
            batch_s = np.asarray([x[0] for x in batch])
            batch_a = np.asarray([x[1] for x in batch])
            batch_r = np.asarray([[x[2]] for x in batch])
            batch_s_ = np.asarray([x[3] for x in batch])

            actor_loss, _ = self.sess.run([self.a_loss, self.atrain],
                                          {self.input_state: batch_s})
            critic_loss, _ = self.sess.run(
                [self.mseloss, self.ctrain], {
                    self.input_state: batch_s,
                    self.a: batch_a,
                    self.R: batch_r,
                    self.input_state_: batch_s_
                })

            self.actor_loss_history.append(actor_loss)
            self.critic_loss_history.append(critic_loss)

    def build_actor_network(self, s, scope, trainable):
        actor_hidden_size = 30
        with tf.variable_scope(scope):
            hidden1 = tf.layers.dense(s,
                                      actor_hidden_size,
                                      activation=tf.nn.relu,
                                      name='l1',
                                      trainable=trainable)
            a = tf.layers.dense(hidden1,
                                self.action_size,
                                activation=tf.nn.tanh,
                                name='a',
                                trainable=trainable)
            return tf.multiply(a, self.action_limit, name='scaled_a')

    def build_critic_network(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            critic_hidden_size = 30
            hidden1 = tf.layers.dense(s, critic_hidden_size,  name='s1', trainable=trainable) \
            + tf.layers.dense(a, critic_hidden_size, name='a1', trainable=trainable) \
            + tf.get_variable('b1', [1, critic_hidden_size], trainable=trainable)
            hidden1 = tf.nn.relu(hidden1)
            return tf.layers.dense(hidden1, 1, trainable=trainable)

    def plot_loss(self):
        plt.title('history', fontsize=25)
        ms = 0.1
        me = 1
        line_width = 0.1
        plt.ylabel('Loss')
        plt.xlabel('Training steps')

        actor_loss_mean = sum(self.actor_loss_history) / len(
            self.actor_loss_history)
        self.actor_loss_history /= actor_loss_mean
        critic_loss_mean = sum(self.critic_loss_history) / len(
            self.critic_loss_history)
        self.critic_loss_history /= critic_loss_mean

        plt.plot(np.arange(len(self.actor_loss_history)),
                 self.actor_loss_history,
                 '-p',
                 color='b',
                 markevery=me,
                 label=r'actor loss',
                 lw=line_width,
                 markersize=ms)
        plt.plot(np.arange(len(self.critic_loss_history)),
                 self.critic_loss_history,
                 '--^',
                 color='r',
                 markevery=me,
                 label=r'critic loss',
                 lw=line_width,
                 markersize=ms)

        plt.grid()
        ax = plt.subplot(111)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.ylim(0, 10)
        plt.show()

    def plot_reward(self, reward_history):
        plt.plot(np.arange(len(reward_history)), reward_history)
        plt.ylabel('Reward')
        plt.xlabel('Episodes')
        plt.grid()
        plt.show()