def train(sess, env, actor, critic, noise, reward, discrete, saver,
          checkpoint_path):
    # Set up summary writer
    summary_writer = tf.summary.FileWriter("ddpg_summary")

    actor.update()
    critic.update()

    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    # Initialize noise
    ou_level = 0.

    for i in range(MAX_EPISODES):

        if i % 100 == 0:
            saver.save(sess, checkpoint_path)

        # s 는 environment에서 제공하는 첫번째 state 정보.
        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        # buffer 초기화
        episode_buffer = np.empty((0, 5), float)

        for j in range(MAX_EP_STEPS):

            # print(critic.w1.eval()[0,0])

            env.render()

            # a 는 actor의 current policy를 기반으로 예측한 q_value tensor [None x action_dim]
            a = actor.predict(np.reshape(s, (1, actor.state_dim)))

            # stochastic environment에서의 e-greedy 를 위해서
            # Noise 를 추가한다.
            # 아랫 부분은 ornstein_uhlenbeck_level이라는 내용을 참조해야 합니다.. constant action space에서
            # 학습을 위해서 사용하는 방법이라고 합니다.
            if i < NOISE_MAX_EP:
                ou_level = noise.ornstein_uhlenbeck_level(ou_level)
                a = a + ou_level

                # Set action for discrete and continuous action spaces
            if discrete:
                action = np.argmax(a)
            else:
                action = a[0]

            # 선택된 action을 기반으로 step을 진행시킨 후 결과를
            # 돌려받습니다.
            s2, r, terminal, info = env.step(action)

            # episode 내의 총 reward 값을 더합니다.
            ep_reward += r

            # ==========================================================================[중요한 부분]==============
            # Replay Buffer에 해당 정보를 더합니다.
            # episode_buffer라는 nparray에 [s, a, r, terminal, s2]의 배열을 넣어줍니다.
            episode_buffer = np.append(episode_buffer,
                                       [[s, a, r, terminal, s2]],
                                       axis=0)
            # ===================================================================================================

            # Replay Buffer에 Minibatch size 이상의 데이터가 담겨 있다면
            # 데이터를 가져와서 학습을 진행합니다.
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # critic을 통해서 Q(s, a)인 action-value function을 가져옵니다.
                # 이때 critic은 current policy를 평가하고 action-value function을 학습하므로,
                # actor의 예측 값을 가져옵니다.
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Critic이 가진 action_value function을 학습합니다.
                # 이때 필요한 데이터는 state batch, action batch, reward batch 입니다.
                # reward는 DQN (deepmind etc...) 논문에서 사용했던 것 과 같이
                # terminal 이라면 마지막 reward 자체.
                # terminal이 아니라면, s2 에서의 q_value 값에 discount factor를 곱한 값과 s 에서의 reward를 더한 값을
                # reward로 계사합니다.
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                # 모델의 성능 측정을 위해서 average Q 값을 확인합니다.
                # DQN (deepmind etc..) 논문에서 강화학습 모델의 진행도를 측정하기 위한 좋은
                # 지표로서 언급하였습니다.
                ep_ave_max_q += np.amax(predicted_q_value)

                # replay buffer에서 가져온 state_batch를 사용해서 actor의 current policy에 해당하는
                # q_value를 가져옵니다.
                # current_policy에 따른 q_value를 state 정보와 함께 넣어
                # Q(s, a)를 계산합니다 by critic
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)

                # grads 는 (1, BATCH_SIZE, ACTION_DIM) 의 배열이므로
                # (BATCH_SIZE, ACTION_DIM)의 입력을 받는 actor.train 함수를 위해서 grads[0]을 취합니다.
                # actor의 policy를 업데이트 하기 위해서 critic의 gradients를 받아와서 train합니다.
                actor.train(s_batch, grads[0])

                # actor와 critic network 모두 업데이트 합니다.
                actor.update()
                critic.update()

            # s2 를 s 로 바꾸어 진행합니다.
            s = s2

            if terminal:

                episode_buffer = reward.discount(episode_buffer)

                # Add episode to replay buffer
                for step in episode_buffer:
                    replay_buffer.add(
                        np.reshape(step[0], (actor.state_dim, )),
                        np.reshape(step[1], (actor.action_dim, )), step[2],
                        step[3], np.reshape(step[4], (actor.state_dim, )))

                summary = tf.Summary()
                summary.value.add(tag='Perf/Reward',
                                  simple_value=float(ep_reward))
                summary.value.add(tag='Perf/Qmax',
                                  simple_value=float(ep_ave_max_q / float(j)))
                summary_writer.add_summary(summary, i)

                summary_writer.flush()

                print('| Reward: %.2i' % int(ep_reward), " | Episode", i,
                      '| Qmax: %.4f' % (ep_ave_max_q / float(j)))

                break
示例#2
0
class DDQNAgent():
    """
        A Double DQN agent has two networks. One local network and one target network.
        The local network is trained every iteration and is used for predictive action.
        The target network is updated to a soft copy of the local network every so often.

        The reason is because the Bellman equation would be valuing the network that is predicting
        as well as that same network being used to calculate loss. We have this separation of training
        and predicting to help the agent learn.
    """
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                    mem_size, batch_size, eps_min = 0.01, eps_dec = 5e-7,
                    replace = 10_000):
        pass
        self.gamma = gamma #used to discount future rewards
        self.epsilon = epsilon #used for epsilon-greedy action choosing algo.
        self.lr = lr #learning rate, essentially, how big of a step does the optimizer take
        self.n_actions = n_actions #number of actions available to our agent in its environment
        self.action_space = [i for i in range(n_actions)]#list comprehension to create array of indices of possible actions to choose from
        self.input_dims = input_dims #the dimensions of our input as defined by the agent's environment
        self.mem_size = mem_size #maximum amount of memories to store
        self.batch_size = batch_size #mini-batch size to sample from memory.
        self.eps_min = eps_min #smallest possible epsilon value for our agent
        self.eps_dec = eps_dec #how much to decrease epsilon each iteration
        self.replace_after = replace #how many iterations until we replace our target network with a sofy copy of our local network
        self.steps = 0 #iteration counter for use with replace_after

        #create a ReplayBuffer to store our memories, also used to sample a mini-batch
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.Q_local = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims = self.input_dims)
        self.Q_target = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims = self.input_dims)

    def store_memory(self, state, action, reward, state_, done):
        """
            Save a new memory to our ReplayBuffer
        """
        self.memory.store_memory(state, action, reward, state_, done)

    def sample_batch(self):
        """
            Pull a stochastic mini-batch from our ReplayBuffer
        """
        state, action, reward, state_, done = \
                            self.memory.sample_batch(self.batch_size)

        states = T.tensor(state).to(self.Q_local.device)
        actions = T.tensor(action).to(self.Q_local.device)
        rewards = T.tensor(reward).to(self.Q_local.device)
        states_ = T.tensor(state_).to(self.Q_local.device)
        dones = T.tensor(done).to(self.Q_local.device)

        return states, actions, rewards, states_, dones


    def choose_action(self, observation):
        """
            Choose an action from our action space using an epsilon-greedy algorithm.
            We can either EXPLOIT, or EXPLORE based on a random probability.

            Exploiting will choose the best known action. (confidence)

            Exploring will explore a random action. This will possibly present new information to our agent
            to learn from.
        """
        if np.random.random() > self.epsilon:#epsilon-greedy (EXPLOIT)
            state = T.tensor([observation], dtype = T.float).to(self.Q_local.device)
            actions = self.Q_local.forward(state)
            action = T.argmax(actions).item()#.item() gets index from tensor
        else:#(EXPLORE)
            action = np.random.choice(self.action_space)#choose random action from our action space

        return action

    def replace_target_network(self):
        """
            after replace_after iterations we update our target network
            to be a soft copy of our local network
        """
        if self.replace_after is not None and \
                    self.steps % self.replace_after == 0:
            self.Q_target.load_state_dict(self.Q_local.state_dict())

    def decrement_epsilon(self):
        """
            decrease epsilon, but not below eps_min
        """
        self.epsilon = max(self.epsilon - self.eps_dec, self.eps_min)

    def learn(self):
        """
            Main part of our agent.

            First we zero the gradient of our optimzier to stop exploding gradients.
            Then we sample a stochastic mini-batch from our ReplayBuffer.

            Then we make predictions and evaluations of this random mini-batch, step our optimzer
            and calculate loss.

            Finally, we decrement our epsilon and begin the cycle of (SEE->DO->LEARN) once again.
        """
        if self.memory.mem_cntr < self.batch_size:#if we dont have a full batch of memories, dont learn quite yet
            return

        self.Q_local.optimizer.zero_grad()#zero out our gradient for optimzer. Stop exploding gradients

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_batch()

        indices = np.arange(self.batch_size)

        q_pred = self.Q_local.forward(states)[indices, actions]#local pred
        q_next = self.Q_target.forward(states_)#target pred
        q_eval = self.Q_local.forward(states_)

        max_actions = T.argmax(q_eval, dim = 1)
        q_next[dones] = 0.0#set to not done

        q_target = rewards + self.gamma*q_next[indices, max_actions]#bellman equation
        loss = self.Q_local.loss(q_target, q_pred).to(self.Q_local.device)
        loss.backward()#back-propagation

        self.Q_local.optimizer.step()
        self.steps += 1

        self.decrement_epsilon()

    def save_agent(self):
        self.Q_local.save_model('local')
        self.Q_target.save_model('target')

    def load_agent(self):
        self.Q_local.load_model('local')
        self.Q_target.load_model('target')