Exemplo n.º 1
0
 def play_one_game(self):
     replay = Replay()
     s = self.env.reset()
     count = 0
     while True:
         conv_s = np.reshape(s, [1, 84, 84, 4])
         p_g = self.nns["good"].predict(conv_s)
         p_n = self.nns["normal"].predict(conv_s)
         p_b = self.nns["bad"].predict(conv_s)
         p = 2 * p_g["pi"][0] + p_n["pi"][0] - p_b["pi"][0]
         p += np.ones_like(self.a)
         p /= np.sum(p)
         a = np.random.choice(self.a, p=p)
         s_, r, t, _ = self.env.step(a)
         replay.add(s, a)
         replay.score += r
         s = s_
         count += 1
         if count % 10 == 0:
             print(".", end="", flush=True)
         if t:
             print()
             break
     return replay
class DDPG:
    def __init__(self, task):
        # Hyper parameters
        self.learning_rate_actor = 1e-4
        self.learning_rate_critic = 1e-3
        self.gamma = 0.99
        self.tau = 0.001

        # Define net
        self.sess = tf.Session()
        self.task = task
        self.actor = ActorNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_actor, \
                     self.task.action_low, self.task.action_high, self.tau)
        self.critic = CriticNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_critic, self.tau)

        # Define noise
        self.mu = 0
        self.theta = 0.15
        self.sigma = 0.20
        self.noise = OUNoise(self.task.action_size, self.mu, self.theta, self.sigma)

        # Define memory replay
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = Replay(self.buffer_size, self.batch_size)

        # Score
        self.best_score = -np.inf
        self.best_reward = -np.inf

    def reset(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.total_reward = 0.0
        self.count = 0
        return state

    def learn(self, experience):
        # Turn into different np arrays
        state_batch = np.vstack([e[0] for e in experience])
        action_batch = np.vstack([e[1] for e in experience])
        reward_batch = np.vstack([e[2] for e in experience])
        next_state_batch = np.vstack([e[3] for e in experience])
        done_batch = np.vstack([e[4] for e in experience])

        # Calculate next_state q value
        next_action_batch = self.actor.target_actions(next_state_batch)
        next_q_targets = self.critic.targetQ(next_state_batch, next_action_batch)

        # Train critic net
        q_targets = reward_batch + self.gamma * next_q_targets * (1 - done_batch)
        self.critic.train(state_batch, action_batch, q_targets)

        # Train actor net
        action_gradients = self.critic.gradients(state_batch, action_batch)
        self.actor.train(action_gradients, state_batch)

        # Update target network
        self.actor.update_target(False)
        self.critic.update_target(False)

    def step(self, action, reward, next_state, done):
        self.memory.add([self.last_state, action, reward, next_state, done])
        self.total_reward += reward
        self.count += 1
        if done:
            self.score = self.total_reward / float(self.count) if self.count else 0.0
            self.best_score = max(self.best_score, self.score)
            self.best_reward = max(self.total_reward, self.best_reward)

        if len(self.memory.buffer) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, states):
        states = np.reshape(states, [-1, self.task.state_size])
        action = self.actor.actions(states)[0]
        return list(action + self.noise.sample())