示例#1
0
            if (t > 0 and episode > 200):
                # Obtain TD-error
                _, base_v = ppo.evaluate_state(s_old)
                _, target_v = ppo.evaluate_state(s)
                lambda_mix = lambda_max * (1 - np.exp(-factor * np.abs(
                    r + GAMMA * np.squeeze(target_v) - np.squeeze(base_v))))
            else:
                lambda_mix = 10.
                lambda_actual = 10.
            lambda_store[t] = lambda_mix

            # Update ppo
            if t == BATCH:  # or (terminal and t < BATCH):
                # Normalise rewards
                rewards = np.array(buffer_r)
                rolling_r.update(rewards)
                rewards = np.clip(rewards / rolling_r.std, -10, 10)

                v_final = [
                    v * (1 - terminal)
                ]  # v = 0 if terminal, otherwise use the predicted v
                values = np.array(buffer_v + v_final)
                terminals = np.array(buffer_terminal + [terminal])

                # Generalized Advantage Estimation - https://arxiv.org/abs/1506.02438
                delta = rewards + GAMMA * values[1:] * (
                    1 - terminals[1:]) - values[:-1]
                advantage = discount(delta, GAMMA * LAMBDA, terminals)
                buffer_v = np.squeeze(np.array(buffer_v))[:, np.newaxis]
                returns = np.squeeze(advantage)[:, np.newaxis] + buffer_v
                advantage = (advantage - advantage.mean()) / np.maximum(
示例#2
0
                bs, ba, br, badv = np.reshape(buffer_s, (len(buffer_s),) + ppo.s_dim), np.vstack(buffer_a), \
                                   np.vstack(returns), np.vstack(advantage)
                experience.append([bs, ba, br, badv])

                buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []

                # Update ppo
                if t >= BATCH:
                    # Per batch normalisation of advantages
                    advs = np.concatenate(list(zip(*experience))[3])
                    for x in experience:
                        x[3] = (x[3] - np.mean(advs)) / np.maximum(
                            np.std(advs), 1e-6)

                    # Update rolling reward stats
                    rolling_r.update(np.array(batch_rewards))

                    print("Training using %i episodes and %i steps..." %
                          (len(experience), t))
                    graph_summary = ppo.update(experience)
                    t, experience, batch_rewards = 0, [], []

            buffer_s.append(s)
            buffer_a.append(a)
            buffer_v.append(v)
            buffer_terminal.append(terminal)
            ep_a.append(a)

            if not ppo.discrete:
                a = np.clip(a, env.action_space.low, env.action_space.high)
            s, r, terminal, _ = env.step(a)
示例#3
0
    def work(self):
        hooks = [self.ppo.sync_replicas_hook]
        sess = tf.train.MonitoredTrainingSession(master=self.server.target,
                                                 is_chief=(self.wid == 0),
                                                 checkpoint_dir=SUMMARY_DIR,
                                                 save_summaries_steps=None,
                                                 save_summaries_secs=None,
                                                 hooks=hooks)
        if self.wid == 0:
            writer = SummaryWriterCache.get(SUMMARY_DIR)

        t, episode, terminal = 0, 0, False
        buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
        rolling_r = RunningStats()

        while not sess.should_stop() and not (episode > EP_MAX
                                              and self.wid == 0):

            s = self.env.reset()
            ep_r, ep_t, ep_a = 0, 0, []

            while True:
                a, v = self.ppo.evaluate_state(s, sess)

                # Update ppo
                if t == BATCH:  # or (terminal and t < BATCH):
                    # Normalise rewards
                    rewards = np.array(buffer_r)
                    rolling_r.update(rewards)
                    rewards = np.clip(rewards / rolling_r.std, -10, 10)

                    v_final = [
                        v * (1 - terminal)
                    ]  # v = 0 if terminal, otherwise use the predicted v
                    values = np.array(buffer_v + v_final)
                    terminals = np.array(buffer_terminal + [terminal])

                    # Generalized Advantage Estimation - https://arxiv.org/abs/1506.02438
                    delta = rewards + GAMMA * values[1:] * (
                        1 - terminals[1:]) - values[:-1]
                    advantage = discount(delta, GAMMA * LAMBDA, terminals)
                    returns = advantage + np.array(buffer_v)
                    advantage = (advantage - advantage.mean()) / np.maximum(
                        advantage.std(), 1e-6)

                    bs, ba, br, badv = np.reshape(buffer_s, (t,) + self.ppo.s_dim), np.vstack(buffer_a), \
                                       np.vstack(returns), np.vstack(advantage)

                    graph_summary = self.ppo.update(bs, ba, br, badv, sess)
                    buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
                    t = 0

                buffer_s.append(s)
                buffer_a.append(a)
                buffer_v.append(v)
                buffer_terminal.append(terminal)
                ep_a.append(a)

                if not self.ppo.discrete:
                    a = np.clip(a, self.env.action_space.low,
                                self.env.action_space.high)
                s, r, terminal, _ = self.env.step(a)
                buffer_r.append(r)

                ep_r += r
                ep_t += 1
                t += 1

                if terminal:
                    # End of episode summary
                    print('Worker_%i' % self.wid, '| Episode: %i' % episode,
                          "| Reward: %.2f" % ep_r, '| Steps: %i' % ep_t)

                    if self.wid == 0:
                        worker_summary = tf.Summary()
                        worker_summary.value.add(tag="Reward",
                                                 simple_value=ep_r)

                        # Create Action histograms for each dimension
                        actions = np.array(ep_a)
                        if self.ppo.discrete:
                            add_histogram(writer,
                                          "Action",
                                          actions,
                                          episode,
                                          bins=self.ppo.a_dim)
                        else:
                            for a in range(self.ppo.a_dim):
                                add_histogram(writer, "Action/Dim" + str(a),
                                              actions[:, a], episode)

                        try:
                            writer.add_summary(graph_summary, episode)
                        except NameError:
                            pass
                        writer.add_summary(worker_summary, episode)
                        writer.flush()

                    episode += 1
                    break

        self.env.close()
        print("Worker_%i finished" % self.wid)