示例#1
0
    def __call__(self):
        total_steps = 0
        n_episode = 0

        # TODO: clean codes
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(
            self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(
            size=self._episode_max_steps, env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            n_episode, total_rewards = self._collect_sample(n_episode, total_steps)
            total_steps += self._policy.horizon
            tf.summary.experimental.set_step(total_steps)

            if len(total_rewards) > 0:
                avg_training_return = sum(total_rewards) / len(total_rewards)
                tf.summary.scalar(
                    name="Common/training_return", data=avg_training_return)

            # Train actor critic
            for _ in range(self._policy.n_epoch):
                samples = self.replay_buffer.sample(self._policy.horizon)
                if self._policy.normalize_adv:
                    adv = (samples["adv"] - np.mean(samples["adv"])) / np.std(samples["adv"])
                else:
                    adv = samples["adv"]
                for idx in range(int(self._policy.horizon / self._policy.batch_size)):
                    target = slice(idx*self._policy.batch_size,
                                (idx+1)*self._policy.batch_size)
                    self._policy.train(
                        states=samples["obs"][target],
                        actions=samples["act"][target],
                        advantages=adv[target],
                        logp_olds=samples["logp"][target],
                        returns=samples["ret"][target])

            if total_steps % self._test_interval == 0:
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info("Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes".format(
                    total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(
                    name="Common/average_test_return", data=avg_test_return)
                self.writer.flush()

            if total_steps % self._model_save_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()
示例#2
0
def get_replay_buffer(policy, env, use_prioritized_rb=False,
                      use_nstep_rb=False, n_step=1, size=None):
    if policy is None or env is None:
        return None

    obs_shape = get_space_size(env.observation_space)
    kwargs = get_default_rb_dict(policy.memory_capacity, env)

    if size is not None:
        kwargs["size"] = size

    # on-policy policy
    if not issubclass(type(policy), OffPolicyAgent):
        kwargs["size"] = policy.horizon
        kwargs["env_dict"].pop("next_obs")
        kwargs["env_dict"].pop("rew")
        # TODO: Remove done. Currently cannot remove because of cpprb implementation
        # kwargs["env_dict"].pop("done")
        kwargs["env_dict"]["logp"] = {}
        kwargs["env_dict"]["ret"] = {}
        kwargs["env_dict"]["adv"] = {}
        if is_discrete(env.action_space):
            kwargs["env_dict"]["act"]["dtype"] = np.int32
        return ReplayBuffer(**kwargs)

    # N-step prioritized
    if use_prioritized_rb and use_nstep_rb:
        kwargs["n_step"] = n_step
        kwargs["discount"] = policy.discount
        raise NotImplementedError
        # return NstepPrioritizedReplayBuffer(**kwargs)

    if len(obs_shape) == 3:
        kwargs["env_dict"]["obs"]["dtype"] = np.ubyte
        kwargs["env_dict"]["next_obs"]["dtype"] = np.ubyte

    # prioritized
    if use_prioritized_rb:
        return PrioritizedReplayBuffer(**kwargs)

    # N-step
    if use_nstep_rb:
        kwargs["n_step"] = n_step
        kwargs["discount"] = policy.discount
        raise NotImplementedError
        # return NstepReplayBuffer(**kwargs)

    return ReplayBuffer(**kwargs)
示例#3
0
文件: apex.py 项目: xunyiljg/tf2rl
def explorer(global_rb,
             queue,
             trained_steps,
             is_training_done,
             lock,
             env_fn,
             policy_fn,
             set_weights_fn,
             noise_level,
             n_env=64,
             n_thread=4,
             buffer_size=1024,
             episode_max_steps=1000,
             gpu=0):
    """
    Collect transitions and store them to prioritized replay buffer.

    :param global_rb (multiprocessing.managers.AutoProxy[PrioritizedReplayBuffer]):
        Prioritized replay buffer sharing with multiple explorers and only one learner.
        This object is shared over processes, so it must be locked when trying to
        operate something with `lock` object.
    :param queue (multiprocessing.Queue):
        A FIFO shared with the `learner` and `evaluator` to get the latest network weights.
        This is process safe, so you don't need to lock process when use this.
    :param trained_steps (multiprocessing.Value):
        Number of steps to apply gradients.
    :param is_training_done (multiprocessing.Event):
        multiprocessing.Event object to share the status of training.
    :param lock (multiprocessing.Lock):
        multiprocessing.Lock to lock other processes.
    :param env_fn (function):
        Method object to generate an environment.
    :param policy_fn (function):
        Method object to generate an explorer.
    :param set_weights_fn (function):
        Method object to set network weights gotten from queue.
    :param noise_level (float):
        Noise level for exploration. For epsilon-greedy policy like DQN variants,
        this will be epsilon, and if DDPG variants this will be variance for Normal distribution.
    :param n_env (int):
        Number of environments to distribute. If this is set to be more than 1,
        `MultiThreadEnv` will be used.
    :param n_thread (int):
        Number of thread used in `MultiThreadEnv`.
    :param buffer_size (int):
        Size of local buffer. If this is filled with transitions, add them to `global_rb`
    :param episode_max_steps (int):
        Maximum number of steps of an episode.
    :param gpu (int):
        GPU id. If this is set to -1, then this process uses only CPU.
    """
    import_tf()

    if n_env > 1:
        envs = MultiThreadEnv(env_fn=env_fn,
                              batch_size=n_env,
                              thread_pool=n_thread,
                              max_episode_steps=episode_max_steps)
        env = envs._sample_env
    else:
        env = env_fn()

    policy = policy_fn(env=env,
                       name="Explorer",
                       memory_capacity=global_rb.get_buffer_size(),
                       noise_level=noise_level,
                       gpu=gpu)

    kwargs = get_default_rb_dict(buffer_size, env)
    if n_env > 1:
        kwargs["env_dict"]["priorities"] = {}
    local_rb = ReplayBuffer(**kwargs)

    if n_env == 1:
        s = env.reset()
        episode_steps = 0
        total_reward = 0.
        total_rewards = []
    else:
        obses = envs.py_reset()
    start = time.time()
    n_sample, n_sample_old = 0, 0

    while not is_training_done.is_set():
        if n_env == 1:
            n_sample += 1
            episode_steps += 1
            a = policy.get_action(s)
            s_, r, done, _ = env.step(a)
            done_flag = done
            if episode_steps == env._max_episode_steps:
                done_flag = False
            total_reward += r
            local_rb.add(obs=s, act=a, rew=r, next_obs=s_, done=done_flag)

            s = s_
            if done or episode_steps == episode_max_steps:
                s = env.reset()
                total_rewards.append(total_reward)
                total_reward = 0
                episode_steps = 0
        else:
            n_sample += n_env
            obses = envs.py_observation()
            actions = policy.get_action(obses, tensor=True)
            next_obses, rewards, dones, _ = envs.step(actions)
            td_errors = policy.compute_td_error(states=obses,
                                                actions=actions,
                                                next_states=next_obses,
                                                rewards=rewards,
                                                dones=dones)
            local_rb.add(obs=obses,
                         act=actions,
                         next_obs=next_obses,
                         rew=rewards,
                         done=dones,
                         priorities=np.abs(td_errors + 1e-6))

        # Periodically copy weights of explorer
        if not queue.empty():
            set_weights_fn(policy, queue.get())

        # Add collected experiences to global replay buffer
        if local_rb.get_stored_size() == buffer_size:
            samples = local_rb.sample(local_rb.get_stored_size())
            if n_env > 1:
                priorities = np.squeeze(samples["priorities"])
            else:
                td_errors = policy.compute_td_error(
                    states=samples["obs"],
                    actions=samples["act"],
                    next_states=samples["next_obs"],
                    rewards=samples["rew"],
                    dones=samples["done"])
                priorities = np.abs(np.squeeze(td_errors)) + 1e-6
            lock.acquire()
            global_rb.add(obs=samples["obs"],
                          act=samples["act"],
                          rew=samples["rew"],
                          next_obs=samples["next_obs"],
                          done=samples["done"],
                          priorities=priorities)
            lock.release()
            local_rb.clear()

            msg = "Grad: {0: 6d}\t".format(trained_steps.value)
            msg += "Samples: {0: 7d}\t".format(n_sample)
            msg += "TDErr: {0:.5f}\t".format(np.average(priorities))
            if n_env == 1:
                ave_rew = 0 if len(total_rewards) == 0 else \
                    sum(total_rewards) / len(total_rewards)
                msg += "AveEpiRew: {0:.3f}\t".format(ave_rew)
                total_rewards = []
            msg += "FPS: {0:.2f}".format(
                (n_sample - n_sample_old) / (time.time() - start))
            logging.info(msg)

            start = time.time()
            n_sample_old = n_sample
示例#4
0
    def __call__(self):
        total_steps = 0
        episode_steps = 0
        episode_return = 0
        episode_start_time = time.time()
        n_episode = 0
        test_step_threshold = self._test_interval

        # TODO: clean codes
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        obs = self._env.reset()
        while total_steps < self._max_steps:
            for _ in range(self._policy.horizon):
                action, log_pi, val = self._policy.get_action_and_val(obs)
                next_obs, reward, done, _ = self._env.step(action)
                if self._show_progress:
                    self._env.render()
                episode_steps += 1
                episode_return += reward
                total_steps += 1

                done_flag = done
                if hasattr(self._env, "_max_episode_steps") and \
                        episode_steps == self._env._max_episode_steps:
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=action,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=log_pi,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_episode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                        .format(n_episode, int(total_steps), episode_steps,
                                episode_return, fps))

                    episode_steps = 0
                    episode_return = 0
                    episode_start_time = time.time()

            self.finish_horizon(last_val=val)
            tf.summary.experimental.set_step(total_steps)
            samples = self.replay_buffer.sample(self._policy.horizon)
            # Normalize advantages
            if self._policy.normalize_adv:
                adv = (samples["adv"] - np.mean(samples["adv"])) / np.std(
                    samples["adv"])
            else:
                adv = samples["adv"]
            for _ in range(1):
                self._policy.train_actor(samples["obs"], samples["act"], adv,
                                         samples["logp"])
            # Train Critic
            for _ in range(5):
                self._policy.train_critic(samples["obs"], samples["ret"])
            if total_steps > test_step_threshold:
                test_step_threshold += self._test_interval
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)
                tf.summary.scalar(name="Common/fps", data=fps)

                self.writer.flush()

            if total_steps % self._model_save_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()
示例#5
0
class OnPolicyTrainer(Trainer):
    def __call__(self):
        total_steps = 0
        episode_steps = 0
        episode_return = 0
        episode_start_time = time.time()
        n_episode = 0
        test_step_threshold = self._test_interval

        # TODO: clean codes
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        obs = self._env.reset()
        while total_steps < self._max_steps:
            for _ in range(self._policy.horizon):
                action, log_pi, val = self._policy.get_action_and_val(obs)
                next_obs, reward, done, _ = self._env.step(action)
                if self._show_progress:
                    self._env.render()
                episode_steps += 1
                episode_return += reward
                total_steps += 1

                done_flag = done
                if hasattr(self._env, "_max_episode_steps") and \
                        episode_steps == self._env._max_episode_steps:
                    done_flag = False
                self.local_buffer.add(obs=obs,
                                      act=action,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done_flag,
                                      logp=log_pi,
                                      val=val)
                obs = next_obs

                if done or episode_steps == self._episode_max_steps:
                    self.finish_horizon()
                    obs = self._env.reset()
                    n_episode += 1
                    fps = episode_steps / (time.time() - episode_start_time)
                    self.logger.info(
                        "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                        .format(n_episode, int(total_steps), episode_steps,
                                episode_return, fps))

                    episode_steps = 0
                    episode_return = 0
                    episode_start_time = time.time()

            self.finish_horizon(last_val=val)
            tf.summary.experimental.set_step(total_steps)
            samples = self.replay_buffer.sample(self._policy.horizon)
            # Normalize advantages
            if self._policy.normalize_adv:
                adv = (samples["adv"] - np.mean(samples["adv"])) / np.std(
                    samples["adv"])
            else:
                adv = samples["adv"]
            for _ in range(1):
                self._policy.train_actor(samples["obs"], samples["act"], adv,
                                         samples["logp"])
            # Train Critic
            for _ in range(5):
                self._policy.train_critic(samples["obs"], samples["ret"])
            if total_steps > test_step_threshold:
                test_step_threshold += self._test_interval
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)
                tf.summary.scalar(name="Common/fps", data=fps)

                self.writer.flush()

            if total_steps % self._model_save_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()

    def finish_horizon(self, last_val=0):
        """
        Call this at the end of a trajectory, or when one gets cut off
        by an epoch ending. This looks back in the buffer to where the
        trajectory started, and uses rewards and value estimates from
        the whole trajectory to compute advantage estimates with GAE-Lambda,
        as well as compute the rewards-to-go for each state, to use as
        the targets for the value function.
        The "last_val" argument should be 0 if the trajectory ended
        because the agent reached a terminal state (died), and otherwise
        should be V(s_T), the value function estimated for the last state.
        This allows us to bootstrap the reward-to-go calculation to account
        for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
        """
        samples = self.local_buffer._encode_sample(
            np.arange(self.local_buffer.get_stored_size()))
        rews = np.append(samples["rew"], last_val)
        vals = np.append(samples["val"], last_val)

        # GAE-Lambda advantage calculation
        deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1]
        if self._policy.enable_gae:
            advs = discount_cumsum(deltas,
                                   self._policy.discount * self._policy.lam)
        else:
            advs = deltas

        # Rewards-to-go, to be targets for the value function
        rets = discount_cumsum(rews, self._policy.discount)[:-1]
        self.replay_buffer.add(obs=samples["obs"],
                               act=samples["act"],
                               done=samples["done"],
                               ret=rets,
                               adv=advs,
                               logp=np.squeeze(samples["logp"]))
        self.local_buffer.clear()

    def evaluate_policy(self, total_steps):
        avg_test_return = 0.
        if self._save_test_path:
            replay_buffer = get_replay_buffer(self._policy,
                                              self._test_env,
                                              size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            obs = self._test_env.reset()
            done = False
            for _ in range(self._episode_max_steps):
                action, _ = self._policy.get_action(obs, test=True)
                next_obs, reward, done, _ = self._test_env.step(action)
                if self._save_test_path:
                    replay_buffer.add(obs=obs,
                                      act=action,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()
                episode_return += reward
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)
            if self._save_test_path:
                save_path(replay_buffer.sample(self._episode_max_steps),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return
        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3),
                tf.uint8)
            tf.summary.image(
                'train/input_img',
                images,
            )
        return avg_test_return / self._test_episodes