예제 #1
0
    def _learn_loop(self, multi_env, step_callback=None):
        """
        Main training loop.
        :param step_callback: a hacky callback that takes a dictionary with all local variables as an argument.
        Allows you too look inside the training process.
        """
        step = initial_step = tf.train.global_step(self.session, tf.train.get_global_step())
        env_steps = self.total_env_steps.eval(session=self.session)
        batch_size = self.params.rollout * self.params.num_envs

        img_obs, timer_obs = extract_keys(multi_env.initial_obs(), 'obs', 'timer')

        adv_running_mean_std = RunningMeanStd(max_past_samples=10000)

        def end_of_training(s, es):
            return s >= self.params.train_for_steps or es > self.params.train_for_env_steps

        while not end_of_training(step, env_steps):
            timing = AttrDict({'experience': time.time(), 'batch': time.time()})
            experience_start = time.time()

            env_steps_before_batch = env_steps
            batch_obs, batch_timer = [img_obs], [timer_obs]
            env_steps += len(img_obs)
            batch_actions, batch_values, batch_rewards, batch_dones, batch_next_obs = [], [], [], [], []
            for rollout_step in range(self.params.rollout):
                actions, values = self._policy_step_timer(img_obs, timer_obs)
                batch_actions.append(actions)
                batch_values.append(values)

                # wait for all the workers to complete an environment step
                next_obs, rewards, dones, infos = multi_env.step(actions)
                next_img_obs, next_timer = extract_keys(next_obs, 'obs', 'timer')

                # calculate curiosity bonus
                bonuses = self._prediction_curiosity_bonus(img_obs, actions, next_img_obs)
                rewards += bonuses

                batch_rewards.append(rewards)
                batch_dones.append(dones)
                batch_next_obs.append(next_img_obs)

                img_obs = next_img_obs
                timer_obs = next_timer

                if infos is not None and 'num_frames' in infos[0]:
                    env_steps += sum((info['num_frames'] for info in infos))
                else:
                    env_steps += multi_env.num_envs

                if rollout_step != self.params.rollout - 1:
                    # we don't need the newest observation in the training batch, already have enough
                    batch_obs.append(img_obs)
                    batch_timer.append(timer_obs)

            assert len(batch_obs) == len(batch_rewards)
            assert len(batch_obs) == len(batch_next_obs)

            batch_rewards = np.asarray(batch_rewards, np.float32).swapaxes(0, 1)
            batch_dones = np.asarray(batch_dones, np.bool).swapaxes(0, 1)
            batch_values = np.asarray(batch_values, np.float32).swapaxes(0, 1)

            # Last value won't be valid for envs with done=True (because env automatically resets and shows 1st
            # observation of the next episode. But that's okay, because we should never use last_value in this case.
            last_values = self._estimate_values_timer(img_obs, timer_obs)

            gamma = self.params.gamma
            disc_rewards = []
            for i in range(len(batch_rewards)):
                env_rewards = self._calc_discounted_rewards(gamma, batch_rewards[i], batch_dones[i], last_values[i])
                disc_rewards.extend(env_rewards)
            disc_rewards = np.asarray(disc_rewards, np.float32)

            # convert observations and estimations to meaningful n-step batches
            batch_obs_shape = (self.params.rollout * multi_env.num_envs,) + img_obs[0].shape
            batch_obs = np.asarray(batch_obs, np.float32).swapaxes(0, 1).reshape(batch_obs_shape)
            batch_next_obs = np.asarray(batch_next_obs, np.float32).swapaxes(0, 1).reshape(batch_obs_shape)
            batch_actions = np.asarray(batch_actions, np.int32).swapaxes(0, 1).flatten()
            batch_timer = np.asarray(batch_timer, np.float32).swapaxes(0, 1).flatten()
            batch_values = batch_values.flatten()

            advantages = disc_rewards - batch_values
            if self.params.normalize_adv:
                adv_running_mean_std.update(advantages)
                advantages = (advantages - adv_running_mean_std.mean) / (np.sqrt(adv_running_mean_std.var) + EPS)
            advantages = np.clip(advantages, -self.params.clip_advantage, self.params.clip_advantage)

            timing.experience = time.time() - timing.experience
            timing.train = time.time()

            step = self._curious_train_step(
                step,
                env_steps,
                batch_obs,
                batch_timer,
                batch_actions,
                batch_values,
                disc_rewards,
                advantages,
                batch_next_obs,
            )
            self._maybe_save(step, env_steps)

            timing.train = time.time() - timing.train

            avg_reward = multi_env.calc_avg_rewards(n=self.params.stats_episodes)
            avg_length = multi_env.calc_avg_episode_lengths(n=self.params.stats_episodes)
            fps = (env_steps - env_steps_before_batch) / (time.time() - timing.batch)

            self._maybe_print(step, avg_reward, avg_length, fps, timing)
            self._maybe_aux_summaries(step, env_steps, avg_reward, avg_length)
            self._maybe_update_avg_reward(avg_reward, multi_env.stats_num_episodes())

            if step_callback is not None:
                step_callback(locals(), globals())
예제 #2
0
    def learn(self, step_callback=None):
        """
        Main training loop.
        :param step_callback: a hacky callback that takes a dictionary with all local variables as an argument.
        Allows you too look inside the training process.
        """
        step = initial_step = tf.train.global_step(self.session,
                                                   tf.train.get_global_step())
        env_steps = env_steps_initial = self.total_env_steps.eval(
            session=self.session)
        batch_size = self.params.rollout * self.params.num_envs

        multi_env = MultiEnv(
            self.params.num_envs,
            self.params.num_workers,
            make_env_func=self.make_env_func,
            stats_episodes=self.params.stats_episodes,
        )
        observations = extract_key(multi_env.initial_obs(), 'obs')

        def end_of_training(s):
            return s >= self.params.train_for_steps

        while not end_of_training(step):
            timing = AttrDict({
                'experience': time.time(),
                'batch': time.time()
            })
            experience_start = time.time()

            env_steps_before_batch = env_steps
            batch_obs = [observations]
            env_steps += len(observations)
            batch_actions, batch_values, batch_rewards, batch_dones = [], [], [], []
            for rollout_step in range(self.params.rollout):
                actions, values = self._policy_step(observations)
                batch_actions.append(actions)
                batch_values.append(values)

                # wait for all the workers to complete an environment step
                observations, rewards, dones, infos = multi_env.step(actions)
                observations = extract_key(observations, 'obs')

                batch_rewards.append(rewards)
                batch_dones.append(dones)
                if infos is not None and 'num_frames' in infos[0]:
                    env_steps += sum((info['num_frames'] for info in infos))
                else:
                    env_steps += multi_env.num_envs

                if rollout_step != self.params.rollout - 1:
                    # we don't need the newest observation in the training batch, already have enough
                    batch_obs.append(observations)

            assert len(batch_obs) == len(batch_rewards)

            batch_rewards = np.asarray(batch_rewards,
                                       np.float32).swapaxes(0, 1)
            batch_dones = np.asarray(batch_dones, np.bool).swapaxes(0, 1)
            last_values = self._estimate_values(observations)

            gamma = self.params.gamma
            discounted_rewards = []
            for env_rewards, env_dones, last_value in zip(
                    batch_rewards, batch_dones, last_values):
                discounted_rewards.extend(
                    self._calc_discounted_rewards(gamma, env_rewards,
                                                  env_dones, last_value))

            # convert observations and estimations to meaningful n-step batches
            batch_obs_shape = (self.params.rollout *
                               multi_env.num_envs, ) + observations[0].shape
            batch_obs = np.asarray(batch_obs, np.float32).swapaxes(
                0, 1).reshape(batch_obs_shape)
            batch_actions = np.asarray(batch_actions,
                                       np.int32).swapaxes(0, 1).flatten()
            batch_values = np.asarray(batch_values,
                                      np.float32).swapaxes(0, 1).flatten()

            timing.experience = time.time() - timing.experience
            timing.train = time.time()

            step = self._train_step(step, env_steps, batch_obs, batch_actions,
                                    batch_values, discounted_rewards)
            self._maybe_save(step, env_steps)

            timing.train = time.time() - timing.train

            avg_reward = multi_env.calc_avg_rewards(
                n=self.params.stats_episodes)
            avg_length = multi_env.calc_avg_episode_lengths(
                n=self.params.stats_episodes)
            fps = (env_steps - env_steps_before_batch) / (time.time() -
                                                          timing.batch)

            self._maybe_print(step, avg_reward, avg_length, fps, timing)
            self._maybe_aux_summaries(step, env_steps, avg_reward, avg_length)
            self._maybe_update_avg_reward(avg_reward,
                                          env_steps - env_steps_initial)

            if step_callback is not None:
                step_callback(locals(), globals())

        log.info('Done!')
        multi_env.close()