예제 #1
0
    def collect_samples(self):
        """
        Collect one full rollout, as determined by the nstep parameter, and add it to the buffer
        """
        assert self.last_obs is not None

        rollout_step = 0
        self.rollout.reset()

        # For logging
        test_int_rewards = []

        while rollout_step < self.nstep:

            with torch.no_grad():
                # Convert to pytorch tensor
                actions, values, log_probs = self.policy.act(self.last_obs)

            obs, rewards, dones, infos = self.env.step(actions.numpy())

            if any(dones):
                self.num_episodes += sum(dones)
            rollout_step += 1
            self.num_timesteps += self.num_envs
            self.update_info_buffer(infos)

            int_rewards = self.intrinsic_module.int_reward(
                torch.Tensor(self.last_obs), torch.Tensor(obs), actions)
            rewards = (
                1 - self.int_rew_integration
            ) * rewards + self.int_rew_integration * int_rewards.detach(
            ).numpy()

            # For logging
            test_int_rewards.append(int_rewards.mean().item())

            actions = actions.reshape(self.num_envs,
                                      self.action_converter.action_output)
            log_probs = log_probs.reshape(self.num_envs,
                                          self.action_converter.action_output)

            self.rollout.add(self.last_obs, actions, rewards, values, dones,
                             log_probs)

            self.last_obs = obs
        logger.record("rollout/mean_int_reward",
                      np.round(np.mean(np.array(test_int_rewards)), 10))
        self.rollout.compute_returns_and_advantages(values, dones=dones)

        return True
예제 #2
0
    def compute_returns_and_advantages(self, last_value, last_int_value,
                                       dones):
        """
        Post-processing step: compute the returns (sum of discounted rewards)
        and GAE advantage.
        Adapted from Stable-Baselines PPO2.
        Uses Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)
        to compute the advantage. To obtain vanilla advantage (A(s) = R - V(S))
        where R is the discounted reward with value bootstrap,
        set ``gae_lambda=1.0`` during initialization.

        :param last_value: (th.Tensor)
        :param dones: (np.ndarray)
        """

        logger.record("rollout/mean_int_reward", np.mean(self.int_rewards))

        last_value = last_value.clone().cpu().numpy().flatten()
        last_int_value = last_int_value.clone().cpu().numpy().flatten()

        last_gae_lam = 0
        int_last_gae_lam = 0

        for step in reversed(range(self.buffer_size)):
            if step == self.buffer_size - 1:
                next_non_terminal = 1.0 - dones
                next_value = last_value
                next_int_values = last_int_value
            else:
                next_non_terminal = 1.0 - self.masks[step + 1]
                next_value = self.values[step + 1]
                next_int_values = self.int_values[step + 1]

            delta = self.rewards[
                step] + self.gamma * next_value * next_non_terminal - self.values[
                    step]
            last_gae_lam = delta + self.gamma * self.gae_lam * next_non_terminal * last_gae_lam
            self.advantages[step] = last_gae_lam

            int_delta = self.int_rewards[
                step] + self.int_gamma * next_int_values - self.int_values[step]
            int_last_gae_lam = int_delta + self.int_gamma * self.gae_lam * int_last_gae_lam
            self.int_advantages[step] = int_last_gae_lam

        self.returns = self.advantages + self.values
        self.int_returns = self.int_advantages + self.int_values
예제 #3
0
def interact_dict(menu, logFlag=0, secFlag=0):     # menu + extension flags
    import os, security, logger                    # utility modules
    user = os.environ['USER']
    if secFlag:                                    # any allowed?
        for name in menu.keys():
            if security.allow(name, user):
                break
        else:
            print "You’re not authorized for any menu selections"
            return
    while 1:
        for name in menu.keys():                   # show legals
            if (not secFlag) or security.allow(name, user):
                print '\t' + name
        tool = raw_input('?')
        if logFlag:
            logger.record(user, tool)              # log it, validate it
        if secFlag and not security.allow(tool, user):
            print "You're not authorized for this selection - try again"
        else:
            try:
                menu[tool]()                       # run function
            except KeyError:              
                print 'what? - try again'          # key not found
    def run(self,
            total_timesteps,
            reward_target=None,
            log_interval=1,
            log_to_file=False):
        """
        Run the algorithm

        :param total_timesteps: (int) total timesteps to run the environment for
        :param reward_target: (int) the reward target indiating termination of the algorithm
        :param log_interval: (int) logging frequency
        :param log_to_file: (bool) log to file or not
        """
        logger.configure("ES", self.env_id, log_to_file)

        MPS = 2
        meta_population = [
            FeedForwardNetwork(self.env, hidden_sizes=self.hidden_sizes)
            for _ in range(MPS)
        ]

        pool = mp.Pool(self.num_threads) if self.num_threads > 1 else None
        start_time = time.time()

        archive = []
        delta_reward_buffer = deque(maxlen=10)

        novelties = []
        for iteration in range(int(total_timesteps)):
            population = self._get_population()
            if len(archive) > 0:
                novelties = []
                S = np.minimum(self.K, len(archive))
                for model in meta_population:
                    b_pi_theta = self.get_behavior_char(
                        model.get_weights(), self.env)
                    distance = self.get_kNN(archive, b_pi_theta, S)
                    novelty = distance / S
                    if novelty <= 1e-3:
                        novelty = 5e-3
                    novelties.append(novelty)

                probs = self.calc_noveltiy_distribution(novelties)

                probs = np.array(probs)
                probs /= probs.sum(
                )  # norm so that sum up to one - does without as well but np gives error because of rounding
                brain_idx = np.random.choice(
                    list(range(MPS)),
                    p=probs)  # select new brain based on novelty probabilities
                model = meta_population[brain_idx]
                novelty = novelties[brain_idx]

                self.model.set_weights(model.get_weights())
                rewards = self._get_rewards(pool, population)
                self._update_weights(rewards, population, novelty)
                meta_population[brain_idx].set_weights(
                    self.model.get_weights())
            else:
                brain_idx = np.random.randint(0, MPS)
                model = meta_population[brain_idx]
                novelty = 1

                self.model.set_weights(model.get_weights())
                rewards = self._get_rewards(pool, population)
                self._update_weights(rewards, population, novelty)
                meta_population[brain_idx].set_weights(
                    self.model.get_weights())

            mean_reward_batch = np.mean(rewards)
            reward_gradient_mean = np.mean(delta_reward_buffer)

            r_koeff = abs(mean_reward_batch - reward_gradient_mean)

            if iteration % 5 == 0:
                if r_koeff < self.nsr_plateu:
                    self.novelty_param = np.minimum(
                        self.nsr_range[1],
                        self.novelty_param + self.nsr_update)
                else:
                    self.novelty_param = np.maximum(
                        self.nsr_range[0],
                        self.novelty_param - self.nsr_update)

            delta_reward_buffer.append(mean_reward_batch)

            b_pix = self.get_behavior_char(self.weights, self.env)
            # append new behavior to specific brain archive
            archive.append(b_pix)

            self.rewards.extend([self.evaluate(self.weights, self.env)])

            if (iteration + 1) % log_interval == 0:
                logger.record("iteration", iteration + 1)
                logger.record("reward", np.mean(self.rewards))
                logger.record("novelty", np.mean(novelties))
                logger.record("n_koeff", self.novelty_param)
                logger.record("total_time", time.time() - start_time)
                logger.dump(step=iteration + 1)
            if reward_target is not None and np.mean(
                    self.rewards) > reward_target:
                print("Solved!")
                logger.record("iteration", iteration + 1)
                logger.record("reward", np.mean(self.rewards))
                logger.record("total_time", time.time() - start_time)
                logger.dump(step=iteration + 1)
                break
        if pool is not None:
            pool.close()
            pool.join()
 def runCommand(self, cmd):
     logger.record(self.user, cmd)            # add pre-logging
     ListMenu.runCommand(self, cmd)           # do normal list runCommand
예제 #6
0
 def runCommand(self, cmd):
     logger.record(self.user, cmd)  # add pre-logging
     ListMenu.runCommand(self, cmd)  # do normal list runCommand
예제 #7
0
    def learn(self,
              total_timesteps,
              log_interval=5,
              reward_target=None,
              log_to_file=False):
        """
        Initiate the training of the algorithm.

        :param total_timesteps: (int)   total number of timesteps the agent is to run for
        :param log_interval: (int)      how often to perform logging
        :param reward_target: (int)     reaching the reward target stops training early
        :param log_to_file: (bool)      specify whether output ought to be logged
        """
        logger.configure("ICM", self.env_id, log_to_file)
        start_time = time.time()
        iteration = 0

        while self.num_timesteps < total_timesteps:
            progress = round(self.num_timesteps / total_timesteps * 100, 2)
            self.collect_samples()

            iteration += 1
            if log_interval is not None and iteration % log_interval == 0:
                logger.record("Progress", str(progress) + '%')
                logger.record("time/total timesteps", self.num_timesteps)
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        np.mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/num_episodes", self.num_episodes)
                fps = int(self.num_timesteps / (time.time() - start_time))
                logger.record("time/total_time", (time.time() - start_time))
                logger.dump(step=self.num_timesteps)

            self.train()

            if reward_target is not None and np.mean(
                [ep_info["r"]
                 for ep_info in self.ep_info_buffer]) > reward_target:
                logger.record("time/total timesteps", self.num_timesteps)
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        np.mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/num_episodes", self.num_episodes)
                fps = int(self.num_timesteps / (time.time() - start_time))
                logger.record("time/total_time", (time.time() - start_time))
                logger.dump(step=self.num_timesteps)
                break
        return self
예제 #8
0
    def train(self):
        """
        Use the collected data from the buffer to train the policy network
        """
        total_losses, policy_losses, value_losses, entropy_losses, icm_losses = [], [], [], [], []

        inv_criterion = self.action_converter.get_loss()

        for epoch in range(self.n_epochs):
            for batch in self.rollout.get(self.batch_size):
                observations = batch.observations
                actions = batch.actions
                old_log_probs = batch.old_log_probs
                old_values = batch.old_values
                advantages = batch.advantages
                returns = batch.returns

                state_values, action_log_probs, entropy = self.policy.evaluate(
                    observations, actions)

                advantages = (advantages -
                              advantages.mean()) / (advantages.std() + 1e-8)
                ratio = torch.exp(action_log_probs - old_log_probs)

                # Surrogate loss
                surr_loss_1 = advantages * ratio
                surr_loss_2 = advantages * torch.clamp(
                    ratio, 1 - self.clip_range, 1 + self.clip_range)
                policy_loss = -torch.min(surr_loss_1, surr_loss_2).mean()

                # Clipped value loss
                state_values_clipped = old_values + (
                    state_values - old_values).clamp(-self.clip_range,
                                                     self.clip_range)
                value_loss = F.mse_loss(returns, state_values).mean()
                value_loss_clipped = F.mse_loss(returns,
                                                state_values_clipped).mean()
                value_loss = torch.max(value_loss, value_loss_clipped).mean()
                # Icm loss
                actions_hat, next_features, next_features_hat = self.intrinsic_module(
                    observations[:-1], observations[1:], actions[:-1])

                forward_loss = F.mse_loss(next_features, next_features_hat)
                inverse_loss = inv_criterion(
                    actions_hat, self.action_converter.action(actions[:-1]))
                icm_loss = (
                    1 - self.beta) * inverse_loss + self.beta * forward_loss

                entropy_loss = -torch.mean(entropy)

                loss = self.policy_weight * (
                    policy_loss + self.vf_coef * value_loss +
                    self.ent_coef * entropy_loss) + icm_loss

                self.optimizer.zero_grad()
                self.icm_optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.policy.net.parameters(),
                                               self.max_grad_norm)
                self.optimizer.step()
                self.icm_optimizer.step()

                total_losses.append(loss.item())
                policy_losses.append(policy_loss.item())
                value_losses.append(value_loss.item())
                entropy_losses.append(entropy_loss.item())
                icm_losses.append(icm_loss.item())

        logger.record("train/entropy_loss", np.mean(entropy_losses))
        logger.record("train/policy_gradient_loss", np.mean(policy_losses))
        logger.record("train/value_loss", np.mean(value_losses))
        logger.record("train/total_loss", np.mean(total_losses))
        logger.record("train/icm_loss", np.mean(icm_losses))

        self._n_updates += self.n_epochs
예제 #9
0
    def train(self):
        """
        Use the collected data from the buffer to train the policy network
        """
        total_losses, policy_losses, value_losses, entropy_losses, intrinsic_losses = [], [], [], [], []
        rnd_trained = False
        for epoch in range(self.n_epochs):
            for batch in self.rollout.get(self.batch_size):
                observations = batch.observations
                actions = batch.actions
                old_log_probs = batch.old_log_probs
                old_values = batch.old_values
                old_int_values = batch.int_values
                advantages = batch.advantages
                int_advantages = batch.int_advantages
                returns = batch.returns
                int_returns = batch.int_returns

                # Get values and action probabilities using the updated policy on gathered observations
                state_values, int_values, action_log_probs, entropy = self.policy.evaluate(
                    observations, actions)

                # Normalize batch advantages
                advantages = (advantages -
                              advantages.mean()) / (advantages.std() + 1e-8)
                int_advantages = (int_advantages - int_advantages.mean()) / (
                    int_advantages.std() + 1e-8)

                advantages = advantages + int_advantages

                # Compute policy gradient ratio of current actions probs over previous
                ratio = torch.exp(action_log_probs - old_log_probs)

                # Compute surrogate loss
                surr_loss_1 = advantages * ratio
                surr_loss_2 = advantages * torch.clamp(
                    ratio, 1 - self.clip_range, 1 + self.clip_range)
                policy_loss = -torch.min(surr_loss_1, surr_loss_2).mean()

                # Clip state values for stability
                state_values_clipped = old_values + (
                    state_values - old_values).clamp(-self.clip_range,
                                                     self.clip_range)
                value_loss = F.mse_loss(returns, state_values).mean()
                value_loss_clipped = F.mse_loss(returns,
                                                state_values_clipped).mean()
                value_loss = torch.max(value_loss, value_loss_clipped).mean()

                # Clip state values for stability
                int_values_clipped = old_int_values + (
                    int_values - old_int_values).clamp(-self.clip_range,
                                                       self.clip_range)
                int_value_loss = F.mse_loss(int_returns, int_values).mean()
                int_value_loss_clipped = F.mse_loss(int_returns,
                                                    int_values_clipped).mean()
                int_value_loss = torch.max(int_value_loss,
                                           int_value_loss_clipped).mean()

                # Compute entropy loss
                entropy_loss = -torch.mean(entropy)

                # Total loss
                loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss + self.int_vf_coef * int_value_loss

                # Perform optimization
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.policy.net.parameters(),
                                               self.max_grad_norm)
                self.optimizer.step()

                if np.random.randn() < 0.25:
                    self.train_rnd(batch)

                total_losses.append(loss.item())
                policy_losses.append(policy_loss.item())
                value_losses.append(value_loss.item())
                entropy_losses.append(entropy_loss.item())
                intrinsic_losses.append(int_value_loss.item())
            rnd_trained = True

        logger.record("train/intrinsic_loss", np.mean(intrinsic_losses))
        logger.record("train/entropy_loss", np.mean(entropy_losses))
        logger.record("train/policy_gradient_loss", np.mean(policy_losses))
        logger.record("train/value_loss", np.mean(value_losses))
        logger.record("train/total_loss", np.mean(total_losses))

        self._n_updates += self.n_epochs