def collect_samples(self): """ Collect one full rollout, as determined by the nstep parameter, and add it to the buffer """ assert self.last_obs is not None rollout_step = 0 self.rollout.reset() # For logging test_int_rewards = [] while rollout_step < self.nstep: with torch.no_grad(): # Convert to pytorch tensor actions, values, log_probs = self.policy.act(self.last_obs) obs, rewards, dones, infos = self.env.step(actions.numpy()) if any(dones): self.num_episodes += sum(dones) rollout_step += 1 self.num_timesteps += self.num_envs self.update_info_buffer(infos) int_rewards = self.intrinsic_module.int_reward( torch.Tensor(self.last_obs), torch.Tensor(obs), actions) rewards = ( 1 - self.int_rew_integration ) * rewards + self.int_rew_integration * int_rewards.detach( ).numpy() # For logging test_int_rewards.append(int_rewards.mean().item()) actions = actions.reshape(self.num_envs, self.action_converter.action_output) log_probs = log_probs.reshape(self.num_envs, self.action_converter.action_output) self.rollout.add(self.last_obs, actions, rewards, values, dones, log_probs) self.last_obs = obs logger.record("rollout/mean_int_reward", np.round(np.mean(np.array(test_int_rewards)), 10)) self.rollout.compute_returns_and_advantages(values, dones=dones) return True
def compute_returns_and_advantages(self, last_value, last_int_value, dones): """ Post-processing step: compute the returns (sum of discounted rewards) and GAE advantage. Adapted from Stable-Baselines PPO2. Uses Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438) to compute the advantage. To obtain vanilla advantage (A(s) = R - V(S)) where R is the discounted reward with value bootstrap, set ``gae_lambda=1.0`` during initialization. :param last_value: (th.Tensor) :param dones: (np.ndarray) """ logger.record("rollout/mean_int_reward", np.mean(self.int_rewards)) last_value = last_value.clone().cpu().numpy().flatten() last_int_value = last_int_value.clone().cpu().numpy().flatten() last_gae_lam = 0 int_last_gae_lam = 0 for step in reversed(range(self.buffer_size)): if step == self.buffer_size - 1: next_non_terminal = 1.0 - dones next_value = last_value next_int_values = last_int_value else: next_non_terminal = 1.0 - self.masks[step + 1] next_value = self.values[step + 1] next_int_values = self.int_values[step + 1] delta = self.rewards[ step] + self.gamma * next_value * next_non_terminal - self.values[ step] last_gae_lam = delta + self.gamma * self.gae_lam * next_non_terminal * last_gae_lam self.advantages[step] = last_gae_lam int_delta = self.int_rewards[ step] + self.int_gamma * next_int_values - self.int_values[step] int_last_gae_lam = int_delta + self.int_gamma * self.gae_lam * int_last_gae_lam self.int_advantages[step] = int_last_gae_lam self.returns = self.advantages + self.values self.int_returns = self.int_advantages + self.int_values
def interact_dict(menu, logFlag=0, secFlag=0): # menu + extension flags import os, security, logger # utility modules user = os.environ['USER'] if secFlag: # any allowed? for name in menu.keys(): if security.allow(name, user): break else: print "You’re not authorized for any menu selections" return while 1: for name in menu.keys(): # show legals if (not secFlag) or security.allow(name, user): print '\t' + name tool = raw_input('?') if logFlag: logger.record(user, tool) # log it, validate it if secFlag and not security.allow(tool, user): print "You're not authorized for this selection - try again" else: try: menu[tool]() # run function except KeyError: print 'what? - try again' # key not found
def run(self, total_timesteps, reward_target=None, log_interval=1, log_to_file=False): """ Run the algorithm :param total_timesteps: (int) total timesteps to run the environment for :param reward_target: (int) the reward target indiating termination of the algorithm :param log_interval: (int) logging frequency :param log_to_file: (bool) log to file or not """ logger.configure("ES", self.env_id, log_to_file) MPS = 2 meta_population = [ FeedForwardNetwork(self.env, hidden_sizes=self.hidden_sizes) for _ in range(MPS) ] pool = mp.Pool(self.num_threads) if self.num_threads > 1 else None start_time = time.time() archive = [] delta_reward_buffer = deque(maxlen=10) novelties = [] for iteration in range(int(total_timesteps)): population = self._get_population() if len(archive) > 0: novelties = [] S = np.minimum(self.K, len(archive)) for model in meta_population: b_pi_theta = self.get_behavior_char( model.get_weights(), self.env) distance = self.get_kNN(archive, b_pi_theta, S) novelty = distance / S if novelty <= 1e-3: novelty = 5e-3 novelties.append(novelty) probs = self.calc_noveltiy_distribution(novelties) probs = np.array(probs) probs /= probs.sum( ) # norm so that sum up to one - does without as well but np gives error because of rounding brain_idx = np.random.choice( list(range(MPS)), p=probs) # select new brain based on novelty probabilities model = meta_population[brain_idx] novelty = novelties[brain_idx] self.model.set_weights(model.get_weights()) rewards = self._get_rewards(pool, population) self._update_weights(rewards, population, novelty) meta_population[brain_idx].set_weights( self.model.get_weights()) else: brain_idx = np.random.randint(0, MPS) model = meta_population[brain_idx] novelty = 1 self.model.set_weights(model.get_weights()) rewards = self._get_rewards(pool, population) self._update_weights(rewards, population, novelty) meta_population[brain_idx].set_weights( self.model.get_weights()) mean_reward_batch = np.mean(rewards) reward_gradient_mean = np.mean(delta_reward_buffer) r_koeff = abs(mean_reward_batch - reward_gradient_mean) if iteration % 5 == 0: if r_koeff < self.nsr_plateu: self.novelty_param = np.minimum( self.nsr_range[1], self.novelty_param + self.nsr_update) else: self.novelty_param = np.maximum( self.nsr_range[0], self.novelty_param - self.nsr_update) delta_reward_buffer.append(mean_reward_batch) b_pix = self.get_behavior_char(self.weights, self.env) # append new behavior to specific brain archive archive.append(b_pix) self.rewards.extend([self.evaluate(self.weights, self.env)]) if (iteration + 1) % log_interval == 0: logger.record("iteration", iteration + 1) logger.record("reward", np.mean(self.rewards)) logger.record("novelty", np.mean(novelties)) logger.record("n_koeff", self.novelty_param) logger.record("total_time", time.time() - start_time) logger.dump(step=iteration + 1) if reward_target is not None and np.mean( self.rewards) > reward_target: print("Solved!") logger.record("iteration", iteration + 1) logger.record("reward", np.mean(self.rewards)) logger.record("total_time", time.time() - start_time) logger.dump(step=iteration + 1) break if pool is not None: pool.close() pool.join()
def runCommand(self, cmd): logger.record(self.user, cmd) # add pre-logging ListMenu.runCommand(self, cmd) # do normal list runCommand
def learn(self, total_timesteps, log_interval=5, reward_target=None, log_to_file=False): """ Initiate the training of the algorithm. :param total_timesteps: (int) total number of timesteps the agent is to run for :param log_interval: (int) how often to perform logging :param reward_target: (int) reaching the reward target stops training early :param log_to_file: (bool) specify whether output ought to be logged """ logger.configure("ICM", self.env_id, log_to_file) start_time = time.time() iteration = 0 while self.num_timesteps < total_timesteps: progress = round(self.num_timesteps / total_timesteps * 100, 2) self.collect_samples() iteration += 1 if log_interval is not None and iteration % log_interval == 0: logger.record("Progress", str(progress) + '%') logger.record("time/total timesteps", self.num_timesteps) if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", np.mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record("rollout/num_episodes", self.num_episodes) fps = int(self.num_timesteps / (time.time() - start_time)) logger.record("time/total_time", (time.time() - start_time)) logger.dump(step=self.num_timesteps) self.train() if reward_target is not None and np.mean( [ep_info["r"] for ep_info in self.ep_info_buffer]) > reward_target: logger.record("time/total timesteps", self.num_timesteps) if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", np.mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record("rollout/num_episodes", self.num_episodes) fps = int(self.num_timesteps / (time.time() - start_time)) logger.record("time/total_time", (time.time() - start_time)) logger.dump(step=self.num_timesteps) break return self
def train(self): """ Use the collected data from the buffer to train the policy network """ total_losses, policy_losses, value_losses, entropy_losses, icm_losses = [], [], [], [], [] inv_criterion = self.action_converter.get_loss() for epoch in range(self.n_epochs): for batch in self.rollout.get(self.batch_size): observations = batch.observations actions = batch.actions old_log_probs = batch.old_log_probs old_values = batch.old_values advantages = batch.advantages returns = batch.returns state_values, action_log_probs, entropy = self.policy.evaluate( observations, actions) advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) ratio = torch.exp(action_log_probs - old_log_probs) # Surrogate loss surr_loss_1 = advantages * ratio surr_loss_2 = advantages * torch.clamp( ratio, 1 - self.clip_range, 1 + self.clip_range) policy_loss = -torch.min(surr_loss_1, surr_loss_2).mean() # Clipped value loss state_values_clipped = old_values + ( state_values - old_values).clamp(-self.clip_range, self.clip_range) value_loss = F.mse_loss(returns, state_values).mean() value_loss_clipped = F.mse_loss(returns, state_values_clipped).mean() value_loss = torch.max(value_loss, value_loss_clipped).mean() # Icm loss actions_hat, next_features, next_features_hat = self.intrinsic_module( observations[:-1], observations[1:], actions[:-1]) forward_loss = F.mse_loss(next_features, next_features_hat) inverse_loss = inv_criterion( actions_hat, self.action_converter.action(actions[:-1])) icm_loss = ( 1 - self.beta) * inverse_loss + self.beta * forward_loss entropy_loss = -torch.mean(entropy) loss = self.policy_weight * ( policy_loss + self.vf_coef * value_loss + self.ent_coef * entropy_loss) + icm_loss self.optimizer.zero_grad() self.icm_optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.policy.net.parameters(), self.max_grad_norm) self.optimizer.step() self.icm_optimizer.step() total_losses.append(loss.item()) policy_losses.append(policy_loss.item()) value_losses.append(value_loss.item()) entropy_losses.append(entropy_loss.item()) icm_losses.append(icm_loss.item()) logger.record("train/entropy_loss", np.mean(entropy_losses)) logger.record("train/policy_gradient_loss", np.mean(policy_losses)) logger.record("train/value_loss", np.mean(value_losses)) logger.record("train/total_loss", np.mean(total_losses)) logger.record("train/icm_loss", np.mean(icm_losses)) self._n_updates += self.n_epochs
def train(self): """ Use the collected data from the buffer to train the policy network """ total_losses, policy_losses, value_losses, entropy_losses, intrinsic_losses = [], [], [], [], [] rnd_trained = False for epoch in range(self.n_epochs): for batch in self.rollout.get(self.batch_size): observations = batch.observations actions = batch.actions old_log_probs = batch.old_log_probs old_values = batch.old_values old_int_values = batch.int_values advantages = batch.advantages int_advantages = batch.int_advantages returns = batch.returns int_returns = batch.int_returns # Get values and action probabilities using the updated policy on gathered observations state_values, int_values, action_log_probs, entropy = self.policy.evaluate( observations, actions) # Normalize batch advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) int_advantages = (int_advantages - int_advantages.mean()) / ( int_advantages.std() + 1e-8) advantages = advantages + int_advantages # Compute policy gradient ratio of current actions probs over previous ratio = torch.exp(action_log_probs - old_log_probs) # Compute surrogate loss surr_loss_1 = advantages * ratio surr_loss_2 = advantages * torch.clamp( ratio, 1 - self.clip_range, 1 + self.clip_range) policy_loss = -torch.min(surr_loss_1, surr_loss_2).mean() # Clip state values for stability state_values_clipped = old_values + ( state_values - old_values).clamp(-self.clip_range, self.clip_range) value_loss = F.mse_loss(returns, state_values).mean() value_loss_clipped = F.mse_loss(returns, state_values_clipped).mean() value_loss = torch.max(value_loss, value_loss_clipped).mean() # Clip state values for stability int_values_clipped = old_int_values + ( int_values - old_int_values).clamp(-self.clip_range, self.clip_range) int_value_loss = F.mse_loss(int_returns, int_values).mean() int_value_loss_clipped = F.mse_loss(int_returns, int_values_clipped).mean() int_value_loss = torch.max(int_value_loss, int_value_loss_clipped).mean() # Compute entropy loss entropy_loss = -torch.mean(entropy) # Total loss loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss + self.int_vf_coef * int_value_loss # Perform optimization self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.policy.net.parameters(), self.max_grad_norm) self.optimizer.step() if np.random.randn() < 0.25: self.train_rnd(batch) total_losses.append(loss.item()) policy_losses.append(policy_loss.item()) value_losses.append(value_loss.item()) entropy_losses.append(entropy_loss.item()) intrinsic_losses.append(int_value_loss.item()) rnd_trained = True logger.record("train/intrinsic_loss", np.mean(intrinsic_losses)) logger.record("train/entropy_loss", np.mean(entropy_losses)) logger.record("train/policy_gradient_loss", np.mean(policy_losses)) logger.record("train/value_loss", np.mean(value_losses)) logger.record("train/total_loss", np.mean(total_losses)) self._n_updates += self.n_epochs