def step(self) -> Tuple[List[float], List[float]]: """ Optimizes the entire population with antithetic sampling. Returns training population rewards and timesteps for the current step. """ perturbs = self._sample_pertrubations(self.num_agents) perturbs_rew = [] perturbs_timesteps = [] centroid_parameters = unroll_parameters(self.centroid.parameters()) report_rew = [] for ind in range(0, self.num_agents): perturb = perturbs[ind] # Initialize agent with perturbed parameters self.agent.policy.init_from_parameters(centroid_parameters + perturb) reward, timesteps = self.agent.train_rollout( self.num_trials, self.env_name, self.seed) # Initialize agent with anti perturbed parameters self.agent.policy.init_from_parameters(centroid_parameters - perturb) reward_anti, timesteps_anti = self.agent.train_rollout( self.num_trials, self.env_name, self.seed) perturbs_rew.append(reward - reward_anti) perturbs_timesteps.append(timesteps) perturbs_timesteps.append(timesteps_anti) report_rew.append(reward) report_rew.append(reward_anti) # Transform rewards as in Salimans et al. (2017) transformed_rews = compute_centered_ranks(np.array(perturbs_rew)) # GRADIENT ASCENT perturbs = np.stack(perturbs) total_grad = torch.zeros(self.num_parameters) for ind in range(0, self.num_agents): grad = torch.tensor(transformed_rews[ind] * perturbs[ind]) total_grad += grad * (self.lr) / (2 * self.num_agents * self.weights_std**2) self.grads.append(total_grad) self.grads.update_orthogonal() centroid_parameters += total_grad # Update the centroid self.centroid.init_from_parameters(centroid_parameters) print("Gradient norm: {}".format(torch.norm(grad, p=2))) return report_rew, perturbs_timesteps
def run_es(self): """ Runs Evolution Strategies. Tricks used: - Antithetic (i.e. mirrored) sampling. - Rank transformation, using OpenAI's code. Tricks avoided: - Fixed Gaussian block. I like to just regenerate here. - Virtual batch normalization, seems to be only for Atari games. - Weight decay. Not sure how to do this. - Action discretization. For now, it adds extra complexity. Final weights are saved and can be pre-loaded elsewhere. """ args = self.args t_start = time.time() for i in range(args.es_iters): if (i % args.log_every_t_iter == 0): print("\n************ Iteration %i ************"%i) stats = defaultdict(list) # Set stuff up for perturbing weights and determining fitness. weights_old = self.sess.run(self.weights_v) # Shape (numw,) eps_nw = np.random.randn(args.npop/2, self.num_ws) scores_n2 = [] for j in range(args.npop/2): # Mirrored sampling, positive case, +eps_j. weights_new_pos = weights_old + args.sigma * eps_nw[j] self.sess.run(self.set_params_op, feed_dict={self.new_weights_v: weights_new_pos}) rews_pos = self._compute_return() # Mirrored sampling, negative case, -eps_j. weights_new_neg = weights_old - args.sigma * eps_nw[j] self.sess.run(self.set_params_op, feed_dict={self.new_weights_v: weights_new_neg}) rews_neg = self._compute_return() scores_n2.append([rews_pos,rews_neg]) # Determine the new weights based on OpenAI's rank updating. proc_returns_n2 = utils.compute_centered_ranks(np.array(scores_n2)) F_n = proc_returns_n2[:,0] - proc_returns_n2[:,1] grad = np.dot(eps_nw.T, F_n) # Apply the gradient update. TODO: Change this to ADAM. alpha = (args.lrate_es / (args.sigma*args.npop)) next_weights = weights_old + alpha * grad self.sess.run(self.set_params_op, feed_dict={self.new_weights_v: next_weights}) # Report relevant logs. if (i % args.log_every_t_iter == 0): hours = (time.time()-t_start) / (60*60.) # Test roll-outs with these new weights. returns = [] for _ in range(args.test_trajs): returns.append(self._compute_return(test=True)) logz.log_tabular("FinalAvgReturns", np.mean(returns)) logz.log_tabular("FinalStdReturns", np.std(returns)) logz.log_tabular("FinalMaxReturns", np.max(returns)) logz.log_tabular("FinalMinReturns", np.min(returns)) logz.log_tabular("ScoresAvg", np.mean(scores_n2)) logz.log_tabular("ScoresStd", np.std(scores_n2)) logz.log_tabular("ScoresMax", np.max(scores_n2)) logz.log_tabular("ScoresMin", np.min(scores_n2)) logz.log_tabular("TotalTimeHours", hours) logz.log_tabular("TotalIterations", i) logz.dump_tabular() # Save the weights so I can test them later. if (i % args.snapshot_every_t_iter == 0): itr = str(i).zfill(len(str(abs(args.es_iters)))) with open(self.log_dir+'/snapshots/weights_'+itr+'.pkl', 'wb') as f: pickle.dump(next_weights, f) # Save the *final* weights. itr = str(i).zfill(len(str(abs(args.es_iters)))) with open(self.log_dir+'/snapshots/weights_'+itr+'.pkl', 'wb') as f: pickle.dump(next_weights, f)
def step(self): """Run a step in ES. 1. kick off all actors to synchronize weights and sample data; 2. update parameters of the model based on sampled data. 3. update global observation filter based on local filters of all actors, and synchronize global filter to all actors. """ num_episodes, num_timesteps = 0, 0 results = [] while num_episodes < self.config['min_episodes_per_batch'] or \ num_timesteps < self.config['min_steps_per_batch']: # Send sample signal to all actors for q in self.actors_signal_input_queues: q.put({'signal': 'sample'}) # Collect results from all actors for q in self.actors_output_queues: result = q.get() results.append(result) # result['noisy_lengths'] is a list of lists, where the inner lists have length 2. num_episodes += sum( len(pair) for pair in result['noisy_lengths']) num_timesteps += sum( sum(pair) for pair in result['noisy_lengths']) all_noise_indices = [] all_training_rewards = [] all_training_lengths = [] all_eval_rewards = [] all_eval_lengths = [] for result in results: all_eval_rewards.extend(result['eval_rewards']) all_eval_lengths.extend(result['eval_lengths']) all_noise_indices.extend(result['noise_indices']) all_training_rewards.extend(result['noisy_rewards']) all_training_lengths.extend(result['noisy_lengths']) assert len(all_eval_rewards) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_rewards) == len(all_training_lengths)) self.sample_total_episodes += num_episodes self.sample_total_steps += num_timesteps eval_rewards = np.array(all_eval_rewards) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_rewards = np.array(all_training_rewards) noisy_lengths = np.array(all_training_lengths) # normalize rewards to (-0.5, 0.5) proc_noisy_rewards = utils.compute_centered_ranks(noisy_rewards) noises = [ self.noise.get(index, self.agent.weights_total_size) for index in noise_indices ] # Update the parameters of the model. self.agent.learn(proc_noisy_rewards, noises) self.latest_flat_weights = self.agent.get_flat_weights() # Update obs filter self._update_filter() # Store the evaluate rewards if len(all_eval_rewards) > 0: self.eval_rewards_stat.add(np.mean(eval_rewards)) self.eval_lengths_stat.add(np.mean(eval_lengths)) metrics = { "episodes_this_iter": noisy_lengths.size, "sample_total_episodes": self.sample_total_episodes, 'sample_total_steps': self.sample_total_steps, "evaluate_rewards_mean": self.eval_rewards_stat.mean, "evaluate_steps_mean": self.eval_lengths_stat.mean, "timesteps_this_iter": noisy_lengths.sum(), } self.log_metrics(metrics) return metrics
def run(): np.random.seed(args.seed) torch.manual_seed(args.seed) gym.logger.set_level(40) env = gym.make(args.env_name) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] state_stat = RunningStat(env.observation_space.shape, eps=1e-2) action_space = env.action_space policy = Policy(state_size, action_size, args.hidden_size, action_space.low, action_space.high) num_params = policy.num_params optim = Adam(num_params, args.lr) ray.init(num_cpus=args.num_parallel) return_list = [] for epoch in range(100000): ##################################### ### Rollout and Update State Stat ### ##################################### policy.set_state_stat(state_stat.mean, state_stat.std) # set diff params (mirror sampling) assert args.episodes_per_batch % 2 == 0 diff_params = torch.empty((args.episodes_per_batch, num_params), dtype=torch.float) diff_params_pos = torch.randn(args.episodes_per_batch // 2, num_params) * args.noise_std diff_params[::2] = diff_params_pos diff_params[1::2] = -diff_params_pos rets = [] num_episodes_popped = 0 num_timesteps_popped = 0 while num_episodes_popped < args.episodes_per_batch \ and num_timesteps_popped < args.timesteps_per_batch: #or num_timesteps_popped < args.timesteps_per_batch: results = [] for i in range(min(args.episodes_per_batch, 500)): # set policy randomized_policy = deepcopy(policy) randomized_policy.add_params(diff_params[num_episodes_popped + i]) # rollout results.append( rollout.remote(randomized_policy, args.env_name, seed=np.random.randint(0, 10000000))) for result in results: ret, timesteps, states = ray.get(result) rets.append(ret) # update state stat if states is not None: state_stat.increment(states.sum(axis=0), np.square(states).sum(axis=0), states.shape[0]) num_timesteps_popped += timesteps num_episodes_popped += 1 rets = np.array(rets, dtype=np.float32) diff_params = diff_params[:num_episodes_popped] best_policy_idx = np.argmax(rets) best_policy = deepcopy(policy) best_policy.add_params(diff_params[best_policy_idx]) best_rets = [ rollout.remote(best_policy, args.env_name, seed=np.random.randint(0, 10000000), calc_state_stat_prob=0.0, test=True) for _ in range(10) ] best_rets = np.average(ray.get(best_rets)) print('epoch:', epoch, 'mean:', np.average(rets), 'max:', np.max(rets), 'best:', best_rets) with open(args.outdir + '/return.csv', 'w') as f: return_list.append( [epoch, np.max(rets), np.average(rets), best_rets]) writer = csv.writer(f, lineterminator='\n') writer.writerows(return_list) plt.figure() sns.lineplot(data=np.array(return_list)[:, 1:]) plt.savefig(args.outdir + '/return.png') plt.close('all') ############# ### Train ### ############# fitness = compute_centered_ranks(rets).reshape(-1, 1) if args.weight_decay > 0: #l2_decay = args.weight_decay * ((policy.get_params() + diff_params)**2).mean(dim=1, keepdim=True).numpy() l1_decay = args.weight_decay * (policy.get_params() + diff_params).mean( dim=1, keepdim=True).numpy() fitness += l1_decay grad = (fitness * diff_params.numpy()).mean(axis=0) policy = optim.update(policy, -grad)
def run(): np.random.seed(args.seed) torch.manual_seed(args.seed) gym.logger.set_level(40) env = gym.make(args.env_name) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] state_stat = RunningStat( env.observation_space.shape, eps=1e-2 ) action_space = env.action_space policy = Policy(state_size, action_size, args.hidden_size, action_space.low, action_space.high) num_params = policy.num_params es = cma.CMAEvolutionStrategy([0] * num_params, args.sigma_init, {'popsize': args.popsize, }) ray.init(num_cpus=args.num_parallel) return_list = [] for epoch in range(100000): ##################################### ### Rollout and Update State Stat ### ##################################### solutions = np.array(es.ask(), dtype=np.float32) policy.set_state_stat(state_stat.mean, state_stat.std) rets = [] results = [] for i in range(args.popsize): # set policy randomized_policy = deepcopy(policy) randomized_policy.set_params(solutions[i]) # rollout results.append(rollout.remote(randomized_policy, args.env_name, seed=np.random.randint(0,10000000))) for result in results: ret, timesteps, states = ray.get(result) rets.append(ret) # update state stat if states is not None: state_stat.increment(states.sum(axis=0), np.square(states).sum(axis=0), states.shape[0]) rets = np.array(rets, dtype=np.float32) best_policy_idx = np.argmax(rets) best_policy = deepcopy(policy) best_policy.set_params(solutions[best_policy_idx]) best_rets = [rollout.remote(best_policy, args.env_name, seed=np.random.randint(0,10000000), calc_state_stat_prob=0.0, test=True) for _ in range(10)] best_rets = np.average(ray.get(best_rets)) print('epoch:', epoch, 'mean:', np.average(rets), 'max:', np.max(rets), 'best:', best_rets) with open(args.outdir + '/return.csv', 'w') as f: return_list.append([epoch, np.max(rets), np.average(rets), best_rets]) writer = csv.writer(f, lineterminator='\n') writer.writerows(return_list) plt.figure() sns.lineplot(data=np.array(return_list)[:,1:]) plt.savefig(args.outdir + '/return.png') plt.close('all') ############# ### Train ### ############# ranks = compute_centered_ranks(rets) fitness = ranks if args.weight_decay > 0: l2_decay = compute_weight_decay(args.weight_decay, solutions) fitness -= l2_decay # convert minimize to maximize es.tell(solutions, fitness)
# Update ob stats. if policy.needs_ob_stat and result.ob_count > 0: ob_stat.increment(result.ob_sum, result.ob_sumsq, result.ob_count) ob_count_this_batch += result.ob_count # Assemble the results. noise_inds_n = np.concatenate( [r.noise_inds_n for r in curr_task_results]) returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results]) lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results]) assert noise_inds_n.shape[0] == returns_n2.shape[ 0] == lengths_n2.shape[0] # Process the returns. if config.return_proc_mode == "centered_rank": proc_returns_n2 = utils.compute_centered_ranks(returns_n2) else: raise NotImplementedError(config.return_proc_mode) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_returns_n2[:, 0] - proc_returns_n2[:, 1], (noise.get(idx, policy.num_params) for idx in noise_inds_n), batch_size=500) g /= returns_n2.size assert (g.shape == (policy.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n)) update_ratio = optimizer.update(-g + config.l2coeff * theta) # Update ob stat (we're never running the policy in the master, but we # might be snapshotting the policy).