def _train(self): config = self.config step_tstart = time.time() theta = self.policy.get_trainable_flat() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results = self._collect_results(theta_id, config["episodes_per_batch"], config["timesteps_per_batch"]) curr_task_results = [] ob_count_this_batch = 0 # Loop over the results for result in results: assert result.eval_length is None, "We aren't doing eval rollouts." assert result.noise_inds_n.ndim == 1 assert result.returns_n2.shape == (len(result.noise_inds_n), 2) assert result.lengths_n2.shape == (len(result.noise_inds_n), 2) assert result.returns_n2.dtype == np.float32 result_num_eps = result.lengths_n2.size result_num_timesteps = result.lengths_n2.sum() self.episodes_so_far += result_num_eps self.timesteps_so_far += result_num_timesteps curr_task_results.append(result) # Update ob stats. if self.policy.needs_ob_stat and result.ob_count > 0: self.ob_stat.increment(result.ob_sum, result.ob_sumsq, result.ob_count) ob_count_this_batch += result.ob_count # Assemble the results. noise_inds_n = np.concatenate( [r.noise_inds_n for r in curr_task_results]) returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results]) lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results]) assert (noise_inds_n.shape[0] == returns_n2.shape[0] == lengths_n2.shape[0]) # Process the returns. if config["return_proc_mode"] == "centered_rank": proc_returns_n2 = utils.compute_centered_ranks(returns_n2) else: raise NotImplementedError(config["return_proc_mode"]) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_returns_n2[:, 0] - proc_returns_n2[:, 1], (self.noise.get(idx, self.policy.num_params) for idx in noise_inds_n), batch_size=500) g /= returns_n2.size assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n)) update_ratio = self.optimizer.update(-g + config["l2coeff"] * theta) # Update ob stat (we're never running the policy in the master, but we # might be snapshotting the policy). if self.policy.needs_ob_stat: self.policy.set_ob_stat(self.ob_stat.mean, self.ob_stat.std) step_tend = time.time() tlogger.record_tabular("EpRewMean", returns_n2.mean()) tlogger.record_tabular("EpRewStd", returns_n2.std()) tlogger.record_tabular("EpLenMean", lengths_n2.mean()) tlogger.record_tabular( "Norm", float(np.square(self.policy.get_trainable_flat()).sum())) tlogger.record_tabular("GradNorm", float(np.square(g).sum())) tlogger.record_tabular("UpdateRatio", float(update_ratio)) tlogger.record_tabular("EpisodesThisIter", lengths_n2.size) tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far) tlogger.record_tabular("TimestepsThisIter", lengths_n2.sum()) tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far) tlogger.record_tabular("ObCount", ob_count_this_batch) tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart) tlogger.record_tabular("TimeElapsed", step_tend - self.tstart) tlogger.dump_tabular() info = { "weights_norm": np.square(self.policy.get_trainable_flat()).sum(), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": lengths_n2.size, "episodes_so_far": self.episodes_so_far, "timesteps_this_iter": lengths_n2.sum(), "timesteps_so_far": self.timesteps_so_far, "ob_count": ob_count_this_batch, "time_elapsed_this_iter": step_tend - step_tstart, "time_elapsed": step_tend - self.tstart } result = TrainingResult(episode_reward_mean=returns_n2.mean(), episode_len_mean=lengths_n2.mean(), timesteps_this_iter=lengths_n2.sum(), info=info) return result
def _train(self): config = self.config step_tstart = time.time() theta = self.policy.get_weights() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["episodes_per_batch"], config["timesteps_per_batch"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes self.timesteps_so_far += num_timesteps # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # Process the returns. if config["return_proc_mode"] == "centered_rank": proc_noisy_returns = utils.compute_centered_ranks(noisy_returns) else: raise NotImplementedError(config["return_proc_mode"]) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_indices), batch_size=500) g /= noisy_returns.size assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32 and count == len(noise_indices)) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g + config["l2_coeff"] * theta) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) step_tend = time.time() tlogger.record_tabular("EvalEpRewMean", eval_returns.mean()) tlogger.record_tabular("EvalEpRewStd", eval_returns.std()) tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean()) tlogger.record_tabular("EpRewMean", noisy_returns.mean()) tlogger.record_tabular("EpRewStd", noisy_returns.std()) tlogger.record_tabular("EpLenMean", noisy_lengths.mean()) tlogger.record_tabular("Norm", float(np.square(theta).sum())) tlogger.record_tabular("GradNorm", float(np.square(g).sum())) tlogger.record_tabular("UpdateRatio", float(update_ratio)) tlogger.record_tabular("EpisodesThisIter", noisy_lengths.size) tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far) tlogger.record_tabular("TimestepsThisIter", noisy_lengths.sum()) tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far) tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart) tlogger.record_tabular("TimeElapsed", step_tend - self.tstart) tlogger.dump_tabular() info = { "weights_norm": np.square(theta).sum(), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, "timesteps_this_iter": noisy_lengths.sum(), "timesteps_so_far": self.timesteps_so_far, "time_elapsed_this_iter": step_tend - step_tstart, "time_elapsed": step_tend - self.tstart } result = TrainingResult(episode_reward_mean=eval_returns.mean(), episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result
def aggregate_rollouts(self, num_rollouts=None, evaluate=False): """ Aggregate update step from rollouts generated in parallel. """ if num_rollouts is None: num_deltas = self.num_deltas else: num_deltas = num_rollouts # put policy weights in the object store policy_id = ray.put(self.w_policy) t1 = time.time() num_rollouts = int(num_deltas / self.num_workers) # parallel generation of rollouts rollout_ids_one = [ worker.do_rollouts.remote(policy_id, num_rollouts=num_rollouts, shift=self.shift, evaluate=evaluate) for worker in self.workers ] remainder_workers = self.workers[:(num_deltas % self.num_workers)] # handle the remainder of num_delta/num_workers rollout_ids_two = [ worker.do_rollouts.remote(policy_id, num_rollouts=1, shift=self.shift, evaluate=evaluate) for worker in remainder_workers ] # gather results results_one = ray.get(rollout_ids_one) results_two = ray.get(rollout_ids_two) rollout_rewards, deltas_idx, steps = [], [], [] for result in results_one: if not evaluate: self.timesteps += np.sum(result["steps"]) deltas_idx += result['deltas_idx'] rollout_rewards += result['rollout_rewards'] steps += [result['steps']] for result in results_two: if not evaluate: self.timesteps += np.sum(result["steps"]) deltas_idx += result['deltas_idx'] rollout_rewards += result['rollout_rewards'] steps += [result['steps']] info_dict = { 'deltas_idx': deltas_idx, 'rollout_rewards': rollout_rewards, 'steps': steps } deltas_idx = np.array(deltas_idx) rollout_rewards = np.array(rollout_rewards, dtype=np.float64) t2 = time.time() print('Time to generate rollouts:', t2 - t1) if evaluate: return rollout_rewards # select top performing directions if deltas_used < num_deltas max_rewards = np.max(rollout_rewards, axis=1) if self.deltas_used > self.num_deltas: self.deltas_used = self.num_deltas percentage = (1 - (self.deltas_used / self.num_deltas)) idx = np.arange(max_rewards.size)[ max_rewards >= np.percentile(max_rewards, 100 * percentage)] deltas_idx = deltas_idx[idx] rollout_rewards = rollout_rewards[idx, :] # normalize rewards by their standard deviation rollout_rewards /= np.std(rollout_rewards) t1 = time.time() # aggregate rollouts to form the gradient used to compute SGD step reward_diff = rollout_rewards[:, 0] - rollout_rewards[:, 1] deltas_tuple = (self.deltas.get(idx, self.w_policy.size) for idx in deltas_idx) g_hat, count = utils.batched_weighted_sum(reward_diff, deltas_tuple, batch_size=500) g_hat /= deltas_idx.size t2 = time.time() print('time to aggregate rollouts', t2 - t1) return g_hat, info_dict