def _train(self): # perform the training t1 = time.time() g_hat, info_dict = self.train_step() t2 = time.time() print('total time of one step', t2 - t1) self.episodes_so_far += len(info_dict['steps']) self.timesteps_so_far += np.sum(info_dict['steps']) # Evaluate the reward with the unperturbed params rewards = self.aggregate_rollouts( num_rollouts=self.config['eval_rollouts'], evaluate=True) w = ray.get(self.workers[0].get_weights.remote()) tlogger.record_tabular("AverageReward", np.mean(rewards)) tlogger.record_tabular("StdRewards", np.std(rewards)) tlogger.record_tabular("WeightNorm", float(np.square(w).sum())) tlogger.record_tabular("WeightStd", float(np.std(w))) tlogger.record_tabular("GradNorm", float(np.square(g_hat).sum())) tlogger.record_tabular("MaxRewardRollout", np.max(rewards)) tlogger.record_tabular("MinRewardRollout", np.min(rewards)) tlogger.dump_tabular() result = ray.tune.result.TrainingResult( episode_reward_mean=np.mean(rewards), episode_len_mean=np.mean(info_dict['steps']), timesteps_this_iter=np.sum(info_dict['steps'])) return result
def _train(self): config = self.config step_tstart = time.time() theta = self.policy.get_weights() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["episodes_per_batch"], config["timesteps_per_batch"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes self.timesteps_so_far += num_timesteps # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # Process the returns. if config["return_proc_mode"] == "centered_rank": proc_noisy_returns = utils.compute_centered_ranks(noisy_returns) else: raise NotImplementedError(config["return_proc_mode"]) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_indices), batch_size=500) g /= noisy_returns.size assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32 and count == len(noise_indices)) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g + config["l2_coeff"] * theta) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) step_tend = time.time() tlogger.record_tabular("EvalEpRewMean", eval_returns.mean()) tlogger.record_tabular("EvalEpRewStd", eval_returns.std()) tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean()) tlogger.record_tabular("EpRewMean", noisy_returns.mean()) tlogger.record_tabular("EpRewStd", noisy_returns.std()) tlogger.record_tabular("EpLenMean", noisy_lengths.mean()) tlogger.record_tabular("Norm", float(np.square(theta).sum())) tlogger.record_tabular("GradNorm", float(np.square(g).sum())) tlogger.record_tabular("UpdateRatio", float(update_ratio)) tlogger.record_tabular("EpisodesThisIter", noisy_lengths.size) tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far) tlogger.record_tabular("TimestepsThisIter", noisy_lengths.sum()) tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far) tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart) tlogger.record_tabular("TimeElapsed", step_tend - self.tstart) tlogger.dump_tabular() info = { "weights_norm": np.square(theta).sum(), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, "timesteps_this_iter": noisy_lengths.sum(), "timesteps_so_far": self.timesteps_so_far, "time_elapsed_this_iter": step_tend - step_tstart, "time_elapsed": step_tend - self.tstart } result = TrainingResult(episode_reward_mean=eval_returns.mean(), episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result
def _train(self): config = self.config step_tstart = time.time() theta = self.policy.get_trainable_flat() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results = self._collect_results(theta_id, config["episodes_per_batch"], config["timesteps_per_batch"]) curr_task_results = [] ob_count_this_batch = 0 # Loop over the results for result in results: assert result.eval_length is None, "We aren't doing eval rollouts." assert result.noise_inds_n.ndim == 1 assert result.returns_n2.shape == (len(result.noise_inds_n), 2) assert result.lengths_n2.shape == (len(result.noise_inds_n), 2) assert result.returns_n2.dtype == np.float32 result_num_eps = result.lengths_n2.size result_num_timesteps = result.lengths_n2.sum() self.episodes_so_far += result_num_eps self.timesteps_so_far += result_num_timesteps curr_task_results.append(result) # Update ob stats. if self.policy.needs_ob_stat and result.ob_count > 0: self.ob_stat.increment(result.ob_sum, result.ob_sumsq, result.ob_count) ob_count_this_batch += result.ob_count # Assemble the results. noise_inds_n = np.concatenate( [r.noise_inds_n for r in curr_task_results]) returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results]) lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results]) assert (noise_inds_n.shape[0] == returns_n2.shape[0] == lengths_n2.shape[0]) # Process the returns. if config["return_proc_mode"] == "centered_rank": proc_returns_n2 = utils.compute_centered_ranks(returns_n2) else: raise NotImplementedError(config["return_proc_mode"]) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_returns_n2[:, 0] - proc_returns_n2[:, 1], (self.noise.get(idx, self.policy.num_params) for idx in noise_inds_n), batch_size=500) g /= returns_n2.size assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n)) update_ratio = self.optimizer.update(-g + config["l2coeff"] * theta) # Update ob stat (we're never running the policy in the master, but we # might be snapshotting the policy). if self.policy.needs_ob_stat: self.policy.set_ob_stat(self.ob_stat.mean, self.ob_stat.std) step_tend = time.time() tlogger.record_tabular("EpRewMean", returns_n2.mean()) tlogger.record_tabular("EpRewStd", returns_n2.std()) tlogger.record_tabular("EpLenMean", lengths_n2.mean()) tlogger.record_tabular( "Norm", float(np.square(self.policy.get_trainable_flat()).sum())) tlogger.record_tabular("GradNorm", float(np.square(g).sum())) tlogger.record_tabular("UpdateRatio", float(update_ratio)) tlogger.record_tabular("EpisodesThisIter", lengths_n2.size) tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far) tlogger.record_tabular("TimestepsThisIter", lengths_n2.sum()) tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far) tlogger.record_tabular("ObCount", ob_count_this_batch) tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart) tlogger.record_tabular("TimeElapsed", step_tend - self.tstart) tlogger.dump_tabular() info = { "weights_norm": np.square(self.policy.get_trainable_flat()).sum(), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": lengths_n2.size, "episodes_so_far": self.episodes_so_far, "timesteps_this_iter": lengths_n2.sum(), "timesteps_so_far": self.timesteps_so_far, "ob_count": ob_count_this_batch, "time_elapsed_this_iter": step_tend - step_tstart, "time_elapsed": step_tend - self.tstart } result = TrainingResult(episode_reward_mean=returns_n2.mean(), episode_len_mean=lengths_n2.mean(), timesteps_this_iter=lengths_n2.sum(), info=info) return result