def fetch_metrics_from_workers(self): episode_rewards = [] episode_lengths = [] metric_lists = [ a.get_completed_rollout_metrics.remote() for a in self.agents ] for metrics in metric_lists: for episode in ray.get(metrics): episode_lengths.append(episode.episode_length) episode_rewards.append(episode.episode_reward) res = TrainingResult(self.iteration, np.mean(episode_rewards), np.mean(episode_lengths)) return res
def fetch_metrics_from_workers(self): episode_rewards = [] episode_lengths = [] metric_lists = [ a.get_completed_rollout_metrics.remote() for a in self.agents ] for metrics in metric_lists: for episode in ray.get(metrics): episode_lengths.append(episode.episode_length) episode_rewards.append(episode.episode_reward) avg_reward = np.mean(episode_rewards) if episode_rewards else None avg_length = np.mean(episode_lengths) if episode_lengths else None res = TrainingResult(self.experiment_id.hex, self.iteration, avg_reward, avg_length, dict()) return res
def _fetch_metrics_from_workers(self): episode_rewards = [] episode_lengths = [] metric_lists = [ a.get_completed_rollout_metrics.remote() for a in self.agents ] for metrics in metric_lists: for episode in ray.get(metrics): episode_lengths.append(episode.episode_length) episode_rewards.append(episode.episode_reward) avg_reward = (np.mean(episode_rewards) if episode_rewards else float('nan')) avg_length = (np.mean(episode_lengths) if episode_lengths else float('nan')) timesteps = np.sum(episode_lengths) if episode_lengths else 0 result = TrainingResult(episode_reward_mean=avg_reward, episode_len_mean=avg_length, timesteps_this_iter=timesteps, info={}) return result
def train(self): config = self.config sample_time, learn_time = 0, 0 for _ in range(config["timesteps_per_iteration"]): self.num_timesteps += 1 dt = time.time() # Take action and update exploration to the newest value action = self.dqn_graph.act( self.sess, np.array(self.obs)[None], self.exploration.value(self.num_timesteps))[0] new_obs, rew, done, _ = self.env.step(action) # Store transition in the replay buffer. self.replay_buffer.add(self.obs, action, rew, new_obs, float(done)) self.obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) sample_time += time.time() - dt if self.num_timesteps > config["learning_starts"] and \ self.num_timesteps % config["train_freq"] == 0: dt = time.time() # Minimize the error in Bellman's equation on a batch sampled # from replay buffer. if config["prioritized_replay"]: experience = self.replay_buffer.sample( config["batch_size"], beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, _, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = ( self.replay_buffer.sample(config["batch_size"])) batch_idxes = None td_errors = self.dqn_graph.train(self.sess, obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if config["prioritized_replay"]: new_priorities = np.abs(td_errors) + ( config["prioritized_replay_eps"]) self.replay_buffer.update_priorities( batch_idxes, new_priorities) learn_time += (time.time() - dt) if self.num_timesteps > config["learning_starts"] and ( self.num_timesteps % config["target_network_update_freq"] == 0): # Update target network periodically. self.dqn_graph.update_target(self.sess) mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1) mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1) num_episodes = len(self.episode_rewards) info = { "sample_time": sample_time, "learn_time": learn_time, "steps": self.num_timesteps, "episodes": num_episodes, "exploration": int(100 * self.exploration.value(self.num_timesteps)) } logger.record_tabular("sample_time", sample_time) logger.record_tabular("learn_time", learn_time) logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("buffer_size", len(self.replay_buffer)) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() res = TrainingResult(self.experiment_id.hex, self.num_iterations, mean_100ep_reward, mean_100ep_length, info) self.num_iterations += 1 return res
def _train(self): config = self.config sample_time, sync_time, learn_time, apply_time = 0, 0, 0, 0 iter_init_timesteps = self.cur_timestep num_loop_iters = 0 steps_per_iter = config["sample_batch_size"] * len(self.workers) while (self.cur_timestep - iter_init_timesteps < config["timesteps_per_iteration"]): dt = time.time() ray.get([ w.do_steps.remote(config["sample_batch_size"], self.cur_timestep) for w in self.workers ]) num_loop_iters += 1 self.cur_timestep += steps_per_iter self.steps_since_update += steps_per_iter sample_time += time.time() - dt if self.cur_timestep > config["learning_starts"]: dt = time.time() # Minimize the error in Bellman's equation on a batch sampled # from replay buffer. self._update_worker_weights() sync_time += (time.time() - dt) dt = time.time() gradients = ray.get([ w.get_gradient.remote(self.cur_timestep) for w in self.workers ]) learn_time += (time.time() - dt) dt = time.time() for grad in gradients: self.actor.apply_gradients(grad) apply_time += (time.time() - dt) if (self.cur_timestep > config["learning_starts"] and self.steps_since_update > config["target_network_update_freq"]): self.actor.dqn_graph.update_target(self.actor.sess) # Update target network periodically. self._update_worker_weights() self.steps_since_update -= config["target_network_update_freq"] self.num_target_updates += 1 mean_100ep_reward = 0.0 mean_100ep_length = 0.0 num_episodes = 0 buffer_size_sum = 0 for mean_rew, mean_len, episodes, exploration, buf_sz in ray.get( [w.stats.remote(self.cur_timestep) for w in self.workers]): mean_100ep_reward += mean_rew mean_100ep_length += mean_len num_episodes += episodes buffer_size_sum += buf_sz mean_100ep_reward /= len(self.workers) mean_100ep_length /= len(self.workers) info = [ ("mean_100ep_reward", mean_100ep_reward), ("exploration_frac", exploration), ("steps", self.cur_timestep), ("episodes", num_episodes), ("buffer_sizes_sum", buffer_size_sum), ("target_updates", self.num_target_updates), ("sample_time", sample_time), ("weight_sync_time", sync_time), ("apply_time", apply_time), ("learn_time", learn_time), ("samples_per_s", num_loop_iters * np.float64(steps_per_iter) / sample_time), ("learn_samples_per_s", num_loop_iters * np.float64(config["train_batch_size"]) * np.float64(config["num_workers"]) / learn_time), ] for k, v in info: logger.record_tabular(k, v) logger.dump_tabular() result = TrainingResult(episode_reward_mean=mean_100ep_reward, episode_len_mean=mean_100ep_length, timesteps_this_iter=self.cur_timestep - iter_init_timesteps, info=info) return result
def _train(self): config = self.config step_tstart = time.time() theta = self.policy.get_trainable_flat() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results = self._collect_results(theta_id, config["episodes_per_batch"], config["timesteps_per_batch"]) curr_task_results = [] ob_count_this_batch = 0 # Loop over the results for result in results: assert result.eval_length is None, "We aren't doing eval rollouts." assert result.noise_inds_n.ndim == 1 assert result.returns_n2.shape == (len(result.noise_inds_n), 2) assert result.lengths_n2.shape == (len(result.noise_inds_n), 2) assert result.returns_n2.dtype == np.float32 result_num_eps = result.lengths_n2.size result_num_timesteps = result.lengths_n2.sum() self.episodes_so_far += result_num_eps self.timesteps_so_far += result_num_timesteps curr_task_results.append(result) # Update ob stats. if self.policy.needs_ob_stat and result.ob_count > 0: self.ob_stat.increment(result.ob_sum, result.ob_sumsq, result.ob_count) ob_count_this_batch += result.ob_count # Assemble the results. noise_inds_n = np.concatenate( [r.noise_inds_n for r in curr_task_results]) returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results]) lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results]) assert (noise_inds_n.shape[0] == returns_n2.shape[0] == lengths_n2.shape[0]) # Process the returns. if config["return_proc_mode"] == "centered_rank": proc_returns_n2 = utils.compute_centered_ranks(returns_n2) else: raise NotImplementedError(config["return_proc_mode"]) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_returns_n2[:, 0] - proc_returns_n2[:, 1], (self.noise.get(idx, self.policy.num_params) for idx in noise_inds_n), batch_size=500) g /= returns_n2.size assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n)) update_ratio = self.optimizer.update(-g + config["l2coeff"] * theta) # Update ob stat (we're never running the policy in the master, but we # might be snapshotting the policy). if self.policy.needs_ob_stat: self.policy.set_ob_stat(self.ob_stat.mean, self.ob_stat.std) step_tend = time.time() tlogger.record_tabular("EpRewMean", returns_n2.mean()) tlogger.record_tabular("EpRewStd", returns_n2.std()) tlogger.record_tabular("EpLenMean", lengths_n2.mean()) tlogger.record_tabular( "Norm", float(np.square(self.policy.get_trainable_flat()).sum())) tlogger.record_tabular("GradNorm", float(np.square(g).sum())) tlogger.record_tabular("UpdateRatio", float(update_ratio)) tlogger.record_tabular("EpisodesThisIter", lengths_n2.size) tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far) tlogger.record_tabular("TimestepsThisIter", lengths_n2.sum()) tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far) tlogger.record_tabular("ObCount", ob_count_this_batch) tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart) tlogger.record_tabular("TimeElapsed", step_tend - self.tstart) tlogger.dump_tabular() info = { "weights_norm": np.square(self.policy.get_trainable_flat()).sum(), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": lengths_n2.size, "episodes_so_far": self.episodes_so_far, "timesteps_this_iter": lengths_n2.sum(), "timesteps_so_far": self.timesteps_so_far, "ob_count": ob_count_this_batch, "time_elapsed_this_iter": step_tend - step_tstart, "time_elapsed": step_tend - self.tstart } result = TrainingResult(episode_reward_mean=returns_n2.mean(), episode_len_mean=lengths_n2.mean(), timesteps_this_iter=lengths_n2.sum(), info=info) return result
def _train(self): agents = self.agents config = self.config model = self.model print("===> iteration", self.iteration) iter_start = time.time() weights = ray.put(model.get_weights()) [a.load_weights.remote(weights) for a in agents] trajectory, total_reward, traj_len_mean = collect_samples( agents, config, self.model.observation_filter, self.model.reward_filter) print("total reward is ", total_reward) print("trajectory length mean is ", traj_len_mean) print("timesteps:", trajectory["dones"].shape[0]) if self.file_writer: traj_stats = tf.Summary(value=[ tf.Summary.Value(tag="ppo/rollouts/mean_reward", simple_value=total_reward), tf.Summary.Value(tag="ppo/rollouts/traj_len_mean", simple_value=traj_len_mean) ]) self.file_writer.add_summary(traj_stats, self.global_step) self.global_step += 1 def standardized(value): # Divide by the maximum of value.std() and 1e-4 # to guard against the case where all values are equal return (value - value.mean()) / max(1e-4, value.std()) if config["use_gae"]: trajectory["advantages"] = standardized(trajectory["advantages"]) else: trajectory["returns"] = standardized(trajectory["returns"]) rollouts_end = time.time() print("Computing policy (iterations=" + str(config["num_sgd_iter"]) + ", stepsize=" + str(config["sgd_stepsize"]) + "):") names = [ "iter", "total loss", "policy loss", "vf loss", "kl", "entropy" ] print(("{:>15}" * len(names)).format(*names)) trajectory = shuffle(trajectory) shuffle_end = time.time() tuples_per_device = model.load_data( trajectory, self.iteration == 0 and config["full_trace_data_load"]) load_end = time.time() rollouts_time = rollouts_end - iter_start shuffle_time = shuffle_end - rollouts_end load_time = load_end - shuffle_end sgd_time = 0 for i in range(config["num_sgd_iter"]): sgd_start = time.time() batch_index = 0 num_batches = (int(tuples_per_device) // int(model.per_device_batch_size)) loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], [] permutation = np.random.permutation(num_batches) # Prepare to drop into the debugger if self.iteration == config["tf_debug_iteration"]: model.sess = tf_debug.LocalCLIDebugWrapperSession(model.sess) while batch_index < num_batches: full_trace = (i == 0 and self.iteration == 0 and batch_index == config["full_trace_nth_sgd_batch"]) batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \ batch_entropy = model.run_sgd_minibatch( permutation[batch_index] * model.per_device_batch_size, self.kl_coeff, full_trace, self.file_writer) loss.append(batch_loss) policy_loss.append(batch_policy_loss) vf_loss.append(batch_vf_loss) kl.append(batch_kl) entropy.append(batch_entropy) batch_index += 1 loss = np.mean(loss) policy_loss = np.mean(policy_loss) vf_loss = np.mean(vf_loss) kl = np.mean(kl) entropy = np.mean(entropy) sgd_end = time.time() print("{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format( i, loss, policy_loss, vf_loss, kl, entropy)) values = [] if i == config["num_sgd_iter"] - 1: metric_prefix = "ppo/sgd/final_iter/" values.append( tf.Summary.Value(tag=metric_prefix + "kl_coeff", simple_value=self.kl_coeff)) values.extend([ tf.Summary.Value(tag=metric_prefix + "mean_entropy", simple_value=entropy), tf.Summary.Value(tag=metric_prefix + "mean_loss", simple_value=loss), tf.Summary.Value(tag=metric_prefix + "mean_kl", simple_value=kl) ]) if self.file_writer: sgd_stats = tf.Summary(value=values) self.file_writer.add_summary(sgd_stats, self.global_step) self.global_step += 1 sgd_time += sgd_end - sgd_start if kl > 2.0 * config["kl_target"]: self.kl_coeff *= 1.5 elif kl < 0.5 * config["kl_target"]: self.kl_coeff *= 0.5 info = { "kl_divergence": kl, "kl_coefficient": self.kl_coeff, "rollouts_time": rollouts_time, "shuffle_time": shuffle_time, "load_time": load_time, "sgd_time": sgd_time, "sample_throughput": len(trajectory["observations"]) / sgd_time } print("kl div:", kl) print("kl coeff:", self.kl_coeff) print("rollouts time:", rollouts_time) print("shuffle time:", shuffle_time) print("load time:", load_time) print("sgd time:", sgd_time) print("sgd examples/s:", len(trajectory["observations"]) / sgd_time) print("total time so far:", time.time() - self.start_time) result = TrainingResult( episode_reward_mean=total_reward, episode_len_mean=traj_len_mean, timesteps_this_iter=trajectory["dones"].shape[0], info=info) return result
def train(self): agents = self.agents config = self.config model = self.model j = self.j self.j += 1 print("===> iteration", self.j) saver = tf.train.Saver(max_to_keep=None) if "load_checkpoint" in config: saver.restore(model.sess, config["load_checkpoint"]) # TF does not support to write logs to S3 at the moment write_tf_logs = config["write_logs"] and self.logdir.startswith("file") iter_start = time.time() if write_tf_logs: file_writer = tf.summary.FileWriter(self.logdir, model.sess.graph) if config["model_checkpoint_file"]: checkpoint_path = saver.save( model.sess, os.path.join(self.logdir, config["model_checkpoint_file"] % j)) print("Checkpoint saved in file: %s" % checkpoint_path) checkpointing_end = time.time() weights = ray.put(model.get_weights()) [a.load_weights.remote(weights) for a in agents] trajectory, total_reward, traj_len_mean = collect_samples( agents, config) print("total reward is ", total_reward) print("trajectory length mean is ", traj_len_mean) print("timesteps:", trajectory["dones"].shape[0]) if write_tf_logs: traj_stats = tf.Summary(value=[ tf.Summary.Value(tag="policy_gradient/rollouts/mean_reward", simple_value=total_reward), tf.Summary.Value(tag="policy_gradient/rollouts/traj_len_mean", simple_value=traj_len_mean) ]) file_writer.add_summary(traj_stats, self.global_step) self.global_step += 1 trajectory["advantages"] = ( (trajectory["advantages"] - trajectory["advantages"].mean()) / trajectory["advantages"].std()) rollouts_end = time.time() print("Computing policy (iterations=" + str(config["num_sgd_iter"]) + ", stepsize=" + str(config["sgd_stepsize"]) + "):") names = ["iter", "loss", "kl", "entropy"] print(("{:>15}" * len(names)).format(*names)) trajectory = shuffle(trajectory) shuffle_end = time.time() tuples_per_device = model.load_data( trajectory, j == 0 and config["full_trace_data_load"]) load_end = time.time() checkpointing_time = checkpointing_end - iter_start rollouts_time = rollouts_end - checkpointing_end shuffle_time = shuffle_end - rollouts_end load_time = load_end - shuffle_end sgd_time = 0 for i in range(config["num_sgd_iter"]): sgd_start = time.time() batch_index = 0 num_batches = (int(tuples_per_device) // int(model.per_device_batch_size)) loss, kl, entropy = [], [], [] permutation = np.random.permutation(num_batches) while batch_index < num_batches: full_trace = (i == 0 and j == 0 and batch_index == config["full_trace_nth_sgd_batch"]) batch_loss, batch_kl, batch_entropy = model.run_sgd_minibatch( permutation[batch_index] * model.per_device_batch_size, self.kl_coeff, full_trace, file_writer if write_tf_logs else None) loss.append(batch_loss) kl.append(batch_kl) entropy.append(batch_entropy) batch_index += 1 loss = np.mean(loss) kl = np.mean(kl) entropy = np.mean(entropy) sgd_end = time.time() print("{:>15}{:15.5e}{:15.5e}{:15.5e}".format( i, loss, kl, entropy)) values = [] if i == config["num_sgd_iter"] - 1: metric_prefix = "policy_gradient/sgd/final_iter/" values.append( tf.Summary.Value(tag=metric_prefix + "kl_coeff", simple_value=self.kl_coeff)) else: metric_prefix = "policy_gradient/sgd/intermediate_iters/" values.extend([ tf.Summary.Value(tag=metric_prefix + "mean_entropy", simple_value=entropy), tf.Summary.Value(tag=metric_prefix + "mean_loss", simple_value=loss), tf.Summary.Value(tag=metric_prefix + "mean_kl", simple_value=kl) ]) if write_tf_logs: sgd_stats = tf.Summary(value=values) file_writer.add_summary(sgd_stats, self.global_step) self.global_step += 1 sgd_time += sgd_end - sgd_start if kl > 2.0 * config["kl_target"]: self.kl_coeff *= 1.5 elif kl < 0.5 * config["kl_target"]: self.kl_coeff *= 0.5 info = { "kl_divergence": kl, "kl_coefficient": self.kl_coeff, "checkpointing_time": checkpointing_time, "rollouts_time": rollouts_time, "shuffle_time": shuffle_time, "load_time": load_time, "sgd_time": sgd_time, "sample_throughput": len(trajectory["observations"]) / sgd_time } print("kl div:", kl) print("kl coeff:", self.kl_coeff) print("checkpointing time:", checkpointing_time) print("rollouts time:", rollouts_time) print("shuffle time:", shuffle_time) print("load time:", load_time) print("sgd time:", sgd_time) print("sgd examples/s:", len(trajectory["observations"]) / sgd_time) print("total time so far:", time.time() - self.start_time) result = TrainingResult(self.experiment_id.hex, j, total_reward, traj_len_mean, info) return result
def train(self): agents = self.agents config = self.config model = self.model j = self.j self.j += 1 saver = tf.train.Saver(max_to_keep=None) if "load_checkpoint" in config: saver.restore(model.sess, config["load_checkpoint"]) file_writer = tf.summary.FileWriter( "{}/trpo_{}_{}".format( config["tensorboard_log_dir"], self.env_name, str(datetime.today()).replace(" ", "_")), model.sess.graph) iter_start = time.time() if config["model_checkpoint_file"]: checkpoint_path = saver.save( model.sess, config["model_checkpoint_file"] % j) print("Checkpoint saved in file: %s" % checkpoint_path) checkpointing_end = time.time() weights = ray.put(model.get_weights()) [a.load_weights.remote(weights) for a in agents] trajectory, total_reward, traj_len_mean = collect_samples( agents, config["timesteps_per_batch"], 0.995, 1.0, 2000) print("total reward is ", total_reward) print("trajectory length mean is ", traj_len_mean) print("timesteps:", trajectory["dones"].shape[0]) traj_stats = tf.Summary(value=[ tf.Summary.Value( tag="policy_gradient/rollouts/mean_reward", simple_value=total_reward), tf.Summary.Value( tag="policy_gradient/rollouts/traj_len_mean", simple_value=traj_len_mean)]) file_writer.add_summary(traj_stats, self.global_step) self.global_step += 1 trajectory["advantages"] = ((trajectory["advantages"] - trajectory["advantages"].mean()) / trajectory["advantages"].std()) rollouts_end = time.time() print("Computing policy (iterations=" + str(config["num_sgd_iter"]) + ", stepsize=" + str(config["sgd_stepsize"]) + "):") names = ["iter", "loss", "kl", "entropy"] print(("{:>15}" * len(names)).format(*names)) trajectory = shuffle(trajectory) shuffle_end = time.time() tuples_per_device = model.load_data( trajectory, j == 0 and config["full_trace_data_load"]) load_end = time.time() checkpointing_time = checkpointing_end - iter_start rollouts_time = rollouts_end - checkpointing_end shuffle_time = shuffle_end - rollouts_end load_time = load_end - shuffle_end sgd_time = 0 for i in range(config["num_sgd_iter"]): sgd_start = time.time() batch_index = 0 num_batches = int(tuples_per_device) // int(model.per_device_batch_size) loss, kl, entropy = [], [], [] permutation = np.random.permutation(num_batches) while batch_index < num_batches: full_trace = ( i == 0 and j == 0 and batch_index == config["full_trace_nth_sgd_batch"]) batch_loss, batch_kl, batch_entropy = model.run_sgd_minibatch( permutation[batch_index] * model.per_device_batch_size, self.kl_coeff, full_trace, file_writer) loss.append(batch_loss) kl.append(batch_kl) entropy.append(batch_entropy) batch_index += 1 loss = np.mean(loss) kl = np.mean(kl) entropy = np.mean(entropy) sgd_end = time.time() print("{:>15}{:15.5e}{:15.5e}{:15.5e}".format(i, loss, kl, entropy)) values = [] if i == config["num_sgd_iter"] - 1: metric_prefix = "policy_gradient/sgd/final_iter/" values.append(tf.Summary.Value( tag=metric_prefix + "kl_coeff", simple_value=self.kl_coeff)) else: metric_prefix = "policy_gradient/sgd/intermediate_iters/" values.extend([ tf.Summary.Value( tag=metric_prefix + "mean_entropy", simple_value=entropy), tf.Summary.Value( tag=metric_prefix + "mean_loss", simple_value=loss), tf.Summary.Value( tag=metric_prefix + "mean_kl", simple_value=kl)]) sgd_stats = tf.Summary(value=values) file_writer.add_summary(sgd_stats, self.global_step) self.global_step += 1 sgd_time += sgd_end - sgd_start if kl > 2.0 * config["kl_target"]: self.kl_coeff *= 1.5 elif kl < 0.5 * config["kl_target"]: self.kl_coeff *= 0.5 print("kl div:", kl) print("kl coeff:", self.kl_coeff) print("checkpointing time:", checkpointing_time) print("rollouts time:", rollouts_time) print("shuffle time:", shuffle_time) print("load time:", load_time) print("sgd time:", sgd_time) print("sgd examples/s:", len(trajectory["observations"]) / sgd_time) return TrainingResult(j, total_reward, traj_len_mean)
def _train(self): return TrainingResult( episode_reward_mean=10, episode_len_mean=10, timesteps_this_iter=10, info={})
def train(self): config = self.config step_tstart = time.time() theta = self.policy.get_trainable_flat() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the policy # weights. rollout_ids = [ worker.do_rollouts.remote( theta_id, self.ob_stat.mean if self.policy.needs_ob_stat else None, self.ob_stat.std if self.policy.needs_ob_stat else None) for worker in self.workers ] # Get the results of the rollouts. results = ray.get(rollout_ids) curr_task_results = [] ob_count_this_batch = 0 # Loop over the results for result in results: assert result.eval_length is None, "We aren't doing eval rollouts." assert result.noise_inds_n.ndim == 1 assert result.returns_n2.shape == (len(result.noise_inds_n), 2) assert result.lengths_n2.shape == (len(result.noise_inds_n), 2) assert result.returns_n2.dtype == np.float32 result_num_eps = result.lengths_n2.size result_num_timesteps = result.lengths_n2.sum() self.episodes_so_far += result_num_eps self.timesteps_so_far += result_num_timesteps curr_task_results.append(result) # Update ob stats. if self.policy.needs_ob_stat and result.ob_count > 0: self.ob_stat.increment(result.ob_sum, result.ob_sumsq, result.ob_count) ob_count_this_batch += result.ob_count # Assemble the results. noise_inds_n = np.concatenate( [r.noise_inds_n for r in curr_task_results]) returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results]) lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results]) assert noise_inds_n.shape[0] == returns_n2.shape[ 0] == lengths_n2.shape[0] # Process the returns. if config.return_proc_mode == "centered_rank": proc_returns_n2 = utils.compute_centered_ranks(returns_n2) else: raise NotImplementedError(config.return_proc_mode) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_returns_n2[:, 0] - proc_returns_n2[:, 1], (self.noise.get(idx, self.policy.num_params) for idx in noise_inds_n), batch_size=500) g /= returns_n2.size assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n)) update_ratio = self.optimizer.update(-g + config.l2coeff * theta) # Update ob stat (we're never running the policy in the master, but we # might be snapshotting the policy). if self.policy.needs_ob_stat: self.policy.set_ob_stat(self.ob_stat.mean, self.ob_stat.std) step_tend = time.time() tlogger.record_tabular("EpRewMean", returns_n2.mean()) tlogger.record_tabular("EpRewStd", returns_n2.std()) tlogger.record_tabular("EpLenMean", lengths_n2.mean()) tlogger.record_tabular( "Norm", float(np.square(self.policy.get_trainable_flat()).sum())) tlogger.record_tabular("GradNorm", float(np.square(g).sum())) tlogger.record_tabular("UpdateRatio", float(update_ratio)) tlogger.record_tabular("EpisodesThisIter", lengths_n2.size) tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far) tlogger.record_tabular("TimestepsThisIter", lengths_n2.sum()) tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far) tlogger.record_tabular("ObCount", ob_count_this_batch) tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart) tlogger.record_tabular("TimeElapsed", step_tend - self.tstart) tlogger.dump_tabular() if (config.snapshot_freq != 0 and self.iteration % config.snapshot_freq == 0): filename = os.path.join( "/tmp", "snapshot_iter{:05d}.h5".format(self.iteration)) assert not os.path.exists(filename) self.policy.save(filename) tlogger.log("Saved snapshot {}".format(filename)) res = TrainingResult(self.iteration, returns_n2.mean(), lengths_n2.mean()) self.iteration += 1 return res