def _train(self): i = max(0, self.iteration - self.config["offset"]) v = np.tanh(float(i) / self.config["width"]) v *= self.config["height"] return TrainingResult( episode_reward_mean=v, episode_len_mean=v, timesteps_this_iter=self.config["iter_timesteps"], time_this_iter_s=self.config["iter_time"], info={})
def _train(self): if self.config["mock_error"] and self.iteration == 1 \ and (self.config["persistent_error"] or not self.restored): raise Exception("mock error") return TrainingResult(episode_reward_mean=10, episode_len_mean=10, timesteps_this_iter=10, info={})
def __call__(self, **kwargs): """Report updated training status. Args: kwargs (TrainingResult): Latest training result status. You must at least define `timesteps_total`, but probably want to report some of the other metrics as well. """ with self._lock: self._latest_result = self._last_result = TrainingResult(**kwargs)
def main(_): # Import data mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) # Create the model x = tf.placeholder(tf.float32, [None, 784]) # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, 10]) # Build the graph for the deep net y_conv, keep_prob = deepnn(x) with tf.name_scope('loss'): cross_entropy = tf.nn.softmax_cross_entropy_with_logits( labels=y_, logits=y_conv) cross_entropy = tf.reduce_mean(cross_entropy) with tf.name_scope('adam_optimizer'): train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction) graph_location = tempfile.mkdtemp() print('Saving graph to: %s' % graph_location) train_writer = tf.summary.FileWriter(graph_location) train_writer.add_graph(tf.get_default_graph()) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(20000): batch = mnist.train.next_batch(50) if i % 10 == 0: train_accuracy = accuracy.eval(feed_dict={ x: batch[0], y_: batch[1], keep_prob: 1.0}) # !!! Report status to ray.tune !!! if status_reporter: status_reporter.report(TrainingResult( timesteps_total=i, mean_accuracy=train_accuracy)) print('step %d, training accuracy %g' % (i, train_accuracy)) train_step.run( feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5}) print('test accuracy %g' % accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
def _train(self): self.classifier.train(input_fn=lambda: iris_data.train_input_fn( self.train_x, self.train_y, 10), steps=100) self.steps = self.steps + 100 eval_result = self.classifier.evaluate( input_fn=lambda: iris_data.eval_input_fn(self.test_x, self.test_y, 10)) return TrainingResult(timesteps_this_iter=100, timesteps_total=self.steps, mean_validation_accuracy=eval_result['accuracy'])
def _train(self): self.optimizer.step() metric_lists = [re.get_metrics.remote() for re in self.remote_evaluators] total_samples = 0 total_loss = 0 for metrics in metric_lists: for m in ray.get(metrics): total_samples += m["num_samples"] total_loss += m["loss"] result = TrainingResult( mean_loss=total_loss / total_samples, timesteps_this_iter=total_samples, ) return result
def _train(self): start_timestep = self.global_timestep self.optimizer.step() self.local_evaluator.update_target() self.last_target_update_ts = self.global_timestep self.num_target_updates += 1 self.local_evaluator.set_global_timestep(self.global_timestep) for e in self.remote_evaluators: e.set_global_timestep.remote(self.global_timestep) mean_100ep_reward = 0.0 mean_100ep_length = 0.0 num_episodes = 0 if self.remote_evaluators: stats = ray.get([e.stats.remote() for e in self.remote_evaluators]) else: stats = self.local_evaluator.stats() if not isinstance(stats, list): stats = [stats] if self.config["per_worker_exploration"]: # Return stats from workers with the lowest 20% of exploration test_stats = stats[-int(max(1, len(stats) * 0.2)):] else: test_stats = stats for s in test_stats: mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats) mean_100ep_length += s["mean_100ep_length"] / len(test_stats) for s in stats: num_episodes += s["num_episodes"] result = TrainingResult(episode_reward_mean=mean_100ep_reward, episode_len_mean=mean_100ep_length, episodes_total=num_episodes, timesteps_this_iter=self.global_timestep - start_timestep, info={}) return result
def _train(self): start_timestep = self.global_timestep while (self.global_timestep - start_timestep < self.config["timesteps_per_iteration"]): if self.global_timestep < self.config["learning_starts"]: self._populate_replay_buffer() else: self.optimizer.step() stats = self._update_global_stats() if self.global_timestep - self.last_target_update_ts > \ self.config["target_network_update_freq"]: self.local_evaluator.update_target() self.last_target_update_ts = self.global_timestep self.num_target_updates += 1 mean_100ep_reward = 0.0 mean_100ep_length = 0.0 num_episodes = 0 exploration = -1 for s in stats: mean_100ep_reward += s["mean_100ep_reward"] / len(stats) mean_100ep_length += s["mean_100ep_length"] / len(stats) num_episodes += s["num_episodes"] exploration = s["exploration"] result = TrainingResult( episode_reward_mean=mean_100ep_reward, episode_len_mean=mean_100ep_length, episodes_total=num_episodes, timesteps_this_iter=self.global_timestep - start_timestep, info=dict( { "exploration": exploration, "num_target_updates": self.num_target_updates, }, **self.optimizer.stats())) return result
def _train_stats(self, start_timestep): if self.remote_evaluators: stats = ray.get([ e.stats.remote() for e in self.remote_evaluators]) else: stats = self.local_evaluator.stats() if not isinstance(stats, list): stats = [stats] mean_100ep_reward = 0.0 mean_100ep_length = 0.0 num_episodes = 0 explorations = [] if self.config["per_worker_exploration"]: # Return stats from workers with the lowest 20% of exploration test_stats = stats[-int(max(1, len(stats)*0.2)):] else: test_stats = stats for s in test_stats: mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats) mean_100ep_length += s["mean_100ep_length"] / len(test_stats) for s in stats: num_episodes += s["num_episodes"] explorations.append(s["exploration"]) opt_stats = self.optimizer.stats() result = TrainingResult( episode_reward_mean=mean_100ep_reward, episode_len_mean=mean_100ep_length, episodes_total=num_episodes, timesteps_this_iter=self.global_timestep - start_timestep, info=dict({ "min_exploration": min(explorations), "max_exploration": max(explorations), "num_target_updates": self.num_target_updates, }, **opt_stats)) return result
def _fetch_metrics_from_remote_evaluators(self): episode_rewards = [] episode_lengths = [] metric_lists = [a.get_completed_rollout_metrics.remote() for a in self.remote_evaluators] for metrics in metric_lists: for episode in ray.get(metrics): episode_lengths.append(episode.episode_length) episode_rewards.append(episode.episode_reward) avg_reward = ( np.mean(episode_rewards) if episode_rewards else float('nan')) avg_length = ( np.mean(episode_lengths) if episode_lengths else float('nan')) timesteps = np.sum(episode_lengths) if episode_lengths else 0 result = TrainingResult( episode_reward_mean=avg_reward, episode_len_mean=avg_length, timesteps_this_iter=timesteps) return result
def _train(self): start_timestep = self.global_timestep num_steps = 0 while (self.global_timestep - start_timestep < self.config["timesteps_per_iteration"]): self.global_timestep += self.optimizer.step() num_steps += 1 if self.global_timestep - self.last_target_update_ts > \ self.config["target_network_update_freq"]: self.local_evaluator.update_target() self.last_target_update_ts = self.global_timestep self.num_target_updates += 1 test_stats = self._update_global_stats() mean_100ep_reward = 0.0 mean_100ep_length = 0.0 num_episodes = 0 explorations = [] for s in test_stats: mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats) mean_100ep_length += s["mean_100ep_length"] / len(test_stats) num_episodes += s["num_episodes"] opt_stats = self.optimizer.stats() result = TrainingResult( episode_reward_mean=mean_100ep_reward, episode_len_mean=mean_100ep_length, episodes_total=num_episodes, timesteps_this_iter=self.global_timestep - start_timestep, info=dict({ "num_target_updates": self.num_target_updates, }, **opt_stats)) return result
def _train(self): self.optimizer.step() episode_rewards = [] episode_lengths = [] metric_lists = [a.get_completed_rollout_metrics.remote() for a in self.remote_evaluators] for metrics in metric_lists: for episode in ray.get(metrics): episode_lengths.append(episode.episode_length) episode_rewards.append(episode.episode_reward) avg_reward = np.mean(episode_rewards) avg_length = np.mean(episode_lengths) timesteps = np.sum(episode_lengths) result = TrainingResult( episode_reward_mean=avg_reward, episode_len_mean=avg_length, timesteps_this_iter=timesteps, info={}) return result
def collect_metrics(local_evaluator, remote_evaluators=[]): """Gathers episode metrics from CommonPolicyEvaluator instances.""" episode_rewards = [] episode_lengths = [] policy_rewards = collections.defaultdict(list) metric_lists = ray.get([ a.apply.remote(lambda ev: ev.sampler.get_metrics()) for a in remote_evaluators ]) metric_lists.append(local_evaluator.sampler.get_metrics()) for metrics in metric_lists: for episode in metrics: episode_lengths.append(episode.episode_length) episode_rewards.append(episode.episode_reward) for (_, policy_id), reward in episode.agent_rewards.items(): policy_rewards[policy_id].append(reward) if episode_rewards: min_reward = min(episode_rewards) max_reward = max(episode_rewards) else: min_reward = float('nan') max_reward = float('nan') avg_reward = np.mean(episode_rewards) avg_length = np.mean(episode_lengths) timesteps = np.sum(episode_lengths) for policy_id, rewards in policy_rewards.copy().items(): policy_rewards[policy_id] = np.mean(rewards) return TrainingResult(episode_reward_max=max_reward, episode_reward_min=min_reward, episode_reward_mean=avg_reward, episode_len_mean=avg_length, episodes_total=len(episode_lengths), timesteps_this_iter=timesteps, policy_reward_mean=dict(policy_rewards))
def _train(self): config = self.config step_tstart = time.time() theta = self.policy.get_weights() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["episodes_per_batch"], config["timesteps_per_batch"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes self.timesteps_so_far += num_timesteps # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # Process the returns. if config["return_proc_mode"] == "centered_rank": proc_noisy_returns = utils.compute_centered_ranks(noisy_returns) else: raise NotImplementedError(config["return_proc_mode"]) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_indices), batch_size=500) g /= noisy_returns.size assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32 and count == len(noise_indices)) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g + config["l2_coeff"] * theta) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) step_tend = time.time() tlogger.record_tabular("EvalEpRewMean", eval_returns.mean()) tlogger.record_tabular("EvalEpRewStd", eval_returns.std()) tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean()) tlogger.record_tabular("EpRewMean", noisy_returns.mean()) tlogger.record_tabular("EpRewStd", noisy_returns.std()) tlogger.record_tabular("EpLenMean", noisy_lengths.mean()) tlogger.record_tabular("Norm", float(np.square(theta).sum())) tlogger.record_tabular("GradNorm", float(np.square(g).sum())) tlogger.record_tabular("UpdateRatio", float(update_ratio)) tlogger.record_tabular("EpisodesThisIter", noisy_lengths.size) tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far) tlogger.record_tabular("TimestepsThisIter", noisy_lengths.sum()) tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far) tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart) tlogger.record_tabular("TimeElapsed", step_tend - self.tstart) tlogger.dump_tabular() info = { "weights_norm": np.square(theta).sum(), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, "timesteps_this_iter": noisy_lengths.sum(), "timesteps_so_far": self.timesteps_so_far, "time_elapsed_this_iter": step_tend - step_tstart, "time_elapsed": step_tend - self.tstart } result = TrainingResult(episode_reward_mean=eval_returns.mean(), episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result
def _train(self): return TrainingResult( episode_reward_mean=self.config["reward_amt"] * self.iteration, episode_len_mean=self.config["reward_amt"], timesteps_this_iter=self.config["iter_timesteps"], time_this_iter_s=self.config["iter_time"], info={})
def result2(t, rew): return TrainingResult(time_total_s=t, neg_mean_loss=rew)
def result(t, rew): return TrainingResult(time_total_s=t, episode_reward_mean=rew, training_iteration=int(t))
def result2(t, rew): return TrainingResult(training_iteration=t, neg_mean_loss=rew)
def _train(self): config = self.config sample_time, sync_time, learn_time, apply_time = 0, 0, 0, 0 iter_init_timesteps = self.cur_timestep num_loop_iters = 0 steps_per_iter = config["sample_batch_size"] * len(self.workers) while (self.cur_timestep - iter_init_timesteps < config["timesteps_per_iteration"]): dt = time.time() ray.get([ w.do_steps.remote(config["sample_batch_size"], self.cur_timestep) for w in self.workers ]) num_loop_iters += 1 self.cur_timestep += steps_per_iter self.steps_since_update += steps_per_iter sample_time += time.time() - dt if self.cur_timestep > config["learning_starts"]: dt = time.time() # Minimize the error in Bellman's equation on a batch sampled # from replay buffer. self._update_worker_weights() sync_time += (time.time() - dt) dt = time.time() gradients = ray.get([ w.get_gradient.remote(self.cur_timestep) for w in self.workers ]) learn_time += (time.time() - dt) dt = time.time() for grad in gradients: self.actor.apply_gradients(grad) apply_time += (time.time() - dt) if (self.cur_timestep > config["learning_starts"] and self.steps_since_update > config["target_network_update_freq"]): self.actor.dqn_graph.update_target(self.actor.sess) # Update target network periodically. self._update_worker_weights() self.steps_since_update -= config["target_network_update_freq"] self.num_target_updates += 1 mean_100ep_reward = 0.0 mean_100ep_length = 0.0 num_episodes = 0 buffer_size_sum = 0 for mean_rew, mean_len, episodes, exploration, buf_sz in ray.get( [w.stats.remote(self.cur_timestep) for w in self.workers]): mean_100ep_reward += mean_rew mean_100ep_length += mean_len num_episodes += episodes buffer_size_sum += buf_sz mean_100ep_reward /= len(self.workers) mean_100ep_length /= len(self.workers) info = [ ("mean_100ep_reward", mean_100ep_reward), ("exploration_frac", exploration), ("steps", self.cur_timestep), ("episodes", num_episodes), ("buffer_sizes_sum", buffer_size_sum), ("target_updates", self.num_target_updates), ("sample_time", sample_time), ("weight_sync_time", sync_time), ("apply_time", apply_time), ("learn_time", learn_time), ("samples_per_s", num_loop_iters * np.float64(steps_per_iter) / sample_time), ("learn_samples_per_s", num_loop_iters * np.float64(config["train_batch_size"]) * np.float64(config["num_workers"]) / learn_time), ] for k, v in info: logger.record_tabular(k, v) logger.dump_tabular() result = TrainingResult(episode_reward_mean=mean_100ep_reward, episode_len_mean=mean_100ep_length, timesteps_this_iter=self.cur_timestep - iter_init_timesteps, info=info) return result
def _train(self): return TrainingResult( episode_reward_mean=10, episode_len_mean=10, timesteps_this_iter=10, info={})
def _train(self): agents = self.agents config = self.config model = self.model print("===> iteration", self.iteration) iter_start = time.time() weights = ray.put(model.get_weights()) [a.load_weights.remote(weights) for a in agents] trajectory, total_reward, traj_len_mean = collect_samples( agents, config, self.model.observation_filter, self.model.reward_filter) print("total reward is ", total_reward) print("trajectory length mean is ", traj_len_mean) print("timesteps:", trajectory["dones"].shape[0]) if self.file_writer: traj_stats = tf.Summary(value=[ tf.Summary.Value(tag="ppo/rollouts/mean_reward", simple_value=total_reward), tf.Summary.Value(tag="ppo/rollouts/traj_len_mean", simple_value=traj_len_mean) ]) self.file_writer.add_summary(traj_stats, self.global_step) self.global_step += 1 def standardized(value): # Divide by the maximum of value.std() and 1e-4 # to guard against the case where all values are equal return (value - value.mean()) / max(1e-4, value.std()) if config["use_gae"]: trajectory["advantages"] = standardized(trajectory["advantages"]) else: trajectory["returns"] = standardized(trajectory["returns"]) rollouts_end = time.time() print("Computing policy (iterations=" + str(config["num_sgd_iter"]) + ", stepsize=" + str(config["sgd_stepsize"]) + "):") names = [ "iter", "total loss", "policy loss", "vf loss", "kl", "entropy" ] print(("{:>15}" * len(names)).format(*names)) trajectory = shuffle(trajectory) shuffle_end = time.time() tuples_per_device = model.load_data( trajectory, self.iteration == 0 and config["full_trace_data_load"]) load_end = time.time() rollouts_time = rollouts_end - iter_start shuffle_time = shuffle_end - rollouts_end load_time = load_end - shuffle_end sgd_time = 0 for i in range(config["num_sgd_iter"]): sgd_start = time.time() batch_index = 0 num_batches = (int(tuples_per_device) // int(model.per_device_batch_size)) loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], [] permutation = np.random.permutation(num_batches) # Prepare to drop into the debugger if self.iteration == config["tf_debug_iteration"]: model.sess = tf_debug.LocalCLIDebugWrapperSession(model.sess) while batch_index < num_batches: full_trace = (i == 0 and self.iteration == 0 and batch_index == config["full_trace_nth_sgd_batch"]) batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \ batch_entropy = model.run_sgd_minibatch( permutation[batch_index] * model.per_device_batch_size, self.kl_coeff, full_trace, self.file_writer) loss.append(batch_loss) policy_loss.append(batch_policy_loss) vf_loss.append(batch_vf_loss) kl.append(batch_kl) entropy.append(batch_entropy) batch_index += 1 loss = np.mean(loss) policy_loss = np.mean(policy_loss) vf_loss = np.mean(vf_loss) kl = np.mean(kl) entropy = np.mean(entropy) sgd_end = time.time() print("{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format( i, loss, policy_loss, vf_loss, kl, entropy)) values = [] if i == config["num_sgd_iter"] - 1: metric_prefix = "ppo/sgd/final_iter/" values.append( tf.Summary.Value(tag=metric_prefix + "kl_coeff", simple_value=self.kl_coeff)) values.extend([ tf.Summary.Value(tag=metric_prefix + "mean_entropy", simple_value=entropy), tf.Summary.Value(tag=metric_prefix + "mean_loss", simple_value=loss), tf.Summary.Value(tag=metric_prefix + "mean_kl", simple_value=kl) ]) if self.file_writer: sgd_stats = tf.Summary(value=values) self.file_writer.add_summary(sgd_stats, self.global_step) self.global_step += 1 sgd_time += sgd_end - sgd_start if kl > 2.0 * config["kl_target"]: self.kl_coeff *= 1.5 elif kl < 0.5 * config["kl_target"]: self.kl_coeff *= 0.5 info = { "kl_divergence": kl, "kl_coefficient": self.kl_coeff, "rollouts_time": rollouts_time, "shuffle_time": shuffle_time, "load_time": load_time, "sgd_time": sgd_time, "sample_throughput": len(trajectory["observations"]) / sgd_time } print("kl div:", kl) print("kl coeff:", self.kl_coeff) print("rollouts time:", rollouts_time) print("shuffle time:", shuffle_time) print("load time:", load_time) print("sgd time:", sgd_time) print("sgd examples/s:", len(trajectory["observations"]) / sgd_time) print("total time so far:", time.time() - self.start_time) result = TrainingResult( episode_reward_mean=total_reward, episode_len_mean=traj_len_mean, timesteps_this_iter=trajectory["dones"].shape[0], info=info) return result
def result(t, rew): return TrainingResult(time_total_s=t, episode_reward_mean=rew)
def _train(self): config = self.config step_tstart = time.time() theta = self.policy.get_trainable_flat() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results = self._collect_results(theta_id, config["episodes_per_batch"], config["timesteps_per_batch"]) curr_task_results = [] ob_count_this_batch = 0 # Loop over the results for result in results: assert result.eval_length is None, "We aren't doing eval rollouts." assert result.noise_inds_n.ndim == 1 assert result.returns_n2.shape == (len(result.noise_inds_n), 2) assert result.lengths_n2.shape == (len(result.noise_inds_n), 2) assert result.returns_n2.dtype == np.float32 result_num_eps = result.lengths_n2.size result_num_timesteps = result.lengths_n2.sum() self.episodes_so_far += result_num_eps self.timesteps_so_far += result_num_timesteps curr_task_results.append(result) # Update ob stats. if self.policy.needs_ob_stat and result.ob_count > 0: self.ob_stat.increment(result.ob_sum, result.ob_sumsq, result.ob_count) ob_count_this_batch += result.ob_count # Assemble the results. noise_inds_n = np.concatenate( [r.noise_inds_n for r in curr_task_results]) returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results]) lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results]) assert (noise_inds_n.shape[0] == returns_n2.shape[0] == lengths_n2.shape[0]) # Process the returns. if config["return_proc_mode"] == "centered_rank": proc_returns_n2 = utils.compute_centered_ranks(returns_n2) else: raise NotImplementedError(config["return_proc_mode"]) # Compute and take a step. g, count = utils.batched_weighted_sum( proc_returns_n2[:, 0] - proc_returns_n2[:, 1], (self.noise.get(idx, self.policy.num_params) for idx in noise_inds_n), batch_size=500) g /= returns_n2.size assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32 and count == len(noise_inds_n)) update_ratio = self.optimizer.update(-g + config["l2coeff"] * theta) # Update ob stat (we're never running the policy in the master, but we # might be snapshotting the policy). if self.policy.needs_ob_stat: self.policy.set_ob_stat(self.ob_stat.mean, self.ob_stat.std) step_tend = time.time() tlogger.record_tabular("EpRewMean", returns_n2.mean()) tlogger.record_tabular("EpRewStd", returns_n2.std()) tlogger.record_tabular("EpLenMean", lengths_n2.mean()) tlogger.record_tabular( "Norm", float(np.square(self.policy.get_trainable_flat()).sum())) tlogger.record_tabular("GradNorm", float(np.square(g).sum())) tlogger.record_tabular("UpdateRatio", float(update_ratio)) tlogger.record_tabular("EpisodesThisIter", lengths_n2.size) tlogger.record_tabular("EpisodesSoFar", self.episodes_so_far) tlogger.record_tabular("TimestepsThisIter", lengths_n2.sum()) tlogger.record_tabular("TimestepsSoFar", self.timesteps_so_far) tlogger.record_tabular("ObCount", ob_count_this_batch) tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart) tlogger.record_tabular("TimeElapsed", step_tend - self.tstart) tlogger.dump_tabular() info = { "weights_norm": np.square(self.policy.get_trainable_flat()).sum(), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": lengths_n2.size, "episodes_so_far": self.episodes_so_far, "timesteps_this_iter": lengths_n2.sum(), "timesteps_so_far": self.timesteps_so_far, "ob_count": ob_count_this_batch, "time_elapsed_this_iter": step_tend - step_tstart, "time_elapsed": step_tend - self.tstart } result = TrainingResult(episode_reward_mean=returns_n2.mean(), episode_len_mean=lengths_n2.mean(), timesteps_this_iter=lengths_n2.sum(), info=info) return result
def _train_async(self): apply_time = RunningStat(()) wait_time = RunningStat(()) gradient_lag = RunningStat(()) iter_init_timesteps = self.cur_timestep num_gradients_applied = 0 gradient_list = [ worker.do_async_step.remote(i, self.cur_timestep, self.actor.get_weights(), num_gradients_applied) for i, worker in enumerate(self.workers) ] steps = self.config["sample_batch_size"] * len(gradient_list) self.cur_timestep += steps self.steps_since_update += steps while gradient_list: dt = time.time() gradient, info = ray.get(gradient_list[0]) gradient_list = gradient_list[1:] wait_time.push(time.time() - dt) if gradient is not None: dt = time.time() self.actor.apply_gradients(gradient) apply_time.push(time.time() - dt) gradient_lag.push(num_gradients_applied - info["gradient_id"]) num_gradients_applied += 1 if (self.cur_timestep - iter_init_timesteps < self.config["timesteps_per_iteration"]): worker_id = info["id"] gradient_list.append( self.workers[info["id"]].do_async_step.remote( worker_id, self.cur_timestep, self.actor.get_weights(), num_gradients_applied)) self.cur_timestep += self.config["sample_batch_size"] self.steps_since_update += self.config["sample_batch_size"] if (self.cur_timestep > self.config["learning_starts"] and self.steps_since_update > self.config["target_network_update_freq"]): # Update target network periodically. self.actor.dqn_graph.update_target(self.actor.sess) self.steps_since_update -= ( self.config["target_network_update_freq"]) self.num_target_updates += 1 mean_100ep_reward = 0.0 mean_100ep_length = 0.0 num_episodes = 0 buffer_size_sum = 0 stats = ray.get( [w.stats.remote(self.cur_timestep) for w in self.workers]) for stat in stats: mean_100ep_reward += stat[0] mean_100ep_length += stat[1] num_episodes += stat[2] exploration = stat[3] buffer_size_sum += stat[4] set_weights_time = stat[5] sample_time = stat[6] grad_time = stat[7] mean_100ep_reward /= self.config["num_workers"] mean_100ep_length /= self.config["num_workers"] info = [ ("mean_100ep_reward", mean_100ep_reward), ("exploration_frac", exploration), ("steps", self.cur_timestep), ("episodes", num_episodes), ("buffer_sizes_sum", buffer_size_sum), ("target_updates", self.num_target_updates), ("mean_set_weights_time", set_weights_time), ("mean_sample_time", sample_time), ("mean_grad_time", grad_time), ("mean_apply_time", float(apply_time.mean)), ("mean_ray_wait_time", float(wait_time.mean)), ("gradient_lag_mean", float(gradient_lag.mean)), ("gradient_lag_stdev", float(gradient_lag.std)), ] for k, v in info: logger.record_tabular(k, v) logger.dump_tabular() result = TrainingResult(episode_reward_mean=mean_100ep_reward, episode_len_mean=mean_100ep_length, timesteps_this_iter=self.cur_timestep - iter_init_timesteps, info=info) return result
def _train_sync(self): config = self.config sample_time, sync_time, learn_time, apply_time = 0, 0, 0, 0 iter_init_timesteps = self.cur_timestep num_loop_iters = 0 while (self.cur_timestep - iter_init_timesteps < config["timesteps_per_iteration"]): dt = time.time() if self.workers: worker_steps = ray.get([ w.do_steps.remote(config["sample_batch_size"] // len(self.workers), self.cur_timestep, store=False) for w in self.workers ]) for steps in worker_steps: for obs, action, rew, new_obs, done in steps: self.actor.replay_buffer.add(obs, action, rew, new_obs, done) else: self.actor.do_steps(config["sample_batch_size"], self.cur_timestep, store=True) num_loop_iters += 1 self.cur_timestep += config["sample_batch_size"] self.steps_since_update += config["sample_batch_size"] sample_time += time.time() - dt if self.cur_timestep > config["learning_starts"]: if config["multi_gpu_optimize"]: dt = time.time() times = self.actor.do_multi_gpu_optimize(self.cur_timestep) if num_loop_iters <= 1: print("Multi-GPU times", times) learn_time += (time.time() - dt) else: # Minimize the error in Bellman's equation on a batch # sampled from replay buffer. for _ in range( max( 1, config["train_batch_size"] // config["sgd_batch_size"])): dt = time.time() gradients = [ self.actor.sample_buffer_gradient( self.cur_timestep) ] learn_time += (time.time() - dt) dt = time.time() for grad in gradients: self.actor.apply_gradients(grad) apply_time += (time.time() - dt) dt = time.time() self._update_worker_weights() sync_time += (time.time() - dt) if (self.cur_timestep > config["learning_starts"] and self.steps_since_update > config["target_network_update_freq"]): # Update target network periodically. self.actor.dqn_graph.update_target(self.actor.sess) self.steps_since_update -= config["target_network_update_freq"] self.num_target_updates += 1 mean_100ep_reward = 0.0 mean_100ep_length = 0.0 num_episodes = 0 buffer_size_sum = 0 if not self.workers: stats = self.actor.stats(self.cur_timestep) mean_100ep_reward += stats[0] mean_100ep_length += stats[1] num_episodes += stats[2] exploration = stats[3] buffer_size_sum += stats[4] for mean_rew, mean_len, episodes, exploration, buf_sz in ray.get( [w.stats.remote(self.cur_timestep) for w in self.workers]): mean_100ep_reward += mean_rew mean_100ep_length += mean_len num_episodes += episodes buffer_size_sum += buf_sz mean_100ep_reward /= config["num_workers"] mean_100ep_length /= config["num_workers"] info = [ ("mean_100ep_reward", mean_100ep_reward), ("exploration_frac", exploration), ("steps", self.cur_timestep), ("episodes", num_episodes), ("buffer_sizes_sum", buffer_size_sum), ("target_updates", self.num_target_updates), ("sample_time", sample_time), ("weight_sync_time", sync_time), ("apply_time", apply_time), ("learn_time", learn_time), ("samples_per_s", num_loop_iters * np.float64(config["sample_batch_size"]) / sample_time), ("learn_samples_per_s", num_loop_iters * np.float64(config["train_batch_size"]) / learn_time), ] for k, v in info: logger.record_tabular(k, v) logger.dump_tabular() result = TrainingResult(episode_reward_mean=mean_100ep_reward, episode_len_mean=mean_100ep_length, timesteps_this_iter=self.cur_timestep - iter_init_timesteps, info=info) return result
def _train(self): return TrainingResult(timesteps_this_iter=1, done=True)