def train(self): config = self.config sample_time, learn_time = 0, 0 for _ in range(config["timesteps_per_iteration"]): self.num_timesteps += 1 dt = time.time() # Take action and update exploration to the newest value action = self.dqn_graph.act( self.sess, np.array(self.obs)[None], self.exploration.value(self.num_timesteps))[0] new_obs, rew, done, _ = self.env.step(action) # Store transition in the replay buffer. self.replay_buffer.add(self.obs, action, rew, new_obs, float(done)) self.obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) sample_time += time.time() - dt if self.num_timesteps > config["learning_starts"] and \ self.num_timesteps % config["train_freq"] == 0: dt = time.time() # Minimize the error in Bellman's equation on a batch sampled # from replay buffer. if config["prioritized_replay"]: experience = self.replay_buffer.sample( config["batch_size"], beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, _, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = ( self.replay_buffer.sample(config["batch_size"])) batch_idxes = None td_errors = self.dqn_graph.train(self.sess, obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if config["prioritized_replay"]: new_priorities = np.abs(td_errors) + ( config["prioritized_replay_eps"]) self.replay_buffer.update_priorities( batch_idxes, new_priorities) learn_time += (time.time() - dt) if self.num_timesteps > config["learning_starts"] and ( self.num_timesteps % config["target_network_update_freq"] == 0): # Update target network periodically. self.dqn_graph.update_target(self.sess) mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1) mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1) num_episodes = len(self.episode_rewards) info = { "sample_time": sample_time, "learn_time": learn_time, "steps": self.num_timesteps, "episodes": num_episodes, "exploration": int(100 * self.exploration.value(self.num_timesteps)) } logger.record_tabular("sample_time", sample_time) logger.record_tabular("learn_time", learn_time) logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("buffer_size", len(self.replay_buffer)) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() res = TrainingResult(self.experiment_id.hex, self.num_iterations, mean_100ep_reward, mean_100ep_length, info) self.num_iterations += 1 return res
def _train_sync(self): config = self.config sample_time, sync_time, learn_time, apply_time = 0, 0, 0, 0 iter_init_timesteps = self.cur_timestep num_loop_iters = 0 while (self.cur_timestep - iter_init_timesteps < config["timesteps_per_iteration"]): dt = time.time() if self.workers: worker_steps = ray.get([ w.do_steps.remote(config["sample_batch_size"] // len(self.workers), self.cur_timestep, store=False) for w in self.workers ]) for steps in worker_steps: for obs, action, rew, new_obs, done in steps: self.actor.replay_buffer.add(obs, action, rew, new_obs, done) else: self.actor.do_steps(config["sample_batch_size"], self.cur_timestep, store=True) num_loop_iters += 1 self.cur_timestep += config["sample_batch_size"] self.steps_since_update += config["sample_batch_size"] sample_time += time.time() - dt if self.cur_timestep > config["learning_starts"]: if config["multi_gpu_optimize"]: dt = time.time() times = self.actor.do_multi_gpu_optimize(self.cur_timestep) if num_loop_iters <= 1: print("Multi-GPU times", times) learn_time += (time.time() - dt) else: # Minimize the error in Bellman's equation on a batch # sampled from replay buffer. for _ in range( max( 1, config["train_batch_size"] // config["sgd_batch_size"])): dt = time.time() gradients = [ self.actor.sample_buffer_gradient( self.cur_timestep) ] learn_time += (time.time() - dt) dt = time.time() for grad in gradients: self.actor.apply_gradients(grad) apply_time += (time.time() - dt) dt = time.time() self._update_worker_weights() sync_time += (time.time() - dt) if (self.cur_timestep > config["learning_starts"] and self.steps_since_update > config["target_network_update_freq"]): # Update target network periodically. self.actor.dqn_graph.update_target(self.actor.sess) self.steps_since_update -= config["target_network_update_freq"] self.num_target_updates += 1 mean_100ep_reward = 0.0 mean_100ep_length = 0.0 num_episodes = 0 buffer_size_sum = 0 if not self.workers: stats = self.actor.stats(self.cur_timestep) mean_100ep_reward += stats[0] mean_100ep_length += stats[1] num_episodes += stats[2] exploration = stats[3] buffer_size_sum += stats[4] for mean_rew, mean_len, episodes, exploration, buf_sz in ray.get( [w.stats.remote(self.cur_timestep) for w in self.workers]): mean_100ep_reward += mean_rew mean_100ep_length += mean_len num_episodes += episodes buffer_size_sum += buf_sz mean_100ep_reward /= config["num_workers"] mean_100ep_length /= config["num_workers"] info = [ ("mean_100ep_reward", mean_100ep_reward), ("exploration_frac", exploration), ("steps", self.cur_timestep), ("episodes", num_episodes), ("buffer_sizes_sum", buffer_size_sum), ("target_updates", self.num_target_updates), ("sample_time", sample_time), ("weight_sync_time", sync_time), ("apply_time", apply_time), ("learn_time", learn_time), ("samples_per_s", num_loop_iters * np.float64(config["sample_batch_size"]) / sample_time), ("learn_samples_per_s", num_loop_iters * np.float64(config["train_batch_size"]) / learn_time), ] for k, v in info: logger.record_tabular(k, v) logger.dump_tabular() result = TrainingResult(episode_reward_mean=mean_100ep_reward, episode_len_mean=mean_100ep_length, timesteps_this_iter=self.cur_timestep - iter_init_timesteps, info=info) return result
def _train(self): config = self.config sample_time, sync_time, learn_time, apply_time = 0, 0, 0, 0 iter_init_timesteps = self.cur_timestep num_loop_iters = 0 steps_per_iter = config["sample_batch_size"] * len(self.workers) while (self.cur_timestep - iter_init_timesteps < config["timesteps_per_iteration"]): dt = time.time() ray.get([ w.do_steps.remote(config["sample_batch_size"], self.cur_timestep) for w in self.workers ]) num_loop_iters += 1 self.cur_timestep += steps_per_iter self.steps_since_update += steps_per_iter sample_time += time.time() - dt if self.cur_timestep > config["learning_starts"]: dt = time.time() # Minimize the error in Bellman's equation on a batch sampled # from replay buffer. self._update_worker_weights() sync_time += (time.time() - dt) dt = time.time() gradients = ray.get([ w.get_gradient.remote(self.cur_timestep) for w in self.workers ]) learn_time += (time.time() - dt) dt = time.time() for grad in gradients: self.actor.apply_gradients(grad) apply_time += (time.time() - dt) if (self.cur_timestep > config["learning_starts"] and self.steps_since_update > config["target_network_update_freq"]): self.actor.dqn_graph.update_target(self.actor.sess) # Update target network periodically. self._update_worker_weights() self.steps_since_update -= config["target_network_update_freq"] self.num_target_updates += 1 mean_100ep_reward = 0.0 mean_100ep_length = 0.0 num_episodes = 0 buffer_size_sum = 0 for mean_rew, mean_len, episodes, exploration, buf_sz in ray.get( [w.stats.remote(self.cur_timestep) for w in self.workers]): mean_100ep_reward += mean_rew mean_100ep_length += mean_len num_episodes += episodes buffer_size_sum += buf_sz mean_100ep_reward /= len(self.workers) mean_100ep_length /= len(self.workers) info = [ ("mean_100ep_reward", mean_100ep_reward), ("exploration_frac", exploration), ("steps", self.cur_timestep), ("episodes", num_episodes), ("buffer_sizes_sum", buffer_size_sum), ("target_updates", self.num_target_updates), ("sample_time", sample_time), ("weight_sync_time", sync_time), ("apply_time", apply_time), ("learn_time", learn_time), ("samples_per_s", num_loop_iters * np.float64(steps_per_iter) / sample_time), ("learn_samples_per_s", num_loop_iters * np.float64(config["train_batch_size"]) * np.float64(config["num_workers"]) / learn_time), ] for k, v in info: logger.record_tabular(k, v) logger.dump_tabular() result = TrainingResult(episode_reward_mean=mean_100ep_reward, episode_len_mean=mean_100ep_length, timesteps_this_iter=self.cur_timestep - iter_init_timesteps, info=info) return result
def _train_async(self): apply_time = RunningStat(()) wait_time = RunningStat(()) gradient_lag = RunningStat(()) iter_init_timesteps = self.cur_timestep num_gradients_applied = 0 gradient_list = [ worker.do_async_step.remote(i, self.cur_timestep, self.actor.get_weights(), num_gradients_applied) for i, worker in enumerate(self.workers) ] steps = self.config["sample_batch_size"] * len(gradient_list) self.cur_timestep += steps self.steps_since_update += steps while gradient_list: dt = time.time() gradient, info = ray.get(gradient_list[0]) gradient_list = gradient_list[1:] wait_time.push(time.time() - dt) if gradient is not None: dt = time.time() self.actor.apply_gradients(gradient) apply_time.push(time.time() - dt) gradient_lag.push(num_gradients_applied - info["gradient_id"]) num_gradients_applied += 1 if (self.cur_timestep - iter_init_timesteps < self.config["timesteps_per_iteration"]): worker_id = info["id"] gradient_list.append( self.workers[info["id"]].do_async_step.remote( worker_id, self.cur_timestep, self.actor.get_weights(), num_gradients_applied)) self.cur_timestep += self.config["sample_batch_size"] self.steps_since_update += self.config["sample_batch_size"] if (self.cur_timestep > self.config["learning_starts"] and self.steps_since_update > self.config["target_network_update_freq"]): # Update target network periodically. self.actor.dqn_graph.update_target(self.actor.sess) self.steps_since_update -= ( self.config["target_network_update_freq"]) self.num_target_updates += 1 mean_100ep_reward = 0.0 mean_100ep_length = 0.0 num_episodes = 0 buffer_size_sum = 0 stats = ray.get( [w.stats.remote(self.cur_timestep) for w in self.workers]) for stat in stats: mean_100ep_reward += stat[0] mean_100ep_length += stat[1] num_episodes += stat[2] exploration = stat[3] buffer_size_sum += stat[4] set_weights_time = stat[5] sample_time = stat[6] grad_time = stat[7] mean_100ep_reward /= self.config["num_workers"] mean_100ep_length /= self.config["num_workers"] info = [ ("mean_100ep_reward", mean_100ep_reward), ("exploration_frac", exploration), ("steps", self.cur_timestep), ("episodes", num_episodes), ("buffer_sizes_sum", buffer_size_sum), ("target_updates", self.num_target_updates), ("mean_set_weights_time", set_weights_time), ("mean_sample_time", sample_time), ("mean_grad_time", grad_time), ("mean_apply_time", float(apply_time.mean)), ("mean_ray_wait_time", float(wait_time.mean)), ("gradient_lag_mean", float(gradient_lag.mean)), ("gradient_lag_stdev", float(gradient_lag.std)), ] for k, v in info: logger.record_tabular(k, v) logger.dump_tabular() result = TrainingResult(episode_reward_mean=mean_100ep_reward, episode_len_mean=mean_100ep_length, timesteps_this_iter=self.cur_timestep - iter_init_timesteps, info=info) return result