def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A2C", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) t_start = time.time() callback.on_training_start(locals(), globals()) for update in range(1, total_timesteps // self.n_batch + 1): callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # unpack obs, states, rewards, masks, actions, values, ep_infos, true_reward = rollout callback.update_locals(locals()) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) _, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values, self.num_timesteps // self.n_batch, writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.dump_tabular() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.n_batch = self.n_envs * self.n_steps self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # FIFO queue of the q_runner thread is closed at the end of the learn function. # As a result, it needs to be redefinied at every call with self.graph.as_default(): with tf.variable_scope( "kfac_apply", reuse=self.trained, custom_getter=tf_util.outer_scope_getter( "kfac_apply")): # Some of the variables are not in a scope when they are create # so we make a note of any previously uninitialized variables tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) old_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f ] self.train_op, self.q_runner = self.optim.apply_gradients( list(zip(self.grads_check, self.params))) # then we check for new uninitialized variables and initialize them tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) new_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f and v not in old_uninitialized_vars ] if len(new_uninitialized_vars) != 0: self.sess.run( tf.variables_initializer(new_uninitialized_vars)) self.trained = True t_start = time.time() coord = tf.train.Coordinator() if self.q_runner is not None: enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) else: enqueue_threads = [] callback.on_training_start(locals(), globals()) for update in range(1, total_timesteps // self.n_batch + 1): callback.on_rollout_start() # pytype:disable=bad-unpacking # true_reward is the reward without discount if isinstance(self.runner, PPO2Runner): # We are using GAE rollout = self.runner.run(callback) obs, returns, masks, actions, values, _, states, ep_infos, true_reward = rollout else: rollout = self.runner.run(callback) obs, states, returns, masks, actions, values, ep_infos, true_reward = rollout # pytype:enable=bad-unpacking callback.update_locals(locals()) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) policy_loss, value_loss, policy_entropy = self._train_step( obs, states, returns, masks, actions, values, self.num_timesteps // (self.n_batch + 1), writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.dump_tabular() coord.request_stop() coord.join(enqueue_threads) callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # Unpack obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs, 1) inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs // self.n_steps, 1) assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False callback.update_locals(locals()) if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACER", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) episode_stats = EpisodeStats(self.n_steps, self.n_envs) if self.replay_ratio > 0: buffer = Buffer(env=self.env, n_steps=self.n_steps, size=self.buffer_size) else: buffer = None t_start = time.time() callback.on_training_start(locals(), globals()) # n_batch samples, 1 on_policy call and multiple off-policy calls for steps in range(0, total_timesteps, self.n_batch): callback.on_rollout_start() enc_obs, obs, actions, rewards, mus, dones, masks = self.runner.run(callback) callback.update_locals(locals()) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) if writer is not None: total_episode_reward_logger(self.episode_reward, rewards.reshape((self.n_envs, self.n_steps)), dones.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) # reshape stuff correctly obs = obs.reshape(self.runner.batch_ob_shape) actions = actions.reshape([self.n_batch]) rewards = rewards.reshape([self.n_batch]) mus = mus.reshape([self.n_batch, self.n_act]) dones = dones.reshape([self.n_batch]) masks = masks.reshape([self.runner.batch_ob_shape[0]]) names_ops, values_ops = self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, self.num_timesteps, writer) if self.verbose >= 1 and (int(steps / self.n_batch) % log_interval == 0): logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", int(steps / (time.time() - t_start))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, # not just at the terminal state. Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular() if (self.replay_ratio > 0 and buffer is not None and buffer.has_atleast(self.replay_start)): samples_number = np.random.poisson(self.replay_ratio) for _ in range(samples_number): # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(self.runner.batch_ob_shape) actions = actions.reshape([self.n_batch]) rewards = rewards.reshape([self.n_batch]) mus = mus.reshape([self.n_batch, self.n_act]) dones = dones.reshape([self.n_batch]) masks = masks.reshape([self.runner.batch_ob_shape[0]]) self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, self.num_timesteps) callback.on_training_end() return self