def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs, )) t_start = time.time() for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, true_reward = runner.run( ) _, value_loss, policy_entropy = self._train_step( obs, states, rewards, masks, actions, values, update, writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) == False: break if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * self.n_batch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A2C", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) t_start = time.time() callback.on_training_start(locals(), globals()) for update in range(1, total_timesteps // self.n_batch + 1): callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # unpack obs, states, rewards, masks, actions, values, ep_infos, true_reward = rollout callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) _, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values, self.num_timesteps // self.n_batch, writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.dump_tabular() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs,)) # Training stats (when using Monitor wrapper) ep_info_buf = deque(maxlen=100) t_start = time.time() for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run() ep_info_buf.extend(ep_infos) _, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values, self.num_timesteps // self.n_batch, writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) self.num_timesteps += self.n_batch if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() info = {"cte": 0.0} else: obs = self.env.reset() info = {"cte": 0.0} file1 = open("cte.log", "a") file2 = open("obs.log", "a") for step in range(total_timesteps): if callback is not None: if callback(locals(), globals()) is False: break action = self.env.action_space.sample() assert action.shape == self.env.action_space.shape new_obs, reward, done, new_info = self.env.step(action) print (obs) print (info["cte"], action, new_info["cte"]) if math.fabs(info["cte"] - new_info["cte"]) < 1.5: file1.write("%f\t %f\t %f\t %f\n" %(info["cte"], action[0], action[1], new_info["cte"])) obs_flatten = obs.flatten() file2.write(' '.join(str(item) for item in obs_flatten) + "\n") #file2.write("\n") obs = new_obs info = new_info if done: if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() if is_teleop_env: self.env.is_training = False print("Final optimization before saving") self.env.reset() file1.close() file2.close() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="SIL_A2C"): with SetVerbosity(self.verbose), \ TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: # type: tf.summary.FileWriter self._setup_learn(seed) self.save_directory = Path(writer.get_logdir()) self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) runner = SuccessorFeatureA2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs,)) t_start = time.time() for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, true_reward, raw_rewards, features, reward_bonuses = runner.run() _, value_loss, policy_entropy, sf_loss = self._train_step(obs, states, rewards, masks, actions, values, update, writer, features=features, rewards_bonuses=reward_bonuses) sil_loss, sil_adv, sil_samples, sil_nlogp = self._train_sil() n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, raw_rewards.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) summary = tf.Summary(value=[tf.Summary.Value( tag="episode_reward/best_reward", simple_value=self.sil.get_best_reward())]) writer.add_summary(summary, update * (self.n_batch + 1)) if callback is not None: callback(locals(), globals()) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * self.n_batch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular('sf_loss', float(sf_loss)) logger.record_tabular("explained_variance", float(explained_var)) logger.record_tabular("best_episode_reward", float(self.sil.get_best_reward())) if self.sil_update > 0: logger.record_tabular("sil_num_episodes", float(self.sil.num_episodes())) logger.record_tabular("sil_valid_samples", float(sil_samples)) logger.record_tabular("sil_steps", float(self.sil.num_steps())) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PlannedPPO", reset_num_timesteps=True): self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(1, nupdates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) return NotImplementedError
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for step in range(total_timesteps): if callback is not None: callback(locals(), globals()) # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(step) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(step) + self.exploration.value(step) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, step) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if step > self.learning_starts and step % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(step)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + step) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % step) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, step) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if step > self.learning_starts and step % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(step))) logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DDPG", \ reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # a list for tensorboard logging, to prevent logging with the same step number, if it already occured self.tb_seen_steps = [] rank = MPI.COMM_WORLD.Get_rank() # we assume symmetric actions. assert np.all( np.abs(self.env.action_space.low) == self.env.action_space.high) if self.verbose >= 2: logger.log('Using agent with the following configuration:') logger.log(str(self.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) self.episode_reward = np.zeros((1, )) episode_successes = [] with self.sess.as_default(), self.graph.as_default(): # Prepare everything. self._reset() obs = self.env.reset() eval_obs = None if self.eval_env is not None: eval_obs = self.eval_env.reset() episode_reward = 0. episode_step = 0 episodes = 0 step = 0 total_steps = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] eval_episode_rewards = [] eval_qs = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 epoch = 0 while True: for _ in range(log_interval): # Perform rollouts. for _ in range(self.nb_rollout_steps): if total_steps >= total_timesteps: return self # Predict next action. action, q_value = self._policy(obs, apply_noise=True, compute_q=True) assert action.shape == self.env.action_space.shape # Execute next action. if rank == 0 and self.render: self.env.render() # Randomly sample actions from a uniform distribution # with a probabilty self.random_exploration (used in HER + DDPG) if np.random.rand() < self.random_exploration: rescaled_action = action = self.action_space.sample( ) else: rescaled_action = action * np.abs( self.action_space.low) rescaled_action = np.where(action)[0][0] new_obs, reward, done, info = self.env.step( rescaled_action) if writer is not None: ep_rew = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) step += 1 total_steps += 1 self.num_timesteps += 1 if rank == 0 and self.render: self.env.render() episode_reward += reward episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q_value) self._store_transition(obs, action, reward, new_obs, done) obs = new_obs if callback is not None: # Only stop training if return value is False, not when it is None. # This is for backwards compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: return self if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append( float(maybe_is_success)) self._reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(self.nb_train_steps): # Not enough samples in the replay buffer if not self.replay_buffer.can_sample( self.batch_size): break # Adapt param noise, if necessary. if len(self.replay_buffer) >= self.batch_size and \ t_train % self.param_noise_adaption_interval == 0: distance = self._adapt_param_noise() epoch_adaptive_distances.append(distance) # weird equation to deal with the fact the nb_train_steps will be different # to nb_rollout_steps step = (int(t_train * (self.nb_rollout_steps / self.nb_train_steps)) + self.num_timesteps - self.nb_rollout_steps) critic_loss, actor_loss = self._train_step( step, writer, log=t_train == 0) epoch_critic_losses.append(critic_loss) epoch_actor_losses.append(actor_loss) self._update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if self.eval_env is not None: eval_episode_reward = 0. for _ in range(self.nb_eval_steps): if total_steps >= total_timesteps: return self eval_action, eval_q = self._policy( eval_obs, apply_noise=False, compute_q=True) eval_obs, eval_r, eval_done, _ = self.eval_env.step( eval_action * np.abs(self.action_space.low)) if self.render_eval: self.eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: if not isinstance(self.env, VecEnv): eval_obs = self.eval_env.reset() eval_episode_rewards.append( eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = self._get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean( epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean( epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean( epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean( epoch_critic_losses) if len(epoch_adaptive_distances) != 0: combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float( step) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std( epoch_actions) # Evaluation statistics. if self.eval_env is not None: combined_stats['eval/return'] = np.mean( eval_episode_rewards) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len( eval_episode_rewards) def as_scalar(scalar): """ check and return the input if it is a scalar, otherwise raise ValueError :param scalar: (Any) the object to check :return: (Number) the scalar if x is a scalar """ if isinstance(scalar, np.ndarray): assert scalar.size == 1 return scalar[0] elif np.isscalar(scalar): return scalar else: raise ValueError('expected scalar, got %s' % scalar) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = step for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(self.env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler: pickle.dump(self.env.get_state(), file_handler) if self.eval_env and hasattr(self.eval_env, 'get_state'): with open( os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler: pickle.dump(self.eval_env.get_state(), file_handler)
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs().squeeze() reward_ = self._vec_normalize_env.get_original_reward().squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done)) if self.expert_exp is not None: self.add_expert_exp() obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities(batch_idxes, new_priorities) callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] obs = self.env.reset() self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] obs = self.env.reset() for i in range(128): action = self.env.action_space.sample() new_obs, reward, done, info = self.env.step(action) # print(new_obs) # self.env.render() self.iiayn.update_history([obs]) obs = new_obs for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if self.num_timesteps < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action #* np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) print(step, action) # self.env.render() self.iiayn.update_history([obs]) if step % 2048 == 0: self.iiayn.activate_buffer() # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done or step%1024 == 0: obs = self.env.reset() # if not isinstance(self.env, VecEnv): # obs = self.env.reset() episode_rewards.append(0.0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, env_eval, callback=None, seed=None, path=None, dis_path=None, score_path=None, dis_eval_interval=100, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): self.eval_env = env_eval new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] dis_eval_array = [] # (total_step % eval_intervel) x 2 x n_batch self.ep_length = 0 for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() noise = np.zeros(self.noise_dim) else: noise = self.policy_tf.gen_noise(obs[None]).flatten() action = self.policy_tf.step(obs[None],noise[None] ,deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) self.ep_length += 1 # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done), noise) episode_rewards[-1] += reward reset_flag = done or self.ep_length >= self.max_ep_length if reset_flag: if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() episode_rewards.append(0.0) self.ep_length = 0 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) else: obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr, dis_eval_array, dis_eval_interval, dis_path)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if self.num_timesteps % 2000 == 0: eval_ob = self.eval_env.reset() eval_epi_rewards = 0 eval_epis = 0 eval_performance = [] eval_ep_step = 0 while True: eval_noise = self.policy_tf.gen_noise(eval_ob[None]).flatten() eval_action = self.policy_tf.step(eval_ob[None], eval_noise[None], deterministic=True).flatten() eval_rescaled_action = eval_action * np.abs(self.action_space.low) eval_new_obs, eval_reward, eval_done, eval_info = self.eval_env.step(eval_rescaled_action) eval_epi_rewards += eval_reward eval_ob = eval_new_obs eval_ep_step += 1 if eval_done or eval_ep_step >= self.max_ep_length: eval_ob = self.eval_env.reset() eval_performance.append(eval_epi_rewards) eval_epi_rewards = 0 eval_epis += 1 eval_ep_step = 0 if eval_epis > 5: break with open(score_path, 'a') as f2: f2.write("%i %f\n" % (self.num_timesteps, np.mean(eval_performance))) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and reset_flag and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) with open(path,'a') as f1: f1.write("%f " % step) f1.write("%f " % mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) with open(path,'a') as f1: f1.write("%f " % safe_mean([ep_info['r'] for ep_info in ep_info_buf])) f1.write("%f " % safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) with open(path,'a') as f1: f1.write("%f " % n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="PPO1", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \ "an instance of common.policies.ActorCriticPolicy." with self.sess.as_default(): self.adam.sync() callback.on_training_start(locals(), globals()) # Prepare for rollouts seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch, callback=callback) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() # rolling buffer for episode lengths len_buffer = deque(maxlen=100) # rolling buffer for episode rewards reward_buffer = deque(maxlen=100) while True: if timesteps_so_far >= total_timesteps: break if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / total_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() # Stop training early (triggered by the callback) if not seg.get('continue_training', True): # pytype: disable=attribute-error break add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) observations, actions = seg["observations"], seg["actions"] atarg, tdlamret = seg["adv"], seg["tdlamret"] # true_rew is the reward without discount if writer is not None: total_episode_reward_logger( self.episode_reward, seg["true_rewards"].reshape( (self.n_envs, -1)), seg["dones"].reshape( (self.n_envs, -1)), writer, self.num_timesteps) # predicted value function before udpate vpredbefore = seg["vpred"] # standardized advantage function estimate atarg = (atarg - atarg.mean()) / atarg.std() dataset = Dataset(dict(ob=observations, ac=actions, atarg=atarg, vtarg=tdlamret), shuffle=not self.policy.recurrent) optim_batchsize = self.optim_batchsize or observations.shape[ 0] # set old parameter values to new parameter values self.assign_old_eq_new(sess=self.sess) logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for k in range(self.optim_epochs): # list of tuples, each of which gives the loss for a minibatch losses = [] for i, batch in enumerate( dataset.iterate_once(optim_batchsize)): steps = ( self.num_timesteps + k * optim_batchsize + int(i * (optim_batchsize / len(dataset.data_map)))) if writer is not None: # run loss backprop with summary, but once every 10 runs save the metadata # (memory, compute time, ...) if self.full_tensorboard_log and (1 + k) % 10 == 0: run_options = tf.compat.v1.RunOptions( trace_level=tf.compat.v1.RunOptions. FULL_TRACE) run_metadata = tf.compat.v1.RunMetadata() summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % steps) else: summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) writer.add_summary(summary, steps) else: _, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) self.adam.update(grad, self.optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in dataset.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) losses.append(newlosses) mean_losses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, mean_losses)) for (loss_val, name) in zipsame(mean_losses, self.loss_names): logger.record_tabular("loss_" + name, loss_val) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # local values lrlocal = (seg["ep_lens"], seg["ep_rets"]) # list of tuples listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) lens, rews = map(flatten_lists, zip(*listoflrpairs)) len_buffer.extend(lens) reward_buffer.extend(rews) if len(len_buffer) > 0: logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(reward_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) current_it_timesteps = MPI.COMM_WORLD.allreduce( seg["total_timestep"]) timesteps_so_far += current_it_timesteps self.num_timesteps += current_it_timesteps iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", self.num_timesteps) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACER"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) episode_stats = EpisodeStats(self.n_steps, self.n_envs) runner = _Runner(env=self.env, model=self, n_steps=self.n_steps) self.episode_reward = np.zeros((self.n_envs, )) if self.replay_ratio > 0: buffer = Buffer(env=self.env, n_steps=self.n_steps, size=self.buffer_size) else: buffer = None t_start = time.time() # n_batch samples, 1 on_policy call and multiple off-policy calls for steps in range(0, total_timesteps, self.n_batch): enc_obs, obs, actions, rewards, mus, dones, masks = runner.run( ) episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, rewards.reshape((self.n_envs, self.n_steps)), dones.reshape((self.n_envs, self.n_steps)), writer, steps) # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.n_batch]) rewards = rewards.reshape([runner.n_batch]) mus = mus.reshape([runner.n_batch, runner.n_act]) dones = dones.reshape([runner.n_batch]) masks = masks.reshape([runner.batch_ob_shape[0]]) names_ops, values_ops = self._train_step( obs, actions, rewards, dones, mus, self.initial_state, masks, steps, writer) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) == False: break if self.verbose >= 1 and (int(steps / runner.n_batch) % log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps / (time.time() - t_start))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, # not just at the terminal state. Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular() if self.replay_ratio > 0 and buffer.has_atleast( self.replay_start): samples_number = np.random.poisson(self.replay_ratio) for _ in range(samples_number): # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.n_batch]) rewards = rewards.reshape([runner.n_batch]) mus = mus.reshape([runner.n_batch, runner.n_act]) dones = dones.reshape([runner.n_batch]) masks = masks.reshape([runner.batch_ob_shape[0]]) self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, steps) return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() reset = True ############################################################ # MODIFICATION: # Track list of actions taken each episode. This is # intentionally not a set so that we can use np.isin. action_list = list() ############################################################ for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): #################################################### # MODIFICATION: # Rename variable from original, since it's now # going to come back as an array due to the # modified build_act function being used to # construct everything. action_arr = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] #################################################### # ORIGINAL: # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] ######################################################## # MODIFICATION: # Get the best action that has not yet been taken this # episode. action = \ action_arr[np.argmin(np.isin(action_arr, action_list))] # Add this action to the list. action_list.append(action) ######################################################## env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: #################################################### # MODIFICATION: # Clear the list. action_list.clear() #################################################### maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self
def learn(self, total_timesteps, seed=None, tb_log_name='DQN', test_interval=1, reset_num_timesteps=True): if reset_num_timesteps: self.num_timesteps = 0 with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.replay_buffer = ReplayBuffer(size=self.buffer_size) self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset(train=True) best_train_score = None best_test_score = None self.reward_curve = [] for _ in range(total_timesteps): update_eps = self.exploration.value(self.num_timesteps) with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps)[0] new_obs, rew, done, _ = self.env.step(action) self.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if self.num_timesteps > self.learning_starts: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights = np.ones_like(rewards) if writer is not None: if (1 + self.num_timesteps) % 100 == 0: summary, td_errors = self.train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self.train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.num_timesteps > self.learning_starts and self.num_timesteps % self.target_network_update_freq == 0: self.update_target(sess=self.sess) if done: print('-------------------------------------') print('steps | {}'.format( self.num_timesteps)) print('episodes | {}'.format( len(episode_rewards))) epsilon = int(100 * self.exploration.value(self.num_timesteps)) print('% time spent exploring | {}'.format(epsilon)) print('--') mean_100ep_reward = -np.inf if len( episode_rewards[-16:-1]) == 0 else round( float(np.mean(episode_rewards[-16:-1])), 1) self.reward_curve.append(mean_100ep_reward) print('mean 10 episode reward | {:.1f}'.format( mean_100ep_reward)) journal = self.env.sim.journal print('Total operations | {}'.format( len(self.env.sim.journal))) longs = [x for x in journal if x['Type'] == 'LONG'] shorts = [x for x in journal if x['Type'] == 'SHORT'] print('Long/Short | {}/{}'.format( len(longs), len(shorts))) print('Avg duration trades | {:.2f}'.format( np.mean([j['Trade Duration'] for j in journal]))) total_profit = sum([j['Profit'] for j in journal]) print('Total profit | {:.2f}'.format( total_profit)) print('Avg profit per trade | {:.3f}'.format( total_profit / self.env.sim.total_trades)) if epsilon <= self.exploration_final_eps * 100: if best_train_score is None or total_profit > best_train_score: self.save('saves/best_model_train.pkl') best_train_score = total_profit if self.num_timesteps % test_interval == 0: print('--') test_episode_rewards, test_longs, test_shorts, test_ave_profit_per_trade = self.test( ) print('Total profit test > {:.2f}'.format( test_episode_rewards)) print('Long/Short test > {}/{}'.format( test_longs, test_shorts)) print('Avg profit per trade test > {:.3f}'.format( test_ave_profit_per_trade)) if epsilon <= self.exploration_final_eps * 100: if best_test_score is None or test_episode_rewards > best_test_score: self.save('saves/best_model_test.pkl') best_test_score = test_episode_rewards print('-------------------------------------') obs = self.env.reset() episode_rewards.append(0.0) if self.num_timesteps + ( self.num_timesteps / len(episode_rewards)) >= total_timesteps: self.save('saves/final_model.pkl') break self.num_timesteps += 1 return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # Unpack obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) self.n_batch = self.n_envs * self.n_steps self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # FIFO queue of the q_runner thread is closed at the end of the learn function. # As a result, it needs to be redefinied at every call with self.graph.as_default(): with tf.variable_scope( "kfac_apply", reuse=self.trained, custom_getter=tf_util.outer_scope_getter( "kfac_apply")): # Some of the variables are not in a scope when they are create # so we make a note of any previously uninitialized variables tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) old_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f ] self.train_op, self.q_runner = self.optim.apply_gradients( list(zip(self.grads_check, self.params))) # then we check for new uninitialized variables and initialize them tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) new_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f and v not in old_uninitialized_vars ] if len(new_uninitialized_vars) != 0: self.sess.run( tf.variables_initializer(new_uninitialized_vars)) self.trained = True # Use GAE if self.gae_lambda is not None: runner = PPO2Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.gae_lambda) else: runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs, )) t_start = time.time() coord = tf.train.Coordinator() if self.q_runner is not None: enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) else: enqueue_threads = [] # Training stats (when using Monitor wrapper) ep_info_buf = deque(maxlen=100) for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount if isinstance(runner, PPO2Runner): # We are using GAE obs, returns, masks, actions, values, _, states, ep_infos, true_reward = runner.run( ) else: obs, states, returns, masks, actions, values, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) policy_loss, value_loss, policy_entropy = self._train_step( obs, states, returns, masks, actions, values, self.num_timesteps // (self.n_batch + 1), writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.dump_tabular() self.num_timesteps += self.n_batch + 1 coord.request_stop() coord.join(enqueue_threads) return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACER", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) episode_stats = EpisodeStats(self.n_steps, self.n_envs) if self.replay_ratio > 0: buffer = Buffer(env=self.env, n_steps=self.n_steps, size=self.buffer_size) else: buffer = None t_start = time.time() callback.on_training_start(locals(), globals()) # n_batch samples, 1 on_policy call and multiple off-policy calls for steps in range(0, total_timesteps, self.n_batch): callback.on_rollout_start() enc_obs, obs, actions, rewards, mus, dones, masks = self.runner.run(callback) callback.update_locals(locals()) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) if writer is not None: total_episode_reward_logger(self.episode_reward, rewards.reshape((self.n_envs, self.n_steps)), dones.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) # reshape stuff correctly obs = obs.reshape(self.runner.batch_ob_shape) actions = actions.reshape([self.n_batch]) rewards = rewards.reshape([self.n_batch]) mus = mus.reshape([self.n_batch, self.n_act]) dones = dones.reshape([self.n_batch]) masks = masks.reshape([self.runner.batch_ob_shape[0]]) names_ops, values_ops = self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, self.num_timesteps, writer) if self.verbose >= 1 and (int(steps / self.n_batch) % log_interval == 0): logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", int(steps / (time.time() - t_start))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, # not just at the terminal state. Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular() if (self.replay_ratio > 0 and buffer is not None and buffer.has_atleast(self.replay_start)): samples_number = np.random.poisson(self.replay_ratio) for _ in range(samples_number): # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(self.runner.batch_ob_shape) actions = actions.reshape([self.n_batch]) rewards = rewards.reshape([self.n_batch]) mus = mus.reshape([self.n_batch, self.n_act]) dones = dones.reshape([self.n_batch]) masks = masks.reshape([self.runner.batch_ob_shape[0]]) self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, self.num_timesteps) callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", eval_every_n=5, reset_num_timesteps=True, record_video=False, log_dir=""): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(1, nupdates + 1): if update % eval_every_n == 1: print("[RAISIM_GYM] Visualizing in RaiSimOgre") obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = \ runner.run(test_mode=True, record_video=record_video, video_name=log_dir+"/"+str(update-1)+".mp4") print("Average rewards in this test episode ", ep_infos[0]['r']) # tensorboard_log(logger, ep_infos, self.sess) assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) self.num_timesteps += (self.n_envs * self.noptepochs ) // envs_per_batch * update_fac loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) bestscore = 0 new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) fig = plt.figure() ax = fig.add_subplot(111) x, y = [0], [0] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() episode_stats = EpisodeStats(self.n_steps, self.n_envs) t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # Unpack obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout callback.update_locals(locals()) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break episode_stats.feed(true_reward, masks) self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs, 1) inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs // self.n_steps, 1) assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose == 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) #if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: #logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) #logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv("mean_episode_length", episode_stats.mean_length()) logger.logkv("mean_episode_reward", episode_stats.mean_reward()) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if self.verbose == 2 and ( update % log_interval == 0 or update == 1) and episode_stats.mean_reward() > bestscore: bestscore = episode_stats.mean_reward() logger.logkv('time_elapsed', t_start - t_first_start) logger.logkv("mean_episode_reward", bestscore) logger.dumpkvs() x.append(self.num_timesteps) y.append(bestscore) ax.plot(x, y, marker='.', color='b') fig.canvas.draw() ax.set_xlim(left=0, right=total_timesteps) ax.set(title='Street Fighter 2 AI - PPO2 Algorithm', ylabel='Fitness score', xlabel='Timesteps') fig.show() plt.pause(0.001) callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="TRPO", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) with self.sess.as_default(): seg_gen = traj_segment_generator( self.policy_pi, self.env, self.timesteps_per_batch, reward_giver=self.reward_giver, gail=self.using_gail) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() len_buffer = deque( maxlen=40) # rolling buffer for episode lengths reward_buffer = deque( maxlen=40) # rolling buffer for episode rewards self.episode_reward = np.zeros((self.n_envs, )) true_reward_buffer = None if self.using_gail: true_reward_buffer = deque(maxlen=40) # Initialize dataloader batchsize = self.timesteps_per_batch // self.d_step self.expert_dataset.init_dataloader(batchsize) # Stats not used for now # TODO: replace with normal tb logging # g_loss_stats = Stats(loss_names) # d_loss_stats = Stats(reward_giver.loss_name) # ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"]) while True: if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if total_timesteps and timesteps_so_far >= total_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(vec): return self.allmean( self.compute_fvp( vec, *fvpargs, sess=self.sess)) + self.cg_damping * vec # ------------------ Update G ------------------ logger.log("Optimizing Policy...") # g_step = 1 when not using GAIL mean_losses = None vpredbefore = None tdlamret = None observation = None action = None seg = None for k in range(self.g_step): with self.timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) observation, action, atarg, tdlamret = seg["ob"], seg[ "ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate # true_rew is the reward without discount if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, seg["true_rew"].reshape( (self.n_envs, -1)), seg["dones"].reshape( (self.n_envs, -1)), writer, self.num_timesteps) args = seg["ob"], seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] self.assign_old_eq_new(sess=self.sess) with self.timed("computegrad"): steps = self.num_timesteps + (k + 1) * ( seg["total_timestep"] / self.g_step) run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata( ) if self.full_tensorboard_log else None # run loss backprop with summary, and save the metadata (memory, compute time, ...) if writer is not None: summary, grad, *lossbefore = self.compute_lossandgrad( *args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) if self.full_tensorboard_log: writer.add_run_metadata( run_metadata, 'step%d' % steps) writer.add_summary(summary, steps) else: _, grad, *lossbefore = self.compute_lossandgrad( *args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) lossbefore = self.allmean(np.array(lossbefore)) grad = self.allmean(grad) if np.allclose(grad, 0): logger.log("Got zero gradient. not updating") else: with self.timed("conjugate_gradient"): stepdir = conjugate_gradient( fisher_vector_product, grad, cg_iters=self.cg_iters, verbose=self.rank == 0 and self.verbose >= 1) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot( fisher_vector_product(stepdir)) # abs(shs) to avoid taking square root of negative values lagrange_multiplier = np.sqrt( abs(shs) / self.max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lagrange_multiplier expectedimprove = grad.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = self.get_flat() thnew = None for _ in range(10): thnew = thbefore + fullstep * stepsize self.set_from_flat(thnew) mean_losses = surr, kl_loss, *_ = self.allmean( np.array( self.compute_losses(*args, sess=self.sess))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(mean_losses).all(): logger.log( "Got non-finite value of losses -- bad!" ) elif kl_loss > self.max_kl * 1.5: logger.log( "violated KL constraint. shrinking step." ) elif improve < 0: logger.log( "surrogate didn't improve. shrinking step." ) else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") self.set_from_flat(thbefore) if self.nworkers > 1 and iters_so_far % 20 == 0: # list of tuples paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), self.vfadam.getflat().sum())) assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with self.timed("vf"): for _ in range(self.vf_iters): # NOTE: for recurrent policies, use shuffle=False? for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128, shuffle=True): grad = self.allmean( self.compute_vflossandgrad( mbob, mbob, mbret, sess=self.sess)) self.vfadam.update(grad, self.vf_stepsize) for (loss_name, loss_val) in zip(self.loss_names, mean_losses): logger.record_tabular(loss_name, loss_val) logger.record_tabular( "explained_variance_tdlam_before", explained_variance(vpredbefore, tdlamret)) if self.using_gail: # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, self.reward_giver.loss_name)) assert len(observation) == self.timesteps_per_batch batch_size = self.timesteps_per_batch // self.d_step # NOTE: uses only the last g step for observation d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch # NOTE: for recurrent policies, use shuffle=False? for ob_batch, ac_batch in dataset.iterbatches( (observation, action), include_final_partial_batch=False, batch_size=batch_size, shuffle=True): ob_expert, ac_expert = self.expert_dataset.get_next_batch( ) # update running mean/std for reward_giver if self.reward_giver.normalize: self.reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) # Reshape actions if needed when using discrete actions if isinstance(self.action_space, gym.spaces.Discrete): if len(ac_batch.shape) == 2: ac_batch = ac_batch[:, 0] if len(ac_expert.shape) == 2: ac_expert = ac_expert[:, 0] *newlosses, grad = self.reward_giver.lossandgrad( ob_batch, ac_batch, ob_expert, ac_expert) self.d_adam.update(self.allmean(grad), self.d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values list_lr_pairs = MPI.COMM_WORLD.allgather( lr_local) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*list_lr_pairs)) true_reward_buffer.extend(true_rets) else: # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"] ) # local values list_lr_pairs = MPI.COMM_WORLD.allgather( lr_local) # list of tuples lens, rews = map(flatten_lists, zip(*list_lr_pairs)) len_buffer.extend(lens) reward_buffer.extend(rews) if len(len_buffer) > 0: logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(reward_buffer)) if self.using_gail: logger.record_tabular("EpTrueRewMean", np.mean(true_reward_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) current_it_timesteps = MPI.COMM_WORLD.allreduce( seg["total_timestep"]) timesteps_so_far += current_it_timesteps self.num_timesteps += current_it_timesteps iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", self.num_timesteps) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and self.rank == 0: logger.dump_tabular() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() # define the victim_model if self.use_explanation: if self.pretrained_mimic: exp_test = GradientExp(self.mimic_model) else: exp_test = None else: exp_test = None nupdates = total_timesteps // self.n_batch obs_list = [] act_list = [] for update in range(1, nupdates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward, \ obs_oppo, actions_oppo, o_next, o_opp_next, a_opp_next = runner.run() obs_opp_ph = obs_oppo action_oppo_ph = actions_oppo if update % 100 == 0 and self.save_victim_traj: obs_list.append(obs_oppo) act_list.append(actions_oppo) # todo calculate the attention paid on opponent attention = self.calculate_attention(obs_oppo=obs_opp_ph, action_oppo=action_oppo_ph, \ exp_test=exp_test, black_box_att=self.black_box_att, exp_method=self.exp_method) is_stochastic = False ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) slices_hua = (arr[mbinds] for arr in (a_opp_next, o_opp_next, attention)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, *slices_hua, is_stochastic=is_stochastic, ratio=self.mix_ratio, writer=writer, update=timestep)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) slices_hua = (arr[mb_flat_inds] for arr in (a_opp_next, o_opp_next, is_stochastic, attention)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, *slices_hua, update=timestep, writer=writer, states=mb_states)) self.num_timesteps += (self.n_envs * self.noptepochs ) // envs_per_batch * update_fac loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) # print the attention weights if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break model_file_name = "{0}agent_{1}.pkl".format( self.model_saved_loc, update * self.n_batch) if self.black_box_att: if update % 1000 == 0: print("Model saved at: {}".format(model_file_name)) self.save(model_file_name) else: if update % 1000 == 0: print("Model saved at: {}".format(model_file_name)) self.save(model_file_name) obs_numpy = np.vstack(obs_list) act_numpy = np.vstack(act_list) with open('../saved/trajectory.pkl', 'ab+') as f: pkl.dump([obs_numpy, act_numpy], f, protocol=2) return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None]).flatten() # Add noise to the action, as the policy # is deterministic, this is required for exploration if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. callback.update_locals(locals()) if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward # Store transition in the replay buffer. self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) # Note: the policy is updated less frequently than the Q functions # this is controlled by the `policy_delay` parameter mb_infos_vals.append( self._train_step(step, writer, current_lr, (step + grad_step) % self.policy_delay == 0)) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward_ if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) # substract 1 as we appended a new term just now num_episodes = len(episode_rewards) - 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'eplenmean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.n_batch = self.n_envs * self.n_steps self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # FIFO queue of the q_runner thread is closed at the end of the learn function. # As a result, it needs to be redefinied at every call with self.graph.as_default(): with tf.variable_scope("kfac_apply", reuse=self.trained, custom_getter=tf_util.outer_scope_getter("kfac_apply")): # Some of the variables are not in a scope when they are create # so we make a note of any previously uninitialized variables tf_vars = tf.global_variables() is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars]) old_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f] self.train_op, self.q_runner = self.optim.apply_gradients(list(zip(self.grads_check, self.params))) # then we check for new uninitialized variables and initialize them tf_vars = tf.global_variables() is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars]) new_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f and v not in old_uninitialized_vars] if len(new_uninitialized_vars) != 0: self.sess.run(tf.variables_initializer(new_uninitialized_vars)) self.trained = True t_start = time.time() coord = tf.train.Coordinator() if self.q_runner is not None: enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) else: enqueue_threads = [] callback.on_training_start(locals(), globals()) for update in range(1, total_timesteps // self.n_batch + 1): callback.on_rollout_start() # pytype:disable=bad-unpacking # true_reward is the reward without discount if isinstance(self.runner, PPO2Runner): # We are using GAE rollout = self.runner.run(callback) obs, returns, masks, actions, values, _, states, ep_infos, true_reward = rollout else: rollout = self.runner.run(callback) obs, states, returns, masks, actions, values, ep_infos, true_reward = rollout # pytype:enable=bad-unpacking callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) policy_loss, value_loss, policy_entropy = self._train_step(obs, states, returns, masks, actions, values, self.num_timesteps // (self.n_batch + 1), writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.dump_tabular() coord.request_stop() coord.join(enqueue_threads) callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO2_SH"): with SetVerbosity(self.verbose), TensorboardWriter( self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam, visualize=self.visualize, snapshot_details=self.snapshot_details) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch for update in range(nupdates + 1): assert self.n_batch % self.nminibatches == 0 n_batch_train = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update / (nupdates + 1)) lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, n_batch_train): timestep = ( (update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // n_batch_train) end = start + n_batch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep)) else: # recurrent version assert self.n_envs % self.nminibatches == 0 envinds = np.arange(self.n_envs) flatinds = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envsperbatch = n_batch_train // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(envinds) for start in range(0, self.n_envs, envsperbatch): timestep = ( (update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envsperbatch) end = start + envsperbatch mb_env_inds = envinds[start:end] mb_flat_inds = flatinds[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) all_env_episode_rewards = calculate_total_episode_reward( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps))) average_episode_reward = safe_mean(all_env_episode_rewards) ep_info = {'r': average_episode_reward, 'l': np.nan} ep_info_buf.append(ep_info) if callback is not None: callback(locals(), globals()) if self.verbose >= 1 and ( (update + 1) % log_interval // 100 == 0 or update == 0): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", (update + 1) * self.n_steps) logger.logkv("nupdates", (update + 1)) logger.logkv("total_timesteps", (update + 1) * self.n_batch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): """ Just copied from the stable_baselines.ppo2 implementation. Goal is to change some parts of it later. """ # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): minibatch_size = cfg.minibatch_size # self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # try getting rollout 3 times tried_rollouts = 0 while tried_rollouts < 1: try: # true_reward is the reward without discount rollout = self.runner.run(callback) break except BrokenPipeError as bpe: raise BrokenPipeError(f'Catched Broken Pipe Error.') except Exception as ex: # tried_rollouts += 1 # obs, returns, masks, actions, values, neglogpacs, \ # states, ep_infos, true_reward = rollout # log(f'Rollout failed {tried_rollouts} times!', # [f'Catched exception: {ex}', # f'obs.shape: {obs.shape}', # f'ret.shape: {returns.shape}']) traceback.print_exc() # if isinstance(ex, BrokenPipeError): # # copy-pasted from the old blog here: # # http://newbebweb.blogspot.com/2012/02/python-head-ioerror-errno-32-broken.html # from signal import signal, SIGPIPE, SIG_DFL # signal(SIGPIPE, SIG_DFL) # print('Executing fix: Importing signal and disabling BrokenPipeError.') # for _ in range(10000): # print('', end='') # reset count once, rollout was successful tried_rollouts = 0 # Unpack if self.mirror_experiences: obs, returns, masks, actions, values, neglogpacs, \ states, ep_infos, true_reward = mirror_experiences(rollout, self) elif cfg.is_mod(cfg.MOD_EXP_REPLAY): obs, returns, masks, actions, values, neglogpacs, \ states, ep_infos, true_reward = self.exp_replay(rollout) else: obs, returns, masks, actions, values, neglogpacs, \ states, ep_infos, true_reward = rollout self.last_actions = actions if np.random.randint(low=1, high=20) == 7: log(f'Values and Returns of collected experiences: ', [ f'min returns:\t{np.min(returns)}', f'min values:\t\t{np.min(values)}', f'mean returns:\t{np.mean(returns)}', f'mean values:\t{np.mean(values)}', f'max returns:\t{np.max(returns)}', f'max values:\t\t{np.max(values)}' ]) if cfg.is_mod(cfg.MOD_REFS_REPLAY): # load ref experiences and treat them as real experiences obs, actions, returns, masks, values, neglogpacs = \ generate_experiences_from_refs(rollout, self.ref_obs, self.ref_acts) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] self.n_batch = obs.shape[0] self.nminibatches = self.n_batch / minibatch_size if self.n_batch % minibatch_size != 0: log("CAUTION!", [ 'Last minibatch might be too small!', f'Batch Size: \t{self.n_batch}', f'Minibatch Size:\t{minibatch_size}', f'Modulo: \t\t {self.n_batch % minibatch_size}' ]) if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) n_epochs = self.noptepochs for epoch_num in range(n_epochs): np.random.shuffle(inds) for start in range(0, self.n_batch, minibatch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // minibatch_size) end = start + minibatch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = minibatch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2"): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn() runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs,)) ep_info_buf = deque(maxlen=100) t_first_start = time.time() n_timesteps = 0 # nupdates = total_timesteps // self.n_batch for timestep in range(1, total_timesteps + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - timestep / total_timesteps lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run() n_timesteps += len(obs) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): # timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // # batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=n_timesteps)) else: # recurrent version assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for stan_timestepsrt in range(0, self.n_envs, envs_per_batch): # timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // # envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=n_timesteps, writer=writer, states=mb_states)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, n_timesteps) if self.verbose >= 1 and (timestep % log_interval == 0 or timestep == 1): explained_var = explained_variance(values, returns) logger.logkv("total_timesteps", n_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if n_timesteps > total_timesteps: break return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() else: obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ep_len += 1 if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done: if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format( episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv("episode reward", episode_rewards[-2]) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, initial_p=1.0): self.actions_weights = [] self.actions_container = [] new_tb_log = self._init_num_timesteps(reset_num_timesteps) cnt = 0 ds_rewards = [[0, 0]] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=initial_p, final_p=self.exploration_final_eps) episode_rewards = [0.0] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) for _ in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True ''' Hierarchical Step (Start) ''' obs, new_obs, rew, action, done, reset = self.hierarchical_step( obs, ds_rewards, cnt, kwargs, update_eps) ''' Hierarchical Step (End) ''' if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: try: new_priorities = np.array([ abs(x) for x in td_errors.tolist() ]) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) except AssertionError: print(td_errors) if self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() self.num_timesteps += 1 return self, ds_rewards