def __init__(self, env_maker: Callable, ac_maker=core.MLPActorCritic, ac_kwargs={}, seed: int = 0, epochs: int = 50, steps_per_epoch: int = 4000, gamma: float = 0.99, actor_lr: float = 3e-4, critic_lr: float = 1e-3, num_iter_train_critic: int = 80, lam: float = 0.97, max_episode_len: int = 1000, logger_kwargs=dict(), save_freq: int = 10): # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) self.epochs = epochs self.steps_per_epoch = steps_per_epoch self.num_iter_train_critic = num_iter_train_critic self.max_episode_len = max_episode_len self.save_freq = save_freq # make env self.env = env_maker() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape # make actor-critic self.ac = ac_maker(self.env.observation_space, self.env.action_space, **ac_kwargs) # make buffer self.local_steps_per_epoch = int(steps_per_epoch / num_procs()) self.buffer = Buffer(self.obs_dim, self.act_dim, self.local_steps_per_epoch, gamma, lam) # make optimizers self.actor_optimizer = Adam(self.ac.actor.parameters(), lr=actor_lr) self.critic_optimizer = Adam(self.ac.critic.parameters(), lr=critic_lr) # Sync params across processes sync_params(self.ac) # Count variables var_counts = tuple( core.count_vars(module) for module in [self.ac.actor, self.ac.critic]) self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up model saving self.logger.setup_pytorch_saver(self.ac)
def update(episode_buffer): # Update if episode_buffer.dones[-1]: next_value = 0.0 else: last_obs = episode_buffer.next_observations[-1] previous_reward = episode_buffer.rewards[-1] last_obs_tensor = torch.tensor(last_obs, dtype=torch.float32).unsqueeze(0) previous_reward_tensor = torch.tensor([previous_reward], dtype=torch.float32).unsqueeze(0) context = agent.get_context() next_value = target_agent.predict_value(obs_tensor=last_obs_tensor, previous_reward_tensor=previous_reward_tensor, goal_grid_code_tensor=goal_grid_code_tensor, context=context).cpu().item() # Super critical!! optimizer.zero_grad() # Compute value and policy losses loss, info = agent.compute_loss(rewards=np.array(episode_buffer.rewards), dones=np.array(episode_buffer.dones), next_value=next_value, discount_factor=gamma, use_gae=use_gae, tau=tau, value_loss_coef=value_loss_coef, policy_loss_coef=policy_loss_coef, entropy_reg_coef=entropy_loss_coef, grid_layer_wreg_loss_coef=grid_layer_weight_reg_loss_coef) loss.backward() if use_MPI: mpi_pytorch.mpi_avg_grads(agent) # Optimize if max_grad_norm is not None: torch.nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm) optimizer.step() # Log losses and info logger.store(**info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(agent.parameters(), target_agent.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) if use_MPI: mpi_pytorch.sync_params(target_agent)
def a2c(env_fn, agent: Agent, seed=0, num_cpu=1, device=torch.device("cpu"), epochs=1000, steps_per_epoch=100, gamma=0.99, use_gae=True, tau=0.95, max_grad_norm=0.5, polyak=0.995, learning_rate=1e-3, value_loss_coef=0.5, policy_loss_coef=1, entropy_loss_coef=0.1, grid_layer_weight_reg_loss_coef=1e-4, save_every=100, log_every=10, logger_kwargs=dict(), test_every=100, num_test_episodes=5, deterministic=False, save_freq=1, solved_score=None, render=False, ): use_MPI = num_cpu > 1 if use_MPI: # Special function to avoid certain slowdowns from PyTorch + MPI combo. mpi_pytorch.setup_pytorch_for_mpi() else: torch.set_num_threads(torch.get_num_threads()) # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) config = locals() del config['env_fn'] del config['agent'] del config['logger'] logger.save_config(config) test_logger_kwargs = deepcopy(logger_kwargs) test_logger_kwargs['output_dir'] = pathlib.Path(test_logger_kwargs['output_dir']) / 'evaluation' test_logger = EpochLogger(**test_logger_kwargs) # Random seed if use_MPI: seed += 10000 * mpi_tools.proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() assert env.max_episode_steps > 0 obs_shape = env.observation_space.shape act_dim = env.action_space.n # training model and target model target_agent = deepcopy(agent) if use_MPI: # Sync params across processes mpi_pytorch.sync_params(agent) mpi_pytorch.sync_params(target_agent) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in target_agent.parameters(): p.requires_grad = False # Utilize GPU agent.to(device) target_agent.to(device) # Set up optimizers for policy and q-function optimizer = Adam(agent.parameters(), lr=learning_rate) # Set up model saving logger.setup_pytorch_saver(agent, name='model') def update(episode_buffer): # Update if episode_buffer.dones[-1]: next_value = 0.0 else: last_obs = episode_buffer.next_observations[-1] previous_reward = episode_buffer.rewards[-1] last_obs_tensor = torch.tensor(last_obs, dtype=torch.float32).unsqueeze(0) previous_reward_tensor = torch.tensor([previous_reward], dtype=torch.float32).unsqueeze(0) context = agent.get_context() next_value = target_agent.predict_value(obs_tensor=last_obs_tensor, previous_reward_tensor=previous_reward_tensor, goal_grid_code_tensor=goal_grid_code_tensor, context=context).cpu().item() # Super critical!! optimizer.zero_grad() # Compute value and policy losses loss, info = agent.compute_loss(rewards=np.array(episode_buffer.rewards), dones=np.array(episode_buffer.dones), next_value=next_value, discount_factor=gamma, use_gae=use_gae, tau=tau, value_loss_coef=value_loss_coef, policy_loss_coef=policy_loss_coef, entropy_reg_coef=entropy_loss_coef, grid_layer_wreg_loss_coef=grid_layer_weight_reg_loss_coef) loss.backward() if use_MPI: mpi_pytorch.mpi_avg_grads(agent) # Optimize if max_grad_norm is not None: torch.nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm) optimizer.step() # Log losses and info logger.store(**info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(agent.parameters(), target_agent.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) if use_MPI: mpi_pytorch.sync_params(target_agent) # Prepare for interaction with environment start_time = time.time() # Main loop: collect experience in env and update/log each epoch total_steps = 0 # Reset env obs = env.reset() reward = 0 goal_grid_code_tensor = None # Reset episode stats episode_return = 0 episode_length = 0 for epoch in range(1, epochs + 1): agent.reset_for_training() epoch_history = EpisodeHistory() for t in range(steps_per_epoch): total_steps += 1 # Get action from the model obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0) previous_reward_tensor = torch.tensor([reward], dtype=torch.float32).unsqueeze(0) action = agent.step(obs_tensor, previous_reward_tensor, goal_grid_code_tensor).squeeze(0) # Step the env obs2, reward, done, _ = env.step(action.detach().cpu().item()) if render and mpi_tools.proc_id() == 0: env.render('human', view='top') time.sleep(1e-3) episode_return += reward episode_length += 1 # Store transition to history epoch_history.store(observation=None, action=None, reward=reward, done=done, next_observation=obs2) # Super critical, easy to overlook step: make sure to update # most recent observation! obs = obs2 # End of trajectory handling if done: if reward > 0: goal_grid_code_tensor = agent.current_grid_code.detach() break update(epoch_history) # if done if epoch_history.dones[-1]: logger.store(EpRet=episode_return, EpLen=episode_length) # Reset env obs = env.reset() agent.reset() # Reset episode stats episode_return = 0 episode_length = 0 # End of epoch handling if epoch % log_every == 0: total_interactions = mpi_tools.mpi_sum(total_steps) if use_MPI else total_steps # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('Value', average_only=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossEntropy', average_only=True) logger.log_tabular('LossGridL2', average_only=True) logger.log_tabular('LossPIM', average_only=True) logger.log_tabular('TotalEnvInteracts', total_interactions) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Test agent solved = False if epoch % test_every == 0: video_dir = pathlib.Path(logger.output_dir) / 'test_videos' / f'epoch-{epoch:d}' test_env_fn = lambda: Monitor(env_fn(), directory=video_dir) # Test the performance of the deterministic version of the agent. context = agent.get_context() agent.eval() episode_info = evaluate_agent(env_fn=test_env_fn, agent=agent, deterministic=deterministic, num_episodes=num_test_episodes, render=False, logger=test_logger) agent.train() agent.set_context(context) if solved_score is not None: solved = all(r >= solved_score for (t, r) in episode_info) # Save model if (epoch % save_every == 0) or (epoch == epochs) or solved: logger.save_state({'env': env}) # Check environment is solved if solved: plog = lambda msg: logger.log(msg, color='green') plog("=" * 40) plog(f"ENVIRONMENT SOLVED!") plog("=" * 40) plog(f' TotalEnvInteracts {total_steps}') plog(f' Time {time.time() - start_time}') plog(f' Epoch {epoch}') break torch.save(agent, str(logger.output_dir / 'agent.pt')) env.close()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): # Special function to avoid certain slowdowns from PyTorch + MPI combination setup_pytorch_for_mpi() # Setup logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random Seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate Environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor - critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync parameters across processes sync_params(ac) # Count variables var_counts = tuple( core.count_variables(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experiment buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up a function for computing PPO Policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy Loss pi, log_p = ac.pi(obs, act) ratio = torch.exp(log_p - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful Extra Information approx_kl = (logp_old - log_p).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clip_fraction = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clip_fraction) return loss_pi, pi_info # Setup function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Setup optimizers for policy and value functions pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with the environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs(critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or time_out epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not terminal: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, beta=0.01, clip_ratio=0.2, pi_lr=3e-4, vf_lr=3e-4, train_pi_iters=80, train_v_iters=80, lam=0.95, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, use_rnn=False, reward_factor=1, spectrum_repr=False): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) comm = MPI.COMM_WORLD rank = comm.Get_rank() if rank == 0: print(ac) # udpate env config # env.scalar_thick = ac_kwargs['scalar_thick'] env.update_with_ac(**ac_kwargs) # For Tuple spaces obs_dim = ac.obs_dim if isinstance(env.action_space, spaces.Tuple): act_dim = core.tuple_space_dim(env.action_space, action=True) else: act_dim = env.action_space.shape # Create actor-critic module # print(ac) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam, cell_size=ac_kwargs['cell_size']) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old, hid = data['obs'], data['act'], data[ 'adv'], data['logp'], data['hid'] # for i in range(len(obs)-1): # if torch.eq(obs[i], torch.zeros(12)).sum()==12 and torch.eq(obs[i+1], torch.zeros(12)).sum()==12: # print(obs[i], obs[i+1], act[i], act[i+1]) # Policy loss pis = [] logp = 0 if len(ac.pi) > 1: # tuple actions for i, actor_i in enumerate(ac.pi): pi, logp_i = actor_i(obs, act[:, i][:, None]) logp += logp_i pis.append(pi) else: pi, logp_i = ac.pi[0](obs, act) logp += logp_i pis.append(pi) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info # sample estimation policy KL approx_kl = (logp_old - logp).mean().item() ent = sum([pi.entropy().mean().item() for pi in pis]) clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return 0.5 * ((ac.v(obs) - ret)**2).mean() def compute_loss_pi_v_rnn(data): obs, act, adv, logp_old, ret = data['obs'], data['act'], data[ 'adv'], data['logp'], data['ret'] hid = torch.zeros(ac_kwargs['cell_size']) v = [] logp = [] ent = [] num_traj = 0 #todo: test for i in range(len(obs)): v_i, logp_i, hid, ent_i = ac.evaluate(obs[i], act[i], hid) if i < len(obs) - 1 and obs[i + 1].sum() == 0: num_traj += 1 # print('Reinitialize #{}'.format(num_traj), flush=True) hid = torch.zeros(ac_kwargs['cell_size']) v.append(v_i) logp.append(logp_i) ent.append(ent_i) logp = torch.cat(logp) v = torch.cat(v) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # print(logp_old - logp) approx_kl = (logp_old - logp).mean().item() ent = torch.stack(ent).mean() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) loss_v = 0.5 * ((v - ret)**2).mean() # import pdb; pdb.set_trace() loss_pi = loss_pi - beta * ent logger.store(RetBuf=ret.clone().detach().numpy()) # import pdb; pdb.set_trace() return loss_pi, pi_info, loss_v # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) if use_rnn: optimizer = Adam(ac.parameters(), lr=pi_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() # import pdb; pdb.set_trace() if not use_rnn: pi_l_old, pi_info_old = compute_loss_pi(data) v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() if not use_rnn: loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() else: pi_l_old, pi_info_old, v_l_old = compute_loss_pi_v_rnn(data) pi_l_old = pi_l_old.item() for i in range(train_pi_iters): optimizer.zero_grad() loss_pi, pi_info, loss_v = compute_loss_pi_v_rnn(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss = loss_pi + loss_v loss.backward() mpi_avg_grads(ac) optimizer.step() logger.store(StopIter=i) # import pdb; pdb.set_trace() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() obs, ep_ret, ep_len = env.reset(), 0, 0 # import pdb; pdb.set_trace() # if ac_kwargs['scalar_thick']: # thick= obs[env.num_materials:env.num_materials+env.num_thicknesses].argmax() / env.num_thicknesses # obs = np.concatenate((obs[:env.num_materials+1], np.array([thick]))) # if ac_kwargs['scalar_thick']: # thick= obs[env.num_materials:env.num_materials+env.num_thicknesses].argmax() / env.num_thicknesses # obs = np.concatenate((obs[:env.num_materials+1], np.array([thick]))) hid = np.zeros( ac_kwargs['cell_size']) if ac_kwargs['cell_size'] else np.zeros(1) # import pdb; pdb.set_trace() design_tracker = DesignTracker(epochs, **logger_kwargs) total_env_time = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): epoch_start_time = time.time() for t in range(local_steps_per_epoch): #TODO: only evaluate act, v, logp, hid = ac.step( torch.as_tensor(obs, dtype=torch.float32), torch.as_tensor(hid, dtype=torch.float32)) # nv_start = time.time() next_obs, r, d, _ = env.step(act) # env_end = time.time() # env_time = env_end - env_start # total_env_time += env_time r = r * reward_factor # scale the rewards, possibly match the reward scale of atari ep_ret += r if not d: ep_len += 1 # save and log if use_rnn: buf.store(obs, act, r, v, logp, hid) else: buf.store(obs, act, r, v, logp) logger.store(VVals=v) # Update obs (critical!) obs = next_obs timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: # print(t) # if epoch_ended and not(terminal): # print('Warning: trajectory cut off by epoch at %d steps.' # % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target # if timeout or epoch_ended: if not terminal: _, v, _, _ = ac.step( torch.as_tensor(obs, dtype=torch.float32), torch.as_tensor(hid, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) if hasattr(env, 'layers') and hasattr(env, 'thicknesses'): design_tracker.store(env.layers, env.thicknesses, ep_ret, epoch) if rank == 0: print(env.layers, env.thicknesses) obs, ep_ret, ep_len = env.reset(), 0, 0 # reinitilize hidden state hid = np.zeros(ac_kwargs['cell_size']) if hasattr(env, "layers"): logger.store(Act=act[1]) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) design_tracker.save_state() # Perform PPO update! update() elapsed = time.time() - start_time epoch_time = time.time() - epoch_start_time # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) if hasattr(env, 'layers'): logger.log_tabular('Act', with_min_and_max=True) logger.log_tabular('RetBuf', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', elapsed) logger.log_tabular('FPS', int(steps_per_epoch / epoch_time)) logger.dump_tabular()
def vpg(env, hidden_sizes, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE-Lambda for advantage estimation) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # random seeds seed += 1000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # 环境 obs_dim = env.observation_space.shape act_dim = env.action_space.shape # 创建模型 ac = core.MLPActorCritic(env.observation_space, env.action_space, hidden_sizes) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer. 如果有多个线程,每个线程的经验池长度为 local_steps_per_epoch local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, size=local_steps_per_epoch, gamma=gamma, lam=lam) # optimizer pi_optimizer = torch.optim.Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = torch.optim.Adam(ac.v.parameters(), lr=vf_lr) # setup model saving # logger.setup_pytorch_for_mpi() # interaction start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor( o, dtype=torch.float32)) # (act_dim,), (), () next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save buf.store(o, a, r, v, logp) logger.store(VVals=v) # update obs o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: # timeout=True, terminal=True, epoch_ended=True/False if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # 重新初始化 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update(buf, ac, train_v_iters, pi_optimizer, vf_optimizer, logger) # # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=2048, epochs=250, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=100, train_v_iters=70, lam=0.95, max_ep_len=512, target_kl=0.005, logger_kwargs=dict(), save_freq=5): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env( "PandaPegIn", has_offscreen_renderer=True, # has_renderer=True, use_camera_obs=False, control_freq=100, ) obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) #加载预训练模型 # fname = "data/ppo_peg_in_add_delta_pos_plus_plus/ppo_peg_in_add_delta_pos_plus_plus_s0/pyt_save/model24.pt" # pre_model = torch.load(fname) # ac.pi = pre_model.pi # ac.v =pre_model.v #使用TensorboardX writer = logger.create_writer() # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi( obs, act ) #变化的是网络pi data['obs'],data['act'],data['adv'],data['logp']在回合内未变 ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info #需要最小化loss_pi,pi_info包括kl散度、熵、越界程度(都针对单个回合) # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() #一次更新改变一次data pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): #在kl散度不超标的前提下尽可能减小损失 pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # print(i,':',loss_v) # print('='*20) # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 #运动到初始位置 pre_action = [0, 0, 0] for i in range(4): o, _, _, _ = env.step(pre_action) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): print("epoch:", epoch) for t in range(local_steps_per_epoch): # if( t == steps_per_epoch/2 ): # print("Half finished!") #通过policy网络和值函数网络计算出:动作、值函数和采取这个动作的概率 a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r #单次游戏回报 ep_len += 1 #单次游戏时长 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: #game die; game超出时间; 回合结束 if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) #计算GAE和rewards-to-go # print("steps:",t) # print("done",d) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, epoch) # Perform PPO update! update() #将数据写入TensorboardX stats_to_write = logger.get_stats('EpRet') writer.add_scalar('AverageEpRet', stats_to_write[0], global_step=(epoch + 1) * 2048) # Log info about epoch 一个回合的数据 logger.log_tabular('Epoch', epoch) #第几个回合 logger.log_tabular('EpRet', with_min_and_max=True) #回报的最大、最小、平均值,游戏结束时停留的状态的回报 logger.log_tabular('EpLen', average_only=True) #单次游戏长度的平均值 logger.log_tabular('VVals', with_min_and_max=True) #值函数的最大、最小、平均值 logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) #目前总步数 logger.log_tabular('LossPi', average_only=True) #回合开始时策略网络的损失 logger.log_tabular('LossV', average_only=True) #回合开始时值网络的损失 logger.log_tabular('DeltaLossPi', average_only=True) #策略网络回合结束损失-开始损失 logger.log_tabular('DeltaLossV', average_only=True) #值略网络回合结束损失-开始损失 logger.log_tabular('Entropy', average_only=True) #? logger.log_tabular('KL', average_only=True) #散度值 logger.log_tabular('ClipFrac', average_only=True) #越界程度 logger.log_tabular('StopIter', average_only=True) #ppo策略网络迭代次数 logger.log_tabular('Time', time.time() - start_time) #回合时间 logger.dump_tabular() # if __name__ == '__main__': # import argparse # parser = argparse.ArgumentParser() # parser.add_argument('--env', type=str, default='HalfCheetah-v2') # parser.add_argument('--hid', type=int, default=64) # parser.add_argument('--l', type=int, default=2) # parser.add_argument('--gamma', type=float, default=0.99) # parser.add_argument('--seed', '-s', type=int, default=0) # parser.add_argument('--cpu', type=int, default=1) # parser.add_argument('--steps', type=int, default=4000) # parser.add_argument('--epochs', type=int, default=50) # parser.add_argument('--exp_name', type=str, default='ppo') # args = parser.parse_args() # mpi_fork(args.cpu) # run parallel code with mpi # from spinup.utils.run_utils import setup_logger_kwargs # logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) # ppo(lambda : gym.make(args.env), actor_critic=core.MLPActorCritic, # ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, # seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs, # logger_kwargs=logger_kwargs)
def trpo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo='trpo'): """ Trust Region Policy Optimization (with support for Natural Policy Gradient) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TRPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) delta (float): KL-divergence limit for TRPO / NPG update. (Should be small for stability. Values like 0.01, 0.05.) vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. Probably don't play with this hyperparameter. cg_iters (int): Number of iterations of conjugate gradient to perform. Increasing this will lead to a more accurate approximation to :math:`H^{-1} g`, and possibly slightly-improved performance, but at the cost of slowing things down. Also probably don't play with this hyperparameter. backtrack_iters (int): Maximum number of steps allowed in the backtracking line search. Since the line search usually doesn't backtrack, and usually only steps back once when it does, this hyperparameter doesn't often matter. backtrack_coeff (float): How far back to step during backtracking line search. (Always between 0 and 1, usually above 0.5.) lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. algo: Either 'trpo' or 'npg': this code supports both, since they are almost the same. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss _, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) loss_pi = -(ratio * adv).mean() return loss_pi # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() def compute_kl(data, old_pi): obs, act = data['obs'], data['act'] pi, _ = ac.pi(obs, act) kl_loss = torch.distributions.kl_divergence(pi, old_pi).mean() return kl_loss @torch.no_grad() def compute_kl_loss_pi(data, old_pi): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) loss_pi = -(ratio * adv).mean() kl_loss = torch.distributions.kl_divergence(pi, old_pi).mean() return loss_pi, kl_loss def hessian_vector_product(data, old_pi, v): kl = compute_kl(data, old_pi) grads = torch.autograd.grad(kl, ac.pi.parameters(), create_graph=True) flat_grad_kl = core.flat_grads(grads) kl_v = (flat_grad_kl * v).sum() grads = torch.autograd.grad(kl_v, ac.pi.parameters()) flat_grad_grad_kl = core.flat_grads(grads) return flat_grad_grad_kl + v * damping_coeff # Set up optimizers for policy and value function vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() # compute old pi distribution obs, act = data['obs'], data['act'] with torch.no_grad(): old_pi, _ = ac.pi(obs, act) pi_loss = compute_loss_pi(data) pi_l_old = pi_loss.item() v_l_old = compute_loss_v(data).item() grads = core.flat_grads( torch.autograd.grad(pi_loss, ac.pi.parameters())) # Core calculations for TRPO or NPG Hx = lambda v: hessian_vector_product(data, old_pi, v) x = core.conjugate_gradients(Hx, grads, cg_iters) alpha = torch.sqrt(2 * delta / (torch.matmul(x, Hx(x)) + EPS)) old_params = core.get_flat_params_from(ac.pi) def set_and_eval(step): new_params = old_params - alpha * x * step core.set_flat_params_to(ac.pi, new_params) loss_pi, kl_loss = compute_kl_loss_pi(data, old_pi) return kl_loss.item(), loss_pi.item() if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform TRPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, resume=None, reinitialize_optimizer_on_resume=True, render=False, notes='', env_config=None, boost_explore=0, partial_net_load=False, num_inputs_to_add=0, episode_cull_ratio=0, try_rollouts=0, steps_per_try_rollout=0, take_worst_rollout=False, shift_advs_pct=0, **kwargs): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. resume (str): Path to directory with simple_save model info you wish to resume from reinitialize_optimizer_on_resume: (bool) Whether to initialize training state in the optimizers, i.e. the individual learning rates for weights in Adam render: (bool) Whether to render the env during training. Useful for checking that resumption of training caused visual performance to carry over notes: (str) Experimental notes on what this run is testing env_config (dict): Environment configuration pass through boost_explore (float): Amount to increase std of actions in order to reinvigorate exploration. partial_net_load (bool): Whether to partially load the network when resuming. https://pytorch.org/tutorials/beginner/saving_loading_models.html#id4 num_inputs_to_add (int): Number of new inputs to add, if resuming and partially loading a new network. episode_cull_ratio (float): Ratio of bad episodes to cull from epoch try_rollouts (int): Number of times to sample actions steps_per_try_rollout (int): Number of steps per attempted rollout take_worst_rollout (bool): Use worst rollout in training shift_advs_pct (float): Action should be better than this pct of actions to be considered advantageous. """ config = deepcopy(locals()) # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) import_custom_envs() # Instantiate environment env = env_fn() if hasattr(env.unwrapped, 'configure_env'): env.unwrapped.configure_env(env_config) obs_dim = env.observation_space.shape act_dim = env.action_space.shape num_agents = getattr(env, 'num_agents', 1) if hasattr(env.unwrapped, 'logger'): print('Logger set by environment') logger_kwargs['logger'] = env.unwrapped.logger logger = EpochLogger(**logger_kwargs) logger.add_key_stat('won') logger.add_key_stat('trip_pct') logger.add_key_stat('HorizonReturn') logger.save_config(config) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, num_inputs_to_add=num_inputs_to_add, **ac_kwargs) # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Resume if resume is not None: ac, pi_optimizer, vf_optimizer = get_model_to_resume( resume, ac, pi_lr, vf_lr, reinitialize_optimizer_on_resume, actor_critic, partial_net_load, num_inputs_to_add) if num_inputs_to_add: add_inputs(ac, ac_kwargs, num_inputs_to_add) if boost_explore: boost_exploration(ac, boost_explore) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = ppo_buffer_factory(obs_dim, act_dim, local_steps_per_epoch, gamma, lam, num_agents, shift_advs_pct, cull_ratio=episode_cull_ratio) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1-clip_ratio, 1+clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1+clip_ratio) | ratio.lt(1-clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Sync params across processes sync_params(ac) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.'%i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, r, d = reset(env) effective_horizon = round(1 / (1 - gamma)) effective_horizon_rewards = [] for _ in range(num_agents): effective_horizon_rewards.append(deque(maxlen=effective_horizon)) if hasattr(env, 'agent_index'): agent_index = env.agent_index agent = env.agents[agent_index] is_multi_agent = True else: agent_index = 0 agent = None is_multi_agent = False def get_action_fn(_obz): return ac.step(torch.as_tensor(_obz, dtype=torch.float32)) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): epoch_episode = 0 info = {} epoch_ended = False step_num = 0 ep_len = 0 ep_ret = 0 while not epoch_ended: if try_rollouts != 0: # a, v, logp, next_o, r, d, info # a, v, logp, obs, r, done, info rollout = do_rollouts( get_action_fn, env, o, steps_per_try_rollout, try_rollouts, take_worst_rollout) else: a, v, logp = get_action_fn(o) # NOTE: For multi-agent, steps current agent, # but returns values for next agent (from its previous action)! # TODO: Just return multiple agents observations next_o, r, d, info = env.step(a) if render: env.render() curr_reward = env.curr_reward if is_multi_agent else r # save and log buf.store(o, a, curr_reward, v, logp, agent_index) logger.store(VVals=v) # Update obs (critical!) o = next_o if 'stats' in info and info['stats']: # TODO: Optimize this logger.store(**info['stats']) if is_multi_agent: agent_index = env.agent_index agent = env.agents[agent_index] # TODO: Store vector of these for each agent when changing step API ep_len = agent.episode_steps ep_ret = agent.episode_reward else: ep_len += 1 ep_ret += r calc_effective_horizon_reward( agent_index, effective_horizon_rewards, logger, r) timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = buf.epoch_ended(step_num) if terminal or epoch_ended: if epoch_ended and not terminal: print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(agent_index, v) if terminal: buf.record_episode(ep_len=ep_len, ep_ret=ep_ret, step_num=step_num) # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) if 'stats' in info and info['stats'] and info['stats']['done_only']: logger.store(**info['stats']['done_only']) o, r, d = reset(env) if not is_multi_agent: ep_len = 0 ep_ret = 0 step_num += 1 buf.prepare_for_update() # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('DateTime', get_date_str()) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('HorizonReturn', with_min_and_max=True) if getattr(env.unwrapped, 'is_deepdrive', False): logger.log_tabular('trip_pct', with_min_and_max=True) logger.log_tabular('collided') logger.log_tabular('harmful_gs') logger.log_tabular('timeup') logger.log_tabular('exited_lane') logger.log_tabular('circles') logger.log_tabular('skipped') logger.log_tabular('backwards') logger.log_tabular('won') if 'stats' in info and info['stats']: for stat, value in info['stats'].items(): logger.log_tabular(stat, with_min_and_max=True) if logger.best_category or (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(dict(env=env), pytorch_save=dict( ac=ac.state_dict(), pi_optimizer=pi_optimizer.state_dict(), vf_optimizer=vf_optimizer.state_dict(), epoch=epoch, stats=logger.epoch_dict, ), itr=None, best_category=logger.best_category) logger.dump_tabular()
def ppo(task, actor_critic=model.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, lr=3e-4, v_loss_coeff=0.5, train_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, wrapper_type="continuous_absolute", log_wandb=False): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = dm_construction.get_environment(task, wrapper_type=wrapper_type) obs_dim = env.observation_spec().shape if wrapper_type == "continuous_absolute": act_dim = 4 # for continuous absolute action space else: raise NotImplementedError # Create actor-critic module ac = actor_critic(env.observation_spec(), env.action_spec(), **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = count_vars(ac.ac) logger.log(f"\nNumber of parameters: \t {var_counts}") # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) def compute_loss(data): obs, act, adv, logp_old, ret = data['obs'], data['act'], data[ 'adv'], data['logp'], data['ret'] pi, v, logp = ac.ac(obs, act) # value loss (just MSE) loss_v = ((v - ret)**2).mean() # policy loss ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # useful extra info re: policy approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_v, loss_pi, pi_info # Set up optimizers for policy and value function optimizer = Adam(ac.ac.parameters(), lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() v_l_old, pi_l_old, pi_info_old = compute_loss(data) pi_l_old = pi_l_old.item() vl_l_old = v_l_old.item() # Train policy with multiple steps of gradient descent for i in range(train_iters): optimizer.zero_grad() loss_v, loss_pi, pi_info = compute_loss(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( f'Early stopping at step {i} due to reaching max kl.') break loss = loss_pi + loss_v * v_loss_coeff loss.backward() mpi_avg_grads(ac.ac) # average grads across MPI processes optimizer.step() logger.store(StopIter=i) # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() timestep, ep_ret, ep_len = env.reset(difficulty=0), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): encountered_terminal = False for t in range(local_steps_per_epoch): # assumes obs is an rgb array: rescale to [0, 1] o = timestep.observation / 255.0 a, v, logp = ac.step(o) next_timestep = env.step(ac.action_to_dict(a, rescale=True)) r = timestep.reward d = next_timestep.last( ) # TODO: check if r, d are assoc w/ correct timestep ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # TODO debugging logger.store(AHor=a[0]) logger.store(AVer=a[1]) logger.store(ASel=a[3]) # Update obs (critical!) timestep = next_timestep timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print( f'Warning: trajectory cut off by epoch at {ep_len} steps.', flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(timestep.observation / 255.0) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished. logger.store(EpRet=ep_ret, EpLen=ep_len) encountered_terminal = True timestep, ep_ret, ep_len = env.reset(difficulty=0), 0, 0 # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) if encountered_terminal: # Note, if local_steps_per_epoch is too small so no terminal state # has been encountered, then ep_ret and ep_len will not # be stored before call to log_tabular, resulting in error. logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) # TODO debugging logger.log_tabular('AHor', with_min_and_max=True) logger.log_tabular('AVer', with_min_and_max=True) logger.log_tabular('ASel', with_min_and_max=True) # Save model if (epoch % save_freq == 0 and epoch > 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) if proc_id() == 0 and log_wandb: # Save the model parameters to wandb every save_freq epoch # instead of waiting till the end state = { 'epoch': epoch, 'ac_state_dict': ac.ac.state_dict(), 'optimizer': optimizer.state_dict(), } # output the model in the wandb.run.dir to avoid problems # syncing the model in the cloud with wandb's files state_fname = os.path.join(wandb.run.dir, "state_dict.pt") torch.save(state, state_fname) if proc_id() == 0 and log_wandb: wandb.log(logger.log_current_row, step=epoch) logger.dump_tabular()
def trpo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo='trpo'): # Special function to avoid certain slow down form PYTORCH + MPI combo setup_pytorch_for_mpi() # Setup logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Create random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate Environment # Get dimensions of action and observation space env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync parameters across processes sync_params(ac) # Count variables and add the information to the logger var_counts = tuple( core.count_variables(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # set up experiment buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = TRPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # TODO: set up a function for computing TRPO policy loss also get useful extra information # Setup function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value funtions pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set Up model saving logger.setup_pytorch_saver(ac) # TODO: Create the update function for the TRPO policy def update(): return # Prepare for the interaction with the environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0
def __init__(self, env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=None, save_freq=10, train_graph_path='/home/visgean/', train_graph_name='return.svg', model=None): self.actor_critic = actor_critic self.ac_kwargs = ac_kwargs self.seed = seed self.steps_per_epoch = steps_per_epoch self.epochs = epochs self.gamma = gamma self.clip_ratio = clip_ratio self.pi_lr = pi_lr self.vf_lr = vf_lr self.train_pi_iters = train_pi_iters self.train_v_iters = train_v_iters self.lam = lam self.max_ep_len = max_ep_len self.target_kl = target_kl self.logger_kwargs = logger_kwargs if logger_kwargs else {} self.save_freq = save_freq # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration self.logger = EpochLoggerFixed(**self.logger_kwargs) self.logger.save_config(locals()) # Random seed self.seed += 10000 * proc_id() torch.manual_seed(self.seed) np.random.seed(self.seed) # Instantiate environment self.env = env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape # Create actor-critic module if model: self.ac = model else: self.ac = actor_critic(self.env.observation_space, self.env.action_space, **ac_kwargs) # Sync params across processes sync_params(self.ac) # Count variables self.var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.v]) self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % self.var_counts) # Set up experience buffer self.local_steps_per_epoch = int(steps_per_epoch / num_procs()) self.buf = PPOBuffer(self.obs_dim, self.act_dim, self.local_steps_per_epoch, gamma, lam) # Set up optimizers for policy and value function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=pi_lr) self.vf_optimizer = Adam(self.ac.v.parameters(), lr=vf_lr) # Set up model saving self.logger.setup_pytorch_saver(self.ac) # Prepare for interaction with environment self.start_time = time.time() self.obs = self.env.reset() self.ep_ret = 0 self.ep_len = 0 self.test_returns = [] self.train_returns = [] self.max_return = 0 self.test_lengths = [] self.train_graph_path = train_graph_path + f'{proc_id()}_{train_graph_name}'
def ppo(env_fn, actor_critic=MLPActorCritic, ac_kwargs={}, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ torch.manual_seed(10) np.random.seed(10) random.seed(10) # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Instantiate environment env = env_fn() # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).cuda() # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOTrajectory(gamma, lam) training_queue = TrainingQueue(200) # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 num_training = 15 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): training_data = training_queue.get_batch(num_training) num_training = len(training_data['act']) o = torch.tensor(o).float().unsqueeze(0) if num_training > 0: o = torch.cat([o, training_data['obs']]) o = o.cuda() pi = ac.pi._distribution(o) a = pi.sample() if num_training > 0: a[-num_training:] = training_data['act'] logp = ac.pi._log_prob_from_distribution(pi, a) v = ac.v(o) if num_training > 0: run_update(logp[-num_training:], v[-num_training:], training_data['ret'].cuda(), training_data['adv'].cuda(), training_data['logp'].cuda(), pi_optimizer, vf_optimizer, clip_ratio, logger) a = a[:len(a) - num_training].cpu().item() o = o[:len(o) - num_training].cpu().numpy().squeeze() v = v[:len(v) - num_training].cpu().item() logp = logp[:len(logp) - num_training].cpu().item() next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o num_training = 15 timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: v = ac.v( torch.as_tensor(o, dtype=torch.float32).unsqueeze( 0).cuda()).cpu().detach().item() else: v = 0 trajectory = buf.finish_path(v) training_queue.put_batch(trajectory) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) # logger.log_tabular('LossPi', average_only=True) # logger.log_tabular('LossV', average_only=True) # logger.log_tabular('DeltaLossPi', average_only=True) # logger.log_tabular('DeltaLossV', average_only=True) # logger.log_tabular('Entropy', average_only=True) # logger.log_tabular('KL', average_only=True) # logger.log_tabular('ClipFrac', average_only=True) # logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE-Lambda for advantage estimation) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape # obs_dim = env.observation_space.n act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing VPG policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) loss_pi = -(logp * adv).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, ent=ent) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) bayes_kl_loss = 0. if isinstance(ac.v, BayesMLPCritic): bayes_kl_loss = ac.v.compute_kl() total_loss_v = loss_v + bayes_kl_loss / data['obs'].shape[0] total_loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old), BayesKL=bayes_kl_loss) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 epoch_reward = [] # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t==local_steps_per_epoch-1 if terminal or epoch_ended: if epoch_ended and not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished epoch_reward.append(ep_ret) logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform VPG update! update() if epoch % 10 == 0: # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('BayesKL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() return epoch_reward
r, = torch.autograd.grad(g, w, grad_outputs=y, create_graph=False) x.grad = y + epsilon * r # Test Pytorch layer schema for mannually interfering parameters` gradient observation_space = gym.spaces.Box(-np.inf, np.inf, shape=(8, ), dtype=np.float32) action_space = gym.spaces.Discrete(4) ac_kwargs = dict() ac = core.MLPActorCritic(observation_space, action_space, **ac_kwargs) sync_params(ac) #obs = 3* torch.rand(1,8) #act = 3* torch.rand(1) #torch.save(obs, 'obs.pt') #torch.save(act, 'act.pt') obs = torch.load('obs.pt') act = torch.load('act.pt') pi_optimizer = Adam(ac.pi.parameters(), lr=0.001) pi_optimizer.zero_grad() pi, logp = ac.pi(obs, act) loss_pi = logp.mean() print('Before Backward Propagation')
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): # Special function to avoid certain slowdowns from PyTorch + MPI combo. # setup_pytorch_for_mpi() # Setup logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random Seed seed += 10000 * proc_id torch.manual_seed(seed) np.random.seed(seed) # Instantiate Environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create Actor-Critic Module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params() # Count the number of variables var_counts = tuple(core.count_variables(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experiment buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for calculating VPG policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy Loss pi, logp = ac.pi(obs, act) loss_pi = -(logp * adv).mean() # Useful extra information approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, ent=ent) return loss_pi, pi_info # Set up a function for calculating Value Function loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret) ** 2).mean() # Set up optimizers for policy and value functions pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_save(ac) def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) vf_optimizer.step() # Log changes from the update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with the environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main Loop: Collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical !) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not terminal: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform vpg update update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()