def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=None, target_kl=0.01, logger_kwargs=dict(), save_freq=10, TensorBoard=True, save_nn=True, save_every=1000, load_latest=False, load_custom=False, LoadPath=None, RTA_type=None): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. TensorBoard (bool): True plots to TensorBoard, False does not save_nn (bool): True saves neural network data, False does not save_every (int): How often to save neural network load_latest (bool): Load last saved neural network data before training load_custom (bool): Load custom neural network data file before training LoadPath (str): Path for custom neural network data file RTA_type (str): RTA framework, either 'CBF', 'SVL', 'ASIF', or 'SBSF' """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Random seed for each cpu seed += 1 * proc_id() env.seed(seed) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Load model if True if load_latest: models = glob.glob(f"{PATH}/models/PPO/*") LoadPath = max(models, key=os.path.getctime) ac.load_state_dict(torch.load(LoadPath)) elif load_custom: ac.load_state_dict(torch.load(LoadPath)) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Import RTA if RTA_type == 'CBF': from CBF_for_speed_limit import RTA elif RTA_type == 'SVL': from Simple_velocity_limit import RTA elif RTA_type == 'ASIF': from IASIF import RTA elif RTA_type == 'SBSF': from ISimplex import RTA # Call RTA, define action conversion if RTA_type != 'off': env.RTA_reward = RTA_type rta = RTA(env) def RTA_act(obs, act): act = np.clip(act, -env.force_magnitude, env.force_magnitude) x0 = [obs[0], obs[1], 0, obs[2], obs[3], 0] u_des = np.array([[act[0]], [act[1]], [0]]) u = rta.main(x0, u_des) new_act = [u[0, 0], u[1, 0]] if np.sqrt((act[0] - new_act[0])**2 + (act[1] - new_act[1])**2) < 0.0001: env.RTA_on = False else: env.RTA_on = True return new_act # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 total_episodes = 0 RTA_percent = 0 # Create TensorBoard file if True if TensorBoard and proc_id() == 0: if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name = f"{PATH}/runs/Spacecraft-docking-" + current_time elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name = f"{PATH}/runs/Dubins-aircraft-" + current_time writer = SummaryWriter(Name) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): batch_ret = [] # Track episode returns batch_len = [] # Track episode lengths batch_RTA_percent = [] # Track precentage of time RTA is on env.success = 0 # Track episode success rate env.failure = 0 # Track episode failure rate env.crash = 0 # Track episode crash rate env.overtime = 0 # Track episode over max time/control rate episodes = 0 # Track episodes delta_v = [] # Track episode total delta v for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) if RTA_type != 'off': # If RTA is on, get RTA action RTA_a = RTA_act(o, a) if env.RTA_on: RTA_percent += 1 next_o, r, d, _ = env.step(RTA_a) else: # If RTA is off, pass through desired action next_o, r, d, _ = env.step(a) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': over_max_vel, _, _ = env.check_velocity(a[0], a[1]) if over_max_vel: RTA_percent += 1 ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) batch_ret.append(ep_ret) batch_len.append(ep_len) episodes += 1 if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': delta_v.append(env.control_input / env.mass_deputy) batch_RTA_percent.append(RTA_percent / ep_len * 100) RTA_percent = 0 o, ep_ret, ep_len = env.reset(), 0, 0 total_episodes += episodes # Track success, failure, crash, overtime rates if episodes != 0: success_rate = env.success / episodes failure_rate = env.failure / episodes crash_rate = env.crash / episodes overtime_rate = env.overtime / episodes else: success_rate = 0 failure_rate = 0 crash_rate = 0 overtime_rate = 0 raise ( "No completed episodes logging will break [increase steps per epoch]" ) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Average data over all cpus avg_batch_ret = mpi_avg(np.mean(batch_ret)) avg_batch_len = mpi_avg(np.mean(batch_len)) avg_success_rate = mpi_avg(success_rate) avg_failure_rate = mpi_avg(failure_rate) avg_crash_rate = mpi_avg(crash_rate) avg_overtime_rate = mpi_avg(overtime_rate) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': avg_delta_v = mpi_avg(np.mean(delta_v)) avg_RTA_percent = mpi_avg(np.mean(batch_RTA_percent)) if proc_id() == 0: # Only on one cpu # Plot to TensorBoard if True, only on one cpu if TensorBoard: writer.add_scalar('Return', avg_batch_ret, epoch) writer.add_scalar('Episode-Length', avg_batch_len * env.tau, epoch) writer.add_scalar('Success-Rate', avg_success_rate * 100, epoch) writer.add_scalar('Failure-Rate', avg_failure_rate * 100, epoch) writer.add_scalar('Crash-Rate', avg_crash_rate * 100, epoch) writer.add_scalar('Overtime-Rate', avg_overtime_rate * 100, epoch) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': writer.add_scalar('Delta-V', avg_delta_v, epoch) writer.add_scalar('RTA-on-percent', avg_RTA_percent, epoch) # Save neural network if true, can change to desired location if save_nn and epoch % save_every == 0 and epoch != 0: if not os.path.isdir(f"{PATH}/models"): os.mkdir(f"{PATH}/models") if not os.path.isdir(f"{PATH}/models/PPO"): os.mkdir(f"{PATH}/models/PPO") if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name2 = f"{PATH}/models/PPO/Spacecraft-docking-" + current_time + f"-epoch{epoch}.dat" elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name2 = f"{PATH}/models/PPO/Dubins-aircraft-" + current_time + f"-epoch{epoch}.dat" torch.save(ac.state_dict(), Name2) # Average episodes per hour, episode per epoch ep_hr = mpi_avg(total_episodes) * args.cpu / (time.time() - start_time) * 3600 ep_Ep = mpi_avg(total_episodes) * args.cpu / (epoch + 1) # Plot on one cpu if proc_id() == 0: # Save neural network if save_nn: if not os.path.isdir(f"{PATH}/models"): os.mkdir(f"{PATH}/models") if not os.path.isdir(f"{PATH}/models/PPO"): os.mkdir(f"{PATH}/models/PPO") if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name2 = f"{PATH}/models/PPO/Spacecraft-docking-" + current_time + "-final.dat" elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name2 = f"{PATH}/models/PPO/Dubins-aircraft-" + current_time + "-final.dat" torch.save(ac.state_dict(), Name2) # Print statistics on episodes print( f"Episodes per hour: {ep_hr:.0f}, Episodes per epoch: {ep_Ep:.0f}, Epochs per hour: {(epoch+1)/(time.time()-start_time)*3600:.0f}" )
class td3_agent: def __init__(self, args, env, env_params): self.args = args # path to save the model self.exp_name = '_'.join( (self.args.env_name, self.args.alg, str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join((self.args.env_name, self.args.alg)), self.exp_name) self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name) self.logger.save_config(args) self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network1 = critic(env_params) self.critic_network2 = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network1) sync_networks(self.critic_network2) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network1 = critic(env_params) self.critic_target_network2 = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network1.load_state_dict( self.critic_network1.state_dict()) self.critic_target_network2.load_state_dict( self.critic_network2.state_dict()) # if use gpu self.rank = MPI.COMM_WORLD.Get_rank() if args.cuda: device = 'cuda:{}'.format(self.rank % torch.cuda.device_count()) else: device = 'cpu' self.device = torch.device(device) if self.args.cuda: self.actor_network.cuda(self.device) self.critic_network1.cuda(self.device) self.critic_network2.cuda(self.device) self.actor_target_network.cuda(self.device) self.critic_target_network1.cuda(self.device) self.critic_target_network2.cuda(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim1 = torch.optim.Adam( self.critic_network1.parameters(), lr=self.args.lr_critic) self.critic_optim2 = torch.optim.Adam( self.critic_network2.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.logger.setup_pytorch_saver(self.actor_network) def learn(self): """ train the network """ # start to collect samples for epoch in range(self.args.n_epochs): for _ in range(self.args.n_cycles): mb_obs, mb_ag, mb_g, mb_actions = [], [], [], [] for _ in range(self.args.num_rollouts_per_mpi): # reset the rollouts ep_obs, ep_ag, ep_g, ep_actions = [], [], [], [] # reset the environment observation = self.env.reset() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] # start to collect samples for t in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs, g) pi = self.actor_network(input_tensor) action = self._select_actions(pi) # feed the actions into the environment observation_new, _, _, info = self.env.step(action) obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] # append rollouts ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_g.append(g.copy()) ep_actions.append(action.copy()) # re-assign the observation obs = obs_new ag = ag_new ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) mb_obs.append(ep_obs) mb_ag.append(ep_ag) mb_g.append(ep_g) mb_actions.append(ep_actions) # convert them into arrays mb_obs = np.array(mb_obs) mb_ag = np.array(mb_ag) mb_g = np.array(mb_g) mb_actions = np.array(mb_actions) # store the episodes self.buffer.store_episode([mb_obs, mb_ag, mb_g, mb_actions]) self._update_normalizer([mb_obs, mb_ag, mb_g, mb_actions]) for _ in range(self.args.n_batches): # train the network self._update_network() # soft update self._soft_update_target_network(self.actor_target_network, self.actor_network) self._soft_update_target_network(self.critic_target_network1, self.critic_network1) self._soft_update_target_network(self.critic_target_network2, self.critic_network2) # start to do the evaluation success_rate = self._eval_agent() # save some necessary objects # self.logger.save_state will also save pytorch's model implicitly. # self.logger.save_state({'env':self.env, 'o_norm':self.o_norm, 'g_norm':self.g_norm}, None) state = { 'env': self.env, 'o_norm': self.o_norm.get(), 'g_norm': self.g_norm.get() } self.logger.save_state(state, None) t = ((epoch + 1) * self.args.n_cycles * self.args.num_rollouts_per_mpi * MPI.COMM_WORLD.Get_size() * self.env_params['max_timesteps']) self.logger.log_tabular('Epoch', epoch + 1) self.logger.log_tabular('SuccessRate', success_rate) self.logger.log_tabular('LossPi') self.logger.log_tabular('LossQ') self.logger.log_tabular('TotalEnvInteracts', t) self.logger.dump_tabular() # pre_process the inputs def _preproc_inputs(self, obs, g): obs_norm = self.o_norm.normalize(obs) g_norm = self.g_norm.normalize(g) # concatenate the stuffs inputs = np.concatenate([obs_norm, g_norm]) inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) if self.args.cuda: inputs = inputs.cuda(self.device) return inputs # this function will choose action for the agent and do the exploration def _select_actions(self, pi): action = pi.cpu().numpy().squeeze() # add the gaussian action += self.args.noise_eps * self.env_params[ 'action_max'] * np.random.randn(*action.shape) action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max']) # random actions... random_actions = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], \ size=self.env_params['action']) # choose if use the random actions action += np.random.binomial(1, self.args.random_eps, 1)[0] * (random_actions - action) return action # update the normalizer def _update_normalizer(self, episode_batch): mb_obs, mb_ag, mb_g, mb_actions = episode_batch mb_obs_next = mb_obs[:, 1:, :] mb_ag_next = mb_ag[:, 1:, :] # get the number of normalization transitions num_transitions = mb_actions.shape[1] # create the new buffer to store them buffer_temp = { 'obs': mb_obs, 'ag': mb_ag, 'g': mb_g, 'actions': mb_actions, 'obs_next': mb_obs_next, 'ag_next': mb_ag_next, } transitions = self.her_module.sample_her_transitions( buffer_temp, num_transitions) obs, g = transitions['obs'], transitions['g'] # pre process the obs and g transitions['obs'], transitions['g'] = self._preproc_og(obs, g) # update self.o_norm.update(transitions['obs']) self.g_norm.update(transitions['g']) # recompute the stats self.o_norm.recompute_stats() self.g_norm.recompute_stats() def _preproc_og(self, o, g): o = np.clip(o, -self.args.clip_obs, self.args.clip_obs) g = np.clip(g, -self.args.clip_obs, self.args.clip_obs) return o, g # soft update def _soft_update_target_network(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data) # update the network def _update_network(self): # sample the episodes transitions = self.buffer.sample(self.args.batch_size) # pre-process the observation and goal o, o_next, g = transitions['obs'], transitions[ 'obs_next'], transitions['g'] transitions['obs'], transitions['g'] = self._preproc_og(o, g) transitions['obs_next'], transitions['g_next'] = self._preproc_og( o_next, g) # start to do the update obs_norm = self.o_norm.normalize(transitions['obs']) g_norm = self.g_norm.normalize(transitions['g']) inputs_norm = np.concatenate([obs_norm, g_norm], axis=1) obs_next_norm = self.o_norm.normalize(transitions['obs_next']) g_next_norm = self.g_norm.normalize(transitions['g_next']) inputs_next_norm = np.concatenate([obs_next_norm, g_next_norm], axis=1) # transfer them into the tensor inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32) inputs_next_norm_tensor = torch.tensor(inputs_next_norm, dtype=torch.float32) actions_tensor = torch.tensor(transitions['actions'], dtype=torch.float32) r_tensor = torch.tensor(transitions['r'], dtype=torch.float32) if self.args.cuda: inputs_norm_tensor = inputs_norm_tensor.cuda(self.device) inputs_next_norm_tensor = inputs_next_norm_tensor.cuda(self.device) actions_tensor = actions_tensor.cuda(self.device) r_tensor = r_tensor.cuda(self.device) # calculate the target Q value function with torch.no_grad(): # do the normalization # concatenate the stuffs actions_next = self.actor_target_network(inputs_next_norm_tensor) actions_next += self.args.noise_eps * self.env_params[ 'action_max'] * torch.randn(actions_next.shape).cuda( self.device) actions_next = torch.clamp(actions_next, -self.env_params['action_max'], self.env_params['action_max']) q_next_value1 = self.critic_target_network1( inputs_next_norm_tensor, actions_next) q_next_value2 = self.critic_target_network2( inputs_next_norm_tensor, actions_next) target_q_value = r_tensor + self.args.gamma * torch.min( q_next_value1, q_next_value2) # clip the q value clip_return = 1 / (1 - self.args.gamma) target_q_value = torch.clamp(target_q_value, -clip_return, 0) target_q_value = target_q_value.detach() # the q loss real_q_value1 = self.critic_network1(inputs_norm_tensor, actions_tensor) critic_loss1 = (target_q_value - real_q_value1).pow(2).mean() real_q_value2 = self.critic_network2(inputs_norm_tensor, actions_tensor) critic_loss2 = (target_q_value - real_q_value2).pow(2).mean() # the actor loss actions_real = self.actor_network(inputs_norm_tensor) actor_loss = -torch.min( self.critic_network1(inputs_norm_tensor, actions_real), self.critic_network2(inputs_norm_tensor, actions_real)).mean() actor_loss += self.args.action_l2 * ( actions_real / self.env_params['action_max']).pow(2).mean() # start to update the network self.actor_optim.zero_grad() actor_loss.backward() sync_grads(self.actor_network) self.actor_optim.step() # update the critic_network self.critic_optim1.zero_grad() critic_loss1.backward() sync_grads(self.critic_network1) self.critic_optim1.step() self.critic_optim2.zero_grad() critic_loss2.backward() sync_grads(self.critic_network2) self.critic_optim2.step() self.logger.store(LossPi=actor_loss.detach().cpu().numpy()) self.logger.store(LossQ=(critic_loss1 + critic_loss2).detach().cpu().numpy()) # do the evaluation def _eval_agent(self): total_success_rate = [] for _ in range(self.args.n_test_rollouts): per_success_rate = [] observation = self.env.reset() obs = observation['observation'] g = observation['desired_goal'] for _ in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs, g) pi = self.actor_network(input_tensor) # convert the actions actions = pi.detach().cpu().numpy().squeeze() observation_new, _, _, info = self.env.step(actions) obs = observation_new['observation'] g = observation_new['desired_goal'] per_success_rate.append(info['is_success']) total_success_rate.append(per_success_rate) total_success_rate = np.array(total_success_rate) local_success_rate = np.mean(total_success_rate[:, -1]) global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM) return global_success_rate / MPI.COMM_WORLD.Get_size()
class gac_agent: def __init__(self, args, env, test_env, env_params): self.args = args # path to save the model if self.args.mmd: self.exp_name = '_'.join( (self.args.env_name, self.args.alg, 'mmd' + str(self.args.beta_mmd), 's' + str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join( (self.args.env_name, self.args.alg, 'mmd' + str(self.args.beta_mmd))), self.exp_name) else: self.exp_name = '_'.join( (self.args.env_name, self.args.alg, str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join( (self.args.env_name, self.args.alg)), self.exp_name) self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name) self.logger.save_config(args) self.env = env self.test_env = test_env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network1 = critic(env_params) self.critic_network2 = critic(env_params) self.advice_network1 = critic(env_params) self.advice_network2 = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network1) sync_networks(self.critic_network2) sync_networks(self.advice_network1) sync_networks(self.advice_network2) # build up the target network # self.actor_target_network = actor(env_params) self.critic_target_network1 = critic(env_params) self.critic_target_network2 = critic(env_params) self.advice_target_network1 = critic(env_params) self.advice_target_network2 = critic(env_params) # load the weights into the target networks # self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network1.load_state_dict( self.critic_network1.state_dict()) self.critic_target_network2.load_state_dict( self.critic_network2.state_dict()) self.advice_target_network1.load_state_dict( self.advice_network1.state_dict()) self.advice_target_network2.load_state_dict( self.advice_network2.state_dict()) # if use gpu self.rank = MPI.COMM_WORLD.Get_rank() self.mpi_size = MPI.COMM_WORLD.Get_size() if args.cuda: device = 'cuda:{}'.format(self.rank % torch.cuda.device_count()) self.device = torch.device(device) if self.args.cuda: self.actor_network.cuda(self.device) self.critic_network1.cuda(self.device) self.critic_network2.cuda(self.device) # self.actor_target_network.cuda(self.device) self.critic_target_network1.cuda(self.device) self.critic_target_network2.cuda(self.device) self.advice_network1.cuda(self.device) self.advice_network2.cuda(self.device) self.advice_target_network1.cuda(self.device) self.advice_target_network2.cuda(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim1 = torch.optim.Adam( self.critic_network1.parameters(), lr=self.args.lr_critic) self.critic_optim2 = torch.optim.Adam( self.critic_network2.parameters(), lr=self.args.lr_critic) self.advice_optim1 = torch.optim.Adam( self.advice_network1.parameters(), lr=self.args.lr_critic) self.advice_optim2 = torch.optim.Adam( self.advice_network2.parameters(), lr=self.args.lr_critic) # create the replay buffer self.buffer = ReplayBuffer(self.env_params['obs'], self.env_params['action'], self.args.buffer_size) self.logger.setup_pytorch_saver(self.actor_network) self.obs_mean, self.obs_std = self.buffer.obs_mean, self.buffer.obs_std def learn(self): """ train the network """ # start to collect samples obs, ep_rew, ep_cost, ep_len, done = self.env.reset(), 0, 0, 0, False for epoch in range(self.args.n_epochs): for _ in range(self.args.n_train_rollouts): for t in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs) action = self.actor_network(input_tensor) action = action.detach().cpu().numpy().squeeze() # feed the actions into the environment next_obs, reward, done, info = self.env.step( action * self.env_params['action_max']) ep_rew += reward ep_cost += info['cost'] ep_len += 1 self.buffer.store(obs, action, reward, info['cost'], next_obs, done) obs = next_obs if done or (ep_len == self.env_params['max_timesteps'] ) or (t % self.args.n_batches == 0): self.buffer.obs_mean = MPI.COMM_WORLD.allreduce( self.buffer.obs_mean, op=MPI.SUM) / self.mpi_size self.buffer.obs_std = MPI.COMM_WORLD.allreduce( self.buffer.obs_std, op=MPI.SUM) / self.mpi_size self.obs_mean, self.obs_std = self.buffer.obs_mean, self.buffer.obs_std self.buffer.rew_mean = MPI.COMM_WORLD.allreduce( self.buffer.rew_mean, op=MPI.SUM) / self.mpi_size self.buffer.rew_std = MPI.COMM_WORLD.allreduce( self.buffer.rew_std, op=MPI.SUM) / self.mpi_size self.buffer.cost_mean = MPI.COMM_WORLD.allreduce( self.buffer.cost_mean, op=MPI.SUM) / self.mpi_size self.buffer.cost_std = MPI.COMM_WORLD.allreduce( self.buffer.cost_std, op=MPI.SUM) / self.mpi_size for _ in range(self.args.n_batches): # train the network self._update_network() # soft update # self._soft_update_target_network(self.actor_target_network, self.actor_network) self._soft_update_target_network( self.critic_target_network1, self.critic_network1, self.args.polyak) self._soft_update_target_network( self.critic_target_network2, self.critic_network2, self.args.polyak) if done or (ep_len == self.env_params['max_timesteps']): self.logger.store(EpReward=ep_rew, EpCost=ep_cost, EpLen=ep_len) obs, ep_rew, ep_cost, ep_len, done = self.env.reset( ), 0, 0, 0, False # start to do the evaluation self._test_policy() # save some necessary objects state = { 'observation_mean': self.buffer.obs_mean, 'observation_std': self.buffer.obs_std } self.logger.save_state(state, None) t = ((epoch + 1) * self.mpi_size * self.env_params['max_timesteps']) * self.args.n_train_rollouts self.logger.log_tabular('Epoch', epoch + 1) self.logger.log_tabular('EpReward', with_min_and_max=True) self.logger.log_tabular('EpCost', with_min_and_max=True) self.logger.log_tabular('EpLen', average_only=True) self.logger.log_tabular('TestReward', with_min_and_max=True) self.logger.log_tabular('TestCost', with_min_and_max=True) self.logger.log_tabular('TestLen', average_only=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('MMDEntropy', average_only=True) self.logger.log_tabular('TotalEnvInteracts', t) self.logger.dump_tabular() if MPI.COMM_WORLD.Get_rank() == 0: print("obs_mean=", self.buffer.obs_mean) print("obs_std=", self.buffer.obs_std) print("reward_mean=", self.buffer.rew_mean) print("reward_std=", self.buffer.rew_std) print("cost_mean=", self.buffer.cost_mean) print("cost_std=", self.buffer.cost_std) # pre_process the inputs def _preproc_inputs(self, obs): inputs = ((np.array(obs) - self.obs_mean) / (self.obs_std + 1e-8)).clip(-self.args.clip_range, self.args.clip_range) inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) if self.args.cuda: inputs = inputs.cuda(self.device) return inputs # soft update def _soft_update_target_network(self, target, source, polyak): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - polyak) * param.data + polyak * target_param.data) # update the network def _update_network(self): # sample the episodes batches = self.buffer.sample(self.args.batch_size) o = torch.FloatTensor(batches['obs']).to(self.device) o2 = torch.FloatTensor(batches['obs2']).to(self.device) a = torch.FloatTensor(batches['act']).to(self.device) r = torch.FloatTensor(batches['rew']).to(self.device) c = torch.FloatTensor(batches['cost']).to(self.device) d = torch.FloatTensor(batches['done']).to(self.device) # calculate the target Q value function with torch.no_grad(): # do the normalization # concatenate the stuffs a2 = self.actor_network(o2) q_next_value1 = self.critic_target_network1(o2, a2).detach() q_next_value2 = self.critic_target_network2(o2, a2).detach() target_q_value = r + self.args.gamma * (1 - d) * torch.min( q_next_value1, q_next_value2) target_q_value = target_q_value.detach() p_next_value1 = self.advice_target_network1(o2, a2).detach() p_next_value2 = self.advice_target_network2(o2, a2).detach() target_p_value = -c + self.args.gamma * (1 - d) * torch.min( p_next_value1, p_next_value2) target_p_value = target_p_value.detach() # the q loss real_q_value1 = self.critic_network1(o, a) real_q_value2 = self.critic_network2(o, a) critic_loss1 = (target_q_value - real_q_value1).pow(2).mean() critic_loss2 = (target_q_value - real_q_value2).pow(2).mean() # the p loss real_p_value1 = self.advice_network1(o, a) real_p_value2 = self.advice_network2(o, a) advice_loss1 = (target_p_value - real_p_value1).pow(2).mean() advice_loss2 = (target_p_value - real_p_value2).pow(2).mean() # the actor loss o_exp = o.repeat(self.args.expand_batch, 1) a_exp = self.actor_network(o_exp) actor_loss = -torch.min(self.critic_network1(o_exp, a_exp), self.critic_network2(o_exp, a_exp)).mean() actor_loss -= self.args.advice * torch.min( self.advice_network1(o_exp, a_exp), self.advice_network2(o_exp, a_exp)).mean() mmd_entropy = torch.tensor(0.0) if self.args.mmd: # mmd is computationally expensive a_exp_reshape = a_exp.view(self.args.expand_batch, -1, a_exp.shape[-1]).transpose(0, 1) with torch.no_grad(): uniform_actions = (2 * torch.rand_like(a_exp_reshape) - 1) mmd_entropy = mmd(a_exp_reshape, uniform_actions) if self.args.beta_mmd <= 0.0: mmd_entropy.detach_() else: actor_loss += self.args.beta_mmd * mmd_entropy # start to update the network self.actor_optim.zero_grad() actor_loss.backward() sync_grads(self.actor_network) self.actor_optim.step() # update the critic_network self.critic_optim1.zero_grad() critic_loss1.backward() sync_grads(self.critic_network1) self.critic_optim1.step() self.critic_optim2.zero_grad() critic_loss2.backward() sync_grads(self.critic_network2) self.critic_optim2.step() self.logger.store(LossPi=actor_loss.detach().cpu().numpy()) self.logger.store(LossQ=(critic_loss1 + critic_loss2).detach().cpu().numpy()) self.logger.store(MMDEntropy=mmd_entropy.detach().cpu().numpy()) # do the evaluation def _test_policy(self): for _ in range(self.args.n_test_rollouts): obs, ep_rew, ep_cost, ep_len, done = self.test_env.reset( ), 0, 0, 0, False while (not done and ep_len < self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs) action = self.actor_network(input_tensor, std=0.5) action = action.detach().cpu().numpy().squeeze() obs_next, reward, done, info = self.test_env.step(action) obs = obs_next ep_rew += reward ep_cost += info['cost'] ep_len += 1 self.logger.store(TestReward=ep_rew, TestCost=ep_cost, TestLen=ep_len)
def sac(args, steps_per_epoch=1500, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=3e-4, batch_size=128, start_steps=1000, update_after=1000, update_every=1, num_test_episodes=10, max_ep_len=150, logger_kwargs=dict(), save_freq=1): logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) torch.set_num_threads(torch.get_num_threads()) actor_critic = core.MLPActorCritic ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) gamma = args.gamma seed = args.seed epochs = args.epochs logger_tensor = Logger(logdir=args.logdir, run_name="{}-{}".format(args.model_name, time.ctime())) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env = ML1.get_train_tasks('reach-v1') # Create an environment with task `pick_place` tasks = env.sample_tasks(1) # Sample a task (in this case, a goal variation) env.set_task(tasks[0]) # Set task test_env = ML1.get_train_tasks('reach-v1') # Create an environment with task `pick_place` tasks = env.sample_tasks(1) # Sample a task (in this case, a goal variation) test_env.set_task(tasks[0]) # Set task obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup) ** 2).mean() loss_q2 = ((q2 - backup) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=3e-4) q_optimizer = Adam(q_params, lr=3e-4) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, logger_tensor, t): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) logger_tensor.log_value(t, loss_q.item(), "loss q") # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) logger_tensor.log_value(t, loss_pi.item(), "loss pi") # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) logger_tensor.log_value(t, ep_ret, "test ep reward") logger_tensor.log_value(t, ep_len, "test ep length") # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger_tensor.log_value(t, ep_ret, "reward") logging.info("> total_steps={} | reward={}".format(t, ep_ret)) logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, logger_tensor = logger_tensor, t = t) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger_tensor.log_value(t, epoch, "epoch") logger.dump_tabular(logger_tensor=logger_tensor,epoch = epoch) ac.save(args.save_model_dir, args.model_name)
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # GAedit # Special function to avoid certain slowdowns from PyTorch + MPI combo. # setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # GAedit # Seed seed = 333 torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() #GAedit # obs_dim = env.observation_space.shape # act_dim = env.action_space.shape # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) # size of each action act_dim = brain.vector_action_space_size # examine the state space obs_dim = env_info.vector_observations.shape[1] #GAedit # Create actor-critic module # ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac = actor_critic(obs_dim, act_dim, **ac_kwargs) # GAedit - don't think we need to sync # Sync params across processes # sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer # GAedit # local_steps_per_epoch = int(steps_per_epoch / num_procs()) local_steps_per_epoch = int(steps_per_epoch / num_agents) #GAedit buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch * num_agents, gamma, lam) # buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) #GAedit # kl = mpi_avg(pi_info['kl']) kl = pi_info['kl'] if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() #GAedit # mpi_avg_grads(ac.pi) # average grads across MPI processes # ac.pi.mean() pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() #GAedit # mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() #GAedit # o, ep_ret, ep_len = env.reset(), 0, 0 ep_ret, ep_len = 0, 0 env_info = env.reset(train_mode=True)[brain_name] o = env_info.vector_observations # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) # GAedit # next_o, r, d, _ = env.step(a) env_info = env.step(a)[brain_name] next_o, r, d = env_info.vector_observations, env_info.rewards, env_info.local_done #GAedit # ep_ret += r ep_ret += np.mean(r) ep_len += 1 # save and log #GAedit # buf.store(o, a, r, v, logp) for i in range(20): buf.store(o[i], a[i], r[i], v[i], logp[i]) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len # GAedit # terminal = d or timeout terminal = any(d) or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) # GAedit # o, ep_ret, ep_len = env.reset(), 0, 0 ep_ret, ep_len = 0, 0 env_info = env.reset(train_mode=True)[brain_name] o = env_info.vector_observations # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=2000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): global RENDER, BONUS """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Reachability Trainer r_network = R_Network().to(device) trainer = R_Network_Trainer(r_network=r_network, exp_name="random1") episodic_memory = EpisodicMemory(embedding_shape=[EMBEDDING_DIM]) # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(3, 64, 64)) action_space = gym.spaces.Discrete(3) obs_dim = observation_space.shape act_dim = action_space.shape # Create actor-critic module ac = actor_critic(observation_space, action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) # Entropy bonus loss_pi += pi_info['ent'] * 0.0021 kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, _ = env.reset() env.render() o = o.astype(np.float32) / 255. o = o.transpose(2, 0, 1) ep_ret, ep_len = 0, 0 indices = [] # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): state = torch.as_tensor(o[np.newaxis, ...], dtype=torch.float32) a, v, logp = ac.step(state) next_o, r, d, info = env.step(a) next_o = next_o.astype(np.float32) / 255. d = ep_len == max_ep_len trainer.store_new_state([next_o], [r], [d], [None]) r_network.eval() with torch.no_grad(): state_embedding = r_network.embed_observation( torch.FloatTensor([o]).to(device)).cpu().numpy()[0] aggregated, _, _ = similarity_to_memory( state_embedding, episodic_memory, r_network) curiosity_bonus = 0.03 * (0.5 - aggregated) if BONUS: print(f'{curiosity_bonus:.3f}') if curiosity_bonus > 0 or len(episodic_memory) == 0: idx = episodic_memory.store_new_state(state_embedding) x = int(env.map_scale * info['pose']['x']) y = int(env.map_scale * info['pose']['y']) if idx == len(indices): indices.append((x, y)) else: indices[idx] = (x, y) r_network.train() next_o = next_o.transpose(2, 0, 1) ep_ret += r + curiosity_bonus ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) k = cv2.waitKey(1) if k == ord('s'): RENDER = 1 - RENDER elif k == ord('b'): BONUS = 1 - BONUS if RENDER: env.info['map'] = cv2.flip(env.info['map'], 0) for index in indices: cv2.circle(env.info['map'], index, 3, (0, 0, 255), -1) env.info['map'] = cv2.flip(env.info['map'], 0) env.render() # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: state = torch.as_tensor(o[np.newaxis, ...], dtype=torch.float32) _, v, _ = ac.step(state) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) print(ep_ret, ep_len, len(episodic_memory)) ep_ret, ep_len = 0, 0 o, _ = env.reset() o = o.astype(np.float32) / 255. o = o.transpose(2, 0, 1) episodic_memory.reset() indices = [] # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! if epoch > 4: update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() else: buf.get()