def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module if inspect.isclass(actor_critic): ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) else: ac = actor_critic # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # current state logger.save_state({'env': env}, epoch) # for rendering # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() logger.output_file.close()
class AWAC: def __init__(self, env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=100, epochs=10000, replay_size=int(2000000), gamma=0.99, polyak=0.995, lr=3e-4, p_lr=3e-4, alpha=0.0, batch_size=1024, start_steps=10000, update_after=0, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, algo='SAC'): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) self.env, self.test_env = env_fn(), env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.env.action_space.high[0] # Create actor-critic module and target networks self.ac = actor_critic(self.env.observation_space, self.env.action_space, special_policy='awac', **ac_kwargs) self.ac_targ = actor_critic(self.env.observation_space, self.env.action_space, special_policy='awac', **ac_kwargs) self.ac_targ.load_state_dict(self.ac.state_dict()) self.gamma = gamma # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) self.logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) self.algo = algo self.p_lr = p_lr self.lr = lr self.alpha = 0 # # Algorithm specific hyperparams # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr, weight_decay=1e-4) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.num_test_episodes = num_test_episodes self.max_ep_len = max_ep_len self.epochs = epochs self.steps_per_epoch = steps_per_epoch self.update_after = update_after self.update_every = update_every self.batch_size = batch_size self.save_freq = save_freq self.polyak = polyak # Set up model saving self.logger.setup_pytorch_saver(self.ac) print("Running Offline RL algorithm: {}".format(self.algo)) def populate_replay_buffer(self, env_name): data_envs = { 'HalfCheetah-v2': ( "awac_data/hc_action_noise_15.npy", "awac_data/hc_off_policy_15_demos_100.npy"), 'Ant-v2': ( "awac_data/ant_action_noise_15.npy", "awac_data/ant_off_policy_15_demos_100.npy"), 'Walker2d-v2': ( "awac_data/walker_action_noise_15.npy", "awac_data/walker_off_policy_15_demos_100.npy"), } if env_name in data_envs: print('Loading saved data') for file in data_envs[env_name]: if not os.path.exists(file): warnings.warn(colored('Offline data not found. Follow awac_data/instructions.txt to download. Running without offline data.', 'red')) break data = np.load(file, allow_pickle=True) for demo in data: for transition in list(zip(demo['observations'], demo['actions'], demo['rewards'], demo['next_observations'], demo['terminals'])): self.replay_buffer.store(*transition) else: dataset = d4rl.qlearning_dataset(self.env) N = dataset['rewards'].shape[0] for i in range(N): self.replay_buffer.store(dataset['observations'][i], dataset['actions'][i], dataset['rewards'][i], dataset['next_observations'][i], float(dataset['terminals'][i])) print("Loaded dataset") # Set up function for computing SAC Q-losses def compute_loss_q(self, data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = self.ac.q1(o, a) q2 = self.ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = self.ac.pi(o2) # Target Q-values q1_pi_targ = self.ac_targ.q1(o2, a2) q2_pi_targ = self.ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + self.gamma * (1 - d) * (q_pi_targ - self.alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup) ** 2).mean() loss_q2 = ((q2 - backup) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(self, data): o = data['obs'] pi, logp_pi = self.ac.pi(o) q1_pi = self.ac.q1(o, pi) q2_pi = self.ac.q2(o, pi) v_pi = torch.min(q1_pi, q2_pi) beta = 2 q1_old_actions = self.ac.q1(o, data['act']) q2_old_actions = self.ac.q2(o, data['act']) q_old_actions = torch.min(q1_old_actions, q2_old_actions) adv_pi = q_old_actions - v_pi weights = F.softmax(adv_pi / beta, dim=0) policy_logpp = self.ac.pi.get_logprob(o, data['act']) loss_pi = (-policy_logpp * len(weights) * weights.detach()).mean() # Useful info for logging pi_info = dict(LogPi=policy_logpp.detach().numpy()) return loss_pi, pi_info def update(self, data, update_timestep): # First run one gradient descent step for Q1 and Q2 self.q_optimizer.zero_grad() loss_q, q_info = self.compute_loss_q(data) loss_q.backward() self.q_optimizer.step() # Record things self.logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in self.q_params: p.requires_grad = False # Next run one gradient descent step for pi. self.pi_optimizer.zero_grad() loss_pi, pi_info = self.compute_loss_pi(data) loss_pi.backward() self.pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in self.q_params: p.requires_grad = True # Record things self.logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) def get_action(self, o, deterministic=False): return self.ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(self): for j in range(self.num_test_episodes): o, d, ep_ret, ep_len = self.test_env.reset(), False, 0, 0 while not (d or (ep_len == self.max_ep_len)): # Take deterministic actions at test time o, r, d, _ = self.test_env.step(self.get_action(o, True)) ep_ret += r ep_len += 1 self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Get unnormalized score # self.logger.store(TestEpRet=100*self.test_env.get_normalized_score(ep_ret), TestEpLen=ep_len) # Get normalized score def run(self): # Prepare for interaction with environment total_steps = self.epochs * self.steps_per_epoch start_time = time.time() obs, ep_ret, ep_len = self.env.reset(), 0, 0 done = True num_train_episodes = 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Reset stuff if necessary if done and t > 0: self.logger.store(ExplEpRet=ep_ret, ExplEpLen=ep_len) obs, ep_ret, ep_len = self.env.reset(), 0, 0 num_train_episodes += 1 # Collect experience act = self.get_action(obs, deterministic=False) next_obs, rew, done, info = self.env.step(act) self.replay_buffer.store(obs, act, rew, next_obs, done) obs = next_obs # Update handling if t > self.update_after and t % self.update_every == 0: for _ in range(self.update_every): batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch, update_timestep=t) # End of epoch handling if (t + 1) % self.steps_per_epoch == 0: epoch = (t + 1) // self.steps_per_epoch # Save model if (epoch % self.save_freq == 0) or (epoch == self.epochs): self.logger.save_state({'env': self.env}, None) # Test the performance of the deterministic version of the agent. self.test_agent() # Log info about epoch self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('TestEpRet', with_min_and_max=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('TotalUpdates', t) self.logger.log_tabular('Q1Vals', with_min_and_max=True) self.logger.log_tabular('Q2Vals', with_min_and_max=True) self.logger.log_tabular('LogPi', with_min_and_max=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('Time', time.time() - start_time) self.logger.dump_tabular()
class CQL: def __init__(self, env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=1000, epochs=10000, replay_size=int(2e6), gamma=0.99, polyak=0.995, lr=3e-4, p_lr=1e-4, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1,policy_eval_start=0, algo='CQL',min_q_weight=5, automatic_alpha_tuning=False): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) self.env, self.test_env = env_fn(), env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.env.action_space.high[0] # Create actor-critic module and target networks self.ac = actor_critic(self.env.observation_space, self.env.action_space, **ac_kwargs) self.ac_targ = deepcopy(self.ac) self.gamma = gamma # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) self.logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts) self.algo = algo self.lagrange_threshold = 10 self.penalty_lr = lr self.tune_lambda = True if 'lagrange' in self.algo else False if self.tune_lambda: print("Tuning Lambda") self.target_action_gap = self.lagrange_threshold self.log_lamda = torch.zeros(1, requires_grad=True, device=device) self.lamda_optimizer = torch.optim.Adam([self.log_lamda],lr=self.penalty_lr) self.lamda = self.log_lamda.exp() self.min_q_weight = 1.0 else: # self.lamda = min_q_weight self.min_q_weight = min_q_weight self.automatic_alpha_tuning = automatic_alpha_tuning if self.automatic_alpha_tuning is True: self.target_entropy = -torch.prod(torch.Tensor(self.env.action_space.shape)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = Adam([self.log_alpha], lr=p_lr) self.alpha = self.log_alpha.exp() else: self.alpha = alpha # self.alpha = alpha # CWR does not require entropy in Q evaluation self.target_update_freq = 1 self.p_lr = p_lr self.lr=lr # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.num_test_episodes = num_test_episodes self.max_ep_len = max_ep_len self.epochs= epochs self.steps_per_epoch = steps_per_epoch self.update_after = update_after self.update_every = update_every self.batch_size = batch_size self.save_freq = save_freq self.polyak = polyak self.softmax = torch.nn.Softmax(dim=1) self.softplus = torch.nn.Softplus(beta=1, threshold=20) self.policy_eval_start=policy_eval_start self._current_epoch=0 # Set up model saving self.logger.setup_pytorch_saver(self.ac) print("Running Offline RL algorithm: {}".format(self.algo)) def populate_replay_buffer(self): dataset = d4rl.qlearning_dataset(self.env) self.replay_buffer.obs_buf[:dataset['observations'].shape[0],:] = dataset['observations'] self.replay_buffer.act_buf[:dataset['actions'].shape[0],:] = dataset['actions'] self.replay_buffer.obs2_buf[:dataset['next_observations'].shape[0],:] = dataset['next_observations'] self.replay_buffer.rew_buf[:dataset['rewards'].shape[0]] = dataset['rewards'] self.replay_buffer.done_buf[:dataset['terminals'].shape[0]] = dataset['terminals'] self.replay_buffer.size = dataset['observations'].shape[0] self.replay_buffer.ptr = (self.replay_buffer.size+1)%(self.replay_buffer.max_size) # Set up function for computing SAC Q-losses def compute_loss_q(self, data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = self.ac.q1(o,a) q2 = self.ac.q2(o,a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = self.ac.pi(o2) # Target Q-values q1_pi_targ = self.ac_targ.q1(o2, a2) q2_pi_targ = self.ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + self.gamma * (1 - d) * (q_pi_targ - self.alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 self.logger.store(CQLalpha=self.lamda) if 'rho' in self.algo: samples = 10 # Sample from previous policy (10 samples) o_rep = o.repeat_interleave(repeats=samples,dim=0) sample_actions, _ = self.ac.pi(o_rep) cql_loss_q1 = self.ac.q1(o_rep,sample_actions).reshape(-1,1) cql_loss_q2 = self.ac.q2(o_rep,sample_actions).reshape(-1,1) cql_loss_q1 = cql_loss_q1-np.log(samples) cql_loss_q2 = cql_loss_q2-np.log(samples) cql_loss_q1 = torch.logsumexp(cql_loss_q1,dim=1).mean()*self.min_q_weight cql_loss_q2 = torch.logsumexp(cql_loss_q2,dim=1).mean()*self.min_q_weight # Sample from dataset cql_loss_q1 -= self.ac.q1(o, a).mean()*self.min_q_weight cql_loss_q2 -= self.ac.q2(o, a).mean()*self.min_q_weight else: samples = 10 q1_pi_samples = None q2_pi_samples = None # Add samples from previous policy o_rep = o.repeat_interleave(repeats=samples,dim=0) o2_rep = o2.repeat_interleave(repeats=samples,dim=0) # o_rep = o.repeat_interleave(samples,1) # Samples from current policy sample_action, logpi = self.ac.pi(o_rep) q1_pi_samples = self.ac.q1(o_rep,sample_action).view(-1,1) - logpi.view(-1,1).detach() q2_pi_samples = self.ac.q2(o_rep,sample_action).view(-1,1) - logpi.view(-1,1).detach() q1_pi_samples = q1_pi_samples.view((o.shape[0],-1)) q2_pi_samples = q2_pi_samples.view((o.shape[0],-1)) sample_next_action, logpi_n = self.ac.pi(o2_rep) q1_next_pi_samples = self.ac.q1(o2_rep,sample_next_action).view(-1,1) - logpi_n.view(-1,1).detach() q2_next_pi_samples = self.ac.q2(o2_rep,sample_next_action).view(-1,1) - logpi_n.view(-1,1).detach() q1_next_pi_samples = q1_next_pi_samples.view((o2.shape[0],-1)) q2_next_pi_samples = q2_next_pi_samples.view((o2.shape[0],-1)) # Add samples from uniform sampling sample_action = np.random.uniform(low=self.env.action_space.low,high=self.env.action_space.high,size=(q1_pi_samples.shape[0]*10,self.env.action_space.high.shape[0])) sample_action = torch.FloatTensor(sample_action).to(device) log_pi = torch.FloatTensor([np.log(1/np.prod(self.env.action_space.high-self.env.action_space.low))]).to(device) q1_rand_samples = self.ac.q1(o_rep,sample_action).view(-1,1) - log_pi.view(-1,1).detach() q2_rand_samples = self.ac.q2(o_rep,sample_action).view(-1,1) - log_pi.view(-1,1).detach() q1_rand_samples = q1_rand_samples.view((o.shape[0],-1)) q2_rand_samples = q2_rand_samples.view((o.shape[0],-1)) cql_loss_q1 = torch.logsumexp(torch.cat([q1_pi_samples,q1_next_pi_samples,q1_rand_samples],dim=1),dim=1).mean()*self.min_q_weight cql_loss_q2 = torch.logsumexp(torch.cat((q2_pi_samples,q2_next_pi_samples,q2_rand_samples),dim=1),dim=1).mean()*self.min_q_weight # Sample from dataset cql_loss_q1 -= self.ac.q1(o, a).mean()*self.min_q_weight cql_loss_q2 -= self.ac.q2(o, a).mean()*self.min_q_weight # Update the cql-alpha if 'lagrange' in self.algo: cql_alpha = torch.clamp(self.log_lamda.exp(), min=0.0, max=1000000.0) self.lamda = cql_alpha.item() cql_loss_q1 = cql_alpha*(cql_loss_q1-self.target_action_gap) cql_loss_q2 = cql_alpha*(cql_loss_q2-self.target_action_gap) self.lamda_optimizer.zero_grad() lamda_loss = (-cql_loss_q1-cql_loss_q2)*0.5 lamda_loss.backward(retain_graph=True) self.lamda_optimizer.step() # print(self.log_lamda.exp()) avg_q = 0.5*(cql_loss_q1.mean() + cql_loss_q2.mean()).detach().cpu() loss_q += (cql_loss_q1.mean() + cql_loss_q2.mean()) # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy(), AvgQ = avg_q) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(self,data): o = data['obs'] a = data['act'] pi, logp_pi = self.ac.pi(o) q1_pi = self.ac.q1(o, pi) q2_pi = self.ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) loss_pi = (self.alpha * logp_pi - q_pi).mean() # TODO: Verify if this is needed if self._current_epoch<self.policy_eval_start: policy_log_prob = self.ac.pi.get_logprob(o, a) loss_pi = (self.alpha * logp_pi - policy_log_prob).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy()) return loss_pi, pi_info, logp_pi def update(self,data, update_timestep): self._current_epoch+=1 # First run one gradient descent step for Q1 and Q2 self.q_optimizer.zero_grad() loss_q, q_info = self.compute_loss_q(data) loss_q.backward() self.q_optimizer.step() # Record things self.logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in self.q_params: p.requires_grad = False # Next run one gradient descent step for pi. self.pi_optimizer.zero_grad() loss_pi, pi_info, log_pi = self.compute_loss_pi(data) loss_pi.backward() self.pi_optimizer.step() if self.automatic_alpha_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in self.q_params: p.requires_grad = True # Record things self.logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. if update_timestep%self.target_update_freq==0: with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) def get_action(self, o, deterministic=False): return self.ac.act(torch.as_tensor(o, dtype=torch.float32).to(device), deterministic) def test_agent(self): for j in range(self.num_test_episodes): o, d, ep_ret, ep_len = self.test_env.reset(), False, 0, 0 while not(d or (ep_len == self.max_ep_len)): # Take deterministic actions at test time o, r, d, _ = self.test_env.step(self.get_action(o, True)) ep_ret += r ep_len += 1 self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # self.logger.store(TestEpRet=100*self.test_env.get_normalized_score(ep_ret), TestEpLen=ep_len) def run(self): # Prepare for interaction with environment total_steps = self.epochs * self.steps_per_epoch start_time = time.time() o, ep_ret, ep_len = self.env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # # Update handling batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch, update_timestep = t) # End of epoch handling if (t+1) % self.steps_per_epoch == 0: epoch = (t+1) // self.steps_per_epoch # # Save model # if (epoch % self.save_freq == 0) or (epoch == self.epochs): # self.logger.save_state({'env': self.env}, None) # Test the performance of the deterministic version of the agent. self.test_agent() # Log info about epoch self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('TestEpRet', with_min_and_max=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('TotalUpdates', t) self.logger.log_tabular('Q1Vals', with_min_and_max=True) self.logger.log_tabular('Q2Vals', with_min_and_max=True) self.logger.log_tabular('LogPi', with_min_and_max=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('CQLalpha', average_only=True) self.logger.log_tabular('Time', time.time()-start_time) self.logger.dump_tabular() def train(self, training_epochs): # Main loop: collect experience in env and update/log each epoch for t in range(training_epochs): # # Update handling batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch, update_timestep = t) self.test_agent() def collect_episodes(self, num_episodes): env_steps = 0 for j in range(num_episodes): o, d, ep_ret, ep_len = self.env.reset(), False, 0, 0 while not(d or (ep_len == self.max_ep_len)): # Take deterministic actions at test time act = self.get_action(o) no, r, d, _ = self.env.step(act) self.replay_buffer.store(o,act,r,no,d) env_steps+=1 return env_steps def log_and_dump(self): # Log info about epoch self.logger.log_tabular('TestEpRet', with_min_and_max=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('Q1Vals', with_min_and_max=True) self.logger.log_tabular('Q2Vals', with_min_and_max=True) self.logger.log_tabular('LogPi', with_min_and_max=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('CQLalpha', average_only=True) self.logger.dump_tabular()
class EMAQ: def __init__(self, env_fn, env_name=None, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=100, epochs=10000, replay_size=int(2000000), gamma=0.99, polyak=0.995, lr=3e-4, p_lr=3e-5, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, algo='CQL'): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) self.env, self.test_env = env_fn(), env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.env.action_space.high[0] # Create actor-critic module and target networks self.ac = actor_critic(self.env.observation_space, self.env.action_space, **ac_kwargs) self.ac_targ = deepcopy(self.ac) self.gamma = gamma # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) self.logger.log( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) self.algo = algo self.lagrange_threshold = 10 self.penalty_lr = 5e-2 self.lamda = Variable(torch.log(torch.exp(torch.Tensor([5])) - 1), requires_grad=True) self.lamda_optimizer = torch.optim.Adam([self.lamda], lr=self.penalty_lr) self.tune_lambda = True if 'lagrange' in self.algo else False self.alpha = 0 self.target_update_freq = 1 self.p_lr = 3e-5 self.lr = 3e-4 self.n_samples = 100 self.env_name = env_name # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.num_test_episodes = num_test_episodes self.max_ep_len = max_ep_len self.epochs = epochs self.steps_per_epoch = steps_per_epoch self.update_after = update_after self.update_every = update_every self.batch_size = batch_size self.save_freq = save_freq self.polyak = polyak # Set up model saving self.logger.setup_pytorch_saver(self.ac) print("Running Offline RL algorithm: {}".format(self.algo)) def populate_replay_buffer(self): dataset = d4rl.qlearning_dataset(self.env) self.replay_buffer.obs_buf[:dataset['observations']. shape[0], :] = dataset['observations'] self.replay_buffer.act_buf[:dataset['actions']. shape[0], :] = dataset['actions'] self.replay_buffer.obs2_buf[:dataset['next_observations']. shape[0], :] = dataset['next_observations'] self.replay_buffer.rew_buf[:dataset['rewards']. shape[0]] = dataset['rewards'] self.replay_buffer.done_buf[:dataset['terminals']. shape[0]] = dataset['terminals'] self.replay_buffer.size = dataset['observations'].shape[0] self.replay_buffer.ptr = (self.replay_buffer.size + 1) % (self.replay_buffer.max_size) # Set up function for computing SAC Q-losses def compute_loss_q(self, data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] sampled_actions_q1 = None sampled_actions_q2 = None for i in range(self.n_samples): z = np.random.randn(a.shape[0], a.shape[1]) z = torch.FloatTensor(z) actions, _ = self.sampling_policy.inverse(z, y=o2) if sampled_actions_q1 is None: sampled_actions_q1 = self.ac_targ.q1(o2, actions).view(-1, 1) sampled_actions_q2 = self.ac_targ.q2(o2, actions).view(-1, 1) else: sampled_actions_q1 = torch.cat( (sampled_actions_q1, self.ac_targ.q1(o2, actions).view( -1, 1)), dim=1) sampled_actions_q2 = torch.cat( (sampled_actions_q2, self.ac_targ.q2(o2, actions).view( -1, 1)), dim=1) q1 = self.ac.q1(o, a) q2 = self.ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = self.ac.pi(o2) # Target Q-values q1_pi_targ = torch.max(sampled_actions_q1, dim=1).values q2_pi_targ = torch.max(sampled_actions_q2, dim=1).values q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + self.gamma * (1 - d) * (q_pi_targ - self.alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info def update(self, data, update_timestep): # First run one gradient descent step for Q1 and Q2 self.q_optimizer.zero_grad() loss_q, q_info = self.compute_loss_q(data) loss_q.backward() self.q_optimizer.step() # Record things self.logger.store(LossQ=loss_q.item(), **q_info) # Finally, update target networks by polyak averaging. if update_timestep % self.target_update_freq == 0: with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) def get_action(self, o, deterministic=False): sampled_actions_q1 = None sampled_actions_q2 = None sampled_actions = [] o = torch.FloatTensor(o).view(1, -1) for i in range(self.n_samples): z = np.random.randn(1, self.act_dim) z = torch.FloatTensor(z) actions, _ = self.sampling_policy.inverse(z, y=o) sampled_actions.append(actions) if sampled_actions_q1 is None: sampled_actions_q1 = self.ac.q1(o, actions).view(-1, 1) sampled_actions_q2 = self.ac.q2(o, actions).view(-1, 1) else: sampled_actions_q1 = torch.cat( (sampled_actions_q1, self.ac.q1(o, actions).view(-1, 1)), dim=1) sampled_actions_q2 = torch.cat( (sampled_actions_q2, self.ac.q2(o, actions).view(-1, 1)), dim=1) q_values = torch.min(sampled_actions_q1, sampled_actions_q2) max_idx = torch.argmax(q_values.view(-1)) return sampled_actions[max_idx].detach().cpu().numpy() def test_agent(self): for j in range(self.num_test_episodes): o, d, ep_ret, ep_len = self.test_env.reset(), False, 0, 0 while not (d or (ep_len == self.max_ep_len)): # Take deterministic actions at test time o, r, d, _ = self.test_env.step(self.get_action(o, True)) ep_ret += r ep_len += 1 self.logger.store(TestEpRet=100 * self.test_env.get_normalized_score(ep_ret), TestEpLen=ep_len) def run(self): # Learn a generative model for data # density_epochs = 50 # self.sampling_policy = core.MADE(self.act_dim, 256, 2 , cond_label_size = self.obs_dim[0]) # density_optimizer = torch.optim.Adam(self.sampling_policy.parameters(), lr=1e-4, weight_decay=1e-6) # for i in range(density_epochs): # sample_indices = np.random.choice( # self.replay_buffer.size, self.replay_buffer.size) # np.random.shuffle(sample_indices) # ctr = 0 # total_loss = 0 # for j in range(0, self.replay_buffer.size, self.batch_size): # actions = self.replay_buffer.act_buf[sample_indices[ctr * self.batch_size:( # ctr + 1) * self.batch_size],:] # actions = torch.FloatTensor(actions) # obs = self.replay_buffer.obs_buf[sample_indices[ctr * self.batch_size:( # ctr + 1) * self.batch_size],:] # obs = torch.FloatTensor(obs) # density_optimizer.zero_grad() # loss = -self.sampling_policy.log_prob(actions,y=obs).mean() # loss.backward() # total_loss+=loss.data * self.batch_size # density_optimizer.step() # ctr+=1 # print("Density training loss: {}".format(total_loss/self.replay_buffer.size)) self.sampling_policy = core.MADE(self.act_dim, 256, 3, cond_label_size=self.obs_dim[0]) self.sampling_policy.load_state_dict( torch.load("behavior_policies/" + self.env_name + ".pt")) # self.sampling_policy = torch.load("marginals/"+self.env_name+".pt") # Prepare for interaction with environment total_steps = self.epochs * self.steps_per_epoch start_time = time.time() o, ep_ret, ep_len = self.env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # # Update handling batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch, update_timestep=t) # End of epoch handling if (t + 1) % self.steps_per_epoch == 0: epoch = (t + 1) // self.steps_per_epoch # Save model if (epoch % self.save_freq == 0) or (epoch == self.epochs): self.logger.save_state({'env': self.env}, None) # Test the performance of the deterministic version of the agent. self.test_agent() # Log info about epoch self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('TestEpRet', with_min_and_max=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('TotalUpdates', t) self.logger.log_tabular('Q1Vals', with_min_and_max=True) self.logger.log_tabular('Q2Vals', with_min_and_max=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('Time', time.time() - start_time) self.logger.dump_tabular()
def ddpg(env_fn, env_name, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Deep Deterministic Policy Gradient (DDPG) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, and a ``q`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q`` (batch,) | Tensor containing the current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts) # Set up function for computing DDPG Q-loss def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q = ac.q(o, a) # Bellman backup for Q function with torch.no_grad(): q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.detach().numpy()) return loss_q, loss_info # Set up function for computing DDPG pi loss def compute_loss_pi(data): o = data['obs'] q_pi = ac.q(o, ac.pi(o)) return -q_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(ac.q.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q. q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in ac.q.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in ac.q.parameters(): p.requires_grad = True # Record things logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) episode_rewards.append(ep_ret) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 rewards_log = [] episode_rewards = deque(maxlen=10) # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model # if (epoch % save_freq == 0) or (epoch == epochs): # logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() rewards_log.append(np.mean(episode_rewards)) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rewards_log = np.array(rewards_log) save_path = '../../log/ddpg/' + env_name + '/' + str(seed) + '.npy' np.save(save_path, rewards_log)
class EMAQ_LP: def __init__(self, env_fn, env_name=None, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=100, epochs=10000, replay_size=int(2000000), gamma=0.99, polyak=0.995, lr=3e-4, p_lr=3e-5, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, algo='CQL'): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) self.env, self.test_env = env_fn(), env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.env.action_space.high[0] # Create actor-critic module and target networks self.ac = actor_critic(self.env.observation_space, self.env.action_space, **ac_kwargs) self.ac_targ = deepcopy(self.ac) self.gamma = gamma # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) self.logger.log( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) self.algo = algo self.lagrange_threshold = 10 self.penalty_lr = 5e-2 self.lamda = Variable(torch.log(torch.exp(torch.Tensor([5])) - 1), requires_grad=True) self.lamda_optimizer = torch.optim.Adam([self.lamda], lr=self.penalty_lr) self.tune_lambda = True if 'lagrange' in self.algo else False self.alpha = alpha # CWR does not require entropy in Q evaluation self.target_update_freq = 1 self.p_lr = 3e-5 self.lr = 1e-4 self.n_samples = 100 self.env_name = env_name # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.num_test_episodes = num_test_episodes self.max_ep_len = max_ep_len self.epochs = epochs self.steps_per_epoch = steps_per_epoch self.update_after = update_after self.update_every = update_every self.batch_size = batch_size self.save_freq = save_freq self.polyak = polyak # Set up model saving self.logger.setup_pytorch_saver(self.ac) print("Running Offline RL algorithm: {}".format(self.algo)) def populate_replay_buffer(self): dataset = d4rl.qlearning_dataset(self.env) self.replay_buffer.obs_buf[:dataset['observations']. shape[0], :] = dataset['observations'] self.replay_buffer.act_buf[:dataset['actions']. shape[0], :] = dataset['actions'] self.replay_buffer.obs2_buf[:dataset['next_observations']. shape[0], :] = dataset['next_observations'] self.replay_buffer.rew_buf[:dataset['rewards']. shape[0]] = dataset['rewards'] self.replay_buffer.done_buf[:dataset['terminals']. shape[0]] = dataset['terminals'] self.replay_buffer.size = dataset['observations'].shape[0] self.replay_buffer.ptr = (self.replay_buffer.size + 1) % (self.replay_buffer.max_size) def run(self): # Learn a generative model for data density_epochs = 1000 self.sampling_policy = core.MADE(self.act_dim, 256, 3, cond_label_size=self.obs_dim[0]) density_optimizer = torch.optim.Adam(self.sampling_policy.parameters(), lr=5e-4, weight_decay=1e-6) for i in range(density_epochs): sample_indices = np.random.choice(self.replay_buffer.size, self.replay_buffer.size) np.random.shuffle(sample_indices) ctr = 0 total_loss = 0 for j in range(0, self.replay_buffer.size, self.batch_size): actions = self.replay_buffer.act_buf[ sample_indices[ctr * self.batch_size:(ctr + 1) * self.batch_size], :] actions = torch.FloatTensor(actions) obs = self.replay_buffer.obs_buf[ sample_indices[ctr * self.batch_size:(ctr + 1) * self.batch_size], :] obs = torch.FloatTensor(obs) density_optimizer.zero_grad() loss = -self.sampling_policy.log_prob(actions, y=obs).mean() loss.backward() total_loss += loss.data * self.batch_size density_optimizer.step() ctr += 1 print("Density training loss: {}".format(total_loss / self.replay_buffer.size)) # Save the behavior policy print("Saving the behavior policy model to {}".format( 'behavior_policies/' + self.env_name)) torch.save(self.sampling_policy.state_dict(), 'behavior_policies/' + self.env_name + '.pt')
def sac(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def train(env_fn, env_name, ac_kwargs=dict(), seed=0, steps_per_epoch=1000, epochs=3000, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=3e-4, batch_size=64, start_steps=10000, update_after=10000, update_every=1, num_test_episodes=10, value_coef=0.5, entropy_coef=0.02, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, device=torch.device('cpu')): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env.seed(seed) test_env.seed(seed) obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] actor_critic = MLPActorCritic(env.observation_space, env.action_space, **ac_kwargs).to(device) sql = SQL(actor_critic, lr, batch_size, update_every, gamma, polyak, value_coef, entropy_coef) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) rewards_log = [] episode_rewards = deque(maxlen=10) # Set up model saving logger.setup_pytorch_saver(sql.actor_critic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): action = sql.actor_critic.act( torch.as_tensor(o, dtype=torch.float32).to(device)) o, r, d, _ = test_env.step(action) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) episode_rewards.append(ep_ret) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = sql.actor_critic.act( torch.as_tensor(o, dtype=torch.float32).to(device)) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after: for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) loss = sql.update(data=batch) logger.store(Loss=loss) else: logger.store(Loss=0.) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if save_freq != 0 and ((epoch % save_freq == 0) or (epoch == epochs)): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() rewards_log.append(np.mean(episode_rewards)) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('Loss', average_only=True) logger.dump_tabular() rewards_log = np.array(rewards_log) save_path = '../../log/modified_sql/' + env_name + '/' + str(seed) + '.npy' np.save(save_path, rewards_log)
def train(env_fn, seed=0, ppo_epoch=10, steps_per_epoch=2048, mini_batch_size=64, num_epoch=1500, gamma=0.99, clip_ratio=0.2, value_clip_ratio=10, value_loss_coef=0.5, entropy_loss_coef=0, use_value_clipped_loss=True, lr=3e-4, eps=1e-5, lam=0.95, max_grad_norm=0.5, max_ep_len=1000, save_freq=10, device=torch.device('cpu'), ac_kwargs=dict(), logger_kwargs=dict()): # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() env.seed(seed) obs_dim = env.observation_space.shape act_dim = env.action_space.shape actor_critic = MLPActorCritic(env.observation_space, env.action_space, **ac_kwargs).to(device) ppo = PPO(actor_critic, clip_ratio, value_clip_ratio, ppo_epoch, mini_batch_size, value_loss_coef, entropy_loss_coef, lr, eps, max_grad_norm, use_value_clipped_loss) # Set up experience buffer buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam, device) # Set up model saving logger.setup_pytorch_saver(ppo.actor_critic) # Prepare for interaction with environment start_time = time.time() running_state = ZFilter((obs_dim[0], ), clip=10) # running_reward = ZFilter((1,), demean=False, clip=10) obs, ep_ret, ep_len = env.reset(), 0, 0 obs = running_state(obs) # Main loop: collect experience in env and update/log each epoch for epoch in range(num_epoch): for t in range(steps_per_epoch): action, value, logp = ppo.actor_critic.step( torch.as_tensor(obs, dtype=torch.float32).to(device)) next_obs, rew, done, _ = env.step(action) next_obs = running_state(next_obs) # rew = running_reward([rew])[0] ep_ret += rew ep_len += 1 # save and log buf.store(obs, action, rew, value, logp) # Update obs (critical!) obs = next_obs timeout = ep_len == max_ep_len terminal = done or timeout epoch_ended = t == steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, value, _ = ppo.actor_critic.step( torch.as_tensor(obs, dtype=torch.float32).to(device)) else: value = 0 buf.finish_path(value) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) obs, ep_ret, ep_len = env.reset(), 0, 0 obs = running_state(obs) # Save model if save_freq != 0 and ((epoch % save_freq == 0) or (epoch == num_epoch - 1)): logger.save_state({'env': env}, None) # perform update data = buf.get() policy_loss, value_loss, entropy, kl = ppo.update(data) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', policy_loss) logger.log_tabular('LossV', value_loss) logger.log_tabular('Entropy', entropy) logger.log_tabular('KL', kl) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()