def trpo( env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo="trpo", ): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The agent's main model which for state ``x`` and action, ``a`` returns the following outputs: ============ ================ ======================================== Symbol Shape Description ============ ================ ======================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a`` | in states ``x``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``info`` N/A | A dict of any intermediate quantities | (from calculating the policy or log | probabilities) which are needed for | analytically computing KL divergence. | (eg sufficient statistics of the | distributions) ``info_phs`` N/A | A dict of placeholders for old values | of the entries in ``info``. ``d_kl`` () | The mean KL | divergence between the current policy | (``pi``) and the old policy (as | specified by the inputs to | ``info``) over the batch of | states given in ``x``. ``v`` (batch,) | Gives the value estimate for states | in ``x``. (Critical: make sure | to flatten this!) ============ ================ ======================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TRPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) delta (float): KL-divergence limit for TRPO / NPG update. (Should be small for stability. Values like 0.01, 0.05.) vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. Probably don't play with this hyperparameter. cg_iters (int): Number of iterations of conjugate gradient to perform. Increasing this will lead to a more accurate approximation to :math:`H^{-1} g`, and possibly slightly-improved performance, but at the cost of slowing things down. Also probably don't play with this hyperparameter. backtrack_iters (int): Maximum number of steps allowed in the backtracking line search. Since the line search usually doesn't backtrack, and usually only steps back once when it does, this hyperparameter doesn't often matter. backtrack_coeff (float): How far back to step during backtracking line search. (Always between 0 and 1, usually above 0.5.) lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. algo: Either 'trpo' or 'npg': this code supports both, since they are almost the same. """ setup_pytorch_for_mpi() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs["action_space"] = env.action_space # Main model actor_critic = actor_critic(in_features=obs_dim[0], **ac_kwargs) # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) if isinstance(env.action_space, Box): info_shapes = { "old_mu": [env.action_space.shape[-1]], "old_log_std": [env.action_space.shape[-1]], } else: info_shapes = {"old_logits": [env.action_space.n]} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables var_counts = tuple( core.count_vars(module) for module in [actor_critic.policy, actor_critic.value_function]) logger.log("\nNumber of parameters: \t pi: %d, \t v: %d\n" % var_counts) # Optimizer for value function train_vf = torch.optim.Adam(actor_critic.value_function.parameters(), lr=vf_lr) # Sync params across processes sync_all_params(actor_critic.parameters()) def cg(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = torch.zeros_like(b) r = b # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = b r_dot_old = torch.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (torch.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = torch.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): inputs = [torch.Tensor(x) for x in buf.get()] obs, act, adv, ret, logp_old = inputs[:-len(buf.sorted_info_keys)] policy_args = dict( zip(buf.sorted_info_keys, inputs[-len(buf.sorted_info_keys):])) # Main outputs from computation graph _, logp, _, _, d_kl, v = actor_critic(obs, act, **policy_args) # Prepare hessian func, gradient eval ratio = (logp - logp_old).exp() # pi(a|s) / pi_old(a|s) pi_l_old = -(ratio * adv).mean() v_l_old = F.mse_loss(v, ret) g = core.flat_grad(pi_l_old, actor_critic.policy.parameters(), retain_graph=True) g = torch.from_numpy(mpi_avg(g.numpy())) pi_l_old = mpi_avg(pi_l_old.item()) def Hx(x): hvp = core.hessian_vector_product(d_kl, actor_critic.policy, x) if damping_coeff > 0: hvp += damping_coeff * x return torch.from_numpy(mpi_avg(hvp.numpy())) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = torch.sqrt(2 * delta / (torch.dot(x, Hx(x)) + EPS)) old_params = parameters_to_vector(actor_critic.policy.parameters()) def set_and_eval(step): vector_to_parameters(old_params - alpha * x * step, actor_critic.policy.parameters()) _, logp, _, _, d_kl = actor_critic.policy(obs, act, **policy_args) ratio = (logp - logp_old).exp() pi_loss = -(ratio * adv).mean() return mpi_avg(d_kl.item()), mpi_avg(pi_loss.item()) if algo == "npg": kl, pi_l_new = set_and_eval(step=1.0) elif algo == "trpo": for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( "Accepting new params at step %d of line search." % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log("Line search failed! Keeping old params.") logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.0) # Value function updates for _ in range(train_v_iters): v = actor_critic.value_function(obs) v_loss = F.mse_loss(v, ret) # Value function gradient step train_vf.zero_grad() v_loss.backward() average_gradients(train_vf.param_groups) train_vf.step() v = actor_critic.value_function(obs) v_l_new = F.mse_loss(v, ret) # Log changes from update logger.store( LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old), ) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): actor_critic.eval() for t in range(local_steps_per_epoch): a, _, logp_t, info_t, _, v_t = actor_critic( torch.Tensor(o.reshape(1, -1))) # save and log buf.store( o, a.detach().numpy(), r, v_t.item(), logp_t.detach().numpy(), core.values_as_sorted_list(info_t), ) logger.store(VVals=v_t) o, r, d, _ = env.step(a.detach().numpy()[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print("Warning: trajectory cut off by epoch at %d steps." % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = (r if d else actor_critic.value_function( torch.Tensor(o.reshape(1, -1))).item()) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({"env": env}, actor_critic, None) # Perform TRPO or NPG update! actor_critic.train() update() # Log info about epoch logger.log_tabular("Epoch", epoch) logger.log_tabular("EpRet", with_min_and_max=True) logger.log_tabular("EpLen", average_only=True) logger.log_tabular("VVals", with_min_and_max=True) logger.log_tabular("TotalEnvInteracts", (epoch + 1) * steps_per_epoch) logger.log_tabular("LossPi", average_only=True) logger.log_tabular("LossV", average_only=True) logger.log_tabular("DeltaLossPi", average_only=True) logger.log_tabular("DeltaLossV", average_only=True) logger.log_tabular("KL", average_only=True) if algo == "trpo": logger.log_tabular("BacktrackIters", average_only=True) logger.log_tabular("Time", time.time() - start_time) logger.dump_tabular()
def ddpg( env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, ): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The agent's main model which takes some states ``x`` and and actions ``a`` and returns a tuple of: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states ``x`` and actions in | ``a``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic class you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs["action_space"] = env.action_space # Main outputs from computation graph main = actor_critic(in_features=obs_dim, **ac_kwargs) # Target networks target = actor_critic(in_features=obs_dim, **ac_kwargs) target.eval() # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(module) for module in [main.policy, main.q, main]) print("\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n" % var_counts) # Separate train ops for pi, q pi_optimizer = torch.optim.Adam(main.policy.parameters(), lr=pi_lr) q_optimizer = torch.optim.Adam(main.q.parameters(), lr=q_lr) # Initializing targets to match main variables target.load_state_dict(main.state_dict()) def get_action(o, noise_scale): pi = main.policy(torch.Tensor(o.reshape(1, -1))) a = pi.detach().numpy()[0] + noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for _ in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): main.eval() """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): main.train() """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) (obs1, obs2, acts, rews, done) = ( torch.Tensor(batch["obs1"]), torch.Tensor(batch["obs2"]), torch.Tensor(batch["acts"]), torch.Tensor(batch["rews"]), torch.Tensor(batch["done"]), ) _, q, q_pi = main(obs1, acts) _, _, q_pi_targ = target(obs2, acts) # Bellman backup for Q function backup = (rews + gamma * (1 - done) * q_pi_targ).detach() # DDPG losses pi_loss = -q_pi.mean() q_loss = F.mse_loss(q, backup) # Q-learning update q_optimizer.zero_grad() q_loss.backward() q_optimizer.step() logger.store(LossQ=q_loss, QVals=q.data.numpy()) # Policy update pi_optimizer.zero_grad() pi_loss.backward() pi_optimizer.step() logger.store(LossPi=pi_loss) # Polyak averaging for target parameters for p_main, p_target in zip(main.parameters(), target.parameters()): p_target.data.copy_(polyak * p_target.data + (1 - polyak) * p_main.data) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({"env": env}, main, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular("Epoch", epoch) logger.log_tabular("EpRet", with_min_and_max=True) logger.log_tabular("TestEpRet", with_min_and_max=True) logger.log_tabular("EpLen", average_only=True) logger.log_tabular("TestEpLen", average_only=True) logger.log_tabular("TotalEnvInteracts", t) logger.log_tabular("QVals", with_min_and_max=True) logger.log_tabular("LossPi", average_only=True) logger.log_tabular("LossQ", average_only=True) logger.log_tabular("Time", time.time() - start_time) logger.dump_tabular()
def dqn( env_fn, dqnetwork=core.DQNetwork, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, min_replay_history=20000, epsilon_decay_period=250000, epsilon_train=0.01, epsilon_eval=0.001, lr=1e-3, max_ep_len=1000, update_period=4, target_update_period=8000, batch_size=100, logger_kwargs=dict(), save_freq=1, ): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = 1 # env.action_space.shape # Share information about action space with policy architecture ac_kwargs["action_space"] = env.action_space # Main computation graph main = dqnetwork(in_features=obs_dim, **ac_kwargs) # Target network target = dqnetwork(in_features=obs_dim, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(module) for module in [main.q, main]) print(("\nNumber of parameters: \t q: %d, \t total: %d\n") % var_counts) # Value train op value_params = main.q.parameters() value_optimizer = torch.optim.Adam(value_params, lr=lr) # Initializing targets to match main variables target.load_state_dict(main.state_dict()) def get_action(o, epsilon): """Select an action from the set of available actions. Chooses an action randomly with probability epsilon otherwise act greedily according to the current Q-value estimates. """ if np.random.random() <= epsilon: return env.action_space.sample() else: q_values = main(torch.Tensor(o.reshape(1, -1))) # return the action with highest Q-value for this observation return torch.argmax(q_values, dim=1).item() def test_agent(n=10): for _ in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # epsilon_eval used when evaluating the agent o, r, d, _ = test_env.step(get_action(o, epsilon_eval)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): main.eval() # the epsilon value used for exploration during training epsilon = core.linearly_decaying_epsilon( epsilon_decay_period, t, min_replay_history, epsilon_train ) a = get_action(o, epsilon) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # train at the rate of update_period if enough training steps have been run if replay_buffer.size > min_replay_history and t % update_period == 0: main.train() batch = replay_buffer.sample_batch(batch_size) (obs1, obs2, acts, rews, done) = ( torch.Tensor(batch["obs1"]), torch.Tensor(batch["obs2"]), torch.Tensor(batch["acts"]), torch.Tensor(batch["rews"]), torch.Tensor(batch["done"]), ) q_pi = main(obs1).gather(1, acts.long()).squeeze() q_pi_targ, _ = target(obs2).max(1) # Bellman backup for Q function backup = (rews + gamma * (1 - done) * q_pi_targ).detach() # DQN loss value_loss = F.smooth_l1_loss(q_pi, backup) # Q-learning update value_optimizer.zero_grad() value_loss.backward() value_optimizer.step() logger.store(LossQ=value_loss.item(), QVals=q_pi.data.numpy()) # syncs weights from online to target network if t % target_update_period == 0: target.load_state_dict(main.state_dict()) # End of epoch wrap-up if replay_buffer.size > min_replay_history and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({"env": env}, main, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular("Epoch", epoch) logger.log_tabular("EpRet", with_min_and_max=True) logger.log_tabular("TestEpRet", with_min_and_max=True) logger.log_tabular("EpLen", average_only=True) logger.log_tabular("TestEpLen", average_only=True) logger.log_tabular("TotalEnvInteracts", t) logger.log_tabular("QVals", with_min_and_max=True) logger.log_tabular("LossQ", average_only=True) logger.log_tabular("Time", time.time() - start_time) logger.dump_tabular()
def ppo( env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, use_gpu=False, gpu_parallel=False, ): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The agent's main model which is composed of the policy and value function model, where the policy takes some state, ``x`` and action ``a``, and value function takes the state ``x``. The model returns a tuple of: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a`` | in states ``x``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x``. (Critical: make sure | to flatten this via .squeeze()!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic class you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Main model actor_critic = actor_critic(in_features=obs_dim[0], **ac_kwargs) # actor_critic = torch.nn.DataParallel(actor_critic).to(device) # gpu是否使用 # device = torch.device("cpu" if USE_DEVICE=="cpu" else "cuda") if torch.cuda.is_available(): device = torch.device("cuda" if use_gpu else "cpu") if gpu_parallel: actor_critic = torch.nn.DataParallel(actor_critic) else: use_gpu = False use_parallel = False device = torch.device("cpu") actor_critic = actor_critic.to(device) # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple( core.count_vars(module) for module in [actor_critic.policy, actor_critic.value_function]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Optimizers train_pi = torch.optim.Adam(actor_critic.policy.parameters(), lr=pi_lr) train_v = torch.optim.Adam(actor_critic.value_function.parameters(), lr=vf_lr) # Sync params across processes sync_all_params(actor_critic.parameters()) def update(): temp_get = buf.get() obs, act, adv, ret, logp_old = [ torch.Tensor(x).to(device) for x in temp_get ] # Training policy _, logp, _ = actor_critic.policy(obs, act) ratio = (logp - logp_old).exp() min_adv = torch.where(adv > 0, (1 + clip_ratio) * adv, (1 - clip_ratio) * adv) pi_l_old = -(torch.min(ratio * adv, min_adv)).mean() ent = (-logp).mean() # a sample estimate for entropy for i in range(train_pi_iters): # Output from policy function graph _, logp, _ = actor_critic.policy(obs, act) # PPO policy objective ratio = (logp - logp_old).exp() min_adv = torch.where(adv > 0, (1 + clip_ratio) * adv, (1 - clip_ratio) * adv) pi_loss = -(torch.min(ratio * adv, min_adv)).mean() # Policy gradient step train_pi.zero_grad() pi_loss.backward() average_gradients(train_pi.param_groups) train_pi.step() _, logp, _ = actor_critic.policy(obs, act) kl = (logp_old - logp).mean() kl = mpi_avg(kl.item()) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # Training value function v = actor_critic.value_function(obs) v_l_old = F.mse_loss(v, ret) for _ in range(train_v_iters): # Output from value function graph v = actor_critic.value_function(obs) # PPO value function objective v_loss = F.mse_loss(v, ret) # Value function gradient step train_v.zero_grad() v_loss.backward() average_gradients(train_v.param_groups) train_v.step() # Log changes from update _, logp, _, v = actor_critic(obs, act) ratio = (logp - logp_old).exp() min_adv = torch.where(adv > 0, (1 + clip_ratio) * adv, (1 - clip_ratio) * adv) pi_l_new = -(torch.min(ratio * adv, min_adv)).mean() v_l_new = F.mse_loss(v, ret) kl = (logp_old - logp).mean() # a sample estimate for KL-divergence clipped = (ratio > (1 + clip_ratio)) | (ratio < (1 - clip_ratio)) cf = (clipped.float()).mean() logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): actor_critic.eval() for t in range(local_steps_per_epoch): a, _, logp_t, v_t = actor_critic( torch.Tensor(o.reshape(1, -1)).to(device)) # save and log buf.store(o, a.cpu().detach().numpy(), r, v_t.item(), logp_t.cpu().detach().numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.cpu().detach().numpy()[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else actor_critic.value_function( torch.Tensor(o.reshape(1, -1)).to(device)).item() buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, actor_critic, None) # Perform PPO update! actor_critic.train() update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The agent's model which takes the state ``x`` and action, ``a`` and returns a tuple of: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x`` and actions in | ``a``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x`` and actions in | ``a``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic class you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Main computation graph main = actor_critic(in_features=obs_dim, **ac_kwargs) # Target value network target = actor_critic(in_features=obs_dim, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer( obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(module) for module in [main.policy, main.q1, main.q2, main.vf_mlp, main]) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = torch.optim.Adam(main.policy.parameters(), lr=lr) # Value train op value_params = list(main.vf_mlp.parameters()) + list( main.q1.parameters()) + list(main.q2.parameters()) value_optimizer = torch.optim.Adam(value_params, lr=lr) # Initializing targets to match main variables target.vf_mlp.load_state_dict(main.vf_mlp.state_dict()) def get_action(o, deterministic=False): pi, mu, _ = main.policy(torch.Tensor(o.reshape(1, -1))) return mu.detach().numpy()[0] if deterministic else pi.detach().numpy()[0] def test_agent(n=10): for _ in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) (obs1, obs2, acts, rews, done) = (torch.Tensor(batch['obs1']), torch.Tensor(batch['obs2']), torch.Tensor(batch['acts']), torch.Tensor(batch['rews']), torch.Tensor(batch['done'])) _, _, logp_pi, q1, q2, q1_pi, q2_pi, v = main(obs1, acts) v_targ = target.vf_mlp(obs2) # Min Double-Q: min_q_pi = torch.min(q1_pi, q2_pi) # Targets for Q and V regression q_backup = (rews + gamma * (1 - done) * v_targ).detach() v_backup = (min_q_pi - alpha * logp_pi).detach() # Soft actor-critic losses pi_loss = (alpha * logp_pi - min_q_pi).mean() q1_loss = 0.5 * F.mse_loss(q1, q_backup) q2_loss = 0.5 * F.mse_loss(q2, q_backup) v_loss = 0.5 * F.mse_loss(v, v_backup) value_loss = q1_loss + q2_loss + v_loss # Policy train op pi_optimizer.zero_grad() pi_loss.backward() pi_optimizer.step() # Value train op value_optimizer.zero_grad() value_loss.backward() value_optimizer.step() # Polyak averaging for target parameters for p_main, p_target in zip(main.vf_mlp.parameters(), target.vf_mlp.parameters()): p_target.data.copy_(polyak * p_target.data + (1 - polyak) * p_main.data) logger.store( LossPi=pi_loss.item(), LossQ1=q1_loss.item(), LossQ2=q2_loss.item(), LossV=v_loss.item(), Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy(), VVals=v.detach().numpy(), LogPi=logp_pi.detach().numpy()) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, main, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The agent's main model which for state ``x`` and action, ``a`` returns the following outputs: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x`` and actions in | ``a``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x`` and actions in | ``a``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Main outputs from computation graph main = actor_critic(in_features=obs_dim, **ac_kwargs) # Target policy network target = actor_critic(in_features=obs_dim, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(module) for module in [main.policy, main.q1, main.q2, main]) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Separate train ops for pi, q pi_optimizer = torch.optim.Adam(main.policy.parameters(), lr=pi_lr) q_params = list(main.q1.parameters()) + list(main.q2.parameters()) q_optimizer = torch.optim.Adam(q_params, lr=q_lr) # Initializing targets to match main variables target.load_state_dict(main.state_dict()) def get_action(o, noise_scale): pi = main.policy(torch.Tensor(o.reshape(1, -1))) a = pi.detach().numpy()[0] + noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for _ in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) (obs1, obs2, acts, rews, done) = (torch.tensor(batch['obs1']), torch.tensor(batch['obs2']), torch.tensor(batch['acts']), torch.tensor(batch['rews']), torch.tensor(batch['done'])) q1 = main.q1(torch.cat((obs1, acts), dim=1)) q2 = main.q2(torch.cat((obs1, acts), dim=1)) pi_targ = target.policy(obs2) # Target policy smoothing, by adding clipped noise to target actions epsilon = torch.normal(torch.zeros_like(pi_targ), target_noise * torch.ones_like(pi_targ)) epsilon = torch.clamp(epsilon, -noise_clip, noise_clip) a2 = torch.clamp(pi_targ + epsilon, -act_limit, act_limit) # Target Q-values, using action from target policy q1_targ = target.q1(torch.cat((obs2, a2), dim=1)) q2_targ = target.q2(torch.cat((obs2, a2), dim=1)) # Bellman backup for Q functions, using Clipped Double-Q targets min_q_targ = torch.min(q1_targ, q2_targ) backup = (rews + gamma * (1 - done) * min_q_targ).detach() # TD3 Q losses q1_loss = F.mse_loss(q1, backup) q2_loss = F.mse_loss(q2, backup) q_loss = q1_loss + q2_loss q_optimizer.zero_grad() q_loss.backward() q_optimizer.step() logger.store(LossQ=q_loss.item(), Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) if j % policy_delay == 0: q1_pi = main.q1(torch.cat((obs1, main.policy(obs1)), dim=1)) # TD3 policy loss pi_loss = -q1_pi.mean() # Delayed policy update pi_optimizer.zero_grad() pi_loss.backward() pi_optimizer.step() # Polyak averaging for target variables for p_main, p_target in zip(main.parameters(), target.parameters()): p_target.data.copy_(polyak * p_target.data + (1 - polyak) * p_main.data) logger.store(LossPi=pi_loss.item()) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, main, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def iqn( env_fn, dqnetwork=core.DQNetwork, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), quantile_embedding_dim=64, # n in equation 4 in IQN paper num_tau_samples=16, # N in equation 3 in IQN paper num_tau_prime_samples=8, # N' in equation 3 in IQN paper num_quantile_samples=32, # K in equation 3 in IQN paper kappa=1.0, # kappa for Huber Loss in IQN gamma=0.99, min_replay_history=20000, epsilon_decay_period=250000, epsilon_train=0.01, epsilon_eval=0.001, lr=1e-3, max_ep_len=1000, update_period=4, target_update_period=8000, batch_size=100, logger_kwargs=dict(), save_freq=1, ): """ quantile_embedding_dim : # n in equation 4 in IQN paper num_tau_samples : N in equation 3 in IQN paper num_tau_prime_samples : N' in equation 3 in IQN paper num_quantile_samples : K in equation 3 in IQN paper kappa : kappa for Huber Loss in IQN """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = 1 # env.action_space.shape # Share information about action space with policy architecture ac_kwargs["action_space"] = env.action_space ac_kwargs["quantile_embedding_dim"] = quantile_embedding_dim # Main computation graph main = dqnetwork(in_features=obs_dim, **ac_kwargs) # Target network target = dqnetwork(in_features=obs_dim, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(module) for module in [main.z, main]) print(("\nNumber of parameters: \t z: %d, \t total: %d\n") % var_counts) # Value train op params = main.parameters() optimizer = torch.optim.Adam(params, lr=lr) # Initializing targets to match main variables target.load_state_dict(main.state_dict()) def get_action(o, epsilon): """Select an action from the set of available actions. Chooses an action randomly with probability epsilon otherwise act greedily according to the current Q-value estimates. """ if np.random.random() <= epsilon: return env.action_space.sample() else: return main.policy(torch.Tensor(o.reshape(1, -1)), num_tau_samples).item() def test_agent(n=10): for _ in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # epsilon_eval used when evaluating the agent o, r, d, _ = test_env.step(get_action(o, epsilon_eval)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def update(): """ref: https://github.com/google/dopamine/blob/master/dopamine/agents/implicit_quantile/implicit_quantile_agent.py """ main.train() batch = replay_buffer.sample_batch(batch_size) (obs1, obs2, acts1, rews, done) = ( torch.Tensor(batch["obs1"]), torch.Tensor(batch["obs2"]), torch.LongTensor(batch["acts"]), # (bsz, 1) torch.Tensor(batch["rews"]), # (bsz) torch.Tensor(batch["done"]), # (bsz) ) action_dim = env.action_space.n bsz = obs1.size(0) with torch.no_grad(): z2, _ = target(obs2, num_tau_prime_samples) assert z2.size() == (bsz, action_dim, num_tau_prime_samples) # acts2 = main(obs2, num_quantile_samples)[0].mean(dim=-1).argmax(dim=-1) # double dqn acts2 = z2.mean(dim=-1).argmax(dim=-1) # (bsz) rews = rews.unsqueeze(1) done = done.unsqueeze(1) backups = rews + (1 - done) * gamma * z2[range(bsz), acts2] assert backups.size() == (bsz, num_tau_prime_samples) z1, replay_tau = main(obs1, num_tau_samples) acts1 = acts1.squeeze(1) # (bsz) z1 = z1[range(bsz), acts1] # (bsz, num_tau_samples) bellman_errors = backups.unsqueeze(-1) - z1.unsqueeze(1) assert bellman_errors.size() == (bsz, num_tau_prime_samples, num_tau_samples) huber_loss1 = (abs(bellman_errors) <= kappa).float() * 0.5 * bellman_errors ** 2 huber_loss2 = ( (abs(bellman_errors) > kappa).float() * kappa * (abs(bellman_errors) - kappa / 2) ) huber_loss = huber_loss1 + huber_loss2 replay_tau = replay_tau.view(bsz, num_tau_samples).unsqueeze( 1 ) # (bsz, 1, num_tau_samples) replay_tau = replay_tau.repeat(1, num_tau_prime_samples, 1) assert replay_tau.size() == (bsz, num_tau_prime_samples, num_tau_samples) tau_huber_loss = abs(replay_tau - ((bellman_errors < 0).float()).detach()) tau_huber_loss = tau_huber_loss * huber_loss / kappa loss = tau_huber_loss.sum(dim=2).mean(dim=1) # (bsz) loss = loss.mean() optimizer.zero_grad() loss.backward() optimizer.step() return loss.item(), None start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): main.eval() # the epsilon value used for exploration during training epsilon = core.linearly_decaying_epsilon( epsilon_decay_period, t, min_replay_history, epsilon_train ) a = get_action(o, epsilon) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # train at the rate of update_period if enough training steps have been run if replay_buffer.size > min_replay_history and t % update_period == 0: loss, QDist = update() logger.store(LossQ=loss) # , QVals=QDist.mean(-1)) # syncs weights from online to target network if t % target_update_period == 0: target.load_state_dict(main.state_dict()) # End of epoch wrap-up if replay_buffer.size > min_replay_history and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({"env": env}, main, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular("Epoch", epoch) logger.log_tabular("EpRet", with_min_and_max=True) logger.log_tabular("TestEpRet", with_min_and_max=True) logger.log_tabular("EpLen", average_only=True) logger.log_tabular("TestEpLen", average_only=True) logger.log_tabular("TotalEnvInteracts", t) logger.log_tabular("LossQ", average_only=True) # logger.log_tabular("QVals", with_min_and_max=True) logger.log_tabular("Time", time.time() - start_time) logger.dump_tabular()
def rainbow( env_fn, dueling_dqn=False, double_dqn=False, noisy=False, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), Vmin=-100.0, # hyperparameters for not-atari env Vmax=100.0, # hyperparameters for not-atari env num_atoms=50, # hyperparameters for not-atari env gamma=0.99, min_replay_history=20000, prioritized_replay_alpha=0.6, beta_start=0.4, beta_frames=10000, epsilon_decay_period=250000, epsilon_train=0.01, epsilon_eval=0.001, lr=1e-3, clip_grad_norm=5.0, max_ep_len=1000, update_period=4, target_update_period=8000, batch_size=100, logger_kwargs=dict(), save_freq=1, ): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = 1 # env.action_space.shape # Share information with policy architecture ac_kwargs['use_noisy_layer'] = noisy ac_kwargs['action_space'] = env.action_space ac_kwargs['num_atoms'] = num_atoms ac_kwargs['Vmin'] = Vmin ac_kwargs['Vmax'] = Vmax dqnetwork = core.CategoricalDuelingDQNetwork if dueling_dqn else core.CategoricalDQNetwork # Main computation graph main = dqnetwork(in_features=obs_dim, **ac_kwargs) # Target network target = dqnetwork(in_features=obs_dim, **ac_kwargs) # C51 stuffs supports = torch.linspace(Vmin, Vmax, num_atoms) delta_z = (Vmax - Vmin) / (num_atoms - 1) # Experience buffer replay_buffer = PrioritizedReplayBuffer(replay_size, prioritized_replay_alpha) # Count variables if dueling_dqn: var_counts = tuple( core.count_vars(module) for module in [main.enc, main.v, main.a, main]) print(( '\nNumber of parameters: \t encoder: %d, \t value head: %d \t advantage head: %d \t total: %d\n' ) % var_counts) else: var_counts = tuple( core.count_vars(module) for module in [main.q, main]) print( ('\nNumber of parameters: \t q: %d, \t total: %d\n') % var_counts) # Value train op value_params = main.parameters() value_optimizer = torch.optim.Adam(value_params, lr=lr) # Initializing targets to match main variables target.load_state_dict(main.state_dict()) def get_action(o, epsilon=None): """Select an action from the set of available actions. Chooses an action randomly with probability epsilon otherwise act greedily according to the current Q-value estimates. """ if epsilon is not None and np.random.random() <= epsilon: return env.action_space.sample() else: return main.policy(torch.Tensor(o.reshape(1, -1))).item() def test_agent(n=10): epsilon_eval = None if noisy else epsilon_eval main.eval() for _ in range(n): o, r, done, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (done or (ep_len == max_ep_len)): # epsilon_eval used when evaluating the agent o, r, done, _ = test_env.step(get_action(o, epsilon_eval)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def update(t): main.train() beta = core.beta_by_frame(t, beta_start, beta_frames) batch = replay_buffer.sample_batch(batch_size, beta) (obs1, obs2, acts, rews, done, weights, idxes) = (torch.Tensor(batch['obs1']), torch.Tensor(batch['obs2']), torch.LongTensor(batch['acts']), torch.Tensor(batch['rews']), torch.Tensor(batch['done']), torch.Tensor(batch['weights']), batch['idxes']) # compute target distribution bsz = obs1.size(0) with torch.no_grad(): if noisy: target.reset_noise() pns = target(obs2) # (bsz, act_dim, num_atoms) if double_dqn: next_act_idx = (main(obs2) * supports.expand_as(pns)).sum(-1).argmax(-1) else: dist = supports.expand_as(pns) * pns next_act_idx = dist.sum(-1).argmax(-1) # (bsz) pns_a = pns[range(bsz), next_act_idx] # (bsz, num_atoms) rews = rews.unsqueeze(1) # (bsz, 1) done = done.unsqueeze(1) # (bsz, 1) # (bsz, num_atoms) for all in this block Tz = rews + (1 - done) * gamma * supports.unsqueeze(0) Tz = Tz.clamp(min=Vmin, max=Vmax) b = (Tz - Vmin) / delta_z l, u = b.floor().long(), b.ceil().long() # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (num_atoms - 1)) * (l == u)] += 1 offset = torch.linspace(0, (bsz - 1) * num_atoms, bsz) offset = offset.unsqueeze(1).expand(bsz, num_atoms).long() m = torch.zeros([bsz, num_atoms]) m.view(-1).index_add_(0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1)) m.view(-1).index_add_(0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1)) log_dist1 = main(obs1, log=True) # (bsz, action_dim, num_atoms) acts = acts.squeeze(1) # (bsz) log_dist1 = log_dist1[range(bsz), acts] # (bsz, num_atoms) loss = -(m * log_dist1).sum(-1) * weights # (bsz) priorities = loss.detach().numpy() + 1e-5 loss = loss.mean() value_optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(main.parameters(), clip_grad_norm) value_optimizer.step() # replay buffer update replay_buffer.update_priorities(idxes, priorities) return loss.item(), pns_a.numpy() start_time = time.time() o, r, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): main.train() # enable NoisyNet exploration # the epsilon value used for exploration during training epsilon = core.linearly_decaying_epsilon( epsilon_decay_period, t, min_replay_history, epsilon_train) if noisy else None with torch.no_grad(): a = get_action(o, epsilon) # Step the env o2, r, done, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) done = False if ep_len == max_ep_len else done # Store experience to replay buffer replay_buffer.store(o, a, r, o2, done) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if done or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if noisy is True and t % update_period == 0: main.reset_noise() # train at the rate of update_period if enough training steps have been run if len(replay_buffer) > min_replay_history and t % update_period == 0: value_loss, QDist = update(t) logger.store(LossQ=value_loss, QVals=QDist.mean(-1)) # syncs weights from online to target network if t % target_update_period == 0: target.load_state_dict(main.state_dict()) # End of epoch wrap-up if len(replay_buffer ) > min_replay_history and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, main, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def c51( env_fn, dqnetwork=core.CategoricalDQNetwork, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), Vmin=-100.0, # hyperparameters for not-atari env Vmax=100.0, # hyperparameters for not-atari env num_atoms=50, # hyperparameters for not-atari env gamma=0.99, min_replay_history=20000, epsilon_decay_period=250000, epsilon_train=0.01, epsilon_eval=0.001, lr=1e-3, grad_clip=5.0, max_ep_len=1000, update_period=4, target_update_period=8000, batch_size=100, logger_kwargs=dict(), save_freq=1, ): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = 1 # env.action_space.shape # Share information with policy architecture ac_kwargs["action_space"] = env.action_space ac_kwargs["num_atoms"] = num_atoms ac_kwargs["Vmin"] = Vmin ac_kwargs["Vmax"] = Vmax # Main computation graph main = dqnetwork(in_features=obs_dim, **ac_kwargs) # Target network target = dqnetwork(in_features=obs_dim, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # C51 stuffs supports = torch.linspace(Vmin, Vmax, num_atoms) delta_z = (Vmax - Vmin) / (num_atoms - 1) # Count variables var_counts = tuple(core.count_vars(module) for module in [main.q, main]) print(("\nNumber of parameters: \t q: %d, \t total: %d\n") % var_counts) # Value train op value_params = main.q.parameters() value_optimizer = torch.optim.Adam(value_params, lr=lr) # Initializing targets to match main variables target.load_state_dict(main.state_dict()) def get_action(o, epsilon): """Select an action from the set of available actions. Chooses an action randomly with probability epsilon otherwise act greedily according to the current Q-value estimates. """ if np.random.random() <= epsilon: return env.action_space.sample() else: return main.policy(torch.Tensor(o.reshape(1, -1))).item() def test_agent(n=10): for _ in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # epsilon_eval used when evaluating the agent o, r, d, _ = test_env.step(get_action(o, epsilon_eval)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def update(): main.train() batch = replay_buffer.sample_batch(batch_size) (obs1, obs2, acts, rews, done) = ( torch.Tensor(batch["obs1"]), torch.Tensor(batch["obs2"]), torch.LongTensor(batch["acts"]), # (bsz, 1) torch.Tensor(batch["rews"]), # (bsz) torch.Tensor(batch["done"]), # (bsz) ) # compute target distribution bsz = obs1.size(0) with torch.no_grad(): out2 = target(obs2) # (bsz, act_dim, num_atoms) dist2 = supports.expand_as(out2) * out2 act2_idx = dist2.sum(-1).argmax(-1) # (bsz) dist2_a = out2[range(bsz), act2_idx] # (bsz, num_atoms) rews = rews.unsqueeze(1) # (bsz, 1) done = done.unsqueeze(1) # (bsz, 1) # (bsz, num_atoms) for all in this block Tz = rews + (1 - done) * gamma * supports.unsqueeze(0) Tz = Tz.clamp(min=Vmin, max=Vmax) b = (Tz - Vmin) / delta_z l, u = b.floor().long(), b.ceil().long() # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (num_atoms - 1)) * (l == u)] += 1 offset = torch.linspace(0, (bsz - 1) * num_atoms, bsz) offset = offset.unsqueeze(1).expand(bsz, num_atoms).long() m = torch.zeros([bsz, num_atoms]) m.view(-1).index_add_(0, (l + offset).view(-1), (dist2_a * (u.float() - b)).view(-1)) m.view(-1).index_add_(0, (u + offset).view(-1), (dist2_a * (b - l.float())).view(-1)) log_dist1 = main(obs1, log=True) # (bsz, action_dim, num_atoms) acts = acts.squeeze(1) # (bsz) log_dist1 = log_dist1[range(bsz), acts] # (bsz, num_atoms) loss = -(m * log_dist1).sum(-1) # (bsz) loss = loss.mean() value_optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(main.parameters(), grad_clip) value_optimizer.step() return loss.item(), (dist2_a * supports.expand_as(dist2_a)).numpy() start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): main.eval() # the epsilon value used for exploration during training epsilon = core.linearly_decaying_epsilon(epsilon_decay_period, t, min_replay_history, epsilon_train) a = get_action(o, epsilon) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # train at the rate of update_period if enough training steps have been run if replay_buffer.size > min_replay_history and t % update_period == 0: loss, QDist = update() logger.store(LossQ=loss, QVals=QDist.sum(-1)) # syncs weights from online to target network if t % target_update_period == 0: target.load_state_dict(main.state_dict()) # End of epoch wrap-up if replay_buffer.size > min_replay_history and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({"env": env}, main, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular("Epoch", epoch) logger.log_tabular("EpRet", with_min_and_max=True) logger.log_tabular("TestEpRet", with_min_and_max=True) logger.log_tabular("EpLen", average_only=True) logger.log_tabular("TestEpLen", average_only=True) logger.log_tabular("TotalEnvInteracts", t) logger.log_tabular("LossQ", average_only=True) logger.log_tabular("QVals", with_min_and_max=True) logger.log_tabular("Time", time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The agent's main model which is composed of the policy and value function model, where the policy takes some state, ``x``, and action, ``a``, and value function takes the state ``x`` and returns a tuple of: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a`` | in states ``x``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x``. (Critical: make sure | to flatten this via .item()!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic class you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Main model actor_critic = actor_critic(in_features=obs_dim[0], **ac_kwargs) # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple( core.count_vars(module) for module in [actor_critic.policy, actor_critic.value_function]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Optimizers train_pi = torch.optim.Adam(actor_critic.policy.parameters(), lr=pi_lr) train_v = torch.optim.Adam(actor_critic.value_function.parameters(), lr=vf_lr) # Sync params across processes sync_all_params(actor_critic.parameters()) def update(): obs, act, adv, ret, logp_old = [torch.Tensor(x) for x in buf.get()] # Policy gradient step _, logp, _ = actor_critic.policy(obs, act) ent = (-logp).mean() # a sample estimate for entropy # VPG policy objective pi_loss = -(logp * adv).mean() # Policy gradient step train_pi.zero_grad() pi_loss.backward() average_gradients(train_pi.param_groups) train_pi.step() # Value function learning v = actor_critic.value_function(obs) v_l_old = F.mse_loss(v, ret) for _ in range(train_v_iters): # Output from value function graph v = actor_critic.value_function(obs) # VPG value objective v_loss = F.mse_loss(v, ret) # Value function gradient step train_v.zero_grad() v_loss.backward() average_gradients(train_v.param_groups) train_v.step() # Log changes from update _, logp, _, v = actor_critic(obs, act) pi_l_new = -(logp * adv).mean() v_l_new = F.mse_loss(v, ret) kl = (logp_old - logp).mean() # a sample estimate for KL-divergence logger.store(LossPi=pi_loss, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_loss), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): actor_critic.eval() for t in range(local_steps_per_epoch): a, _, logp_t, v_t = actor_critic(torch.Tensor(o.reshape(1, -1))) # save and log buf.store(o, a.detach().numpy(), r, v_t.item(), logp_t.detach().numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.detach().numpy()[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else actor_critic.value_function( torch.Tensor(o.reshape(1, -1))).item() buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, actor_critic, None) # Perform VPG update! actor_critic.train() update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def qr_dqn( env_fn, dqnetwork=core.DQNetwork, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), Vmin=-10.0, # hyperparameters for not-atari env Vmax=10.0, # hyperparameters for not-atari env num_quantiles=50, # hyperparameters for not-atari env gamma=0.99, min_replay_history=20000, epsilon_decay_period=250000, epsilon_train=0.01, epsilon_eval=0.001, lr=1e-3, max_ep_len=1000, update_period=4, target_update_period=8000, batch_size=100, logger_kwargs=dict(), save_freq=1, ): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = 1 # env.action_space.shape # Share information about action space with policy architecture ac_kwargs["action_space"] = env.action_space # Main computation graph main = dqnetwork(in_features=obs_dim, **ac_kwargs) # Target network target = dqnetwork(in_features=obs_dim, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(module) for module in [main.q, main]) print(("\nNumber of parameters: \t q: %d, \t total: %d\n") % var_counts) # Value train op value_params = main.q.parameters() value_optimizer = torch.optim.Adam(value_params, lr=lr) # Initializing targets to match main variables target.load_state_dict(main.state_dict()) # Quantile regression stuffs k = 1.0 def huber(x): return torch.where(x.abs() < k, x**2 / 2, k * (x.abs() - k / 2)) tau = torch.Tensor( (2 * np.arange(num_quantiles) + 1) / (2.0 * num_quantiles)).view( 1, -1) def get_action(o, epsilon): """Select an action from the set of available actions. Chooses an action randomly with probability epsilon otherwise act greedily according to the current Q-value estimates. """ if np.random.random() <= epsilon: return env.action_space.sample() else: return main.policy(torch.Tensor(o.reshape(1, -1))).item() def test_agent(n=10): for _ in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # epsilon_eval used when evaluating the agent o, r, d, _ = test_env.step(get_action(o, epsilon_eval)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def update(): main.train() batch = replay_buffer.sample_batch(batch_size) (obs1, obs2, acts, rews, done) = ( torch.Tensor(batch["obs1"]), torch.Tensor(batch["obs2"]), torch.LongTensor(batch["acts"]), # (bsz, 1) torch.Tensor(batch["rews"]), # (bsz) torch.Tensor(batch["done"]), # (bsz) ) bsz = obs1.size(0) q_dist1 = main(obs1) # (bsz, action_dim, num_quantiles) acts = acts.squeeze(1) # (bsz) q_dist1 = q_dist1[range(bsz), acts] # (bsz, num_atoms) q_dist2 = target(obs2).detach() act_idx2 = q_dist2.mean(-1).argmax(-1) # (bsz) # act_idx2 = main(obs2).mean(-1).argmax(-1) # double dqn q_dist2 = q_dist2[range(bsz), act_idx2] # (bsz, num_quantiles) rews = rews.unsqueeze(1) # (bsz, 1) done = done.unsqueeze(1) # (bsz, 1) T_theta = rews + (1 - done) * gamma * q_dist2 diff = T_theta.t().unsqueeze(-1) - q_dist1 loss = huber(diff) * (tau - (diff.detach() < 0).float()).abs() loss = loss.mean() value_optimizer.zero_grad() loss.backward() value_optimizer.step() return loss.item(), q_dist2.detach().numpy() start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): main.eval() # the epsilon value used for exploration during training epsilon = core.linearly_decaying_epsilon(epsilon_decay_period, t, min_replay_history, epsilon_train) a = get_action(o, epsilon) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # train at the rate of update_period if enough training steps have been run if replay_buffer.size > min_replay_history and t % update_period == 0: loss, QDist = update() logger.store(LossQ=loss, QVals=QDist.mean(-1)) # syncs weights from online to target network if t % target_update_period == 0: target.load_state_dict(main.state_dict()) # End of epoch wrap-up if replay_buffer.size > min_replay_history and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({"env": env}, main, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular("Epoch", epoch) logger.log_tabular("EpRet", with_min_and_max=True) logger.log_tabular("TestEpRet", with_min_and_max=True) logger.log_tabular("EpLen", average_only=True) logger.log_tabular("TestEpLen", average_only=True) logger.log_tabular("TotalEnvInteracts", t) logger.log_tabular("LossQ", average_only=True) logger.log_tabular("QVals", with_min_and_max=True) logger.log_tabular("Time", time.time() - start_time) logger.dump_tabular()