def ppo(task, actor_critic=model.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, lr=3e-4, v_loss_coeff=0.5, train_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, wrapper_type="continuous_absolute", log_wandb=False): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = dm_construction.get_environment(task, wrapper_type=wrapper_type) obs_dim = env.observation_spec().shape if wrapper_type == "continuous_absolute": act_dim = 4 # for continuous absolute action space else: raise NotImplementedError # Create actor-critic module ac = actor_critic(env.observation_spec(), env.action_spec(), **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = count_vars(ac.ac) logger.log(f"\nNumber of parameters: \t {var_counts}") # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) def compute_loss(data): obs, act, adv, logp_old, ret = data['obs'], data['act'], data[ 'adv'], data['logp'], data['ret'] pi, v, logp = ac.ac(obs, act) # value loss (just MSE) loss_v = ((v - ret)**2).mean() # policy loss ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # useful extra info re: policy approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_v, loss_pi, pi_info # Set up optimizers for policy and value function optimizer = Adam(ac.ac.parameters(), lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() v_l_old, pi_l_old, pi_info_old = compute_loss(data) pi_l_old = pi_l_old.item() vl_l_old = v_l_old.item() # Train policy with multiple steps of gradient descent for i in range(train_iters): optimizer.zero_grad() loss_v, loss_pi, pi_info = compute_loss(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( f'Early stopping at step {i} due to reaching max kl.') break loss = loss_pi + loss_v * v_loss_coeff loss.backward() mpi_avg_grads(ac.ac) # average grads across MPI processes optimizer.step() logger.store(StopIter=i) # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() timestep, ep_ret, ep_len = env.reset(difficulty=0), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): encountered_terminal = False for t in range(local_steps_per_epoch): # assumes obs is an rgb array: rescale to [0, 1] o = timestep.observation / 255.0 a, v, logp = ac.step(o) next_timestep = env.step(ac.action_to_dict(a, rescale=True)) r = timestep.reward d = next_timestep.last( ) # TODO: check if r, d are assoc w/ correct timestep ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # TODO debugging logger.store(AHor=a[0]) logger.store(AVer=a[1]) logger.store(ASel=a[3]) # Update obs (critical!) timestep = next_timestep timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print( f'Warning: trajectory cut off by epoch at {ep_len} steps.', flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(timestep.observation / 255.0) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished. logger.store(EpRet=ep_ret, EpLen=ep_len) encountered_terminal = True timestep, ep_ret, ep_len = env.reset(difficulty=0), 0, 0 # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) if encountered_terminal: # Note, if local_steps_per_epoch is too small so no terminal state # has been encountered, then ep_ret and ep_len will not # be stored before call to log_tabular, resulting in error. logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) # TODO debugging logger.log_tabular('AHor', with_min_and_max=True) logger.log_tabular('AVer', with_min_and_max=True) logger.log_tabular('ASel', with_min_and_max=True) # Save model if (epoch % save_freq == 0 and epoch > 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) if proc_id() == 0 and log_wandb: # Save the model parameters to wandb every save_freq epoch # instead of waiting till the end state = { 'epoch': epoch, 'ac_state_dict': ac.ac.state_dict(), 'optimizer': optimizer.state_dict(), } # output the model in the wandb.run.dir to avoid problems # syncing the model in the cloud with wandb's files state_fname = os.path.join(wandb.run.dir, "state_dict.pt") torch.save(state, state_fname) if proc_id() == 0 and log_wandb: wandb.log(logger.log_current_row, step=epoch) logger.dump_tabular()
def td3(env_fn: Callable, actor_critic: torch.nn.Module = core.MLPActorCritic, ac_kwargs: Dict = None, seed: int = 0, steps_per_epoch: int = 4000, epochs: int = 2000, replay_size: int = int(1e6), gamma: float = 0.99, polyak: float = 0.995, pi_lr: Union[Callable, float] = 1e-3, q_lr: Union[Callable, float] = 1e-3, batch_size: int = 100, start_steps: int = 10000, update_after: int = 1000, update_every: int = 100, act_noise: Union[Callable, float] = 0.1, target_noise: float = 0.2, noise_clip: float = 0.5, policy_delay: int = 2, num_test_episodes: int = 3, max_ep_len: int = 1000, logger_kwargs: Dict = None, save_freq: int = 1, random_exploration: Union[Callable, float] = 0.0, save_checkpoint_path: str = None, load_checkpoint_path: str = None, load_model_file: str = None): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float or callable): Learning rate for policy. q_lr (float or callable): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float or callable): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. random_exploration (float or callable): Probability to randomly select an action instead of selecting from policy. save_checkpoint_path (str): Path to save the model. If not set, no model will be saved load_checkpoint_path (str): Path to load the model. Cannot be set if save_model_path is set. """ if logger_kwargs is None: logger_kwargs = dict() if ac_kwargs is None: ac_kwargs = dict() if save_checkpoint_path is not None: assert load_checkpoint_path is None, "load_model_path cannot be set when save_model_path is already set" if not os.path.exists(save_checkpoint_path): print(f"Folder {save_checkpoint_path} does not exist, creating...") os.makedirs(save_checkpoint_path) if load_checkpoint_path is not None: assert load_model_file is None, "load_checkpoint_path cannot be set when load_model_file is already set" # ------------ Initialisation begin ------------ loaded_state_dict = None if load_checkpoint_path is not None: logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) loaded_state_dict = load_latest_state_dict(load_checkpoint_path) logger.epoch_dict = loaded_state_dict['logger_epoch_dict'] q_learning_rate_fn = loaded_state_dict['q_learning_rate_fn'] pi_learning_rate_fn = loaded_state_dict['pi_learning_rate_fn'] epsilon_fn = loaded_state_dict['epsilon_fn'] act_noise_fn = loaded_state_dict['act_noise_fn'] replay_buffer = loaded_state_dict['replay_buffer'] env, test_env = loaded_state_dict['env'], loaded_state_dict['test_env'] ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) ac.load_state_dict(loaded_state_dict['ac']) ac_targ.load_state_dict(loaded_state_dict['ac_targ']) obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] env.action_space.np_random.set_state( loaded_state_dict['action_space_state']) # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) t_ori = loaded_state_dict['t'] pi_optimizer = Adam(ac.pi.parameters(), lr=pi_learning_rate_fn(t_ori)) pi_optimizer.load_state_dict(loaded_state_dict['pi_optimizer']) q_optimizer = Adam(q_params, lr=q_learning_rate_fn(t_ori)) q_optimizer.load_state_dict(loaded_state_dict['q_optimizer']) np.random.set_state(loaded_state_dict['np_rng_state']) torch.set_rng_state(loaded_state_dict['torch_rng_state']) else: logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) q_learning_rate_fn = get_schedule_fn(q_lr) pi_learning_rate_fn = get_schedule_fn(pi_lr) act_noise_fn = get_schedule_fn(act_noise) epsilon_fn = get_schedule_fn(random_exploration) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] env.action_space.seed(seed) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Create actor-critic module and target networks if load_model_file is not None: assert os.path.exists( load_model_file ), f"Model file path does not exist: {load_model_file}" ac = torch.load(load_model_file) else: ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_learning_rate_fn(0)) q_optimizer = Adam(q_params, lr=q_learning_rate_fn(0)) t_ori = 0 act_limit = 1.0 # ------------ Initialisation end ------------ # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) torch.set_printoptions(profile="default") # Set up function for computing TD3 Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): pi_targ = ac_targ.pi(o2) # Target policy smoothing epsilon = torch.randn_like(pi_targ) * target_noise epsilon = torch.clamp(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = torch.clamp(a2, -act_limit, act_limit) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging loss_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, loss_info # Set up function for computing TD3 pi loss def compute_loss_pi(data): o = data['obs'] q1_pi = ac.q1(o, ac.pi(o)) return -q1_pi.mean() # Set up model saving logger.setup_pytorch_saver(ac) def update(data, timer): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **loss_info) # Possibly update pi and target networks if timer % policy_delay == 0: # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item()) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for _ in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) scaled_action = get_action(o, 0) o, r, d, _ = test_env.step( unscale_action(env.action_space, scaled_action)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() if loaded_state_dict is not None: o = loaded_state_dict['o'] ep_ret = loaded_state_dict['ep_ret'] ep_len = loaded_state_dict['ep_len'] else: o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): t += t_ori # printMemUsage(f"start of step {t}") # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps and np.random.rand() > epsilon_fn(t): a = get_action(o, act_noise_fn(t)) unscaled_action = unscale_action(env.action_space, a) else: unscaled_action = env.action_space.sample() a = scale_action(env.action_space, unscaled_action) # Step the env o2, r, d, _ = env.step(unscaled_action) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, timer=j) # End of epoch handling if (t + 1) % steps_per_epoch == 0: # Perform LR decay update_learning_rate(q_optimizer, q_learning_rate_fn(t)) update_learning_rate(pi_optimizer, pi_learning_rate_fn(t)) epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Save model and checkpoint save_checkpoint = False checkpoint_path = "" if save_checkpoint_path is not None: save_checkpoint = True checkpoint_path = save_checkpoint_path if load_checkpoint_path is not None: save_checkpoint = True checkpoint_path = load_checkpoint_path if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({}, None) if save_checkpoint: checkpoint_file = os.path.join(checkpoint_path, f'save_{epoch}.pt') torch.save( { 'ac': ac.state_dict(), 'ac_targ': ac_targ.state_dict(), 'replay_buffer': replay_buffer, 'pi_optimizer': pi_optimizer.state_dict(), 'q_optimizer': q_optimizer.state_dict(), 'logger_epoch_dict': logger.epoch_dict, 'q_learning_rate_fn': q_learning_rate_fn, 'pi_learning_rate_fn': pi_learning_rate_fn, 'epsilon_fn': epsilon_fn, 'act_noise_fn': act_noise_fn, 'torch_rng_state': torch.get_rng_state(), 'np_rng_state': np.random.get_state(), 'action_space_state': env.action_space.np_random.get_state(), 'env': env, 'test_env': test_env, 'ep_ret': ep_ret, 'ep_len': ep_len, 'o': o, 't': t + 1 }, checkpoint_file) delete_old_files(checkpoint_path, 10)
def egl(env_fn, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=256, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, eps=0.4, n_explore=32, device='cuda', architecture='mlp', sample='on_policy'): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ if architecture == 'mlp': actor_critic = core.MLPActorCritic elif architecture == 'spline': actor_critic = core.SplineActorCritic else: raise NotImplementedError device = torch.device(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2, ac.geps]) logger.log( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t geps: %d\n' % var_counts) n_samples = 100 cmin = 0.25 cmax = 1.75 greed = 0.01 rand = 0.01 def max_reroute(o): b, _ = o.shape o = repeat_and_reshape(o, n_samples) with torch.no_grad(): ai, _ = ac.pi(o) q1 = ac.q1(o, ai) q2 = ac.q2(o, ai) qi = torch.min(q1, q2).unsqueeze(-1) qi = qi.view(n_samples, b, 1) ai = ai.view(n_samples, b, act_dim) rank = torch.argsort(torch.argsort(qi, dim=0, descending=True), dim=0, descending=False) w = cmin * torch.ones_like(ai) m = int((1 - cmin) * n_samples / (cmax - cmin)) w += (cmax - cmin) * (rank < m).float() w += ((1 - cmin) * n_samples - m * (cmax - cmin)) * (rank == m).float() w -= greed w += greed * n_samples * (rank == 0).float() w = w * (1 - rand) + rand w = w / w.sum(dim=0, keepdim=True) prob = torch.distributions.Categorical(probs=w.permute(1, 2, 0)) a = torch.gather(ai.permute(1, 2, 0), 2, prob.sample().unsqueeze(2)).squeeze(2) return a, (ai, w.mean(-1)) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # # Set up function for computing EGL mean-gradient-losses # def compute_loss_g(data): # # o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] # # a2 = ball_explore(a1, n_explore, eps) # # a2 = a2.view(n_explore * len(r), act_dim) # o_expand = repeat_and_reshape(o, n_explore) # # # Bellman backup for Q functions # with torch.no_grad(): # # q1 = ac.q1(o_expand, a2) # q2 = ac.q2(o_expand, a2) # q_dither = torch.min(q1, q2) # # # Target actions come from *current* policy # a_tag, logp_a_tag = ac.pi(o_tag) # # # Target Q-values # q1_pi_targ = ac_targ.q1(o_tag, a_tag) # q2_pi_targ = ac_targ.q2(o_tag, a_tag) # q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) # q_anchor = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a_tag) # # q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1) # # geps = ac.geps(o, a1) # geps = repeat_and_reshape(geps, n_explore) # a1 = repeat_and_reshape(a1, n_explore) # # geps = (geps * (a2 - a1)).sum(-1) # # l1 loss against Bellman backup # # loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor) # # # Useful info for logging # g_info = dict(GVals=geps.flatten().detach().cpu().numpy()) # # return loss_g, g_info # Set up function for computing EGL mean-gradient-losses def compute_loss_g(data): o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] a2 = ball_explore(a1, n_explore, eps) a2 = a2.view(n_explore * len(r), act_dim) o_expand = repeat_and_reshape(o, n_explore) # Bellman backup for Q functions with torch.no_grad(): q1 = ac.q1(o_expand, a2) q2 = ac.q2(o_expand, a2) q_dither = torch.min(q1, q2) # Target actions come from *current* policy # Target Q-values q1 = ac.q1(o, a1) q2 = ac.q2(o, a1) q_anchor = torch.min(q1, q2) q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1) geps = ac.geps(o, a1) geps = repeat_and_reshape(geps, n_explore) a1 = repeat_and_reshape(a1, n_explore) geps = (geps * (a2 - a1)).sum(-1) # l1 loss against Bellman backup loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor) # Useful info for logging g_info = dict(GVals=geps.flatten().detach().cpu().numpy()) return loss_g, g_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) geps_pi = ac.geps(o, pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - (geps_pi * pi).sum(-1)).mean() beta = autograd.Variable(pi.detach().clone(), requires_grad=True) q1_pi = ac.q1(o, beta) q2_pi = ac.q2(o, beta) qa = torch.min(q1_pi, q2_pi).unsqueeze(-1) grad_q = autograd.grad(outputs=qa, inputs=beta, grad_outputs=torch.cuda.FloatTensor( qa.size()).fill_(1.), create_graph=False, retain_graph=False, only_inputs=True)[0] # Useful info for logging pi_info = dict( LogPi=logp_pi.detach().cpu().numpy(), GradGAmp=torch.norm(geps_pi, dim=-1).detach().cpu().numpy(), GradQAmp=torch.norm(grad_q, dim=-1).detach().cpu().numpy(), GradDelta=torch.norm(geps_pi - grad_q, dim=-1).detach().cpu().numpy(), GradSim=F.cosine_similarity(geps_pi, grad_q, dim=-1).detach().cpu().numpy(), ) return loss_pi, pi_info if architecture == 'mlp': # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) g_optimizer = Adam(ac.geps.parameters(), lr=lr) elif architecture == 'spline': # Set up optimizers for policy and q-function pi_optimizer = SparseDenseAdamOptimizer(ac.pi, dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) q_optimizer = SparseDenseAdamOptimizer([ac.q1, ac.q2], dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) g_optimizer = SparseDenseAdamOptimizer(ac.geps, dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) else: raise NotImplementedError # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Next run one gradient descent step for the mean-gradient g_optimizer.zero_grad() loss_g, g_info = compute_loss_g(data) loss_g.backward() g_optimizer.step() # Record things logger.store(LossG=loss_g.item(), **g_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in ac.geps.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in ac.geps.parameters(): p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action_on_policy(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device), deterministic) def get_action_rbi(o, deterministic=False): o = torch.as_tensor(o, dtype=torch.float32, device=device) if deterministic: a = ac.act(o, deterministic) else: o = o.unsqueeze(0) a, _ = max_reroute(o) a = a.flatten().cpu().numpy() return a if sample == 'on_policy': get_action = get_action_on_policy elif sample == 'rbi': get_action = get_action_rbi else: raise NotImplementedError def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in tqdm(range(total_steps)): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('GVals', with_min_and_max=True) logger.log_tabular('LossG', with_min_and_max=True) logger.log_tabular('GradGAmp', with_min_and_max=True) logger.log_tabular('GradQAmp', with_min_and_max=True) logger.log_tabular('GradDelta', with_min_and_max=True) logger.log_tabular('GradSim', with_min_and_max=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts) #=========================================================================# # # # All of your code goes in the space below. # # # #=========================================================================# # Set up function for computing TD3 Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] # Compute target actions a_next = ac_targ.pi(torch.as_tensor(o2, dtype=torch.float32)) a_next += torch.clamp(target_noise * torch.randn(act_dim), -noise_clip, noise_clip) a_next = torch.clamp(a_next, -act_limit, act_limit) # Compute targets q1 = ac_targ.q1(o2, a_next) q2 = ac_targ.q2(o2, a_next) y = r + gamma * (1 - d) * torch.min(q1, q2) # Loss function loss_q1 = ((ac.q1(o, a) - y) ** 2).mean() loss_q2 = ((ac.q2(o, a) - y) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging loss_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, loss_info # Set up function for computing TD3 pi loss def compute_loss_pi(data): o = torch.as_tensor(data['obs'], dtype=torch.float32) loss_pi = -ac.q1(o, ac.pi(o)).mean() # Gradient ascent return loss_pi #=========================================================================# # # # All of your code goes in the space above. # # # #=========================================================================# # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(q_params, lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, timer): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **loss_info) # Possibly update pi and target networks if timer % policy_delay == 0: # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item()) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, timer=j) # End of epoch handling if (t+1) % steps_per_epoch == 0: epoch = (t+1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=1, steps_per_epoch=2000, epochs=10000, replay_size=int(1e5), gamma=0.99, polyak=0.995, pi_lr=1e-4, q_lr=1e-4, batch_size=128, start_steps=2000, update_after=1000, update_every=1000, act_noise=0.05, num_test_episodes=1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Deep Deterministic Policy Gradient (DDPG) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, and a ``q`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q`` (batch,) | Tensor containing the current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) rospy.init_node('DDPG_Train') env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] print(f"[DDPG] obs dim: {obs_dim} action dim: {act_dim}") # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # ac.apply(init_weights) ac_targ = deepcopy(ac) ac.eval() # in-active training BN print(f"[MODEL] Actor_Critic: {ac}") # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts) # Set up function for computing DDPG Q-loss def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] # import ipdb # ipdb.set_trace() q = ac.q(o, a) # Bellman backup for Q function with torch.no_grad(): q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.cpu().detach().numpy()) return loss_q, loss_info # Set up function for computing DDPG pi loss def compute_loss_pi(data): o = data['obs'] q_pi = ac.q(o, ac.pi(o)) return -q_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(ac.q.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q. q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in ac.q.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in ac.q.parameters(): p.requires_grad = True # Record things logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) def soft_target_update(): # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): o = torch.as_tensor(o, dtype=torch.float32) if o.dim() == 1: o = o.unsqueeze(0) a = ac.act(o)[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, env.act_limit_min, env.act_limit_max) def test_agent(): print("[DDPG] eval......") for j in range(num_test_episodes): o, d, ep_ret, ep_len = env.reset(), False, 0, 0 # while not(d or (ep_len == max_ep_len)): while not(d or (ep_len == 100)): # Take deterministic actions at test time (noise_scale=0) a = get_action(o, 0) print(f"[Eval] a: {a}") o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). print(f"O {o[-4]:.3f} {o[-3]:.3f} {o[-2]:.3f} {o[-1]:.3f} ") if t > start_steps: # if np.random.rand() > 0.3: a = get_action(o, act_noise) # else: # a = env.action_space.sample() else: a = env.action_space.sample() print(f't {t:7.0f} | a [{a[0]:.3f},{a[1]:.3f}]') # Step the env o2, r, d, info = env.step(a) # print(f"O {o[-4:]} |A {a} |O2 {o2[-4:]} |R {r} |D {d} |Info {info}") print(f" ------------------> R: {r:.3f}") ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): env.pause_pedsim() logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 env.unpause_pedsim() # Update handling if t >= update_after and t % update_every == 0: env.pause_pedsim() ac.train() # active training BN ac_targ.train() if torch.cuda.is_available(): ac.cuda() ac_targ.cuda() for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) if torch.cuda.is_available(): for key, value in batch.items(): batch[key] = value.cuda() update(data=batch) soft_target_update() ac.eval() ac_targ.eval() if torch.cuda.is_available(): ac.cpu() ac_targ.cpu() env.unpause_pedsim() # End of epoch handling if (t+1) % steps_per_epoch == 0: epoch = (t+1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() o, d, ep_ret, ep_len = env.reset(), False, 0, 0 sec = time.time() - start_time elapsed_time = str(datetime.timedelta(seconds=sec)).split('.')[0] # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) # logger.log_tabular('Time', time.time()-start_time) logger.log_tabular('Time', elapsed_time) logger.dump_tabular()
def a2c(env_fn, agent: Agent, seed=0, num_cpu=1, device=torch.device("cpu"), epochs=1000, steps_per_epoch=100, gamma=0.99, use_gae=True, tau=0.95, max_grad_norm=0.5, polyak=0.995, learning_rate=1e-3, value_loss_coef=0.5, policy_loss_coef=1, entropy_loss_coef=0.1, grid_layer_weight_reg_loss_coef=1e-4, save_every=100, log_every=10, logger_kwargs=dict(), test_every=100, num_test_episodes=5, deterministic=False, save_freq=1, solved_score=None, render=False, ): use_MPI = num_cpu > 1 if use_MPI: # Special function to avoid certain slowdowns from PyTorch + MPI combo. mpi_pytorch.setup_pytorch_for_mpi() else: torch.set_num_threads(torch.get_num_threads()) # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) config = locals() del config['env_fn'] del config['agent'] del config['logger'] logger.save_config(config) test_logger_kwargs = deepcopy(logger_kwargs) test_logger_kwargs['output_dir'] = pathlib.Path(test_logger_kwargs['output_dir']) / 'evaluation' test_logger = EpochLogger(**test_logger_kwargs) # Random seed if use_MPI: seed += 10000 * mpi_tools.proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() assert env.max_episode_steps > 0 obs_shape = env.observation_space.shape act_dim = env.action_space.n # training model and target model target_agent = deepcopy(agent) if use_MPI: # Sync params across processes mpi_pytorch.sync_params(agent) mpi_pytorch.sync_params(target_agent) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in target_agent.parameters(): p.requires_grad = False # Utilize GPU agent.to(device) target_agent.to(device) # Set up optimizers for policy and q-function optimizer = Adam(agent.parameters(), lr=learning_rate) # Set up model saving logger.setup_pytorch_saver(agent, name='model') def update(episode_buffer): # Update if episode_buffer.dones[-1]: next_value = 0.0 else: last_obs = episode_buffer.next_observations[-1] previous_reward = episode_buffer.rewards[-1] last_obs_tensor = torch.tensor(last_obs, dtype=torch.float32).unsqueeze(0) previous_reward_tensor = torch.tensor([previous_reward], dtype=torch.float32).unsqueeze(0) context = agent.get_context() next_value = target_agent.predict_value(obs_tensor=last_obs_tensor, previous_reward_tensor=previous_reward_tensor, goal_grid_code_tensor=goal_grid_code_tensor, context=context).cpu().item() # Super critical!! optimizer.zero_grad() # Compute value and policy losses loss, info = agent.compute_loss(rewards=np.array(episode_buffer.rewards), dones=np.array(episode_buffer.dones), next_value=next_value, discount_factor=gamma, use_gae=use_gae, tau=tau, value_loss_coef=value_loss_coef, policy_loss_coef=policy_loss_coef, entropy_reg_coef=entropy_loss_coef, grid_layer_wreg_loss_coef=grid_layer_weight_reg_loss_coef) loss.backward() if use_MPI: mpi_pytorch.mpi_avg_grads(agent) # Optimize if max_grad_norm is not None: torch.nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm) optimizer.step() # Log losses and info logger.store(**info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(agent.parameters(), target_agent.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) if use_MPI: mpi_pytorch.sync_params(target_agent) # Prepare for interaction with environment start_time = time.time() # Main loop: collect experience in env and update/log each epoch total_steps = 0 # Reset env obs = env.reset() reward = 0 goal_grid_code_tensor = None # Reset episode stats episode_return = 0 episode_length = 0 for epoch in range(1, epochs + 1): agent.reset_for_training() epoch_history = EpisodeHistory() for t in range(steps_per_epoch): total_steps += 1 # Get action from the model obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0) previous_reward_tensor = torch.tensor([reward], dtype=torch.float32).unsqueeze(0) action = agent.step(obs_tensor, previous_reward_tensor, goal_grid_code_tensor).squeeze(0) # Step the env obs2, reward, done, _ = env.step(action.detach().cpu().item()) if render and mpi_tools.proc_id() == 0: env.render('human', view='top') time.sleep(1e-3) episode_return += reward episode_length += 1 # Store transition to history epoch_history.store(observation=None, action=None, reward=reward, done=done, next_observation=obs2) # Super critical, easy to overlook step: make sure to update # most recent observation! obs = obs2 # End of trajectory handling if done: if reward > 0: goal_grid_code_tensor = agent.current_grid_code.detach() break update(epoch_history) # if done if epoch_history.dones[-1]: logger.store(EpRet=episode_return, EpLen=episode_length) # Reset env obs = env.reset() agent.reset() # Reset episode stats episode_return = 0 episode_length = 0 # End of epoch handling if epoch % log_every == 0: total_interactions = mpi_tools.mpi_sum(total_steps) if use_MPI else total_steps # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('Value', average_only=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossEntropy', average_only=True) logger.log_tabular('LossGridL2', average_only=True) logger.log_tabular('LossPIM', average_only=True) logger.log_tabular('TotalEnvInteracts', total_interactions) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Test agent solved = False if epoch % test_every == 0: video_dir = pathlib.Path(logger.output_dir) / 'test_videos' / f'epoch-{epoch:d}' test_env_fn = lambda: Monitor(env_fn(), directory=video_dir) # Test the performance of the deterministic version of the agent. context = agent.get_context() agent.eval() episode_info = evaluate_agent(env_fn=test_env_fn, agent=agent, deterministic=deterministic, num_episodes=num_test_episodes, render=False, logger=test_logger) agent.train() agent.set_context(context) if solved_score is not None: solved = all(r >= solved_score for (t, r) in episode_info) # Save model if (epoch % save_every == 0) or (epoch == epochs) or solved: logger.save_state({'env': env}) # Check environment is solved if solved: plog = lambda msg: logger.log(msg, color='green') plog("=" * 40) plog(f"ENVIRONMENT SOLVED!") plog("=" * 40) plog(f' TotalEnvInteracts {total_steps}') plog(f' Time {time.time() - start_time}') plog(f' Epoch {epoch}') break torch.save(agent, str(logger.output_dir / 'agent.pt')) env.close()
def vpg(env, actor_critic=MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE 0 for advantage estimation) Args: env : An environment that satisfies the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) obs_dim = env.observation_space.shape act_dim = env.action_space.n # assumes Discrete space ac = actor_critic(env.observation_space, env.action_space) ac.to(device) # buffer size equals number of steps in an epoch buff = VPGBuffer(steps_per_epoch, gamma, obs_dim, act_dim) def compute_loss_pi(data): obs = torch.as_tensor(data.obs_buf, dtype=torch.float32, device=device) act = torch.as_tensor(data.act_buf, dtype=torch.int32, device=device) adv = torch.as_tensor(data.advantage_buf, dtype=torch.float32, device=device) logpa = ac.pi(obs, act) return -1 * (logpa * adv).mean() def compute_loss_v(data): obs = torch.as_tensor(data.obs_buf, dtype=torch.float32, device=device) rew2go = torch.as_tensor(data.rew2go_buf, dtype=torch.float32, device=device) values = ac.v(obs) return F.mse_loss(values, rew2go) pi_optimizer = torch.optim.Adam(ac.pi.parameters(), lr=pi_lr) v_optimizer = torch.optim.Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update_pi(data): pi_optimizer.zero_grad() pi_loss = compute_loss_pi(data) pi_loss.backward() pi_optimizer.step() logger.store(LossPi=pi_loss.item()) #TODO: log policy entropy def update_v(data): for s in range(train_v_iters): v_optimizer.zero_grad() v_loss = compute_loss_v(data) v_loss.backward() v_optimizer.step() logger.store(LossV=v_loss.item()) total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 t = 0 # total environment interactions # Update policy once per epoch for epoch in range(epochs): for t_epoch in range(steps_per_epoch): t += 1 a, v, logpa = ac.step( torch.as_tensor(o, dtype=torch.float32, device=device)) o2, r, d, info = env.step(a.cpu().numpy()) buff.store(o, a, v, r, logpa) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d o = o2 # If trajectory is finished, calculate rewards to go, # then calculate the Advantage. if d is True or (ep_len == max_ep_len) or (t_epoch + 1 == steps_per_epoch): buff.finish_trajectory() logger.store( EpRet=ep_ret, EpLen=ep_len, ) o, ep_ret, ep_len = env.reset(), 0, 0 # Calculate policy gradient when we've collected t_epoch time steps. if t_epoch + 1 == steps_per_epoch: pylogger.debug('*** epoch ***', epoch) pylogger.debug('*** t_epoch ***', t_epoch) pylogger.debug('values', buff.val_buf) pylogger.debug('rewards', buff.rew_buf) pylogger.debug('rew2go', buff.rew2go_buf) pylogger.debug('advantage', buff.advantage_buf) # Update the policy using policy gradient update_pi(buff) # Re-fit the value function on the MSE. Note, this is # gradient descent starting from the previous parameters. update_v(buff) # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # note, this includes full model pickle # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Time', time.time() - start_time) if hasattr(env, 'episode_id'): logger.log_tabular('EpisodeId', env.episode_id) # If a quantity has not been calculated/stored yet, do not log it. This can # happen, e.g. if NN update length or episode length exceeds num steps in epoch. to_log = [{ 'key': 'LossV', 'average_only': True }, { 'key': 'LossPi', 'average_only': True }, { 'key': 'EpRet', 'with_min_and_max': True }, { 'key': 'EpLen', 'average_only': True }, { 'key': 'RawRet', 'with_min_and_max': True }, { 'key': 'RawLen', 'average_only': True }] for log_tabular_kwargs in to_log: key = log_tabular_kwargs['key'] if key in logger.epoch_dict and len(logger.epoch_dict[key]) > 0: logger.log_tabular(**log_tabular_kwargs) wandb.log(logger.log_current_row, step=epoch) logger.dump_tabular() # reset buffer buff = VPGBuffer(steps_per_epoch, gamma, obs_dim, act_dim) # Save final model as a state dict state = { 'epoch': epoch, 'pi_state_dict': ac.pi.state_dict(), 'v_state_dict': ac.v.state_dict(), 'pi_optimizer': pi_optimizer.state_dict(), 'v_optimizer': v_optimizer.state_dict(), } # hack for wandb: should output the model in the wandb.run.dir to avoid # problems syncing the model in the cloud with wandb's files state_fname = os.path.join(logger_kwargs['output_dir'], f"state_dict.pt") torch.save(state, state_fname) wandb.save(state_fname) pylogger.info(f"Saved state dict to {state_fname}") env.close()
class VPG: """ VPG w/ GAE-Lambda """ def __init__(self, env_maker: Callable, ac_maker=core.MLPActorCritic, ac_kwargs={}, seed: int = 0, epochs: int = 50, steps_per_epoch: int = 4000, gamma: float = 0.99, actor_lr: float = 3e-4, critic_lr: float = 1e-3, num_iter_train_critic: int = 80, lam: float = 0.97, max_episode_len: int = 1000, logger_kwargs=dict(), save_freq: int = 10): # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) self.epochs = epochs self.steps_per_epoch = steps_per_epoch self.num_iter_train_critic = num_iter_train_critic self.max_episode_len = max_episode_len self.save_freq = save_freq # make env self.env = env_maker() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape # make actor-critic self.ac = ac_maker(self.env.observation_space, self.env.action_space, **ac_kwargs) # make buffer self.local_steps_per_epoch = int(steps_per_epoch / num_procs()) self.buffer = Buffer(self.obs_dim, self.act_dim, self.local_steps_per_epoch, gamma, lam) # make optimizers self.actor_optimizer = Adam(self.ac.actor.parameters(), lr=actor_lr) self.critic_optimizer = Adam(self.ac.critic.parameters(), lr=critic_lr) # Sync params across processes sync_params(self.ac) # Count variables var_counts = tuple( core.count_vars(module) for module in [self.ac.actor, self.ac.critic]) self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up model saving self.logger.setup_pytorch_saver(self.ac) def compute_actor_loss(self, data): obs, act, adv, logprob_old = data['obs'], data['act'], data[ 'adv'], data['logprob'] # policy loss pi, logprob = self.ac.actor(obs, act) loss_actor = -(logprob * adv).mean() # extra info approx_kl = (logprob_old - logprob).mean().item() entropy = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, entropy=entropy) return loss_actor, pi_info def compute_critic_loss(self, data): obs, ret = data['obs'], data['ret'] return ((self.ac.critic(obs) - ret)**2).mean() def update(self): data = self.buffer.get() actor_loss_old, actor_info_old = self.compute_actor_loss(data) actor_loss_old = actor_loss_old.item() critic_loss_old = self.compute_critic_loss(data).item() # train policy self.actor_optimizer.zero_grad() actor_loss, actor_info = self.compute_actor_loss(data) actor_loss.backward() mpi_avg_grads(self.ac.actor) self.actor_optimizer.step() # train critic for i in range(self.num_iter_train_critic): self.critic_optimizer.zero_grad() critic_loss = self.compute_critic_loss(data) critic_loss.backward() mpi_avg_grads(self.ac.critic) self.critic_optimizer.step() #log kl, entropy = actor_info['kl'], actor_info['entropy'] self.logger.store(LossPi=actor_loss_old, LossV=critic_loss_old, KL=kl, Entropy=entropy, DeltaLossV=(critic_loss.item() - critic_loss_old), DeltaLossPi=(actor_loss.item() - actor_loss_old)) def train(self): start_time = time.time() obs, episode_ret, episode_len = self.env.reset(), 0, 0 for epoch in range(self.epochs): for t in range(self.local_steps_per_epoch): act, v, logprob = self.ac.step( torch.as_tensor(obs, dtype=torch.float32)) # print(f"act: {act}") # print(f"v: {v}") # print(f"logprob: {logprob}") obs_next, reward, done, _ = self.env.step(act) episode_ret += reward episode_len += 1 self.buffer.store(obs, act, reward, v, logprob) self.logger.store(VVals=v) obs = obs_next # episode end/timeout logic timeout = (episode_len == self.max_episode_len) terminal = (done or timeout) epoch_ended = (t == self.local_steps_per_epoch - 1) if terminal or epoch_ended: if epoch_ended and not terminal: # print(f"Warning: trajectory cut off by epoch at {episode_len} steps") pass if timeout or epoch_ended: _, v, _ = self.ac.step( torch.as_tensor(obs, dtype=torch.float32)) else: v = 0 self.buffer.finish_path(v) if terminal: self.logger.store(EpRet=episode_ret, EpLen=episode_len) obs, episode_ret, episode_len = self.env.reset(), 0, 0 if (epoch % self.save_freq == 0) or (epoch == self.epochs - 1): self.logger.save_state({"env": self.env}, None) self.update() # Log info about epoch self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('EpRet', with_min_and_max=True) self.logger.log_tabular('EpLen', average_only=True) self.logger.log_tabular('VVals', with_min_and_max=True) self.logger.log_tabular('TotalEnvInteracts', (epoch + 1) * self.steps_per_epoch) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossV', average_only=True) self.logger.log_tabular('DeltaLossPi', average_only=True) self.logger.log_tabular('DeltaLossV', average_only=True) self.logger.log_tabular('Entropy', average_only=True) self.logger.log_tabular('KL', average_only=True) self.logger.log_tabular('Time', time.time() - start_time) self.logger.dump_tabular()
def sqn(env_fn, env_init, ego_agent, opp_agent, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-2, alpha=0.2, batch_size=100, start_steps=10000, update_after=4000, update_every=1, num_test_episodes=10, max_ep_len=4000, logger_kwargs=dict(), save_freq=1, lr_period=0.7): """ Soft Q-Network, based on SAC and clipped Double Q-learning Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, alpha, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer if isinstance(env.action_space, Box): a_dim = act_dim elif isinstance(env.action_space, Discrete): a_dim = 1 replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=a_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy v1 = ac.q1.values(o2) v2 = ac.q2.values(o2) a2, logp_a2 = ac.pi(v1 + v2, action_mask=ego_agent.aval_paths) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) #Unsqueeze adds another dim, necessary to be column vectors backup = r.unsqueeze(1) + gamma * (1 - d).unsqueeze(1) * ( q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, lr_iter): # Update learning rate with cosine schedule lr = np.clip( 0.005 * np.cos(np.pi * lr_iter / (total_steps * lr_period)) + 0.00501, 1e-2, 1e-5) q_optimizer.param_groups[0]['lr'] = lr # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, action_mask, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), action_mask, deterministic) def test_agent(): for j in range(num_test_episodes): d, ep_ret, ep_len = False, 0, 0 init_positions = np.random.random_integers(0, 1) o = test_env.reset({ 'x': env_init['initial_x'][init_positions], 'y': env_init['initial_y'], 'theta': env_init['initial_theta'] }) #Convert o to RL obs RLobs = ego_agent.process_obs(o) while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = get_action(RLobs, action_mask=ego_agent.aval_paths, deterministic=True) #RL action to drive control actions ego_speed, ego_steer, a = ego_agent.plan(o, a) #Opponent decision opp_speed, opp_steer = opp_agent.plan(o) action = { 'ego_idx': 0, 'speed': [ego_speed, opp_speed], 'steer': [ego_steer, opp_steer] } o, r, d, _ = test_env.step(action) #Convert o to RL obs RLobs = ego_agent.process_obs(o) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() init_positions = np.random.random_integers(0, 1) o, ep_ret, ep_len = env.reset({ 'x': env_init['initial_x'][init_positions], 'y': env_init['initial_y'], 'theta': env_init['initial_theta'] }), 0, 0 #Convert o to RL obs RLobs = ego_agent.process_obs(o) # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(RLobs, action_mask=ego_agent.aval_paths, deterministic=False) else: try: a = random.choice(tuple(ego_agent.aval_paths)) except: #happens when there are no paths available a = 15 #RL action to drive control actions ego_speed, ego_steer, a = ego_agent.plan(o, a) #Opponent decision opp_speed, opp_steer = opp_agent.plan(o) action = { 'ego_idx': 0, 'speed': [ego_speed, opp_speed], 'steer': [ego_steer, opp_steer] } # Step the env o2, r, d, _ = env.step(action) ep_ret += r ep_len += 1 #Convert o2 to RLobs2 RLobs2 = ego_agent.process_obs(o2) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer # replay_buffer.store(o, a, r, o2, d) replay_buffer.store(RLobs, a, r, RLobs2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! RLobs = RLobs2 o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) init_positions = np.random.random_integers(0, 1) o, ep_ret, ep_len = env.reset({ 'x': env_init['initial_x'][init_positions], 'y': env_init['initial_y'], 'theta': env_init['initial_theta'] }), 0, 0 #Convert o to RL obs RLobs = ego_agent.process_obs(o) # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): #Cosine learning rate schedule if t < total_steps * (1 - lr_period): lr_iter = 0 else: lr_iter = t - total_steps * (1 - lr_period) batch = replay_buffer.sample_batch(batch_size) update(data=batch, lr_iter=lr_iter) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): if epoch == epochs: logger.save_state({'env': env}, None) else: #SpinningUp saving style logger.save_state({'env': env}, epoch) #Standard pytorch way of saving fpath = logger_kwargs['output_dir'] + '/state_dict/' os.makedirs(fpath, exist_ok=True) torch.save(ac.state_dict(), fpath + 'model%d.pt' % epoch) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def dqn(env, actor_critic=MLPCritic, replay_size=500, seed=0, steps_per_epoch=3000, epochs=5, gamma=0.99, lr=0.00025, batch_size=32, start_steps=100, update_after=50, update_every=5, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=1e6, target_update_every=1000, num_test_episodes=10, max_ep_len=200, record_video=False, record_video_every=100, save_freq=50, wandb_model_name=None, wandb_restore_run_path=None): """ Args: env : An environment that satisfies the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method and a ``q`` module. The ``act`` method module should accept batches of observations as inputs, and ``q`` should accept a batch of observations and a batch of actions as inputs. When called, ``act`` and ``q`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q`` (batch,) | Tensor containing the current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== global_steps (int): Number of steps / frames for training (should be greater than update_after!) seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) lr (float): Learning rate. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. epsilon_start (float): Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end (float): The final minimum value of epsilon after decaying is done. epsilon_decay_steps (int): Number of steps over which to linearly decrement from epsilon_start to epsilon_end. target_update_every (int): Number of steps between updating target network parameters, i.e. resetting Q_hat to Q. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. (Imposed by the environment.) record_video (bool): Record a video record_video_every (int): Record a video every N episodes save_freq (int): How often (in terms of gap between epochs) to save the current model (value function). wandb_model_name (str): (optional) if not None, use a pretrained serialized torch model stored in wandb wandb_restore_run_path (str): (optional) if wandb_model_name is specified, then this should specify path e.g. '$USER_NAME/$PROJECT_NAME/$RUN_ID' """ logger_out_dir = wandb.run.dir logger = EpochLogger(exp_name='dqn', output_dir=logger_out_dir) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) obs_dim = env.observation_space.shape act_dim = env.action_space.n # assumes Discrete space if wandb_model_name is not None: # note, we can't use load_state_dict. The entire model # was serialized by EpochLogger's save_state rather # than just its weights assert wandb_restore_run_path is not None ac = torch.load(wandb.restore(wandb_model_name, run_path=wandb_restore_run_path).name, map_location=device) else: # Create critic module and network ac = actor_critic(env.observation_space, env.action_space) # Set target Q-network parameters theta_tar = theta target_q_network = deepcopy(ac.q) if torch.cuda.device_count() > 1 and wandb_model_name is None: # hack: last post in this thread https://discuss.pytorch.org/t/bug-in-dataparallel-only-works-if-the-dataset-device-is-cuda-0/28634/24 # advises skipping DataParallel on a pretrained model ac.q = nn.DataParallel(ac.q) target_q_network = nn.DataParallel(target_q_network) ac.to(device) target_q_network.to(device) # Freeze target network w.r.t. optimizers for p in target_q_network.parameters(): p.requires_grad = False # function to compute Q-loss def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] # Pick out q-values associated with / indexed by the action that was taken # for that observation: https://pytorch.org/docs/stable/torch.html#torch.gather. # Note index must be of type LongTensor. q = torch.gather(ac.q(o), dim=1, index=a.view(-1, 1).long()) # Bellman backup for Q function with torch.no_grad(): # Targets come from frozen target Q-network q_target = torch.max(target_q_network(o2), dim=1).values backup = r + (1 - d) * gamma * q_target # MSE loss against Bellman backup # loss_q = ((q - backup)**2).mean() loss_q = F.smooth_l1_loss(q[:, 0], backup).mean() # Useful info for logging loss_info = dict(QVals=q.detach().cpu().numpy()) return loss_q, loss_info # Set up optimizer for Q-function q_optimizer = torch.optim.Adam(ac.q.parameters(), lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) # function to update parameters in Q def update(data): q_optimizer.zero_grad() loss, loss_info = compute_loss_q(data) loss.backward() q_optimizer.step() logger.store(LossQ=loss.item(), **loss_info) def get_action(o, epsilon): # greedy epsilon strategy if np.random.sample() < epsilon: a = env.action_space.sample() else: a = ac.act(torch.as_tensor(o, dtype=torch.float32, device=device)) return a # main loop: collect experience in env # Initialize experience replay buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) total_steps = steps_per_epoch * epochs start_time = time.time() epsilon_decrement = (epsilon_start - epsilon_end) / epsilon_decay_steps epsilon = epsilon_start o, ep_ret, ep_len = env.reset(), 0, 0 for t in range(total_steps): if t > start_steps and epsilon > epsilon_end: # linearly reduce epsilon epsilon -= epsilon_decrement if t > start_steps: # epsilon greedy a = get_action(o, epsilon) else: # randomly sample for better exploration before start_steps a = env.action_space.sample() # Step the env o2, r, d, info = env.step(a) # TODO: clip rewards b/w -1 and 1 ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store transition to replay buffer replay_buffer.store(o, a, r, o2, d) # Update the most recent observation. o = o2 # End of episode handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) # End of multi-life game handling lives = info.get('ale.lives') if lives is not None and lives == 0: # Assumes env has been wrapped by Monitor. logger.store(RawRet=env.get_episode_rewards()[-1]) logger.store(RawLen=env.get_episode_lengths()[-1]) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t > update_after and t % update_every == 0: minibatch = replay_buffer.sample_batch(batch_size) update(data=minibatch) # Refresh target Q network if t % target_update_every == 0: target_q_network.load_state_dict(ac.q.state_dict()) for p in target_q_network.parameters(): p.requires_grad = False # End of epoch handling if (t + 1) % steps_per_epoch == 0 and (t + 1) > start_steps and ( t + 1) > update_after: epoch = (t + 1) // steps_per_epoch print(f"epsilon: {epsilon}") # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state( {'env': env}, None) # note, this includes full model pickle # Save the model parameters to wandb every save_freq epoch # instead of waiting till the end state = { 'epoch': epoch, 'ac_state_dict': ac.state_dict(), 'ac_q_state_dict': ac.q.state_dict(), # not sure this is necessary 'q_optimizer': q_optimizer.state_dict(), 'q_loss': logger.epoch_dict['LossQ'][-1], } # hack for wandb: should output the model in the wandb.run.dir to avoid # problems syncing the model in the cloud with wandb's files state_fname = os.path.join(wandb.run.dir, f"state_dict.pt") torch.save(state, state_fname) wandb.save(state_fname) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('Epsilon', epsilon) if hasattr(env, 'episode_id'): logger.log_tabular('EpisodeId', env.episode_id) # If a quantity has not been calculated/stored yet, do not log it. This can # happen, e.g. if NN update length or episode length exceeds num steps in epoch. to_log = [{ 'key': 'QVals', 'with_min_and_max': True }, { 'key': 'LossQ', 'average_only': True }, { 'key': 'EpRet', 'with_min_and_max': True }, { 'key': 'EpLen', 'average_only': True }, { 'key': 'RawRet', 'with_min_and_max': True }, { 'key': 'RawLen', 'average_only': True }] for log_tabular_kwargs in to_log: key = log_tabular_kwargs['key'] if key in logger.epoch_dict and len( logger.epoch_dict[key]) > 0: logger.log_tabular(**log_tabular_kwargs) wandb.log(logger.log_current_row, step=epoch) logger.dump_tabular() env.close()
def my_vpg(env_fn, seed=0, steps_per_epoch=4000, epochs=50, max_ep_len=1000, hidden_sizes=[32], lr=1e-2, logger_kwargs=dict(), save_freq=10): """ My VPG implementation Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. """ # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() print("env.observation_space", env.observation_space) print("env.observation_space.shape", env.observation_space.shape) print("env.action_space", env.action_space) # Prepare for interaction with environment start_time = time.time() # Instantiate policy if isinstance(env.action_space, gym.spaces.Box): policy = GaussianPolicy(env.action_space, env.observation_space, hidden_sizes) elif isinstance(env.action_space, gym.spaces.Discrete): policy = CategoricalPolicy(env.action_space, env.observation_space, hidden_sizes) policy_optimizer = torch.optim.Adam(policy.actor_net.parameters(), lr=lr) # value_net = mlp(sizes = [obs_dim] + hidden_sizes + [1]) # value_optimizer = torch.optim.Adam(value_net.parameters(), lr=lr) # print("value_net") # print(value_net) # def get_value(o): # return value_net(torch.as_tensor(o, dtype=torch.float32)) # Set up model saving logger.setup_pytorch_saver(policy) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): o, ep_ret, ep_len = env.reset(), 0, 0 buffer = Buffer() for t in range(steps_per_epoch): a = policy.act(torch.tensor(o, dtype=float).unsqueeze(0)) a = a.numpy()[0] # Remove batch dimension next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 buffer.append(o, a, r, next_o) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t==steps_per_epoch-1 if terminal or epoch_ended: if epoch_ended and not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) buffer.fill_episode_returns(ep_ret) o, ep_ret, ep_len = env.reset(), 0, 0 # Update o, a, r, next_o, R = buffer.get() # baseline = get_value(o) # R = r + get_value(next_o) # advantage = R - baseline # # Value function update # value_optimizer.zero_grad() # criterion = torch.nn.MSELoss() # value_loss = criterion(R, baseline) # value_loss.backward() # value_optimizer.step() # Policy function update policy_optimizer.zero_grad() logp_a = policy.get_logp(o, a) policy_loss = -(logp_a * R).mean() policy_loss.backward() policy_optimizer.step() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None)
def dqn(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.n print(obs_dim, act_dim) # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = ac.q1(o) q2 = ac.q2(o) q1_a = q1.gather(1, a.long()).squeeze(-1) q2_a = q2.gather(1, a.long()).squeeze(-1) # Bellman backup for Q functions with torch.no_grad(): # Target Q-values q1_targ = torch.max(ac_targ.q1(o2), dim=1)[0] q2_targ = torch.max(ac_targ.q2(o2), dim=1)[0] q_targ = torch.min(q1_targ, q2_targ) backup = r + gamma * (1 - d) * q_targ # MSE loss against Bellman backup loss_q1 = ((q1_a - backup) ** 2).mean() loss_q2 = ((q2_a - backup) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up optimizers for policy and q-function q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps or np.random.random() > 0.05: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def dqn(env_fn, actor_critic=ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e5), batch_size=100, gamma=0.99, q_lr=1e-4, start_steps=10000, update_after=1000, update_targ_every=50, num_test_episodes=10, max_ep_len=1000, epsilon=0.01, epsilon_decay=0.99995, logger_kwargs=dict(), writer_kwargs=dict(), save_freq=1): """ DQN (Deep Q-Networks). Reproduce the original paper from Minh et al. """ # Instantiate env env = env_fn() test_env = env_fn() # TODO: might have to assert discrete, or otherwise take only first index of shape or so obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Set up actor (pi) & critic (Q), and data buffer ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) q_targ = copy.deepcopy(ac.q) for p in q_targ.parameters(): p.requires_grad = False q_optimizer = torch.optim.Adam(ac.q.parameters(), lr=q_lr) replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Set RNG seeds torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) env.action_space.seed(seed) # Set up logging logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) logger.setup_pytorch_saver(ac) writer = SummaryWriter(**writer_kwargs) start_time = time.time() total_steps = epochs * steps_per_epoch o = env.reset() op = preprocess_obs(o) # "op" = "observation_preprocessed" ep_return = 0 # episode return, counter ep_length = 0 # episode length, counter for step in range(total_steps): # Take an env step, then store data in replay buffer if step > start_steps: ac.pi.epsilon = max(epsilon, epsilon_decay**step) a = ac.act(torch.as_tensor(op, dtype=torch.float32)) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) o2p = preprocess_obs(o2) replay_buffer.store(op, a, r, o2p, d) # TODO: does DQN paper say to do 1 GD update with mean of minibatch, or many 1-data-point updates? # Sample a random batch from replay buffer and perform one GD step q_optimizer.zero_grad() batch_data = replay_buffer.sample_batch(batch_size) loss_q = compute_loss_q(batch_data, ac, q_targ, gamma) loss_q.backward() q_optimizer.step() # Update target network every so often if (step % update_targ_every == 0) and (step >= update_after): q_targ = copy.deepcopy(ac.q) for p in q_targ.parameters(): p.requires_grad = False # Keep track of episode return and length (for logging purposes) ep_return += r ep_length += 1 # If episode done, reset env if d: o = env.reset() op = preprocess_obs(o) logger.store(EpRet=ep_return, EpLen=ep_length) ep_return = 0 ep_length = 0 else: op = o2p # TODO: confirm: no need for test set if test agent & env are same as training agent & env (e.g. would need # test set if algo added noise to training but not test # If epoch end, then do a test to see average return thus far if step % steps_per_epoch == steps_per_epoch - 1: for ep_i in range(num_test_episodes): # turn off epsilon exploration: old_epsilon = ac.pi.epsilon ac.pi.epsilon = 0 test_ep_return, test_ep_length = run_test_episode(test_env, ac) logger.store(TestEpRet=test_ep_return, TestEpLen=test_ep_length) # turn it back on ac.pi.epsilon = old_epsilon # If epoch end, save models and show logged data if step % steps_per_epoch == steps_per_epoch - 1: epoch_i = int(step // steps_per_epoch) writer.add_scalar("EpRet_mean", logger.get_stats("EpRet")[0], epoch_i) # first item in `get_stats` is mean writer.add_scalar("EpRet_std", logger.get_stats("EpRet")[1], epoch_i) # 2nd item in `get_stats` is std writer.add_scalar("TestEpRet_mean", logger.get_stats("TestEpRet")[0], epoch_i) writer.add_scalar("TestEpRet_std", logger.get_stats("TestEpRet")[1], epoch_i) writer.add_scalar("epsilon", ac.pi.epsilon, epoch_i) logger.save_state({'env': env}, None) # saves both ac and env logger.log_tabular("Epoch", epoch_i) logger.log_tabular("EpRet", with_min_and_max=True) logger.log_tabular("EpLen", average_only=True) logger.log_tabular("TestEpRet", with_min_and_max=True) logger.log_tabular("TestEpLen", average_only=True) logger.log_tabular("TimeFromStart", time.time() - start_time) logger.dump_tabular() # Save model at end logger.save_state({'env': env}, None) writer.close()
def cegl(env_fn, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=256, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, eps=0.4, n_explore=32, update_factor=1, device='cuda', architecture='mlp', sample='on_policy', method='egl'): if architecture == 'mlp': actor_critic = core.MLPActorCritic elif architecture == 'spline': actor_critic = core.SplineActorCritic else: raise NotImplementedError device = torch.device(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2, ac.geps]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t geps: %d\n' % var_counts) n_samples = 100 cmin = 0.25 cmax = 1.75 greed = 0.01 rand = 0.01 def max_reroute(o): b, _ = o.shape o = repeat_and_reshape(o, n_samples) with torch.no_grad(): ai, _ = ac.pi(o) q1 = ac.q1(o, ai) q2 = ac.q2(o, ai) qi = torch.min(q1, q2).unsqueeze(-1) qi = qi.view(n_samples, b, 1) ai = ai.view(n_samples, b, act_dim) rank = torch.argsort(torch.argsort(qi, dim=0, descending=True), dim=0, descending=False) w = cmin * torch.ones_like(ai) m = int((1 - cmin) * n_samples / (cmax - cmin)) w += (cmax - cmin) * (rank < m).float() w += ((1 - cmin) * n_samples - m * (cmax - cmin)) * (rank == m).float() w -= greed w += greed * n_samples * (rank == 0).float() w = w * (1 - rand) + rand w = w / w.sum(dim=0, keepdim=True) prob = torch.distributions.Categorical(probs=w.permute(1, 2, 0)) a = torch.gather(ai.permute(1, 2, 0), 2, prob.sample().unsqueeze(2)).squeeze(2) return a, (ai, w.mean(-1)) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup) ** 2).mean() loss_q2 = ((q2 - backup) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # Set up function for computing EGL mean-gradient-losses def compute_loss_g(data): o = data['obs'] # Bellman backup for Q functions with torch.no_grad(): a1, _ = ac.pi(o) std = ac.pi.distribution.scale std = torch.clamp_max(std, max=eps) a2 = explore(a1, n_explore, std) a2 = a2.reshape(n_explore * len(o), act_dim) o_expand = repeat_and_reshape(o, n_explore) # q_dither = ac.q1(o_expand, a2) # q_anchor = ac.q1(o, a1) q1_dither = ac.q1(o_expand, a2) q2_dither = ac.q2(o_expand, a2) q_dither = torch.min(q1_dither, q2_dither) q1_anchor = ac.q1(o, a1) q2_anchor = ac.q2(o, a1) q_anchor = torch.min(q1_anchor, q2_anchor) q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1) geps = ac.geps.forward_tag(o, a1) geps = repeat_and_reshape(geps, n_explore) a1 = repeat_and_reshape(a1, n_explore) geps = (geps * (a2 - a1)).sum(-1) # mse loss against Bellman backup loss_g = F.smooth_l1_loss(geps, (q_dither - q_anchor), reduction='mean') * n_explore / eps / act_dim # delta = torch.norm(a2 - a1, dim=-1) # n = (a2 - a1) / delta.unsqueeze(1) # target = torch.clamp((q_dither - q_anchor) / delta, min=-100, max=100) # geps = (geps * n).sum(-1) # # mse loss against Bellman backup # loss_g = F.smooth_l1_loss(geps, target, reduction='mean') # Useful info for logging g_info = dict(GVals=geps.flatten().detach().cpu().numpy(), GEps=std.flatten().detach().cpu().numpy()) # return loss_g, g_info, {'delta': delta, 'n': n, 'a1': a1, 'a2': a2, 'target': target, # 'geps': geps, 'q_dither': q_dither, 'q_anchor': q_anchor} return loss_g, g_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) geps_pi = ac.geps.forward_tag(o, pi, no_grad=True) if method == 'egl': # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - (geps_pi * pi).sum(-1)).mean() elif method == 'sac': q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() else: raise NotImplementedError beta = autograd.Variable(pi.detach().clone(), requires_grad=True) q1_pi = ac.q1(o, beta) q2_pi = ac.q2(o, beta) qa = torch.min(q1_pi, q2_pi).unsqueeze(-1) grad_q = autograd.grad(outputs=qa, inputs=beta, grad_outputs=torch.cuda.FloatTensor(qa.size()).fill_(1.), create_graph=False, retain_graph=False, only_inputs=True)[0] # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy(), GradGAmp=torch.norm(geps_pi, dim=-1).detach().cpu().numpy(), GradQAmp=torch.norm(grad_q, dim=-1).detach().cpu().numpy(), GradDelta=torch.norm(geps_pi - grad_q, dim=-1).detach().cpu().numpy(), GradSim=F.cosine_similarity(geps_pi, grad_q, dim=-1).detach().cpu().numpy(), ActionsNorm=torch.norm(pi, dim=-1).detach().cpu().numpy(), ActionsAbs=torch.abs(pi).flatten().detach().cpu().numpy(), ) return loss_pi, pi_info if architecture == 'mlp': # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) g_optimizer = Adam(ac.geps.parameters(), lr=lr) elif architecture == 'spline': # Set up optimizers for policy and q-function pi_optimizer = SparseDenseAdamOptimizer(ac.pi, dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) q_optimizer = SparseDenseAdamOptimizer([ac.q1, ac.q2], dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) g_optimizer = SparseDenseAdamOptimizer(ac.geps, dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) else: raise NotImplementedError # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() # if any([torch.isnan(p.grad).any() for p in ac.parameters() if p.grad is not None]): # print('nan') q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Next run one gradient descent step for the mean-gradient g_optimizer.zero_grad() # loss_g, g_info, g_dict = compute_loss_g(data) loss_g, g_info = compute_loss_g(data) loss_g.backward() # if any([torch.isnan(p.grad).any() for p in ac.parameters() if p.grad is not None]): # # print('nan') # print(len(g_dict)) g_optimizer.step() # Record things logger.store(LossG=loss_g.item(), **g_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in ac.geps.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() # if any([torch.isnan(p.grad).any() for p in ac.parameters() if p.grad is not None]): # print('nan') pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in ac.geps.parameters(): p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action_on_policy(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device), deterministic) def get_action_rbi(o, deterministic=False): o = torch.as_tensor(o, dtype=torch.float32, device=device) if deterministic: a = ac.act(o, deterministic) else: o = o.unsqueeze(0) a, _ = max_reroute(o) a = a.flatten().cpu().numpy() return a if sample == 'on_policy': get_action = get_action_on_policy elif sample == 'rbi': get_action = get_action_rbi else: raise NotImplementedError def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in tqdm(range(total_steps)): # eps = math.exp(math.log(1.) + (math.log(0.03) - math.log(1.)) * math.sin(2 * math.pi * t / 200e3)) # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every * update_factor): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('GVals', with_min_and_max=True) logger.log_tabular('LossG', with_min_and_max=True) logger.log_tabular('GradGAmp', with_min_and_max=True) logger.log_tabular('GradQAmp', with_min_and_max=True) logger.log_tabular('GradDelta', with_min_and_max=True) logger.log_tabular('GradSim', with_min_and_max=True) logger.log_tabular('GEps', with_min_and_max=True) logger.log_tabular('ActionsNorm', with_min_and_max=True) logger.log_tabular('ActionsAbs', with_min_and_max=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ddpg(env_name, partially_observable=False, pomdp_type='remove_velocity', flicker_prob=0.2, random_noise_sigma=0.1, random_sensor_missing_prob=0.1, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Deep Deterministic Policy Gradient (DDPG) Args: env_name : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. partially_observable: actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, and a ``q`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q`` (batch,) | Tensor containing the current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) # Wrapper environment if using POMDP if partially_observable: env = POMDPWrapper(env_name, pomdp_type, flicker_prob, random_noise_sigma, random_sensor_missing_prob) test_env = POMDPWrapper(env_name, pomdp_type, flicker_prob, random_noise_sigma, random_sensor_missing_prob) else: env, test_env = gym.make(env_name), gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") ac.to(device) ac_targ.to(device) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts) # Set up function for computing DDPG Q-loss def compute_loss_q(data, batch_hist, t): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] # batch_hist['pred_q_hist'] # batch_hist['targ_q_hist'] # batch_hist['targ_next_q_hist'] # batch_hist['sampled_time_hist'] q = ac.q(o, a) # Bellman backup for Q function with torch.no_grad(): q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) # if t < 50000: # Average over historically predicted q-values window_size = 10 mean_targ_next_q_hist = [] tuned_indicator = np.zeros(q_pi_targ.shape) batch_change_rate = [] for i in range(len(batch_hist['targ_next_q_hist'])): tmp_batch_hist = np.asarray(batch_hist['targ_next_q_hist'][i]) tmp_batch_hist = np.append( tmp_batch_hist, q_pi_targ[i].item()) # add new prediction change_rate = tmp_batch_hist[1:] - tmp_batch_hist[:-1] if len(tmp_batch_hist) == 1: batch_change_rate.append(None) else: batch_change_rate.append(change_rate[-1]) batch_change_rate = np.asarray(batch_change_rate).astype(float) not_nan_idxs = np.argwhere(~np.isnan(batch_change_rate)) sorted_not_nan_idxs = np.argsort( batch_change_rate[not_nan_idxs.flatten()]) threshold_percentile = 75 # 25, 50, 75 if len(sorted_not_nan_idxs) != 0: threshold = np.percentile( batch_change_rate[not_nan_idxs[sorted_not_nan_idxs]], threshold_percentile) if threshold < 0: threshold = 0 else: threshold = 1 # threshold = 1 # thresold=1 works for HalfCheetahBulletEnv-v0 # New threshold for i in range(len(batch_hist['targ_next_q_hist'])): tmp_batch_hist = np.asarray(batch_hist['targ_next_q_hist'][i]) tmp_batch_hist = np.append( tmp_batch_hist, q_pi_targ[i].item()) # add new prediction change_rate = tmp_batch_hist[1:] - tmp_batch_hist[:-1] if len(tmp_batch_hist) == 1: avg_window = tmp_batch_hist[-1] else: if change_rate[-1] > threshold: avg_window = tmp_batch_hist[-2] + threshold # avg_window = tmp_batch_hist[-2] tuned_indicator[i] = 1 else: avg_window = tmp_batch_hist[-1] mean_targ_next_q_hist.append(avg_window) # print(batch_change_rate[not_nan_idxs[sorted_not_nan_idxs]]) # import pdb; pdb.set_trace() # if t>10000: # import pdb; pdb.set_trace() avg_q_pi_targ = torch.as_tensor(mean_targ_next_q_hist, dtype=torch.float32).to(device) # else: # avg_q_pi_targ = q_pi_targ # tuned_indicator = np.zeros(q_pi_targ.shape) backup = r + gamma * (1 - d) * avg_q_pi_targ # backup = r + gamma * (1 - d) * q_pi_targ # import pdb; # pdb.set_trace() # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.cpu().detach().numpy(), TunedNum=tuned_indicator.sum(), THLD=threshold) return loss_q, loss_info, q, backup, avg_q_pi_targ, tuned_indicator # Crucial log shapped q_pi_targ to history # Set up function for computing DDPG pi loss def compute_loss_pi(data): o = data['obs'] q_pi = ac.q(o, ac.pi(o)) return -q_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(ac.q.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, batch_hist, t): # First run one gradient descent step for Q. q_optimizer.zero_grad() loss_q, loss_info, q, backup, q_pi_targ, tuned_indicator = compute_loss_q( data, batch_hist, t) loss_q.backward() q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in ac.q.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in ac.q.parameters(): p.requires_grad = True # Record things logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) # Finally, update target networks by polyak averaging. (Common choice: 0.995) # # TODO: remove later # polyak = 0.4 with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) return q.cpu().detach().numpy(), backup.cpu().detach().numpy( ), q_pi_targ.cpu().detach().numpy(), tuned_indicator def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32).to(device)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for _ in range(update_every): sample_type = 'pseudo_random' # 'pseudo_random' genuine_random batch, batch_hist, batch_idxs = replay_buffer.sample_batch( batch_size, device=device, sample_type=sample_type) q, backup, q_pi_targ, tuned_indicator = update( data=batch, batch_hist=batch_hist, t=t) replay_buffer.add_sample_hist(batch_idxs, q, backup, q_pi_targ, tuned_indicator, t) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # # Save model # fpath = osp.join(logger.output_dir, 'pyt_save') # os.makedirs(fpath, exist_ok=True) # context_fname = 'checkpoint-context-' + ( # 'Step-%d' % t if t is not None else '') + '.pt' # context_fname = osp.join(fpath, context_fname) # if (epoch % save_freq == 0) or (epoch == epochs): # logger.save_state({'env': env}, None) # torch.save({'replay_buffer': replay_buffer}, context_fname) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('TunedNum', with_min_and_max=True) logger.log_tabular('THLD', with_min_and_max=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def randwalk(env_fn, seed=0, steps_per_epoch=4000, epochs=50, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Random Walk (This is simply a uniform random walk!) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. """ # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 policy = Policy(env.action_space) # Set up model saving logger.setup_pytorch_saver(policy) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(steps_per_epoch): # Pick a random action within the action space a = policy.act(o) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None)
def mbfq(env_fn, ac_kwargs=dict(), seed=0, steps_per_epoch=2000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=128, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, max_ep_len_ppo=50, logger_kwargs=dict(), save_freq=1, update_factor=1, device='cuda', lam=0.97, steps_per_ppo_update=1000, n_ppo_updates=1, train_pi_iters=80, target_kl=0.01, clip_ratio=0.2): device = torch.device(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] state_dim = {376: 144, 111: 64, 17: 12, 11: 8}[obs_dim[0]] # Create actor-critic module and target networks ac = core.MLPActorCritic(env.observation_space, env.action_space, **ac_kwargs).to(device) model = core.FlowWorldModel(obs_dim[0], state_dim, act_dim + int(act_dim % 2)).to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) ppo_buffer = PPOBuffer(obs_dim, act_dim, steps_per_ppo_update, gamma=gamma, lam=lam, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, model]) logger.log('\nNumber of parameters: \t pi: %d, \t q: %d, \t model: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_model(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] if act_dim % 2: a = torch.cat([a, torch.zeros(len(a), 1, device=a.device)], dim=1) loss, info, _ = model(o, a, r, o2, d) return loss, info # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) model_optimizer = SparseDenseAdamOptimizer(model, dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): loss_model, model_info = compute_loss_model(data) model_optimizer.zero_grad() loss_model.backward() core.clip_grad_norm(model.parameters(), 1000) model_optimizer.step() # Record things logger.store(LossModel=loss_model.item(), **model_info) # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): # s = model.get_state(torch.as_tensor(o, dtype=torch.float32, device=device).unsqueeze(0), batch_size=batch_size).squeeze(0) # return ac.act(o, deterministic) return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device), deterministic) # Set up function for computing PPO policy loss def ppo_compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss ac.pi(obs) logp = ac.pi.log_prob(act, desquash=True) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, cf=clipfrac) return loss_pi, pi_info def ppo_step(o): with torch.no_grad(): o = torch.as_tensor(o, dtype=torch.float32, device=device) a, log_pi = ac_targ.pi(o) q1_pi = ac.q1(o, a) q2_pi = ac.q2(o, a) q_pi = torch.min(q1_pi, q2_pi) v = (alpha * log_pi - q_pi).squeeze(0).cpu().numpy() return a.squeeze(0).cpu().numpy(), v, log_pi.squeeze(0).cpu().numpy() def virtual_ppo(): venv = VirtualEnv(replay_buffer, model) ac_targ.pi.load_state_dict(ac.pi.state_dict()) # Main loop: collect experience in env and update/log each epoch for epoch in range(n_ppo_updates): o, ep_ret, ep_len = venv.reset(), 0, 0 for t in tqdm(range(steps_per_ppo_update)): a, v, log_pi = ppo_step(o) next_o, r, d, _ = venv.step(a) ep_ret += r ep_len += 1 # save and log ppo_buffer.store(o, a, r, v, log_pi) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len_ppo terminal = d or timeout epoch_ended = t == steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not terminal: print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ppo_step(o) else: v = 0 ppo_buffer.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(VirtualEpRet=ep_ret, VirtualEpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! data = ppo_buffer.get() pi_l_old, pi_info_old = ppo_compute_loss_pi(data) pi_l_old = pi_l_old.item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): loss_pi, pi_info = ppo_compute_loss_pi(data) # kl = mpi_avg(pi_info['kl']) kl = pi_info['kl'] if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break pi_optimizer.zero_grad() loss_pi.backward() # mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Log changes from update kl, cf = pi_info['kl'], pi_info['cf'] logger.store(LossPi=pi_l_old, KL=kl, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old)) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in tqdm(range(total_steps)): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every * update_factor): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() virtual_ppo() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VirtualEpRet', with_min_and_max=True) logger.log_tabular('VirtualEpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossModel', average_only=True) logger.log_tabular('reg', average_only=True) logger.log_tabular('rec', average_only=True) logger.log_tabular('loss_d', average_only=True) logger.log_tabular('loss_r', average_only=True) logger.log_tabular('kl', average_only=True) logger.log_tabular('prior_logprob', average_only=True) logger.log_tabular('log_det', average_only=True) logger.log_tabular('conditional_log_det', average_only=True) logger.log_tabular('conditional_logprob', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, resume=None, reinitialize_optimizer_on_resume=True, render=False, notes='', env_config=None, boost_explore=0, partial_net_load=False, num_inputs_to_add=0, episode_cull_ratio=0, try_rollouts=0, steps_per_try_rollout=0, take_worst_rollout=False, shift_advs_pct=0, **kwargs): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. resume (str): Path to directory with simple_save model info you wish to resume from reinitialize_optimizer_on_resume: (bool) Whether to initialize training state in the optimizers, i.e. the individual learning rates for weights in Adam render: (bool) Whether to render the env during training. Useful for checking that resumption of training caused visual performance to carry over notes: (str) Experimental notes on what this run is testing env_config (dict): Environment configuration pass through boost_explore (float): Amount to increase std of actions in order to reinvigorate exploration. partial_net_load (bool): Whether to partially load the network when resuming. https://pytorch.org/tutorials/beginner/saving_loading_models.html#id4 num_inputs_to_add (int): Number of new inputs to add, if resuming and partially loading a new network. episode_cull_ratio (float): Ratio of bad episodes to cull from epoch try_rollouts (int): Number of times to sample actions steps_per_try_rollout (int): Number of steps per attempted rollout take_worst_rollout (bool): Use worst rollout in training shift_advs_pct (float): Action should be better than this pct of actions to be considered advantageous. """ config = deepcopy(locals()) # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) import_custom_envs() # Instantiate environment env = env_fn() if hasattr(env.unwrapped, 'configure_env'): env.unwrapped.configure_env(env_config) obs_dim = env.observation_space.shape act_dim = env.action_space.shape num_agents = getattr(env, 'num_agents', 1) if hasattr(env.unwrapped, 'logger'): print('Logger set by environment') logger_kwargs['logger'] = env.unwrapped.logger logger = EpochLogger(**logger_kwargs) logger.add_key_stat('won') logger.add_key_stat('trip_pct') logger.add_key_stat('HorizonReturn') logger.save_config(config) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, num_inputs_to_add=num_inputs_to_add, **ac_kwargs) # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Resume if resume is not None: ac, pi_optimizer, vf_optimizer = get_model_to_resume( resume, ac, pi_lr, vf_lr, reinitialize_optimizer_on_resume, actor_critic, partial_net_load, num_inputs_to_add) if num_inputs_to_add: add_inputs(ac, ac_kwargs, num_inputs_to_add) if boost_explore: boost_exploration(ac, boost_explore) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = ppo_buffer_factory(obs_dim, act_dim, local_steps_per_epoch, gamma, lam, num_agents, shift_advs_pct, cull_ratio=episode_cull_ratio) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1-clip_ratio, 1+clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1+clip_ratio) | ratio.lt(1-clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Sync params across processes sync_params(ac) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.'%i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, r, d = reset(env) effective_horizon = round(1 / (1 - gamma)) effective_horizon_rewards = [] for _ in range(num_agents): effective_horizon_rewards.append(deque(maxlen=effective_horizon)) if hasattr(env, 'agent_index'): agent_index = env.agent_index agent = env.agents[agent_index] is_multi_agent = True else: agent_index = 0 agent = None is_multi_agent = False def get_action_fn(_obz): return ac.step(torch.as_tensor(_obz, dtype=torch.float32)) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): epoch_episode = 0 info = {} epoch_ended = False step_num = 0 ep_len = 0 ep_ret = 0 while not epoch_ended: if try_rollouts != 0: # a, v, logp, next_o, r, d, info # a, v, logp, obs, r, done, info rollout = do_rollouts( get_action_fn, env, o, steps_per_try_rollout, try_rollouts, take_worst_rollout) else: a, v, logp = get_action_fn(o) # NOTE: For multi-agent, steps current agent, # but returns values for next agent (from its previous action)! # TODO: Just return multiple agents observations next_o, r, d, info = env.step(a) if render: env.render() curr_reward = env.curr_reward if is_multi_agent else r # save and log buf.store(o, a, curr_reward, v, logp, agent_index) logger.store(VVals=v) # Update obs (critical!) o = next_o if 'stats' in info and info['stats']: # TODO: Optimize this logger.store(**info['stats']) if is_multi_agent: agent_index = env.agent_index agent = env.agents[agent_index] # TODO: Store vector of these for each agent when changing step API ep_len = agent.episode_steps ep_ret = agent.episode_reward else: ep_len += 1 ep_ret += r calc_effective_horizon_reward( agent_index, effective_horizon_rewards, logger, r) timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = buf.epoch_ended(step_num) if terminal or epoch_ended: if epoch_ended and not terminal: print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(agent_index, v) if terminal: buf.record_episode(ep_len=ep_len, ep_ret=ep_ret, step_num=step_num) # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) if 'stats' in info and info['stats'] and info['stats']['done_only']: logger.store(**info['stats']['done_only']) o, r, d = reset(env) if not is_multi_agent: ep_len = 0 ep_ret = 0 step_num += 1 buf.prepare_for_update() # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('DateTime', get_date_str()) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('HorizonReturn', with_min_and_max=True) if getattr(env.unwrapped, 'is_deepdrive', False): logger.log_tabular('trip_pct', with_min_and_max=True) logger.log_tabular('collided') logger.log_tabular('harmful_gs') logger.log_tabular('timeup') logger.log_tabular('exited_lane') logger.log_tabular('circles') logger.log_tabular('skipped') logger.log_tabular('backwards') logger.log_tabular('won') if 'stats' in info and info['stats']: for stat, value in info['stats'].items(): logger.log_tabular(stat, with_min_and_max=True) if logger.best_category or (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(dict(env=env), pytorch_save=dict( ac=ac.state_dict(), pi_optimizer=pi_optimizer.state_dict(), vf_optimizer=vf_optimizer.state_dict(), epoch=epoch, stats=logger.epoch_dict, ), itr=None, best_category=logger.best_category) logger.dump_tabular()
def sac(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=256, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, device='cuda', override=True): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ device = torch.device(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in tqdm(range(total_steps)): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): # logger.save_state({'env': env, 'rb': replay_buffer.get_state()}, None) logger.save_state({'env': env}, None if override else epoch) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def memb(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), model=core.MLPModel, seed=0, steps_per_epoch=1000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, model_lr=3e-4, value_lr=1e-3, pi_lr=3e-4, alpha=0.4, batch_size=100, start_steps=1000, max_ep_len=1000, save_freq=1, train_model_epoch=5, test_freq=5, save_epoch=100, exp_name='', env_name='', logger_kwargs=dict()): ## Added by Rami >> ## logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) ## << Added by Rami ## torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape #@! act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks # [pi, q1, q2, v or v')] = MLPActorCritic(obs_space, act_space) ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # we need a separate target network; bc it's optmz differnetly # [_, _, _, v_targ] = MLPActorCritic(obs2_space, act_space) ac_targ = deepcopy(ac) # Create model module # [transiton, reward] = MLPModel(obs_space, act_space) md = model(env.observation_space, env.action_space) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for all Value-networks (save this for convenience) val_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters(), ac.v.parameters()) # List of parameters for all Model-networks (save this for convenience) md_params = itertools.chain(md.transition.parameters(), md.reward.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) ## Added by Rami >> ## # Count variables # var_counts = tuple(core.count_vars(scope) for scope in ['main/dm', 'main/rm', 'main/pi', 'main/v', 'main/q1', 'main/q2', 'main']) var_counts = tuple( core.count_vars(module) for module in [md.transition, md.reward, ac.pi, ac.q1, ac.q2, ac.v, md, ac]) # print('\nNumber of parameters: \t dm: %d, \t rm: %d, \t pi: %d, \t v: %d, \t q1: %d, \t q2: %d, \t total: %d\n'%var_counts) logger.log( '\nNumber of parameters: \t dm: %d, \t rm: %d, \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d+%d\n' % var_counts) ## << Added by Rami ## # TD3 style Q function updates # ## Optimized costs\losses ## # o, a, r, o2, d = data['obs'], # data['act'], # data['rew'], # data['obs2'], # data['done'] # Set up function for computing Rew/Dyn model-losses ### Model/Reward losses (supervised learning): # loss = 0.5*(actual-prediction)^2 } # Jp(omega) = 0.5 Expt_D[(f(s,a)-s')^2] --> eq#4.a # Jr(ph) = Expt_D[(r(s,a)-r)^2] --> eq#4.b # min_omeg,ph{ Jp(omeg), Jr(ph) } def compute_loss_model(data): # Rami (Done) o, a, r, o2 = data['obs'], data['act'], data['rew'], data['obs2'] transition = md.transition(o, a) r_rm = md.reward(o, a) transition_backup = o2 r_backup = r loss_transition = ((transition_backup - transition)**2).mean() loss_r = ((r_backup - r_rm)**2).mean() loss_model = loss_transition + loss_r # Useful info for logging model_info = dict(Dyn=transition.detach().numpy(), Rew=r_rm.detach().numpy()) return loss_model, model_info # Set up function for computing pi loss ### Policy loss ### # State value-function of st: # V(st) = Expt_pi[Q(st,at) - log pi(at|st)] --> eq#3.b, # Policy learning's Soft Bellman eq (Reparameterization): # V(s) = Expt_pi[Expt_rm[r_hat(s,pi] # - alpha*log pi(a|s) # + gamma*Expt_f[V'(f(s,pi))]] --> eq#8 # Optz pi--> max_pi{ Expt_s~D[V(s)] } def compute_loss_pi(data): # Rami (Done) o = data['obs'] pi, logp_pi = ac.pi(o) transition_pi = md.transition(o, pi) r_rm_pi = md.reward(o, pi) v_prime = ac.v(transition_pi) # Entropy-regularized policy loss loss_pi = -(r_rm_pi - alpha * logp_pi + gamma * (1 - d) * v_prime).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up function for computing Q,V value-losses ### Value functions losses ### # Optz--> min_phi,psi{ Jq(phi),Jv(psi) } def compute_loss_val(data): # Rami (Done) o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] # pi, logp_pi = ac.pi(o) # Optimizesd functions q1 = ac.q1(o, a) q2 = ac.q2(o, a) v = ac.v(o) # q1_pi = ac.q1(o,pi) # q2_pi = ac.q2(o,pi) # min_q_pi = torch.min(q1_pi, q2_pi) # Bellman backup for Value functions with torch.no_grad(): # Target value function pi, logp_pi = ac.pi(o) v_targ = ac_targ.v(o2) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) min_q_pi = torch.min(q1_pi, q2_pi) q_backup = r + gamma * (1 - d) * v_targ # By Rami v_backup = min_q_pi - alpha * logp_pi # By Rami # MSE loss against Bellman backup loss_q1 = ((q_backup - q1)**2).mean() loss_q2 = ((q_backup - q2)**2).mean() loss_v = ((v_backup - v)**2).mean() loss_val = loss_q1 + loss_q2 + loss_v # Useful info for logging val_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy(), V_Vals=v.detach().numpy()) return loss_val, val_info # Set up optimizers for model, policy and value-functions model_optimizer = Adam(md_params, lr=model_lr) # Rami pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) val_optimizer = Adam(val_params, lr=value_lr) # Rami # Set up model saving logger.setup_pytorch_saver(ac) def updateAC(data): # Rami (Done) # print("AC updating..") # First run one gradient descent step for Q1, Q2, and V val_optimizer.zero_grad() loss_val, val_info = compute_loss_val(data) loss_val.backward() # Descent val_optimizer.step() # Record things logger.store(LossVal=loss_val.item(), **val_info) # Freeze Value-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in val_params: p.requires_grad = False # Freeze Model-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in md_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) (loss_pi).backward() # Ascent pi_optimizer.step() # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Unfreeze Value-networks so you can optimize it at next Update step. for p in val_params: p.requires_grad = True # Unfreeze Value-networks so you can optimize it at Model Update step. for p in md_params: p.requires_grad = True # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) # print("..AC updated") def updateModel(data): # Rami (Done) # print("Model updating..") # Run one gradient descent step for model model_optimizer.zero_grad() loss_model, model_info = compute_loss_model(data) loss_model.backward() # Descent model_optimizer.step() # Record things logger.store(LossModel=loss_model.item(), **model_info) # logger.store(LossRew=loss_model.item(), **model_info) # print("..Model updated") def get_action(o, deterministic=False): # Rami (Done) return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(epoch, n=1): # (Done) global mu, pi, q1, q2, q1_pi, q2_pi total_reward = 0 for j in range(n): # repeat n=5 times o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 total_reward += ep_ret logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) ## By Rami # print('The '+str(epoch)+' epoch is finished!') # print('The test reward is '+str(total_reward/n)) return total_reward / n start_time = time.time() ## Rami o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs reward_recorder = [] # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ The algorithm would take total_steps totally in the training """ # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Random for 1k (epoch 1) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Don't let the env done if just reach max_ep_length # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical!, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if [(env is done) or (max_ep_legth reached)] if d or (ep_len == max_ep_len): ## Added by Rami >> ## logger.store(EpRet=ep_ret, EpLen=ep_len) ## << Added by Rami ## o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Learning/Training # Train pi, Q, and V after 5 epochs for 5 times, # Train dyn/rew models from start: if t // steps_per_epoch > train_model_epoch: # if epoch > 5 # Train 5 steps of Q, V, and pi, # then train 1 step of model. for j in range(5): batch = replay_buffer.sample_batch(batch_size) updateAC(data=batch) updateModel(data=batch) # Rami else: # pretrain the model batch = replay_buffer.sample_batch(batch_size) updateModel(data=batch) # Rami # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch ## Added by Rami >> ## # Save model after each epoch: if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) ## << Added by Rami ## # if epoch > 5 and epoch % 10 == 0 start to test the agent: if epoch > train_model_epoch and epoch % test_freq == 0: # if epoch > train_model_epoch and epoch % 1 == 0: # test the agent when we reach the test_freq: reward_test = test_agent(epoch) # save the experiment result: # reward_recorder.append(reward_test) # reward_nparray = np.asarray(reward_recorder) # np.save(str(exp_name)+'_'+str(env_name)+'_'+str(save_freq)+'.npy',reward_nparray) ## Added by Rami >> ## logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpRet', with_min_and_max=True) # if n=1 no variance logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('LossQ1', average_only=True) # logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('V_Vals', with_min_and_max=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossVal', average_only=True) ## Added by Rami >> ## # logger.log_tabular('DynM', with_min_and_max=True) # logger.log_tabular('RewM', with_min_and_max=True) # logger.log_tabular('LossModel', average_only=True) logger.log_tabular('LossDyn', average_only=True) logger.log_tabular('LossRew', average_only=True) ## << Added by Rami ## logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def my_td3(env_fn, seed=0, steps_per_epoch=4000, epochs=100, max_ep_len=1000, hidden_sizes=[256, 256], logger_kwargs=dict(), save_freq=1, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, gamma=0.99, polyak=0.995, act_noise=0.1, pi_lr=1e-3, q_lr=1e-3, buffer_size=int(1e6), target_noise=0.2, noise_clip=0.5, policy_delay=2): """ My TD3 implementation """ # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() test_env = env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print("env.observation_space", env.observation_space) print("env.observation_space.shape", env.observation_space.shape) print("env.action_space", env.action_space) action_min = env.action_space.low[0] action_max = env.action_space.high[0] if isinstance(env.action_space, gym.spaces.Discrete): print("Discrete action space not supported for my_td3!") return # Set up experience buffer buf = ReplayBuffer(obs_dim, act_dim, buffer_size) # Instantiate models assert action_max == abs(action_min) policy = DeterministicPolicyNet(obs_dim, act_dim, hidden_sizes, action_max) policy_target = copy.deepcopy(policy) policy_optimizer = torch.optim.Adam(policy.mu_net.parameters(), lr=pi_lr) # Two Q-functions for Double Q Learning q_function_1 = QNet(obs_dim, act_dim, hidden_sizes) q_function_target_1 = copy.deepcopy(q_function_1) q_optimizer_1 = torch.optim.Adam(q_function_1.q_net.parameters(), lr=q_lr) q_function_2 = QNet(obs_dim, act_dim, hidden_sizes) q_function_target_2 = copy.deepcopy(q_function_2) q_optimizer_2 = torch.optim.Adam(q_function_2.q_net.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(policy) # TODO: Save value network as well # Freeze target networks with respect to optimizers (only update via polyak averaging) for p_targ in policy_target.parameters(): p_targ.requires_grad = False for q_targ in q_function_target_1.parameters(): q_targ.requires_grad = False for q_targ in q_function_target_2.parameters(): q_targ.requires_grad = False # Prepare for interaction with environment num_steps = epochs * steps_per_epoch start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for step in range( num_steps ): # TODO: Change to for loop over range(epochs) and range(steps_per_epoch) with torch.no_grad(): if step < start_steps: # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). a = env.action_space.sample() else: assert o.shape == (obs_dim, ) a = policy(torch.tensor(o, dtype=torch.float32).unsqueeze(0)) assert a.shape == (1, act_dim) a = a[0] # Remove batch dimension a = torch.clamp(a + act_noise * torch.randn(act_dim), action_min, action_max) # Add exploration noise a = a.numpy() # Convert to numpy next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d buf.store(o, a, r, next_o, d) # Update obs (critical!) o = next_o # Trajectory finished if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 if step >= update_after and step % update_every == 0: for j in range(update_every): def update(): o, a, r, next_o, d = buf.sample_batch(batch_size) # Compute targets with torch.no_grad(): next_a_targ = policy_target(next_o) # TD3 modification 1: Target policy smoothing eps = torch.clamp( torch.randn_like(next_a_targ) * target_noise, -noise_clip, noise_clip) next_a_targ = torch.clamp(next_a_targ + eps, action_min, action_max) # Clipped Double Q-Learning next_q_targ_1 = q_function_target_1( next_o, next_a_targ) next_q_targ_2 = q_function_target_2( next_o, next_a_targ) next_q_targ = torch.min(next_q_targ_1, next_q_targ_2) q_targ_1 = r + gamma * (1 - d) * next_q_targ q_targ_2 = r + gamma * (1 - d) * next_q_targ # Update Q functions q_optimizer_1.zero_grad() q_loss_1 = ((q_function_1(o, a) - q_targ_1)**2).mean() q_loss_1.backward() q_optimizer_1.step() q_optimizer_2.zero_grad() q_loss_2 = ((q_function_2(o, a) - q_targ_2)**2).mean() q_loss_2.backward() q_optimizer_2.step() # Delayed policy updates if j % policy_delay == 0: # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in q_function_1.parameters(): p.requires_grad = False for p in q_function_2.parameters(): p.requires_grad = False # Policy function update policy_optimizer.zero_grad() policy_loss = -(q_function_1(o, policy(o))).mean() policy_loss.backward() policy_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in q_function_1.parameters(): p.requires_grad = True for p in q_function_2.parameters(): p.requires_grad = True # Update target networks with polyak with torch.no_grad(): for p, p_targ in zip(policy.parameters(), policy_target.parameters()): p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) for q, q_targ in zip( q_function_1.parameters(), q_function_target_1.parameters()): q_targ.data.mul_(polyak) q_targ.data.add_((1 - polyak) * q.data) for q, q_targ in zip( q_function_2.parameters(), q_function_target_2.parameters()): q_targ.data.mul_(polyak) q_targ.data.add_((1 - polyak) * q.data) update() if (step + 1) % steps_per_epoch == 0: epoch = (step + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. def test_agent(): with torch.no_grad(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = policy( torch.tensor(o, dtype=torch.float32).unsqueeze(0)) assert a.shape == (1, act_dim) a = a[0] # Remove batch dimension a = a.numpy() # Convert to numpy o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', step) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=2048, epochs=250, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=100, train_v_iters=70, lam=0.95, max_ep_len=512, target_kl=0.005, logger_kwargs=dict(), save_freq=5): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env( "PandaPegIn", has_offscreen_renderer=True, # has_renderer=True, use_camera_obs=False, control_freq=100, ) obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) #加载预训练模型 # fname = "data/ppo_peg_in_add_delta_pos_plus_plus/ppo_peg_in_add_delta_pos_plus_plus_s0/pyt_save/model24.pt" # pre_model = torch.load(fname) # ac.pi = pre_model.pi # ac.v =pre_model.v #使用TensorboardX writer = logger.create_writer() # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi( obs, act ) #变化的是网络pi data['obs'],data['act'],data['adv'],data['logp']在回合内未变 ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info #需要最小化loss_pi,pi_info包括kl散度、熵、越界程度(都针对单个回合) # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() #一次更新改变一次data pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): #在kl散度不超标的前提下尽可能减小损失 pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # print(i,':',loss_v) # print('='*20) # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 #运动到初始位置 pre_action = [0, 0, 0] for i in range(4): o, _, _, _ = env.step(pre_action) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): print("epoch:", epoch) for t in range(local_steps_per_epoch): # if( t == steps_per_epoch/2 ): # print("Half finished!") #通过policy网络和值函数网络计算出:动作、值函数和采取这个动作的概率 a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r #单次游戏回报 ep_len += 1 #单次游戏时长 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: #game die; game超出时间; 回合结束 if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) #计算GAE和rewards-to-go # print("steps:",t) # print("done",d) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, epoch) # Perform PPO update! update() #将数据写入TensorboardX stats_to_write = logger.get_stats('EpRet') writer.add_scalar('AverageEpRet', stats_to_write[0], global_step=(epoch + 1) * 2048) # Log info about epoch 一个回合的数据 logger.log_tabular('Epoch', epoch) #第几个回合 logger.log_tabular('EpRet', with_min_and_max=True) #回报的最大、最小、平均值,游戏结束时停留的状态的回报 logger.log_tabular('EpLen', average_only=True) #单次游戏长度的平均值 logger.log_tabular('VVals', with_min_and_max=True) #值函数的最大、最小、平均值 logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) #目前总步数 logger.log_tabular('LossPi', average_only=True) #回合开始时策略网络的损失 logger.log_tabular('LossV', average_only=True) #回合开始时值网络的损失 logger.log_tabular('DeltaLossPi', average_only=True) #策略网络回合结束损失-开始损失 logger.log_tabular('DeltaLossV', average_only=True) #值略网络回合结束损失-开始损失 logger.log_tabular('Entropy', average_only=True) #? logger.log_tabular('KL', average_only=True) #散度值 logger.log_tabular('ClipFrac', average_only=True) #越界程度 logger.log_tabular('StopIter', average_only=True) #ppo策略网络迭代次数 logger.log_tabular('Time', time.time() - start_time) #回合时间 logger.dump_tabular() # if __name__ == '__main__': # import argparse # parser = argparse.ArgumentParser() # parser.add_argument('--env', type=str, default='HalfCheetah-v2') # parser.add_argument('--hid', type=int, default=64) # parser.add_argument('--l', type=int, default=2) # parser.add_argument('--gamma', type=float, default=0.99) # parser.add_argument('--seed', '-s', type=int, default=0) # parser.add_argument('--cpu', type=int, default=1) # parser.add_argument('--steps', type=int, default=4000) # parser.add_argument('--epochs', type=int, default=50) # parser.add_argument('--exp_name', type=str, default='ppo') # args = parser.parse_args() # mpi_fork(args.cpu) # run parallel code with mpi # from spinup.utils.run_utils import setup_logger_kwargs # logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) # ppo(lambda : gym.make(args.env), actor_critic=core.MLPActorCritic, # ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, # seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs, # logger_kwargs=logger_kwargs)
def eglu(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=256, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, eps=0.2, n_explore=32, device='cuda'): device = torch.device(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # Set up function for computing EGL mean-gradient-losses def compute_loss_g(data): o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] a2 = ball_explore(a1, n_explore, eps) a2 = a2.view(n_explore * len(r), act_dim) o_expand = repeat_and_reshape(o, n_explore) # Bellman backup for Q functions with torch.no_grad(): q1 = ac.q1(o_expand, a2) q2 = ac.q2(o_expand, a2) q_dither = torch.min(q1, q2) # Target actions come from *current* policy a_tag, logp_a_tag = ac.pi(o_tag) # Target Q-values q1_pi_targ = ac_targ.q1(o_tag, a_tag) q2_pi_targ = ac_targ.q2(o_tag, a_tag) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) q_anchor = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a_tag) q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1) a1_in = autograd.Variable(a1.data, requires_grad=True) q1 = ac.q1(o, a1_in) q2 = ac.q2(o, a1_in) qa = torch.min(q1, q2).unsqueeze(-1) geps = autograd.grad(outputs=qa, inputs=a1_in, grad_outputs=torch.cuda.FloatTensor( qa.size()).fill_(1.), create_graph=False, retain_graph=True, only_inputs=True)[0] geps = repeat_and_reshape(geps, n_explore) a1 = repeat_and_reshape(a1, n_explore) geps = (geps * (a2 - a1)).sum(-1) # l1 loss against Bellman backup loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor) # Useful info for logging g_info = dict(GVals=geps.flatten().detach().cpu().numpy()) return loss_g, g_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() # Next run one gradient descent step for the mean-gradient loss_g, g_info = compute_loss_g(data) # Record things logger.store(LossG=loss_g.item(), **g_info) q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) # Record things logger.store(LossQ=loss_q.item(), **q_info) loss_q = loss_q + loss_g loss_q.backward() q_optimizer.step() # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in ac.geps.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in ac.geps.parameters(): p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in tqdm(range(total_steps)): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def dqn(env_fn, q_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e4), gamma=0.99, polyak=0.995, q_lr=1e-3, batch_size=128, start_steps=0, update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, tf_logger='logs/dqn/'): tf_logger = tf_logger[:-1] + datetime.now().strftime('%Y%m%d%H%M%S') + '/' # tt() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # set up tensorboard parameters: torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] # only be applicable for discrete action act_dim = env.action_space.n # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # q_func_hidden_size = (256, 128, 64) q_func = core.MLPQFunction(obs_dim, act_dim, **q_kwargs).to(device) with SummaryWriter(log_dir=tf_logger, comment="DQN_graph") as w: dummy_input =torch.rand((1, obs_dim),dtype=torch.float32).to(device) w.add_graph(q_func, dummy_input) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(m) for m in [q_func]) q_target = deepcopy(q_func) q_target.eval() logger.log('\nNumber of parameters: q: %d\n' % var_counts) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in q_target.parameters(): p.requires_grad = False # #test q_func # qv = q_func(torch.randn([128, 4])) # action = torch.randint(high=1, low=0, size = [128, 2]) # Set up function for computing double Q-loss def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] r = Variable(r) o2 = Variable(o2) a = Variable(a) o = Variable(o) d = Variable(d) q = q_func(o).gather(1, a.unsqueeze(1)) # Bellman backup for Q function with torch.no_grad(): q_targ = q_target(o2).detach().max(1)[0] backup = r + gamma * (1 - d) * q_targ # MSE loss against Bellman backup loss_q = ((q - backup) ** 2).mean() # Useful info for logging loss_info = dict(QVals=q.cpu().detach().numpy()) return loss_q, loss_info # TODO: change learning rate q_optimizer = Adam(q_func.parameters()) # q_optimizer = RMSprop(q_func.parameters(), lr=0.00025, alpha=0.95, eps=0.01) # Set up model saving logger.setup_pytorch_saver(q_func) def update(data): q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() for param in q_func.parameters(): param.grad.data.clamp_(-1, 1) q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **loss_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(q_func.parameters(), q_target.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) return loss_q.item() # TODO: the initial value for exploration? # exploration = core.LinearScheduler(steps_per_epoch * epochs, init_value=0.9, final_value=0.05) exploration = core.ExpScheduler(init_value=0.9, final_value=0.05, decay=200) def get_action(obs, t): sample = random.random() eps_threshold = exploration.value(t) writer.add_scalar(tag="epsilon", scalar_value=eps_threshold, global_step=t) if sample > eps_threshold: with torch.no_grad(): # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. # return policy_net(state).max(1)[1].view(1, 1) obs = torch.from_numpy(obs).unsqueeze(0).type(torch.float32) # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history action = q_func(Variable(obs)).data.max(1)[1].item() return action else: return env.action_space.sample() def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) obs = torch.from_numpy(o).unsqueeze(0).type(torch.float32) action = q_func(Variable(obs)).data.max(1)[1].cpu().numpy().squeeze() o, r, d, _ = test_env.step(action) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 writer = SummaryWriter(logdir=tf_logger) # Main loop: collect experience in env and update/log each epoch epoch_counter = 0 for t in range(total_steps): if t > start_steps: a = get_action(o, (t + 1) // steps_per_epoch) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) o = o2 if d or ep_len == max_ep_len: logger.store(EpRet=ep_ret, EpLen=ep_len) epoch_counter += 1 writer.add_scalar('train_reward', ep_ret, global_step=epoch_counter) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling loss_q = 0 if t >= update_after and t % update_every == 0: for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) loss_q += update(data=batch) loss_q /= update_every # core.tensorboard_logger(logdir=tf_logger, scalar=loss_q, step=t, tag='q_loss') writer.add_scalar('q_loss', loss_q, global_step=t) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # core.tensorboard_logger(logdir=tf_logger, scalar=ep_ret, step=epoch, tag='train_reward') if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) test_agent() logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() writer.close() return
class SingleTaskDDPG(Approach): def __init__(self, action_space, observation_space, rng, eps=0.9, discount_factor=0.99, alpha=1e-3): self.rng = rng logger_kwargs = setup_logger_kwargs('SingleTaskDDPG', self.rng) self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.actor_critic = MLPActorCritic # ac_kwargs=dict() ****?????***** # seed=0 self.replay_size = int(1e6) self.polyak = 0.995 self.gamma = discount_factor self.pi_lr = alpha self.q_lr = alpha self.batch_size = 100 self.start_steps = 10000 self.update_after = 1000 self.update_every = 50 self.act_noise = 0.1 self.step_count = 0 self.action_space = action_space self.observation_space = observation_space # self.observation_space = spaces.Box(-np.inf, np.inf, shape=(17,), dtype=np.float32) #fix # torch.manual_seed(seed) # np.random.seed(seed) # self.obs_dim = self.observation_space.shape self.act_dim = self.action_space.shape[0] # act_dim = self.action_space.n # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.action_space.high[0] self.net = False def init_net(self, state): self.obs_dim = state.shape # Create actor-critic module and target networks self.ac = self.actor_critic(self.obs_dim[0], self.action_space) #took out ac_kwargs self.ac_targ = deepcopy(self.ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=self.replay_size) # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.pi_lr) self.q_optimizer = Adam(self.ac.q.parameters(), lr=self.q_lr) self.logger.setup_pytorch_saver(self.ac) self.net = True def observe(self, state, action, next_state, reward, done): state = self.process_state(state) next_state = self.process_state(next_state) self.replay_buffer.store(state, action, reward, next_state, done) if self.step_count >= self.update_after and self.step_count % self.update_every == 0: for _ in range(self.update_every): batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch) # Set up function for computing DDPG Q-loss def compute_loss_q(self, data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q = self.ac.q(o, a) # Bellman backup for Q function with torch.no_grad(): q_pi_targ = self.ac_targ.q(o2, self.ac_targ.pi(o2)) backup = r + self.gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.detach().numpy()) return loss_q, loss_info # Set up function for computing DDPG pi loss def compute_loss_pi(self, data): o = data['obs'] q_pi = self.ac.q(o, self.ac.pi(o)) return -q_pi.mean() def update(self, data): # First run one gradient descent step for Q. self.q_optimizer.zero_grad() loss_q, loss_info = self.compute_loss_q(data) loss_q.backward() self.q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in self.ac.q.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. self.pi_optimizer.zero_grad() loss_pi = self.compute_loss_pi(data) loss_pi.backward() self.pi_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in self.ac.q.parameters(): p.requires_grad = True self.logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) def get_action(self, state, exploit=False): processed_state = self.process_state(state) if not self.net: self.init_net(processed_state) # state is actually observation self.step_count += 1 if self.step_count <= self.start_steps: return self.action_space.sample() a = self.ac.act(torch.as_tensor(processed_state, dtype=torch.float32)) if not exploit: a += self.act_noise * np.random.randn(self.act_dim) return np.clip(a, -self.act_limit, self.act_limit) def reset(self, reward_function): self.reward_function = reward_function self.net = False # self.step_count = 0 def process_state(self, state): return state def log(self, returns, task): self.logger.store(EpRet=sum(returns), EpLen=len(returns)) self.logger.save_state({'env': task}, None) self.logger.log_tabular('EpRet', with_min_and_max=True) self.logger.log_tabular('EpLen', average_only=True) self.logger.log_tabular('TotalEnvInteracts', self.step_count) self.logger.log_tabular('QVals', with_min_and_max=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.dump_tabular() def load(self, file, task): # model = torch.load(file) # s = () # for param_tensor in model.state_dict(): # s+=(param_tensor, "\t", model.state_dict()[param_tensor].size()) # return s # model = self.actor_critic(17, self.action_space) # model.load_state_dict(torch.load(file)) self.ac = torch.load(file) self.ac.eval() self.net = True state = task.reset(self.rng) self.reward_function = task.reward_function images = [] for i in range(100): action = self.get_action(state, True) state, reward, done, _ = task.step(action) im = task.render(mode='rgb_array') images.append(im) if done: break imageio.mimsave('figures/DDPG/oracle.mp4', images)
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, beta=0.01, clip_ratio=0.2, pi_lr=3e-4, vf_lr=3e-4, train_pi_iters=80, train_v_iters=80, lam=0.95, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, use_rnn=False, reward_factor=1, spectrum_repr=False): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) comm = MPI.COMM_WORLD rank = comm.Get_rank() if rank == 0: print(ac) # udpate env config # env.scalar_thick = ac_kwargs['scalar_thick'] env.update_with_ac(**ac_kwargs) # For Tuple spaces obs_dim = ac.obs_dim if isinstance(env.action_space, spaces.Tuple): act_dim = core.tuple_space_dim(env.action_space, action=True) else: act_dim = env.action_space.shape # Create actor-critic module # print(ac) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam, cell_size=ac_kwargs['cell_size']) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old, hid = data['obs'], data['act'], data[ 'adv'], data['logp'], data['hid'] # for i in range(len(obs)-1): # if torch.eq(obs[i], torch.zeros(12)).sum()==12 and torch.eq(obs[i+1], torch.zeros(12)).sum()==12: # print(obs[i], obs[i+1], act[i], act[i+1]) # Policy loss pis = [] logp = 0 if len(ac.pi) > 1: # tuple actions for i, actor_i in enumerate(ac.pi): pi, logp_i = actor_i(obs, act[:, i][:, None]) logp += logp_i pis.append(pi) else: pi, logp_i = ac.pi[0](obs, act) logp += logp_i pis.append(pi) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info # sample estimation policy KL approx_kl = (logp_old - logp).mean().item() ent = sum([pi.entropy().mean().item() for pi in pis]) clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return 0.5 * ((ac.v(obs) - ret)**2).mean() def compute_loss_pi_v_rnn(data): obs, act, adv, logp_old, ret = data['obs'], data['act'], data[ 'adv'], data['logp'], data['ret'] hid = torch.zeros(ac_kwargs['cell_size']) v = [] logp = [] ent = [] num_traj = 0 #todo: test for i in range(len(obs)): v_i, logp_i, hid, ent_i = ac.evaluate(obs[i], act[i], hid) if i < len(obs) - 1 and obs[i + 1].sum() == 0: num_traj += 1 # print('Reinitialize #{}'.format(num_traj), flush=True) hid = torch.zeros(ac_kwargs['cell_size']) v.append(v_i) logp.append(logp_i) ent.append(ent_i) logp = torch.cat(logp) v = torch.cat(v) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # print(logp_old - logp) approx_kl = (logp_old - logp).mean().item() ent = torch.stack(ent).mean() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) loss_v = 0.5 * ((v - ret)**2).mean() # import pdb; pdb.set_trace() loss_pi = loss_pi - beta * ent logger.store(RetBuf=ret.clone().detach().numpy()) # import pdb; pdb.set_trace() return loss_pi, pi_info, loss_v # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) if use_rnn: optimizer = Adam(ac.parameters(), lr=pi_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() # import pdb; pdb.set_trace() if not use_rnn: pi_l_old, pi_info_old = compute_loss_pi(data) v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() if not use_rnn: loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() else: pi_l_old, pi_info_old, v_l_old = compute_loss_pi_v_rnn(data) pi_l_old = pi_l_old.item() for i in range(train_pi_iters): optimizer.zero_grad() loss_pi, pi_info, loss_v = compute_loss_pi_v_rnn(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss = loss_pi + loss_v loss.backward() mpi_avg_grads(ac) optimizer.step() logger.store(StopIter=i) # import pdb; pdb.set_trace() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() obs, ep_ret, ep_len = env.reset(), 0, 0 # import pdb; pdb.set_trace() # if ac_kwargs['scalar_thick']: # thick= obs[env.num_materials:env.num_materials+env.num_thicknesses].argmax() / env.num_thicknesses # obs = np.concatenate((obs[:env.num_materials+1], np.array([thick]))) # if ac_kwargs['scalar_thick']: # thick= obs[env.num_materials:env.num_materials+env.num_thicknesses].argmax() / env.num_thicknesses # obs = np.concatenate((obs[:env.num_materials+1], np.array([thick]))) hid = np.zeros( ac_kwargs['cell_size']) if ac_kwargs['cell_size'] else np.zeros(1) # import pdb; pdb.set_trace() design_tracker = DesignTracker(epochs, **logger_kwargs) total_env_time = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): epoch_start_time = time.time() for t in range(local_steps_per_epoch): #TODO: only evaluate act, v, logp, hid = ac.step( torch.as_tensor(obs, dtype=torch.float32), torch.as_tensor(hid, dtype=torch.float32)) # nv_start = time.time() next_obs, r, d, _ = env.step(act) # env_end = time.time() # env_time = env_end - env_start # total_env_time += env_time r = r * reward_factor # scale the rewards, possibly match the reward scale of atari ep_ret += r if not d: ep_len += 1 # save and log if use_rnn: buf.store(obs, act, r, v, logp, hid) else: buf.store(obs, act, r, v, logp) logger.store(VVals=v) # Update obs (critical!) obs = next_obs timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: # print(t) # if epoch_ended and not(terminal): # print('Warning: trajectory cut off by epoch at %d steps.' # % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target # if timeout or epoch_ended: if not terminal: _, v, _, _ = ac.step( torch.as_tensor(obs, dtype=torch.float32), torch.as_tensor(hid, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) if hasattr(env, 'layers') and hasattr(env, 'thicknesses'): design_tracker.store(env.layers, env.thicknesses, ep_ret, epoch) if rank == 0: print(env.layers, env.thicknesses) obs, ep_ret, ep_len = env.reset(), 0, 0 # reinitilize hidden state hid = np.zeros(ac_kwargs['cell_size']) if hasattr(env, "layers"): logger.store(Act=act[1]) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) design_tracker.save_state() # Perform PPO update! update() elapsed = time.time() - start_time epoch_time = time.time() - epoch_start_time # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) if hasattr(env, 'layers'): logger.log_tabular('Act', with_min_and_max=True) logger.log_tabular('RetBuf', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', elapsed) logger.log_tabular('FPS', int(steps_per_epoch / epoch_time)) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE-Lambda for advantage estimation) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape # obs_dim = env.observation_space.n act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing VPG policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) loss_pi = -(logp * adv).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, ent=ent) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) bayes_kl_loss = 0. if isinstance(ac.v, BayesMLPCritic): bayes_kl_loss = ac.v.compute_kl() total_loss_v = loss_v + bayes_kl_loss / data['obs'].shape[0] total_loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old), BayesKL=bayes_kl_loss) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 epoch_reward = [] # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t==local_steps_per_epoch-1 if terminal or epoch_ended: if epoch_ended and not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished epoch_reward.append(ep_ret) logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform VPG update! update() if epoch % 10 == 0: # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('BayesKL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() return epoch_reward
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): # Special function to avoid certain slowdowns from PyTorch + MPI combination setup_pytorch_for_mpi() # Setup logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random Seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate Environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor - critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync parameters across processes sync_params(ac) # Count variables var_counts = tuple( core.count_variables(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experiment buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up a function for computing PPO Policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy Loss pi, log_p = ac.pi(obs, act) ratio = torch.exp(log_p - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful Extra Information approx_kl = (logp_old - log_p).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clip_fraction = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clip_fraction) return loss_pi, pi_info # Setup function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Setup optimizers for policy and value functions pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with the environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs(critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or time_out epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not terminal: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def rpg(env_fn=(lambda: gym.make("CartPole-v1")), max_traj_length=500, batch_size=100, num_epochs=100, hidden_sizes=(32, 32), activation=nn.Tanh, pi_lr=0.0003, logger_kwargs=dict(), writer_kwargs=dict()): """ Assumes env is CartPole-v1; if want to cleanup later, just make the dimensions general to any env """ env = env_fn() # Assume obs space is a Box obs_dim = env.observation_space.shape[0] # Assume action space is Discrete (categorical), which is why we evaluate .n (rather than .shape[0]) act_dim = env.action_space.n pi = MLPCategoricalActor(obs_dim, act_dim, hidden_sizes, activation) pi_optimizer = Adam(pi.parameters(), lr=pi_lr) logger = EpochLogger(**logger_kwargs) logger.setup_pytorch_saver(pi) writer = SummaryWriter(**writer_kwargs) for ep in range(num_epochs): print(f"Epoch num: {ep}") # batch arrays; contains relevant data for batch of trajectories batch_rewards = torch.zeros(batch_size) batch_log_prob = torch.zeros(batch_size) for i in range(batch_size): o = env.reset() d = False # bool for "done" # buffers for o, a, r; contains all values for one trajectory # assumes cartpole dimensions buffer_o = np.zeros((max_traj_length, obs_dim)) buffer_a = np.zeros(max_traj_length) buffer_r = np.zeros(max_traj_length) ptr = 0 # pointer to the position in the buffer # take data for one entire trajectory while not d: o = torch.as_tensor(o, dtype=torch.float32) a = pi._distribution( o).sample() # sample Categorical policy to get an action o2, r, d, _ = env.step(a.numpy()) o2 = torch.as_tensor(o2, dtype=torch.float32) buffer_o[ptr] = o.numpy() buffer_a[ptr] = a buffer_r[ptr] = r o = o2 ptr += 1 if ptr >= max_traj_length: break # save traj data into batch arrays batch_rewards[i] = buffer_r[:ptr].sum() log_probs = pi._log_prob_from_distribution( pi._distribution( torch.as_tensor(buffer_o[:ptr], dtype=torch.float32)), torch.as_tensor(buffer_a[:ptr], dtype=torch.float32)) batch_log_prob[i] = log_probs.sum() # run one step of gradient descent optimizer pi_optimizer.zero_grad() loss = -1 * (batch_log_prob * batch_rewards).mean() loss.backward() pi_optimizer.step() # logging writer.add_scalar("pi loss", float(loss), ep) writer.add_scalar("avg return", float(batch_rewards.mean()), ep) if ep % 10 == 0: logger.save_state({'env': env}, None) # also saves pi print("Done training the agent.") logger.save_state({'env': env}, None) # also saves pi writer.close()