def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, explorer=None, eps=.03, pretrain_epochs=0): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 pretrain_steps = steps_per_epoch * pretrain_epochs total_epochs = epochs + pretrain_epochs total_steps = steps_per_epoch * total_epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): if t > start_steps: a = get_action(o) elif pretrain_steps == 0: # only explore if not pretraining with MaxEnt a = env.action_space.sample() # use MaxEnt exploration if you are in a pretrain epoch or if eps-greedy pre = t < pretrain_steps during = random.random() < eps if pre or during: if explorer is None: raise ValueError('Trying to explore but explorer is None') state = env.env.state_vector() a = explorer.sample_action(state) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A reference to ActorCritic class which after instantiation takes state, ``x``, and action, ``a``, and returns: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x`` and actions in | ``a``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x`` and actions in | ``a``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) # https://pytorch.org/docs/master/notes/randomness.html#cudnn torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) actor_critic_main = actor_critic(obs_dim, **ac_kwargs).to(device) actor_critic_target = actor_critic(obs_dim, **ac_kwargs).to(device) # Count variables var_counts = tuple( core.count_vars(model) for model in [ actor_critic_main.policy, actor_critic_main.q1, actor_critic_main.q2, actor_critic_main.v, actor_critic_main ]) logger.log(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # Policy train op pi_optimizer = optim.Adam(actor_critic_main.policy.parameters(), lr=lr) # Value train op value_params = [ *actor_critic_main.v.parameters(), *actor_critic_main.q1.parameters(), *actor_critic_main.q2.parameters() ] value_optimizer = optim.Adam(value_params, lr=lr) # Initializing targets to match main variables actor_critic_target.load_state_dict(actor_critic_main.state_dict()) def get_action(o, deterministic=False): mu, pi, _ = actor_critic_main(Tensor(o.reshape(1, -1)).to(device)) a = mu if deterministic else pi return a.cpu().detach().numpy() def test_agent(n=10): global mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) x, x2, a, r, d = [ Tensor(batch[k]).to(device) for k in ['obs1', 'obs2', 'acts', 'rews', 'done'] ] mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic_main( x, a) v_targ = actor_critic_target.v(x2) # Min Double-Q: min_q_pi = torch.min(q1_pi, q2_pi) # Targets for Q and V regression q_backup = (r + gamma * (1 - d) * v_targ).detach() v_backup = (min_q_pi - alpha * logp_pi).detach() # Soft actor-critic losses pi_loss = (alpha * logp_pi - q1_pi).mean() q1_loss = 0.5 * ((q_backup - q1)**2).mean() q2_loss = 0.5 * ((q_backup - q2)**2).mean() v_loss = 0.5 * ((v_backup - v)**2).mean() value_loss = q1_loss + q2_loss + v_loss value_optimizer.zero_grad() value_loss.backward() value_optimizer.step() pi_optimizer.zero_grad() pi_loss.backward() pi_optimizer.step() # Polyak averaging for target Value network (only ) variables # Credits: https://github.com/ghliu/pytorch-ddpg/blob/master/util.py params = zip(actor_critic_target.v.parameters(), actor_critic_main.v.parameters()) for ac_target, ac_main in params: ac_target.data.copy_(ac_main.data * (1.0 - polyak) + ac_target.data * polyak) logger.store( LossPi=pi_loss.item(), LossQ1=q1_loss.item(), LossQ2=q2_loss.item(), LossV=v_loss.item(), Q1Vals=q1.cpu().detach().numpy(), Q2Vals=q2.cpu().detach().numpy(), VVals=v.cpu().detach().numpy(), LogPi=logp_pi.cpu().detach().numpy(), ) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, actor_critic_main, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, expert=None, policy_path=None, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=500, epochs=100000, replay_size=int(5e3), gamma=0.99, dagger_noise=0.02, polyak=0.995, lr=1e-4, alpha=0.2, batch_size=64, dagger_epochs=200, pretrain_epochs=50, max_ep_len=500, logger_kwargs=dict(), save_freq=50, update_steps=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) test_logger_kwargs = dict() test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'], "test") test_logger_kwargs['exp_name'] = logger_kwargs['exp_name'] test_logger = EpochLogger(**test_logger_kwargs) tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print(obs_dim) print(act_dim) # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space act_high_limit = env.action_space.high act_low_limit = env.action_space.low sess = tf.Session() if policy_path is None: # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) tfa_ph = core.placeholder(act_dim) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # sess.run(tf.global_variables_initializer()) else: # load pretrained model model = restore_tf_graph(sess, osp.join(policy_path, 'simple_save')) x_ph, a_ph, x2_ph, r_ph, d_ph = model['x_ph'], model['a_ph'], model['x2_ph'], model['r_ph'], model['d_ph'] mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = model['mu'], model['pi'], model['logp_pi'], model['q1'], model['q2'], model['q1_pi'], model['q2_pi'], model['v'] # tfa_ph = core.placeholder(act_dim) tfa_ph = model['tfa_ph'] # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) dagger_replay_buffer = DaggerReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # print(obs_dim) # print(act_dim) # SAC objectives if policy_path is None: # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses dagger_pi_loss = tf.reduce_mean(tf.square(mu-tfa_ph)) pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) dagger_pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_dagger_pi_op = dagger_pi_optimizer.minimize(dagger_pi_loss, name='train_dagger_pi_op') pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'), name='train_pi_op') # sess.run(tf.variables_initializer(pi_optimizer.variables())) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params, name='train_value_op') # sess.run(tf.variables_initializer(value_optimizer.variables())) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess.run(tf.global_variables_initializer()) else: graph = tf.get_default_graph() dagger_pi_loss = model['dagger_pi_loss'] pi_loss = model['pi_loss'] q1_loss = model['q1_loss'] q2_loss = model['q2_loss'] v_loss = model['v_loss'] train_dagger_pi_op = graph.get_operation_by_name('train_dagger_pi_op') train_value_op = graph.get_operation_by_name('train_value_op') train_pi_op = graph.get_operation_by_name('train_pi_op') # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # sess = tf.Session() # sess.run(tf.global_variables_initializer()) dagger_step_ops = [q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_value_op, target_update] tf.summary.FileWriter("log/", sess.graph) # Setup model saving logger.setup_tf_saver(sess, inputs={'x_ph': x_ph, 'a_ph': a_ph, 'tfa_ph': tfa_ph, 'x2_ph': x2_ph, 'r_ph': r_ph, 'd_ph': d_ph}, \ outputs={'mu': mu, 'pi': pi, 'v': v, 'logp_pi': logp_pi, 'q1': q1, 'q2': q2, 'q1_pi': q1_pi, 'q2_pi': q2_pi, \ 'pi_loss': pi_loss, 'v_loss': v_loss, 'dagger_pi_loss': dagger_pi_loss, 'q1_loss': q1_loss, 'q2_loss': q2_loss}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi a = sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0] return np.clip(a, act_low_limit, act_high_limit) def choose_action(s, add_noise=False): s = s[np.newaxis, :] a = sess.run(mu, {x_ph: s})[0] if add_noise: noise = dagger_noise * act_high_limit * np.random.normal(size=a.shape) a = a + noise return np.clip(a, act_low_limit, act_high_limit) def test_agent(n=81, test_num=1): n = env.unwrapped._set_test_mode(True) con_flag = False for j in range(n): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, info = env.step(choose_action(np.array(o), 0)) ep_ret += r ep_len += 1 if d: test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_logger.store(arrive_des=info['arrive_des']) test_logger.store(arrive_des_appro=info['arrive_des_appro']) if not info['out_of_range']: test_logger.store(converge_dis=info['converge_dis']) con_flag = True test_logger.store(out_of_range=info['out_of_range']) # print(info) # test_logger.dump_tabular() # time.sleep(10) if not con_flag: test_logger.store(converge_dis=10000) env.unwrapped._set_test_mode(False) def ref_test_agent(n=81, test_num=1): n = env.unwrapped._set_test_mode(True) con_flag = False for j in range(n): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) a = call_ref_controller(env, expert) o, r, d, info = env.step(a) ep_ret += r ep_len += 1 if d: test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_logger.store(arrive_des=info['arrive_des']) test_logger.store(arrive_des_appro=info['arrive_des_appro']) if not info['out_of_range']: test_logger.store(converge_dis=info['converge_dis']) con_flag = True test_logger.store(out_of_range=info['out_of_range']) # print(info) # test_logger.dump_tabular() if not con_flag: test_logger.store(converge_dis=10000) env.unwrapped._set_test_mode(False) # ref_test_agent(test_num = -1) # test_logger.log_tabular('epoch', -1) # test_logger.log_tabular('TestEpRet', average_only=True) # test_logger.log_tabular('TestEpLen', average_only=True) # test_logger.log_tabular('arrive_des', average_only=True) # test_logger.log_tabular('arrive_des_appro', average_only=True) # test_logger.log_tabular('converge_dis', average_only=True) # test_logger.log_tabular('out_of_range', average_only=True) # test_logger.dump_tabular() start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 episode_steps = 500 total_env_t = 0 test_num = 0 print(colorize("begin dagger training", 'green', bold=True)) for epoch in range(1, dagger_epochs + 1, 1): # test policy if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs): # Save model logger.save_state({}, None) # Test the performance of the deterministic version of the agent. test_num += 1 test_agent(test_num=test_num) test_logger.log_tabular('epoch', epoch) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) test_logger.log_tabular('arrive_des_appro', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() # train policy o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 env.unwrapped._set_test_mode(False) obs, acs, rewards = [], [], [] for t in range(steps_per_epoch): obs.append(o) ref_action = call_ref_controller(env, expert) if(epoch < pretrain_epochs): action = ref_action else: action = choose_action(np.array(o), True) o2, r, d, _ = env.step(action) o = o2 acs.append(ref_action) rewards.append(r) if (t == steps_per_epoch-1): # print ("reached the end") d = True # Store experience to replay buffer replay_buffer.store(o, action, r, o2, d) ep_ret += r ep_len += 1 total_env_t += 1 if d: # Perform partical sac update! for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(dagger_step_ops, feed_dict) logger.store(LossQ1=outs[0], LossQ2=outs[1], LossV=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], VVals=outs[5], LogPi=outs[6]) # Perform dagger policy update dagger_replay_buffer.stores(obs, acs, rewards) for _ in range(int(ep_len/5)): batch = dagger_replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], tfa_ph: batch['acts']} q_step_ops = [dagger_pi_loss, train_dagger_pi_op] for j in range(10): outs = sess.run(q_step_ops, feed_dict) logger.store(LossPi = outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 break # Main loop: collect experience in env and update/log each epoch print(colorize("begin sac training", 'green', bold=True)) for epoch in range(1, epochs + 1, 1): # test policy if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs): # Save model logger.save_state({}, None) # Test the performance of the deterministic version of the agent. test_num += 1 test_agent(test_num=test_num) test_logger.log_tabular('epoch', epoch) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) # test_logger.log_tabular('arrive_des_appro', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) # logger.log_tabular('DeltaLossPi', average_only=True) # logger.log_tabular('DeltaLossV', average_only=True) # logger.log_tabular('Entropy', average_only=True) # logger.log_tabular('KL', average_only=True) # logger.log_tabular('ClipFrac', average_only=True) # logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() # train policy o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 env.unwrapped._set_test_mode(False) for t in range(steps_per_epoch): a = get_action(np.array(o)) o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 if (t == steps_per_epoch-1): # print ("reached the end") d = True replay_buffer.store(o, a, r, o2, d) o = o2 if d: """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
def sac(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") # device = 'cpu' print(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac).to(device) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer( obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] o, a, r, o2, d = o.to(device), a.to(device), r.to( device), o2.to(device), d.to(device) q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2, _ = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'].to(device) pi, logp_pi, _ = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # if last_pi is None and last_logp_pi is None: # o = data['obs'].to(device) # pi, _, logp_pi = ac.pi(o) # return pi, logp_pi # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) # return this_pi, this_logp_pi def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32).to(device), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t+1) % steps_per_epoch == 0: epoch = (t+1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=24, num_test_episodes=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ #custom values for MEME environment # steps_per_epoch=1200 #for SESE networks # start_steps=3600 #=====custom code. Say n iterations till which random policy is simulated nItr = 2; start_steps = steps_per_epoch * nItr; update_after = update_every*2; #================ # steps_per_epoch=240 #for DESE and LBJ networks # start_steps=7200 logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2 = actor_critic(x_ph, a_ph, **ac_kwargs) with tf.variable_scope('main', reuse=True): # compose q with pi, for pi-learning _, _, _, q1_pi, q2_pi = actor_critic(x_ph, pi, **ac_kwargs) # get actions and log probs of actions for next states, for Q-learning _, pi_next, logp_pi_next, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): # target q values, using actions from *current* policy _, _, _, q1_targ, q2_targ = actor_critic(x2_ph, pi_next, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) min_q_targ = tf.minimum(q1_targ, q2_targ) # # Targets for Q and V regression # q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ) # v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Entropy-regularized Bellman backup for Q functions, using Clipped Double-Q targets q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*(min_q_targ - alpha * logp_pi_next)) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - min_q_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, train_pi_op, train_value_op, target_update] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0] def test_agent(): # global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(num_test_episodes): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) temp_a = get_action(o,True) # if j==1: # print("Observation=",o) # print("--before mapping action=",temp_a) numpyFromA = np.array(temp_a) numpyFromA = ((numpyFromA+1.0)*(env.state.tollMax-env.state.tollMin)/2.0)+ env.state.tollMin temp_a = np.ndarray.tolist(numpyFromA) o, r, d, _ = test_env.step(temp_a) # Take deterministic actions at test time # o, r, d, _ = test_env.step(get_action(o, True)) #orig ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs maxRev=float("-inf") #negative infinity in the beginning #maxRevActionSeq=[] maxRevTSTT=0 maxRevRevenue=0 maxRevThroughput=0 maxRevJAH=0 maxRevRemVeh=0 maxRevJAH2=0 maxRevRMSE_MLvio=0 maxRevPerTimeVio=0 maxRevHOTDensity=pd.DataFrame() maxRevGPDensity=pd.DataFrame() maxtdJAHMax=0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): #======custom code======# #scale the noise hyperbolically to allow exploration in earlier iteration and less exploration latter epochId = 1+(int)(t/steps_per_epoch) par = 10; #scaling parameter for scaling the noise using formula below act_noise = (par)/(par+epochId-1); """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() #a = [a] #custom value for MEME environments #before stepping the environment, scale the tolls (for ML pricing problem) #we need to scale the sampled values of action from (-1,1) to our choices of toll coz they were sampled from tanh activation mu numpyFromA = np.array(a) numpyFromA = ((numpyFromA+1.0)*(env.state.tollMax-env.state.tollMin)/2.0)+ env.state.tollMin a = np.ndarray.tolist(numpyFromA) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ logger.store(EpRet=ep_ret, EpLen=ep_len) #get other stats and store them too otherStats = env.getAllOtherStats() # if np.any(np.isnan(np.array(otherStats))): # sys.exit("Nan found in statistics! Error") logger.store(EpTSTT=otherStats[0], EpRevenue=otherStats[1], EpThroughput=otherStats[2], EpJAH=otherStats[3], EpRemVeh=otherStats[4], EpJAH2= otherStats[5], EpMLViolRMSE=otherStats[6], EpPerTimeVio=otherStats[7], EptdJAHMax=otherStats[8]) #determine max rev profile if ep_ret> maxRev: maxRev=ep_ret maxRevActionSeq = env.state.tollProfile maxRevTSTT=otherStats[0]; maxRevRevenue=otherStats[1]; maxRevThroughput=otherStats[2] maxRevJAH=otherStats[3] maxRevRemVeh=otherStats[4] maxRevJAH2= otherStats[5] maxRevRMSE_MLvio = otherStats[6] maxRevPerTimeVio= otherStats[7] maxRevHOTDensity = env.getHOTDensityData() maxRevGPDensity = env.getGPDensityData() maxtdJAHMax = otherStats[8] o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5]) # OLD--End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # # End of epoch wrap-up # if (t+1) % steps_per_epoch == 0: # epoch = (t+1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) #extra info for MEME environment logger.log_tabular('EpTSTT', average_only=True) logger.log_tabular('EpRevenue', average_only=True) logger.log_tabular('EpThroughput', average_only=True) logger.log_tabular('EpJAH', average_only=True) logger.log_tabular('EpRemVeh', average_only=True) logger.log_tabular('EpJAH2', average_only=True) logger.log_tabular('EpMLViolRMSE', average_only=True) logger.log_tabular('EpPerTimeVio', average_only=True) logger.log_tabular('EptdJAHMax', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() print("Max cumulative reward obtained= %f "% maxRev) print("Corresponding revenue($)= %f, TSTT(hrs)= %f, Throughput(veh)=%f, JAHstat= %f, remaining vehicles= %f, JAHstat2=%f, RMSEML_vio=%f, percentTimeViolated(%%)=%f" % (maxRevRevenue,maxRevTSTT,maxRevThroughput,maxRevJAH,maxRevRemVeh,maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio)) outputVector = [maxRev, maxRevRevenue,maxRevTSTT,maxRevThroughput,maxRevJAH,maxRevRemVeh,maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio] #print("\n===Max rev action sequence is\n",maxRevActionSeq) exportTollProfile(maxRevActionSeq, logger_kwargs, outputVector) exportDensityData(maxRevHOTDensity, maxRevGPDensity, logger_kwargs)