def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) #=========================================================================# # # # All of your code goes in the space below. # # # #=========================================================================# # Main outputs from computation graph with tf.variable_scope('main'): ####################### # # # YOUR CODE HERE # # # ####################### # pi, q1, q2, q1_pi = pass # Target policy network with tf.variable_scope('target'): ####################### # # # YOUR CODE HERE # # # ####################### # pi_targ = pass # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions ####################### # # # YOUR CODE HERE # # # ####################### # Target Q-values, using action from smoothed target policy ####################### # # # YOUR CODE HERE # # # ####################### pass # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets ####################### # # # YOUR CODE HERE # # # ####################### # TD3 losses ####################### # # # YOUR CODE HERE # # # ####################### # pi_loss = # q1_loss = # q2_loss = # q_loss = #=========================================================================# # # # All of your code goes in the space above. # # # #=========================================================================# # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=200000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.005, batch_size=1000, start_steps=10000, max_ep_len=2000, logger_kwargs=dict(), save_freq=1, env_init=dict(), env_name='unknown', nb_test_episodes=50, train_freq=10, Teacher=None): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) hyperparams = locals() if Teacher: del hyperparams['Teacher'] # remove teacher to avoid serialization error logger.save_config(hyperparams) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() # initialize environment (choose between short, default or quadrupedal walker) if len(env_init.items()) > 0: env.env.my_init(env_init) test_env.env.my_init(env_init) if Teacher: Teacher.set_env_params(env) env.reset() obs_dim = env.env.observation_space.shape[0] print(obs_dim) act_dim = env.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0] def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): if Teacher: Teacher.set_test_env_params(test_env) o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) if Teacher: Teacher.record_test_episode(ep_ret, ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(np.ceil(ep_len/train_freq).astype('int')): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) # logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], # LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], # VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) if Teacher: Teacher.record_train_episode(ep_ret, ep_len) Teacher.set_env_params(env) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None)#itr=epoch) # Test the performance of the deterministic version of the agent. test_agent(n=nb_test_episodes) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t+1) #logger.log_tabular('Q1Vals', with_min_and_max=True) #logger.log_tabular('Q2Vals', with_min_and_max=True) #logger.log_tabular('VVals', with_min_and_max=True) #logger.log_tabular('LogPi', with_min_and_max=True) #logger.log_tabular('LossPi', average_only=True) #logger.log_tabular('LossQ1', average_only=True) #logger.log_tabular('LossQ2', average_only=True) #logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() # Pickle parameterized env data #print(logger.output_dir+'/env_params_save.pkl') if Teacher: Teacher.dump(logger.output_dir+'/env_params_save.pkl')
def trpo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo='trpo'): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: ============ ================ ======================================== Symbol Shape Description ============ ================ ======================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``info`` N/A | A dict of any intermediate quantities | (from calculating the policy or log | probabilities) which are needed for | analytically computing KL divergence. | (eg sufficient statistics of the | distributions) ``info_phs`` N/A | A dict of placeholders for old values | of the entries in ``info``. ``d_kl`` () | A symbol for computing the mean KL | divergence between the current policy | (``pi``) and the old policy (as | specified by the inputs to | ``info_phs``) over the batch of | states given in ``x_ph``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) ============ ================ ======================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TRPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) delta (float): KL-divergence limit for TRPO / NPG update. (Should be small for stability. Values like 0.01, 0.05.) vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. Probably don't play with this hyperparameter. cg_iters (int): Number of iterations of conjugate gradient to perform. Increasing this will lead to a more accurate approximation to :math:`H^{-1} g`, and possibly slightly-improved performance, but at the cost of slowing things down. Also probably don't play with this hyperparameter. backtrack_iters (int): Maximum number of steps allowed in the backtracking line search. Since the line search usually doesn't backtrack, and usually only steps back once when it does, this hyperparameter doesn't often matter. backtrack_coeff (float): How far back to step during backtracking line search. (Always between 0 and 1, usually above 0.5.) lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. algo: Either 'trpo' or 'npg': this code supports both, since they are almost the same. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph, plus placeholders for old pdist (for KL) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph ] + core.values_as_sorted_list(info_phs) # Every step, get: action, value, logprob, & info for pdist (for computing kl div) get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info) # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # TRPO losses ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # Symbols needed for CG solver pi_params = core.get_vars('pi') gradient = core.flat_grad(pi_loss, pi_params) v_ph, hvp = core.hessian_vector_product(d_kl, pi_params) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = core.flat_concat(pi_params) set_pi_params = core.assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) r = b.copy( ) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval inputs = {k: v for k, v in zip(all_phs, buf.get())} Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[ 1], agent_outs[2], agent_outs[3:] # save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def train_mnist(steps_per_epoch=100, epochs=5, lr=1e-3, layers=2, hidden_size=64, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Load and preprocess MNIST data (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() x_train = x_train.reshape(-1, 28 * 28) / 255.0 # Define inputs & main outputs from computation graph x_ph = tf.placeholder(tf.float32, shape=(None, 28 * 28)) y_ph = tf.placeholder(tf.int32, shape=(None, )) logits = mlp(x_ph, hidden_sizes=[hidden_size] * layers + [10], activation=tf.nn.relu) predict = tf.argmax(logits, axis=1, output_type=tf.int32) # Define loss function, accuracy, and training op y = tf.one_hot(y_ph, 10) loss = tf.losses.softmax_cross_entropy(y, logits) acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32)) train_op = tf.train.AdamOptimizer().minimize(loss) # Prepare session sess = tf.Session() sess.run(tf.global_variables_initializer()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={ 'logits': logits, 'predict': predict }) start_time = time.time() # Run main training loop for epoch in range(epochs): for t in range(steps_per_epoch): idxs = np.random.randint(0, len(x_train), 32) feed_dict = {x_ph: x_train[idxs], y_ph: y_train[idxs]} outs = sess.run([loss, acc, train_op], feed_dict=feed_dict) logger.store(Loss=outs[0], Acc=outs[1]) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(state_dict=dict(), itr=None) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('Acc', with_min_and_max=True) logger.log_tabular('Loss', average_only=True) logger.log_tabular('TotalGradientSteps', (epoch + 1) * steps_per_epoch) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()