def train_mnist(steps_per_epoch=100, epochs=5, lr=1e-3, layers=2, hidden_size=64, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Load and preprocess MNIST data (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() x_train = x_train.reshape(-1, 28 * 28) / 255.0 # Define inputs & main outputs from computation graph x_ph = tf.placeholder(tf.float32, shape=(None, 28 * 28)) y_ph = tf.placeholder(tf.int32, shape=(None,)) logits = mlp(x_ph, hidden_sizes=[hidden_size] * layers + [10], activation=tf.nn.relu) predict = tf.argmax(logits, axis=1, output_type=tf.int32) # Define loss function, accuracy, and training op y = tf.one_hot(y_ph, 10) loss = tf.losses.softmax_cross_entropy(y, logits) acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32)) train_op = tf.train.AdamOptimizer().minimize(loss) # Prepare session sess = tf.Session() sess.run(tf.global_variables_initializer()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'logits': logits, 'predict': predict}) start_time = time.time() # Run main training loop for epoch in range(epochs): for t in range(steps_per_epoch): idxs = np.random.randint(0, len(x_train), 32) feed_dict = {x_ph: x_train[idxs], y_ph: y_train[idxs]} outs = sess.run([loss, acc, train_op], feed_dict=feed_dict) logger.store(Loss=outs[0], Acc=outs[1]) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(state_dict=dict(), itr=None) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('Acc', with_min_and_max=True) logger.log_tabular('Loss', average_only=True) logger.log_tabular('TotalGradientSteps', (epoch + 1) * steps_per_epoch) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def tac(env_fn, actor_critic=core.mlp_q_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, alpha_schedule='constant', q=1.0, q_schedule='constant', pdf_type='gaussian', log_type='q-log', batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): alpha = Alpha(alpha_start=alpha, schedule=alpha_schedule) entropic_index = EntropicIndex(q_end=q, schedule=q_schedule) alpha_t = alpha() q_t = entropic_index() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space ac_kwargs['pdf_type'] = pdf_type ac_kwargs['log_type'] = log_type # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Coefficient for Tsallis entropy alpha_ph = core.scale_holder() # Place holder for entropic index q_ph = core.entropic_index_holder() # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic( x_ph, a_ph, q_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, q_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha_ph * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph, 'q': q_ph, 'alpha': alpha_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1), q_ph: q_t}) def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = get_action(o, True) o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t, q_ph: q_t } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) if np.isnan(outs[0]) or np.isnan(outs[7]).any(): print(outs) return logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EntIndex', q_t) logger.log_tabular('EntCoeff', alpha_t) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Update alpha and q value alpha_t = alpha() q_t = entropic_index()
def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) #=========================================================================# # # # All of your code goes in the space below. # # # #=========================================================================# # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions pi_noise_targ = get_pi_noise_clipped(pi, noise_scale=target_noise, noise_clip=noise_clip, act_limit=act_limit) # Target Q-values, using action from smoothed target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, pi_noise_targ, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets q_targ = get_q_target(q1_targ, q2_targ, r_ph, d=d_ph, gamma=0.99) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = tf.losses.mean_squared_error(q_targ, q1) q2_loss = tf.losses.mean_squared_error(q_targ, q2) q_loss = q1_loss + q2_loss #=========================================================================# # # # All of your code goes in the space above. # # # #=========================================================================# # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 maxRev = float("-inf") #negative infinity in the beginning #maxRevActionSeq=[] maxRevTSTT = 0 maxRevRevenue = 0 maxRevThroughput = 0 maxRevJAH = 0 maxRevRemVeh = 0 maxRevJAH2 = 0 maxRevRMSE_MLvio = 0 maxRevPerTimeVio = 0 maxRevHOTDensity = pd.DataFrame() maxRevGPDensity = pd.DataFrame() maxtdJAHMax = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) #we need to scale the sampled values of action from (-1,1) to our choices of toll coz they were sampled from tanh activation mu numpyFromA = np.array(a[0]) numpyFromA = ((numpyFromA + 1.0) * (env.state.tollMax - env.state.tollMin) / 2.0) + env.state.tollMin a[0] = np.ndarray.tolist(numpyFromA) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) #get other stats and store them too otherStats = env.getAllOtherStats() if np.any(np.isnan(np.array(otherStats))): sys.exit("Nan found in statistics! Error") logger.store(EpTSTT=otherStats[0], EpRevenue=otherStats[1], EpThroughput=otherStats[2], EpJAH=otherStats[3], EpRemVeh=otherStats[4], EpJAH2=otherStats[5], EpMLViolRMSE=otherStats[6], EpPerTimeVio=otherStats[7], EptdJAHMax=otherStats[8]) #determine max rev profile if ep_ret > maxRev: maxRev = ep_ret maxRevActionSeq = env.state.tollProfile maxRevTSTT = otherStats[0] maxRevRevenue = otherStats[1] maxRevThroughput = otherStats[2] maxRevJAH = otherStats[3] maxRevRemVeh = otherStats[4] maxRevJAH2 = otherStats[5] maxRevRMSE_MLvio = otherStats[6] maxRevPerTimeVio = otherStats[7] maxRevHOTDensity = env.getHOTDensityData() maxRevGPDensity = env.getGPDensityData() maxtdJAHMax = otherStats[8] o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpTSTT', average_only=True) logger.log_tabular('EpRevenue', average_only=True) logger.log_tabular('EpThroughput', average_only=True) logger.log_tabular('EpJAH', average_only=True) logger.log_tabular('EpRemVeh', average_only=True) logger.log_tabular('EpJAH2', average_only=True) logger.log_tabular('EpMLViolRMSE', average_only=True) logger.log_tabular('EpPerTimeVio', average_only=True) logger.log_tabular('EptdJAHMax', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() print("Max cumulative reward obtained= %f " % maxRev) print( "Corresponding revenue($)= %f, TSTT(hrs)= %f, Throughput(veh)=%f, JAHstat= %f, remaining vehicles= %f, JAHstat2=%f, RMSEML_vio=%f, percentTimeViolated(%%)=%f, tdJAHMax= %f" % (maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax)) outputVector = [ maxRev, maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax ] #print("\n===Max rev action sequence is\n",maxRevActionSeq) exportTollProfile(maxRevActionSeq, logger_kwargs, outputVector) exportDensityData(maxRevHOTDensity, maxRevGPDensity, logger_kwargs)
def sac1(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=3000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-4, alpha=0.2, batch_size=150, start_steps=9000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information with policy architecture ac_kwargs['action_space'] = env.action_space ac_kwargs['obs_dim'] = obs_dim h_size = ac_kwargs["h_size"] # hidden size of rnn seq_length = ac_kwargs["seq"] # seq length of rnn # Inputs to computation graph seq = None # training and testing doesn't has to have the same seq length x_ph, a_ph, r_ph, d_ph = core.placeholders([seq, obs_dim], [seq, act_dim], [seq, 1], [seq, 1]) s_t_0 = tf.placeholder(shape=[None, h_size], name="pre_state", dtype="float32") # zero state # Main outputs from computation graph outputs, states = cudnn_rnn_cell(x_ph, s_t_0, h_size=ac_kwargs["h_size"]) # outputs, _ = rnn_cell(x_ph, s_t_0, h_size=ac_kwargs["h_size"]) # outputs = mlp(outputs, [ac_kwargs["h_size"], ac_kwargs["h_size"]], activation=tf.nn.elu) # states = outputs[:, -1, :] # if use model predict next state (obs) with tf.variable_scope("model"): """hidden size for mlp h_size for RNN """ s_predict = mlp(tf.concat([outputs, a_ph], axis=-1), list(ac_kwargs["hidden_sizes"]) + [ac_kwargs["h_size"]], activation=tf.nn.relu) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic( x_ph, a_ph, s_t_0, outputs, states, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, q1_pi_, q2_pi_ = actor_critic(x_ph, a_ph, s_t_0, outputs, states, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, h_size=h_size, seq_length=seq_length, flag="seq", normalize=ac_kwargs["norm"]) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', "model"]) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t model: %d \n' % var_counts) if alpha == 'auto': # target_entropy = (-np.prod(env.action_space.shape)) target_entropy = -np.prod(env.action_space.shape) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=ac_kwargs["h0"]) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(logp_pi[:, :-1, :] + target_entropy)) # Use smaller learning rate to make alpha decay slower alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) # model train op # we can't use s_T to predict s_T+1 delta_x = tf.stop_gradient( outputs[:, 1:, :] - outputs[:, :-1, :]) # predict delta obs instead of obs # model_loss = tf.abs((1 - d_ph[:, :-1, :]) * (s_predict[:, :-1, :] - delta_x[:, :, :obs_dim-act_dim])) model_loss = tf.abs( (1 - d_ph[:, :-1, :]) * (s_predict[:, :-1, :] - delta_x)) # how about "done" state model_optimizer = tf.train.AdamOptimizer(learning_rate=lr) if "m" in ac_kwargs["opt"]: value_params_1 = get_vars('model') + get_vars('rnn') else: value_params_1 = get_vars('model') # opt for optimize model train_model_op = model_optimizer.minimize(tf.reduce_mean(model_loss), var_list=value_params_1) # Targets for Q and V regression v_backup = tf.stop_gradient(tf.minimum(q1_pi_, q2_pi_) - alpha * logp_pi) # clip curiosity in_r = tf.stop_gradient( tf.reduce_mean(tf.clip_by_value(model_loss, 0, 64), axis=-1, keepdims=True)) beta = tf.placeholder(dtype=tf.float32, shape=(), name="beta") # beta = ac_kwargs["beta"] # adjust internal reward # can we prove the optimal value of beta # I think beta should decrease with training going on # beta = alpha # adjust internal reward q_backup = r_ph[:, :-1, :] + beta * in_r + gamma * ( 1 - d_ph[:, :-1, :]) * v_backup[:, 1:, :] # Soft actor-critic losses # pi_loss = tf.reduce_mean(alpha * logp_pi[:, :-1, :] - q1_pi[:, :-1, :]) pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) # in some case, the last timestep Q function is super important so maybe we can use weight sum of loss # calculate last timestep separately for convince # q1_loss = 0.5 * tf.reduce_mean((q1[:, :-1, :] - q_backup) ** 2) q1_loss = tf.reduce_mean((q1[:, :-1, :] - q_backup)**2) q2_loss = tf.reduce_mean((q2[:, :-1, :] - q_backup)**2) # q2_loss = 0.5 * tf.reduce_mean((q2[:, :-1, :] - q_backup) ** 2) Q_loss = q1[:, :-1, :] - q_backup P_loss = alpha * logp_pi - q1_pi value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) # train model first pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) with tf.control_dependencies([train_model_op]): train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) if "q" in ac_kwargs["opt"]: value_params = get_vars('main/q') + get_vars('rnn') else: value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in non_deterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), model_loss, train_model_op, train_pi_op, train_value_op, target_update ] else: step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, model_loss, train_model_op, train_pi_op, train_value_op, target_update, train_alpha_op, Q_loss, P_loss ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, s_t_0_, mu, pi, states, deterministic=False): """s_t_0_ starting step for testing 1 H""" act_op = mu if deterministic else pi action, s_t_1_ = sess.run( [act_op, states], feed_dict={ x_ph: o.reshape(1, 1, obs_dim), a_ph: np.zeros([1, 1, act_dim]), s_t_0: s_t_0_ }) return action.reshape(act_dim), s_t_1_ def test_agent(mu, pi, states, n=10): # global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 s_0 = np.zeros([1, h_size]) while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a, s_1 = get_action(o, s_0, mu, pi, states, deterministic=True) s_0 = s_1 o, r, d, _ = test_env.step(a) # test_env.render() ep_ret += r ep_len += 1 # replay_buffer.store(o.reshape([1, obs_dim]), a.reshape([1, act_dim]), r, d) logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() # start = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch s_t_0_ = np.zeros([1, h_size]) episode = 0 for t in range(total_steps + 1): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t == 0: start = time.time() if t > start_steps: # s_t_0_store = s_t_0_ # hidden state stored in buffer a, s_t_1_ = get_action(o, s_t_0_, mu, pi, states, deterministic=False) s_t_0_ = s_t_1_ else: # s_t_0_store = s_t_0_ # print(s_t_0_.shape) _, s_t_1_ = get_action(o, s_t_0_, mu, pi, states, deterministic=False) s_t_0_ = s_t_1_ a = env.action_space.sample() # Step the env o2, r, d, _ = env.step( a ) # give back o_t_1 we need store o_t_0 because that is what cause a_t_0 # print(r) # env.render() ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o.reshape([1, obs_dim]), s_t_0_.reshape([1, h_size]), a.reshape([1, act_dim]), r, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ # fps = (time.time() - start)/200 # print("{} fps".format(200 / (time.time() - start))) print(ep_len) episode += 1 start = time.time() beta_ = ac_kwargs["beta"] * (1 - t / total_steps) # beta_ = ac_kwargs["beta"] * (1 / t ** 0.5) for j in range(int(ep_len)): batch = replay_buffer.sample_batch(batch_size) # maybe we can store starting state feed_dict = { x_ph: batch['obs1'], s_t_0: batch[ 's_t_0'], # all zero matrix for zero state in training a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], beta: beta_, } for _ in range(ac_kwargs["tm"] - 1): batch = replay_buffer.sample_batch(batch_size) # maybe we can store starting state feed_dict = { x_ph: batch['obs1'], s_t_0: batch['s_t_0'], # stored zero state for training a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], beta: beta_, } _ = sess.run(train_model_op, feed_dict) outs = sess.run(step_ops, feed_dict) # print(outs) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3].flatten(), Q2Vals=outs[4].flatten(), LogPi=outs[5].flatten(), Alpha=outs[6], beta=beta_, model_loss=outs[7].flatten()) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 s_t_0_ = np.zeros([1, h_size ]) # reset s_t_0_ when one episode is finished print("one episode duration:", time.time() - start) start = time.time() # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if epoch % 50 == 0: np.save("model__mq_loss_{}".format(epoch), outs[7]) np.save("Q_mq_loss_{}".format(epoch), outs[-2]) np.save("P_mq_loss_{}".format(epoch), outs[-1]) # Save model # if (epoch % save_freq == 0) or (epoch == epochs - 1): # logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent(mu, pi, states) # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('Episode', episode) logger.log_tabular('name', name) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('beta', average_only=True) logger.log_tabular('model_loss', with_min_and_max=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(BASE_DIR, expert_density, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), steps_per_epoch=1000, epochs=10, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=50, train_v_iters=50, lam=0.97, max_ep_len=1000, target_kl=0.01, data_n=10): data = {} # ALL THE DATA logger_kwargs = setup_logger_kwargs(args.dir_name, data_dir=BASE_DIR) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # update rule def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) policy_distr = Gaussian_Density() policy = lambda s: np.random.uniform( -2.0, 2.0, size=env.action_space.shape) # random policy policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length) density = policy_distr.density() data[0] = { 'pol_s': policy_distr.num_samples, 'pol_t': policy_distr.num_trajects } dist_rewards = [] # repeat REIL for given number of rounds for i in range(args.rounds): message = "\nRound {} out of {}\n".format(i + 1, args.rounds) reward = lambda s: expert_density(s) / (density(s) + args.eps) dist_rewards.append(reward) start_time = time.time() o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 r = reward(o) # custom reward # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, old_r, d, _ = env.step(a[0]) r = reward(o) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = old_r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) last_val = reward(o) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 r = reward(o) # store model! if (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() print(message) policy = lambda state: sess.run( get_action_ops, feed_dict={x_ph: state.reshape(1, -1)})[0][0] data[i] = { 'pol_s': policy_distr.num_samples, 'pol_t': policy_distr.num_trajects } data[i]['rewards'] = evaluate_reward(env, policy, data_n) if i != args.rounds - 1: policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length) density = policy_distr.density() return data, dist_rewards
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, explorer=None, eps=.03, pretrain_epochs=0): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_epochs = epochs + pretrain_epochs # Main loop: collect experience in env and update/log each epoch for epoch in range(total_epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # explore if you are in a pretrain epoch or if eps-greedy pre = epoch < pretrain_epochs during = random.random() < eps if pre or during: if explorer is None: raise ValueError('Trying to explore but explorer is None') state = env.env.state_vector() a = explorer.sample_action(state) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, logger_kwargs=dict(), network_params=dict(), rl_params=dict()): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # env params thresh = rl_params['thresh'] # control params seed = rl_params['seed'] epochs = rl_params['epochs'] steps_per_epoch = rl_params['steps_per_epoch'] replay_size = rl_params['replay_size'] batch_size = rl_params['batch_size'] start_steps = rl_params['start_steps'] max_ep_len = rl_params['max_ep_len'] max_noop = rl_params['max_noop'] save_freq = rl_params['save_freq'] render = rl_params['render'] # rl params gamma = rl_params['gamma'] polyak = rl_params['polyak'] lr = rl_params['lr'] grad_clip_val = rl_params['grad_clip_val'] alpha = rl_params['alpha'] target_entropy_start = rl_params['target_entropy_start'] target_entropy_stop = rl_params['target_entropy_stop'] target_entropy_steps = rl_params['target_entropy_steps'] train_env, test_env = env_fn(), env_fn() obs_space = env.observation_space act_space = env.action_space tf.set_random_seed(seed) np.random.seed(seed) train_env.seed(seed) train_env.action_space.np_random.seed(seed) test_env.seed(seed) test_env.action_space.np_random.seed(seed) # get the size after resize obs_dim = network_params['input_dims'] act_dim = act_space.n # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # init a state buffer for storing last m states train_state_buffer = StateBuffer(m=obs_dim[2]) test_state_buffer = StateBuffer(m=obs_dim[2]) # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim, act_dim, obs_dim, None, None) # alpha and entropy setup max_target_entropy = tf.log(tf.cast(act_dim, tf.float32)) target_entropy_prop_ph = tf.placeholder(dtype=tf.float32, shape=()) target_entropy = max_target_entropy * target_entropy_prop_ph log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) if alpha == 'auto': # auto tune alpha alpha = tf.exp(log_alpha) else: # fixed alpha alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, pi_logits, q1_logits, q2_logits, q1_a, q2_a, q1_pi, q2_pi = build_models(x_ph, a_ph, act_dim, network_params) # Target value network with tf.variable_scope('target'): _, _, logp_pi_targ, _, _, _, _, _, q1_pi_targ, q2_pi_targ = build_models(x2_ph, a_ph, act_dim, network_params) # Count variables var_counts = tuple(count_vars(scope) for scope in ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t alpha: %d, \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) # Min Double-Q: (check the logp_pi bit) min_q_pi = tf.minimum(q1_pi, q2_pi) min_q_pi_targ = tf.minimum(q1_pi_targ, q2_pi_targ) # Targets for Q regression q_backup = r_ph + gamma*(1-d_ph)*tf.stop_gradient(min_q_pi_targ - alpha * logp_pi_targ) # critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2) value_loss = q1_loss + q2_loss # actor loss pi_loss = tf.reduce_mean(alpha*logp_pi - min_q_pi) # alpha loss for temperature parameter alpha_backup = tf.stop_gradient(logp_pi + target_entropy) alpha_loss = -tf.reduce_mean(log_alpha * alpha_backup) # Policy train op # (has to be separate from value train op, because q1_logits appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) if grad_clip_val is not None: gvs = pi_optimizer.compute_gradients(pi_loss, var_list=get_vars('main/pi')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_pi_op = pi_optimizer.apply_gradients(capped_gvs) else: train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_pi_op]): if grad_clip_val is not None: gvs = value_optimizer.compute_gradients(value_loss, var_list=get_vars('main/q')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_value_op = value_optimizer.apply_gradients(capped_gvs) else: train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q')) # Alpha train op alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_value_op]): train_alpha_op = alpha_optimizer.minimize(alpha_loss, var_list=get_vars('log_alpha')) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, q1_a, q2_a, logp_pi, target_entropy, alpha_loss, alpha, train_pi_op, train_value_op, train_alpha_op, target_update] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session(config=tf_config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1_a, 'q2': q2_a}) def get_action(state, deterministic=False): state = state.astype('float32') / 255. act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: [state]})[0] def reset(env, state_buffer): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # fire to start game and perform no-op for some frames to randomise start o, _, _, _ = env.step(1) # Fire action to start game for _ in range(np.random.randint(1, max_noop)): o, _, _, _ = env.step(0) # Action 'NOOP' o = process_image_observation(o, obs_dim, thresh) r = process_reward(r) old_lives = env.ale.lives() state = state_buffer.init_state(init_obs=o) return o, r, d, ep_ret, ep_len, old_lives, state def test_agent(n=10, render=True): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len, test_old_lives, test_state = reset(test_env, test_state_buffer) terminal_life_lost_test = False if render: test_env.render() while not(d or (ep_len == max_ep_len)): # start by firing if terminal_life_lost_test: a = 1 else: # Take lower variance actions at test(noise_scale=0.05) a = get_action(test_state, True) # Take deterministic actions at test time o, r, d, _ = test_env.step(a) o = process_image_observation(o, obs_dim, thresh) r = process_reward(r) test_state = test_state_buffer.append_state(o) ep_ret += r ep_len += 1 if test_env.ale.lives() < test_old_lives: test_old_lives = test_env.ale.lives() terminal_life_lost_test = True else: terminal_life_lost_test = False if render: test_env.render() if render: test_env.close() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # ================== Main training Loop ================== start_time = time.time() o, r, d, ep_ret, ep_len, old_lives, state = reset(train_env, train_state_buffer) total_steps = steps_per_epoch * epochs target_entropy_prop = linear_anneal(current_step=0, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps) save_iter = 0 terminal_life_lost = False # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # press fire to start if terminal_life_lost: a = 1 else: if t > start_steps: a = get_action(state) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) o2 = process_image_observation(o2, obs_dim, thresh) r = process_reward(r) one_hot_a = process_action(a, act_dim) next_state = train_state_buffer.append_state(o2) ep_ret += r ep_len += 1 if train_env.ale.lives() < old_lives: old_lives = train_env.ale.lives() terminal_life_lost = True else: terminal_life_lost = False # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(state, one_hot_a, r, next_state, terminal_life_lost) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 state = next_state if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], target_entropy_prop_ph: target_entropy_prop } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], TargEntropy=outs[6], LossAlpha=outs[7], Alpha=outs[8]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len, old_lives, state = reset(train_env, train_state_buffer) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # update target entropy every epoch target_entropy_prop = linear_anneal(current_step=t, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps) # Save model if save_freq is not None: if (epoch % save_freq == 0) or (epoch == epochs-1): print('Saving...') logger.save_state({'env': env}, itr=save_iter) save_iter+=1 # Test the performance of the deterministic version of the agent. test_agent(n=5, render=render) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', average_only=True) logger.log_tabular('TargEntropy', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) def set_seeds(seed=seed): os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) tf.set_random_seed(seed) np.random.seed(seed) """ Enable 100% reproducibility on operations related to tensor and randomness. Parameters: seed (int): seed value for global randomness fast_n_close (bool): whether to achieve efficient at the cost of determinism/reproducibility """ #set_seeds(seed=seed) #logging.warning("*******************************************************************************") #logging.warning("*** set_global_determinism is called,setting full determinism, will be slow ***") #logging.warning("*******************************************************************************") os.environ['TF_DETERMINISTIC_OPS'] = '1' os.environ['TF_CUDNN_DETERMINISTIC'] = '1' # https://www.tensorflow.org/api_docs/python/tf/config/threading/set_inter_op_parallelism_threads tf.config.threading.set_inter_op_parallelism_threads(1) tf.config.threading.set_intra_op_parallelism_threads(1) #from tfdeterminism import patch #patch() np.random.seed(seed) random.seed(seed) set_seeds(seed=2) env = env_fn() env.seed(seed) env.action_space.seed(seed) #env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) # Target Q-values, using action from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets min_q_targ = tf.minimum(q1_targ, q2_targ) backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = tf.reduce_mean((q1 - backup)**2) q2_loss = tf.reduce_mean((q2 - backup)**2) q_loss = q1_loss + q2_loss global_step1 = tf.Variable(0, trainable=False) global_step2 = tf.Variable(0, trainable=False) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step( get_action(o, 0) ) # _ no processing this _ , need to let Spinning up to buffer & let optuna acces _ ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: # update_after=1000, update_every=50 for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. #test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) #logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) #logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, ref_func=None, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=500, epochs=10000, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=500, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) t_a_ph = core.placeholder_from_space(env.action_space) ret_ph = core.placeholder(None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, t_a_ph, ret_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) print("---------------", local_steps_per_epoch) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # dagger objectives pi_loss = tf.reduce_mean(tf.square(pi - t_a_ph)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old = sess.run([pi_loss, v_loss], feed_dict=inputs) # Training for i in range(train_pi_iters): sess.run(train_pi, feed_dict=inputs) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new = sess.run([pi_loss, v_loss], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(1, epochs + 1, 1): for t in range(local_steps_per_epoch): a_s, v_t, logp_t = sess.run( get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)}) a = a_s[0] ref_a = call_mpc(env, ref_func) if (epoch < 100): a = ref_a # save and log buf.store(o, a, ref_a, r) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: np.array(o).reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space print("---") print("obs_dim:", obs_dim) print("act_dim:", act_dim) print("act_limit:", act_limit) print("env.action_space", env.action_space) # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2 = actor_critic(x_ph, a_ph, **ac_kwargs) # ty: placeholder to hold meta strategy param TODO: check meta_log_std dimension meta_mu = core.placeholder(act_dim) meta_log_std = core.placeholder(act_dim) meta_mu_next = core.placeholder(act_dim) meta_log_std_next = core.placeholder(act_dim) # ty: logp_phi logp_phi = core.gaussian_likelihood(a_ph, meta_mu, meta_log_std) _, _, logp_phi = core.apply_squashing_func(meta_mu, a_ph, logp_phi) with tf.variable_scope('main', reuse=True): # compose q with pi, for pi-learning _, _, _, q1_pi, q2_pi = actor_critic(x_ph, pi, **ac_kwargs) # get actions and log probs of actions for next states, for Q-learning _, pi_next, logp_pi_next, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # ty: logp_phi_next, make sure the action is from the current policy logp_phi_next = core.gaussian_likelihood(pi_next, meta_mu_next, meta_log_std_next) _, _, logp_phi_next = core.apply_squashing_func( meta_mu_next, pi_next, logp_phi_next) # Target value network with tf.variable_scope('target'): # target q values, using actions from *current* policy _, _, _, q1_targ, q2_targ = actor_critic(x2_ph, pi_next, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) min_q_targ = tf.minimum(q1_targ, q2_targ) # Entropy-regularized Bellman backup for Q functions, using Clipped Double-Q targets q_backup = tf.stop_gradient( r_ph + gamma * (1 - d_ph) * (min_q_targ - alpha * logp_pi_next + alpha * logp_phi_next)) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - alpha * logp_phi - min_q_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # ty: temporary values for meta_mu, ... # temp0s = np.ones((100,4)) * (-10) # ty: temporary variance for meta strategy temp1s = np.ones((100, 4)) # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], # ty: fill in correct values meta_mu: np.apply_along_axis(obs2mu, 1, batch['obs1']), meta_log_std: temp1s, meta_mu_next: np.apply_along_axis(obs2mu, 1, batch['obs2']), meta_log_std_next: temp1s } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5]) # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def gail(env_fn,traj_dir, actor_critic=core.mlp_actor_critic_add, ac_kwargs=dict(),d_hidden_size =64,d_batch_size = 64,seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=40, train_v_iters=40, lam=0.97, max_ep_len=4000,beta =1e-4, target_kl=0.01, logger_kwargs=dict(), save_freq=100, r_env_ratio=0,gail_ratio =1, d_itr =20, reward_type = 'negative', pretrain_bc_itr =0): """ additional args d_hidden_size : hidden layer size of Discriminator d_batch_size : Discriminator's batch size r_env_ratio,gail_ratio : the weight of rewards from envirionment and gail .Total reward = gail_ratio *rew_gail+r_env_ratio* rew_from_environment d_itr : The number of iteration of update discriminater reward_type : GAIL reward has three type ['negative','positive', 'AIRL'] trj_num :the number of trajectory for pretrain_bc_itr: the number of iteration of pretraining by behavior cloeing """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape D=Discriminator(env,hidden_size = d_hidden_size,reward_type =reward_type) e_obs = np.loadtxt(traj_dir + '/observations.csv',delimiter=',') e_act = np.loadtxt(traj_dir + '/actions.csv',delimiter= ',')#Demo treajectory Sibuffer =SIBuffer(obs_dim, act_dim, e_obs,e_act,trj_num= 0, max_size =None)#!sibuf assert e_obs.shape[1:] == obs_dim # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi,pi_std, entropy, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))- beta*entropy v_loss = tf.reduce_mean((ret_ph - v)**2)#ret_phには累積報酬のバッファが入る # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() BC = BehavioralCloning(sess,pi,logp,x_ph,a_ph) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Sync params across processes # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k:v for k,v in zip(all_phs, buf.get())}#all_phsは各バッファーに対応するプレースホルダー辞書 pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training#ここも変える必要あり? おそらく変えなくて良い for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl:#更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る logger.log('Early stopping at step %d due to reaching max kl.'%i) break logger.store(StopIter=i) for _ in range(train_v_iters):#vの更新 sess.run(train_v, feed_dict=inputs) # Log changes from update(新しいロスの計算) pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) std, std_ent = sess.run([pi_std,entropy],feed_dict = inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=std_ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old),#更新での改善量 DeltaLossV=(v_l_new - v_l_old), Std = std) start_time = time.time() o, r, d, ep_ret_task,ep_ret_gail, ep_len = env.reset(), 0, False, 0,0 , 0 if pretrain_bc_itr>0: BC.learn(Sibuffer.expert_obs,Sibuffer.expert_act ,max_itr =pretrain_bc_itr) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) buf.store_rew(r) ''' if t <150: env.render() time.sleep(0.03) ''' ep_ret_task += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch-1): if d:# if trajectory didn't reach terminal state, bootstrap value target last_val = r else: last_val = sess.run(v, feed_dict={x_ph: o.reshape(1,-1)})#v_last=...だったけどこれで良さげ buf.store_rew(last_val)#if its terminal ,nothing change and if its maxitr last_val is use buf.finish_path() if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret_task, EpLen=ep_len)#,EpRet_Sum =ep_ret_sum,EpRet_Gail =ep_ret_gail) o, r, d, ep_ret_task,ep_ret_sum,ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, epoch) agent_obs , agent_act = buf.obs_buf, buf.act_buf d_batch_size = d_batch_size#or len(agent_obs)//d_itr #update discreminator for _t in range(d_itr): e_obs_batch ,e_act_batch =Sibuffer.get_random_batch(d_batch_size) a_obs_batch =sample_batch(agent_obs,batch_size = d_batch_size) a_act_batch= sample_batch(agent_act,batch_size = d_batch_size) D.train(sess, e_obs_batch,e_act_batch , a_obs_batch,a_act_batch ) js_d = D.get_js_div(sess,Sibuffer.main_obs_buf,Sibuffer.main_act_buf,agent_obs,agent_act) #---------------get_gail_reward------------------------------ rew_gail=D.get_reward(sess,agent_obs, agent_act).ravel() buf.rew_buf = gail_ratio *rew_gail+r_env_ratio*buf.rew_buf for path_slice in buf.slicelist[:-1]: ep_ret_gail = rew_gail[path_slice].sum() ep_ret_sum = buf.rew_buf[path_slice].sum() logger.store(EpRet_Sum=ep_ret_sum,EpRet_Gail=ep_ret_gail) buf.culculate_adv_buf() # -------------Perform PPO update!-------------------- update() logger.store(JS=js_d) # Log info about epoch #if epoch%10 == 0:#logger print each 10 epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpRet_Sum', average_only=True) logger.log_tabular('EpRet_Gail', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.log_tabular('Std', average_only=True) logger.log_tabular('JS', average_only=True) #logger.log_tabular('JS_Ratio', average_only=True) logger.dump_tabular()
def ppo(env_fn, actor_critic=core_2.mlp_actor_critic, beta=1, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() # game environment obs_dim = env.observation_space.shape # get the observe dimension from environment act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph #print(env.action_space) x_ph, a_ph = core_2.placeholders_from_spaces(env.observation_space, env.action_space) # 构建神经网络的时候,a_ph还没有 adv_ph, ret_ph, logp_old_ph, log_old_ph_all = core_2.placeholders(None, None, None, 18) #print(logp_old_ph) #print(log_old_ph_all) # Main outputs from computation graph pi, logp, logp_pi, v, logp_all = actor_critic(x_ph, a_ph, **ac_kwargs) # 目前这里的状态和action都还是放的placeholder # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, log_old_ph_all] # Every step, get: action, value, and logprob # 每一步都需要得到action(这里的pi似乎表示action) get_action_ops = [pi, v, logp_pi, logp_all] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core_2.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) #print((tf.exp(log_old_ph_all) * (logp - logp_old_ph))) kl = tf.reduce_mean(tf.multiply(tf.exp(log_old_ph_all),tf.transpose([logp - logp_old_ph]))) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) #pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # 两部分的loss pi_loss = -tf.reduce_mean(ratio * adv_ph - beta * kl) v_loss = tf.reduce_mean((ret_ph - v) ** 2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes # 同步参数 sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # 主循环 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t, logp_all = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log # 把数据放进 buffer pool 里 buf.store(o, a, r, v_t, logp_t, logp_all) logger.store(VVals=v_t) # o 应该代表observation o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! # 打完一局游戏,执行一次更新 #update() inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kld = sess.run([train_pi, kl], feed_dict=inputs) kld = mpi_avg(kld) if kld > 1.5 * target_kl: beta = 2 * beta if kld < target_kl / 1.5: beta = beta / 2 # logger.log('Early stopping at step %d due to reaching max kl.' % i) # break logger.store(StopIter=i) # 上部分的train是policy,这部分是值函数 for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() config = Namespace(gamma=0.99, entropy_level=-1, lr=1e-3, batch_size=128, polyak=0.995, replay_size=100000) sess = tf.Session() sac = SAC(sess, config, env.action_space, env.observation_space) sac.initialize() # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': sac.x_ph['input'], 'a': sac.a_ph }, outputs={ 'mu': sac.mu, 'pi': sac.pi, 'q1': sac.q1, 'q2': sac.q2, 'v': sac.v }) def test_agent(n=10): for j in range(n): obs, reward, done, ep_ret, ep_len = test_env.reset( ), 0, False, 0, 0 while not (done or (ep_len == max_ep_len)): # Take deterministic actions at test time obs, reward, done, _ = test_env.step( sac.act({'input': obs[None]}, deterministic=True)) ep_ret += reward ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() obs, reward, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: action = sac.act({'input': obs[None]}) else: action = env.action_space.sample() # Step the env obs_next, reward, done, _ = env.step(action) ep_ret += reward ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) done = False if ep_len == max_ep_len else done # Store experience to replay buffer sac.observe({'input': obs}, action, reward, {'input': obs_next}, done) # Super critical, easy to overlook step: make sure to update # most recent observation! obs = obs_next if done or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): outs = sac.train() logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) obs, reward, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ddpg(env, test_env, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) # Neta - logger is giving me some problems, so ignore # logger.save_config(locals()) # Neta - disable seeding # tf.set_random_seed(seed) # np.random.seed(seed) # Neta - we pass the gym environments to DDPG, instead of using a factory method # env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) msglogger.info('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q-backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q}) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] msglogger.info("spinup_ddpg: pi a={}".format(a)) a += noise_scale * np.random.randn(act_dim) msglogger.info("spinup_ddpg: pi a={} after adding noise".format(a)) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 msglogger.info("spinup_ddpg [after reset]: o={} r={} d={}".format(o, r, d)) total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) msglogger.info("spinup_ddpg: o2={} r={} d={}".format(o2, r, d)) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ # Neta # stats = ('Peformance/Validation/', # OrderedDict([('act_noise', act_noise)])) # # distiller.log_training_progress(stats, None, # self.episode, steps_completed=self.current_layer_id, # total_steps=self.amc_cfg.conv_cnt, log_freq=1, loggers=[self.tflogger]) # Neta: noise decay if t > start_steps: act_noise = act_noise * 0.97 # Neta: don't learn while in heatup if t > start_steps: for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def simple_vpg(env_fn=lambda: gym.make('CartPole-v1'), actor_critic=None, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=10000, gamma=0.99, pi_lr=1e-5, logger_kwargs=dict(), save_freq=10): # Global variables num_of_train_iterations = epochs # `number_of_layers` hidden layers with `hidden_dim` units each hidden_dim = 32 number_of_layers = 2 learning_rate = pi_lr batch_size = steps_per_epoch discount_factor = gamma #init # init log logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) #make gym enviornment env = env_fn() obs_dim = env.observation_space.shape[0] # model - build computation graph with tf.variable_scope('model'): #input layer obs_ph = tf.placeholder(dtype=np.float32, shape=(None, obs_dim), name='obs_ph') #mlp (Multi Layer Perceptron) - hidden layers #define sizes for hidden layers hidden_sizes = [hidden_dim] * number_of_layers #build hidden layers x = obs_ph for h in hidden_sizes: x = tf.layers.dense(x, units=h, activation=tf.tanh) #logits logits = tf.layers.dense(x, units=env.action_space.n, activation=None) #actions - output layer actions = tf.squeeze(tf.multinomial(logits=logits, num_samples=1), axis=1) #define loss function rtgs_ph = tf.placeholder(shape=(None, ), dtype=tf.float32, name='rtgs_ph') acts_ph = tf.placeholder(shape=(None, ), dtype=tf.int32, name='acts_ph') action_mask = tf.one_hot(acts_ph, env.action_space.n) log_probs = tf.reduce_sum(action_mask * tf.nn.softmax(logits=logits), axis=1) loss = -tf.reduce_mean(rtgs_ph * log_probs) #define optimizer optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss) #session init session = tf.InteractiveSession() session.run(tf.global_variables_initializer()) logger.setup_tf_saver(session, inputs={'x': obs_ph}, outputs={'pi': actions}) #train for iter in range(num_of_train_iterations): #single train iteration #reset batch data batch_obs, batch_acts, batch_rtgs, batch_rets, batch_lens = [], [], [], [], [] #reset episodic data obs, reward, done, ep_rews = env.reset(), 0, False, [] while len(batch_obs) < batch_size: #record batch # env.render() batch_obs.append(obs.copy()) selected_action = session.run(actions, {obs_ph: obs.reshape(1, -1)})[0] obs, reward, done, _ = env.step(selected_action) batch_acts.append(selected_action) ep_rews.append(reward) if done: #end of episode batch_rets.append(sum(ep_rews)) batch_lens.append(len(ep_rews)) #calc reward to go n = len(ep_rews) x = np.array(ep_rews) #prepare array of powers of dicount factor # y = discount_factor ** np.arange(n) # rtgs_array = np.zeros_like(x, dtype=np.float32) # for step in range(n): # rtgs_array[step] = sum(x[step:]*y[:n-step]) rtgs_array = np.zeros_like(x) for i in reversed(range(n)): rtgs_array[i] = x[i] + (rtgs_array[i + 1] if i + 1 < n else 0) batch_rtgs += list(rtgs_array) #reset episodic data obs, reward, done, ep_rews = env.reset(), 0, False, [] if (iter % save_freq == 0) or (iter >= num_of_train_iterations - 1): logger.save_state({'env': env}, None) # TODO: normalize advs trick: # batch_advs = np.array(batch_rtgs) # batch_advs = (batch_advs - np.mean(batch_advs))/(np.std(batch_advs) + 1e-8) feed_dict = { obs_ph: np.array(batch_obs[:len(batch_rtgs)]), acts_ph: np.array(batch_acts[:len(batch_rtgs)]), rtgs_ph: np.array(batch_rtgs) } batch_loss, _ = session.run([loss, optimizer], feed_dict=feed_dict) # log epoch results logger.log_tabular('Epoch', iter) logger.log_tabular('TotalEnvInteracts', (iter + 1) * batch_size) logger.log_tabular('loss', batch_loss) logger.log_tabular('AverageEpRet', np.mean(batch_rets)) logger.log_tabular('epispode mean length', np.mean(batch_lens)) logger.dump_tabular()
def dd_dqn(env, logger_kwargs=dict(), network_params=dict(), rl_params=dict(), resume_training=False, resume_params=dict()): logger = EpochLogger(**logger_kwargs) if not resume_training: save_vars = locals().copy() save_vars.pop('env') logger.save_config(save_vars) # ==== control params ==== seed = rl_params['seed'] epochs = rl_params['epochs'] steps_per_epoch = rl_params['steps_per_epoch'] replay_size = rl_params['replay_size'] update_freq = rl_params['update_freq'] n_updates = rl_params['n_updates'] batch_size = rl_params['batch_size'] start_steps = rl_params['start_steps'] max_ep_len = rl_params['max_ep_len'] num_tests = rl_params['num_tests'] save_freq = rl_params['save_freq'] # ==== rl params ==== use_HER = rl_params['use_HER'] use_prev_a = rl_params['use_prev_a'] gamma = rl_params['gamma'] polyak = rl_params['polyak'] q_lr = rl_params['q_lr'] # ==== noise params ==== act_noise_min = rl_params['act_noise_min'] act_noise_max = rl_params['act_noise_max'] act_noise_max_steps = rl_params['act_noise_max_steps'] test_act_noise = rl_params['test_act_noise'] # if resuming sess is passed as a param if not resume_training: sess = tf.compat.v1.Session(config=tf_config) # set seeding (still not perfectly deterministic) tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) env.action_space.np_random.seed(seed) # get required gym spaces obs = env.observation_space act = env.action_space # get the size after resize obs_dim = network_params['input_dims'] act_dim = act.n goal_dim = len(env.goal_list) # add action dimension to network params network_params['output_dim'] = act_dim if not resume_training: # init a state buffer for storing last m states train_state_buffer = StateBuffer(m=obs_dim[2]) test_state_buffer = StateBuffer(m=obs_dim[2]) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, goal_dim=goal_dim, size=replay_size) # Inputs to computation graph x_ph, a_ph, prev_a_ph, x2_ph, r_ph, d_ph, g_ph = placeholders( obs_dim, act_dim, act_dim, obs_dim, None, None, goal_dim) # Main outputs from computation graph with tf.variable_scope('main'): value_x1, advantage_x1, value_x2, advantage_x2 = action_value_networks( x_ph, x2_ph, use_prev_a, a_ph, prev_a_ph, g_ph, network_params) # Target networks with tf.variable_scope('target'): _, _, value_targ_x2, advantage_targ_x2 = action_value_networks( x_ph, x2_ph, use_prev_a, a_ph, prev_a_ph, g_ph, network_params) var_counts = tuple( count_vars(scope) for scope in ['main/q', 'target/q']) print("""\nNumber of parameters: main q: %d target q: %d \n""" % var_counts) # combine value and advantage functions q_x1 = value_x1 + tf.subtract( advantage_x1, tf.reduce_mean(advantage_x1, axis=1, keepdims=True)) q_x2 = value_x2 + tf.subtract( advantage_x2, tf.reduce_mean(advantage_x2, axis=1, keepdims=True)) q_targ = value_targ_x2 + tf.subtract( advantage_targ_x2, tf.reduce_mean(advantage_targ_x2, axis=1, keepdims=True)) # get the index of the maximum q value, corresponds with action taken pi = tf.argmax(q_x1, axis=1) # get q values for actions taken q_val = tf.reduce_sum(tf.multiply(q_x1, a_ph), axis=1) # Double QL uses maximum action from main network but q value from target max_q_x2 = tf.one_hot(tf.argmax(q_x2, axis=1), depth=act_dim) q_targ_val = tf.reduce_sum(tf.multiply(q_targ, max_q_x2), axis=1) # Bellman backup for Q function q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_targ_val) # mean squared error loss q_loss = 0.5 * tf.reduce_mean((q_val - q_backup)**2) # set up optimizer trainable_vars = get_vars('main/q') q_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=q_lr, epsilon=1e-04) train_q_op = q_optimizer.minimize(q_loss, var_list=trainable_vars, name='train_q_op') # Polyak averaging for target variables (polyak=0.00 for hard update) target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ], name='target_update') # Initializing targets to match main variables target_init = tf.group([ tf.compat.v1.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) else: # if resuming define all the ph and outputs from saved model # inputs x_ph = resume_params['model']['x_ph'] a_ph = resume_params['model']['a_ph'] prev_a_ph = resume_params['model']['prev_a_ph'] x2_ph = resume_params['model']['x2_ph'] r_ph = resume_params['model']['r_ph'] d_ph = resume_params['model']['d_ph'] g_ph = resume_params['model']['g_ph'] # outputs pi = resume_params['model']['pi'] q_loss = resume_params['model']['q_loss'] q_x1 = resume_params['model']['q_x1'] # small buffers replay_buffer = resume_params['resume_state']['replay_buffer'] train_state_buffer = resume_params['resume_state'][ 'train_state_buffer'] test_state_buffer = resume_params['resume_state']['test_state_buffer'] # get needed operations from graph by name (trouble saving these) train_q_op = tf.get_default_graph().get_operation_by_name("train_q_op") target_update = tf.get_default_graph().get_operation_by_name( "target_update") if not resume_training: sess.run(tf.compat.v1.global_variables_initializer()) sess.run(target_init) else: sess = resume_params['sess'] # Setup model saving if save_freq is not None: logger.setup_tf_saver(sess, inputs={ 'x_ph': x_ph, 'a_ph': a_ph, 'prev_a_ph': prev_a_ph, 'x2_ph': x2_ph, 'r_ph': r_ph, 'd_ph': d_ph, 'g_ph': g_ph }, outputs={ 'pi': pi, 'q_loss': q_loss, 'q_x1': q_x1 }) def get_action(state, one_hot_goal, prev_a, noise_scale): state = state.astype('float32') / 255. if np.random.random_sample() < noise_scale: a = env.action_space.sample() else: a = sess.run(pi, feed_dict={ x_ph: [state], g_ph: [one_hot_goal], prev_a_ph: [prev_a] })[0] return a def reset(state_buffer): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o = process_image_observation(o, obs_dim) r = process_reward(r) state = state_buffer.init_state(init_obs=o) prev_a = np.zeros(act_dim) # new random goal when the env is reset goal_id = np.random.randint(goal_dim) one_hot_goal = np.eye(goal_dim)[goal_id] goal = env.goal_list[goal_id] env.goal_button = goal # print('Goal Button: {}'.format(goal)) return o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a def test_agent(n=1): print('Testing...') for j in range(n): test_o, test_r, test_d, test_ep_ret, test_ep_len, test_state, test_one_hot_goal, test_prev_a = reset( test_state_buffer) while not (test_d or (test_ep_len == max_ep_len)): test_a = get_action(test_state, test_one_hot_goal, test_prev_a, test_act_noise) test_o, test_r, test_d, _ = env.step(test_a) test_o = process_image_observation(test_o, obs_dim) test_r = process_reward(test_r) test_state = test_state_buffer.append_state(test_o) test_ep_ret += test_r test_ep_len += 1 test_one_hot_a = process_action(test_a, act_dim) test_prev_a = test_one_hot_a logger.store(TestEpRet=test_ep_ret, TestEpLen=test_ep_len) # ================== Main training Loop ================== if not resume_training: start_time = time.time() o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = reset( train_state_buffer) total_steps = steps_per_epoch * epochs act_noise = update_eps(current_step=0, min_eps=act_noise_min, max_eps=act_noise_max, max_steps=act_noise_max_steps) resume_t = 0 # array for storing states used with HER if use_HER: HER_buffer = HERBuffer(obs_dim=obs_dim, act_dim=act_dim, goal_dim=goal_dim, size=max_ep_len) # resuming training else: start_time = time.time() total_steps = steps_per_epoch * (epochs + resume_params['additional_epochs']) act_noise = resume_params['resume_state']['act_noise'] HER_buffer = resume_params['resume_state']['HER_buffer'] resume_t = resume_params['resume_state']['resume_t'] o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = resume_params[ 'resume_state']['rl_state'] # reset the environment to the state set before saving env.set_env_state(resume_params['resume_state']['env_state']) # Main loop: collect experience in env and update/log each epoch for t in range(resume_t, total_steps): if t > start_steps: a = get_action(state, one_hot_goal, prev_a, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) o2 = process_image_observation(o2, obs_dim) # thresholding done in env r = process_reward(r) one_hot_a = process_action(a, act_dim) next_state = train_state_buffer.append_state(o2) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer # if life is lost then store done as true true replay_buffer.store(state, one_hot_a, prev_a, r, next_state, d, one_hot_goal) # append to HER buffer if use_HER: HER_buffer.store(state, one_hot_a, prev_a, r, next_state, d, one_hot_goal) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 state = next_state prev_a = one_hot_a # store additional states in replay buffer where the goal # is given by the final state, if the final state was incorrect if use_HER: if d and (ep_len != max_ep_len): # get actual goal achieved achieved_goal = np.eye(goal_dim)[env.goal_list.index( env.latest_button)] # if an incorrect goal was reached if (achieved_goal != one_hot_goal).any(): for j in range(ep_len): # pull data from HER buffer sample = HER_buffer.sample(j) # change this to calc_rew function in env if j == ep_len - 1: new_rew = env.max_rew else: new_rew = sample['rews'] # add to replay buffer replay_buffer.store(sample['obs1'], sample['acts'], sample['prev_acts'], new_rew, sample['obs2'], sample['done'], achieved_goal) # do a single update if t > 0 and t % update_freq == 0: for i in range(n_updates): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], prev_a_ph: batch['prev_acts'], r_ph: batch['rews'], d_ph: batch['done'], g_ph: batch['goal'] } # Q-learning update outs = sess.run([q_loss, q_x1, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) if d or (ep_len == max_ep_len): # store episode values logger.store(EpRet=ep_ret, EpLen=ep_len) # reset the environment o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = reset( train_state_buffer) if use_HER: # reset HER buffer HER_buffer.reset() # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # update target network outs = sess.run(target_update) # update actor noise every epoch act_noise = update_eps(current_step=t, min_eps=act_noise_min, max_eps=act_noise_max, max_steps=act_noise_max_steps) # save everything neccessary for restarting training from current position env_state = env.get_env_state() # Save model if save_freq is not None: if (epoch % save_freq == 0) or (epoch == epochs - 1): print('Saving...') rl_state = [ o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a ] logger.save_state( state_dict={ 'env_state': env_state, 'replay_buffer': replay_buffer, 'train_state_buffer': train_state_buffer, 'test_state_buffer': test_state_buffer, 'HER_buffer': HER_buffer, 'act_noise': act_noise, 'resume_t': t + 1, 'rl_state': rl_state }) # Test the performance of the deterministic version of the agent. (resets the env) test_agent(n=num_tests) # set params for resuming training env.set_env_state(env_state) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Actor Noise', act_noise) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() plot_progress(os.path.join(logger_kwargs['output_dir'], 'progress.txt'), show_plot=False)
def ppo(env_fn, GUI=True, actor_critic=my_mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, on_policy=True, prev_epochs=0): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. GUI : Whether or not display GUI during training. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) if GUI: env = env_fn("GUI", prev_epochs) else: env = env_fn("DIRECT", prev_epochs) obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space sess = tf.Session() # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) # Main outputs from computation graph pi, logp, logp_pi, v, mu, log_std = actor_critic(x_ph, a_ph, **ac_kwargs) # if load_path==None: # # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) # # Main outputs from computation graph # pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # else: # fname = osp.join(load_path, 'tf1_save') # print('\n\nLoading old model from %s.\n\n' % fname) # # # load the things! # model = restore_tf_graph(sess, fname) # x_ph, a_ph = model['x'], model['a'] # pi, logp, logp_pi, v = model['pi'], model['logp'], model['logp_pi'], model['v'] # Calculated through one epoch, assigned by buf's methods adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'v': v, 'logp': logp, 'logp_pi': logp_pi }) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # lllogp, mmmu, llog_std = sess.run([logp, mu, log_std], feed_dict=inputs) # logp is basically the same as logp_old_ph, the error starts from 1e-6, # and this error is a little strange... # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): last_noise_time = 0.0 noise = np.zeros(12) for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape( 1, -1)}) # CHANGE THE feed_dict HERE! # aa = a.copy() # if 2.0 < env.t < 4.0: # # on_policy = False # if env.t - last_noise_time > 0.1: # noise = np.random.uniform(-0.5 * np.pi, 0.5 * np.pi, 12) # last_noise_time += 0.1 # a += noise # logp_t = sess.run(logp, feed_dict={x_ph: o.reshape(1, -1), a_ph: a}) # else: # # on_policy = True # pass # print("time:", env.t, a-aa) if not on_policy: a = np.array([get_action_from_target_policy(env.t)]) logp_t = sess.run(logp, feed_dict={ x_ph: o.reshape(1, -1), a_ph: a }) env.history_buffer['last_action'] = a[0] for i in range( 25): # Change the frequency of control from 500Hz to 20Hz o2, r, d, o2_dict = env.step(a[0]) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 # print(ep_len, d) terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target if d: last_val = 0 # print(o2_dict['position']) # print(np.alltrue(o2_dict['position'][i] < -1 for i in [1, 4, 7, 10]) is True) # print(np.alltrue([o2_dict['position'][i] < -1 for i in [1, 4, 7, 10]])) # print("I did it!!!") else: # last_val = sess.run(v, feed_dict={x_ph: o.reshape(1, -1)}) last_val = 0 buf.finish_path(last_val) print(ep_ret) # logger.store(EpRet=ep_ret+last_val, EpLen=ep_len) # if terminal: # o, ep_ret, ep_len = env.reset(), 0, 0 if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 last_noise_time = 0.0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() env.addEpoch() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # show the log if time.ctime()[-13:-11] == '09': break env.close()
def asac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=5e-4, alpha_start=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001, delta=0.02, sample_step=2000): alpha = Alpha(alpha_start=alpha_start, delta=delta) alpha_t = alpha() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None) x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) alpha_ph = core.scale_holder() # Main outputs from computation graph #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _,_,_,_,_,_,_,v_targ, _, _, R_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha_ph *logp_pi) Q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*R_targ) R_backup = tf.stop_gradient(Q_pi) adv = Q_pi - R pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) Q_loss = 0.5*tf.reduce_mean((Q_backup - Q)**2) R_loss = 0.5*tf.reduce_mean((R_backup - R)**2) value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') + get_vars('main/Q') + get_vars('main/R') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) """ R_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R')) """ # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update, R_loss, Q_loss] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) config = tf.ConfigProto(inter_op_parallelism_threads=30,intra_op_parallelism_threads=5) config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v, 'Q': Q, 'R': R}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] total_steps = steps_per_epoch * epochs counter = 0 ret_epi = [] obs_epi = [] loss_old = 10000 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7], LossR=outs[11]) counter += 1 logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] logger.store(RetEst=ret_est) if counter >= 1000: loss_new, _ = logger.get_stats('LossPi') counter = 0 if (loss_old - loss_new)/np.absolute(loss_old) < loss_threshold and t > start_steps: rho_s = np.zeros([sample_step, obs_dim], dtype=np.float32) rho_ptr = 0 for sample_t in range(sample_step): a = get_action(o) o2, r, d, _ = env.step(a) ep_len += 1 d = False if ep_len == max_ep_len else d rho_s[rho_ptr] = o o = o2 if d or (ep_len == max_ep_len): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 advantages = sess.run(adv, feed_dict={x_ph: rho_s}) alpha.update_alpha(advantages) #alpha.update_alpha(rho_q-rho_v) alpha_t = alpha() print(alpha_t) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 loss_old = 10000 else: loss_old = loss_new # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EntCoeff', alpha_t) logger.log_tabular('RetEst', average_only=True) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossR', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
class DeterministicLearner: """ Learner for training Agents with deterministic policies, and thus have different behavior during training and testing """ def __init__(self, agent, env, steps_per_epoch=5000, epochs=50, seed=0, max_ep_len=1000, start_steps=10000, replay_size=int(1e6), batch_size=100, n_test_episodes=10, output_dir=None, output_fname='progress.txt', exp_name=None): self.epoch_len, self.n_epochs = steps_per_epoch, epochs self.max_ep_len, self.start_steps = max_ep_len, start_steps self.n_test_episodes = n_test_episodes self.logger = EpochLogger(output_dir=output_dir, output_fname=output_fname, exp_name=exp_name) print('locals') for key, val in locals().items(): print('{}: {}'.format(key, len(str(val)))) # self.logger.save_config(locals()) self.env, self.agent = env, agent self.buffer = OffPolicyBuffer(buffer_size=replay_size, epoch_size=steps_per_epoch, batch_size=batch_size) saver_kwargs = agent.build_graph(env.observation_space, env.action_space) self.logger.setup_tf_saver(**saver_kwargs) var_counts = tuple( tf_utils.trainable_count(scope) for scope in ['pi', 'q']) self.logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts) np.random.seed(seed) tf.set_random_seed(seed) def episode_step(self, obs, rew, is_term, ep_len, ep_ret, epoch_ctr, testing=False): """ take a single step in the episode """ # environment variables to store in buffer env_to_buffer = dict(obs=obs, rew=rew, is_term=is_term) # Take agent step, return values to store in buffer, and in logs act = self.agent.step(obs, testing=testing) if not testing: self.buffer.store({**env_to_buffer, 'act': act}) epoch_ctr += 1 ep_len += 1 ep_ret += rew obs, rew, is_term, _ = self.env.step(act) return obs, rew, is_term, ep_len, ep_ret, epoch_ctr def play_episode(self, epoch_ctr=0, testing=False): """ play out an episode until one of these things happens: 1. episode ends 2. max episode length is reached 3. end of epoch is reached """ obs = self.env.reset() rew, ep_len, ep_ret, is_term_state = 0, 0, 0, False while ((ep_len < self.max_ep_len) and (not is_term_state) and (epoch_ctr < self.epoch_len)): step_ret = self.episode_step(obs, rew, is_term_state, ep_len, ep_ret, epoch_ctr, testing=testing) obs, rew, is_term_state, ep_len, ep_ret, epoch_ctr = step_ret ep_ret += rew # important! add the last reward to the return! log_prefix = 'Test' if testing else '' if (is_term_state) or (ep_len >= self.max_ep_len): self.logger.store(**{ log_prefix + 'EpRet': ep_ret, log_prefix + 'EpLen': ep_len }) if not testing: self.buffer.finish_path(last_obs=obs) return ep_len, ep_ret, epoch_ctr def train_episode(self, ep_len): """ train agent at the end of episode """ batches = self.buffer.batches(n_batches=ep_len) for train_iter, batch in enumerate(batches): to_logger = self.agent.train(train_iter, batch) self.logger.store(**to_logger) def run_epoch(self): """ run epoch of training + evaluation """ epoch_ctr = 0 while epoch_ctr < self.epoch_len: ep_len, _, epoch_ctr = self.play_episode(epoch_ctr=epoch_ctr, testing=False) self.train_episode(ep_len) self.test_epoch(self.n_test_episodes) def test_epoch(self, n_test_episodes): """ perform testing for an epoch """ for _ in range(n_test_episodes): self.play_episode(0, testing=True) def learn(self): """ Train the agent over n_epochs """ for epoch in range(self.n_epochs): start_time = time.time() self.run_epoch() self.log_epoch(epoch, start_time) self.logger.save_state({'env': self.env}, None) self.agent.sess.close() def log_epoch(self, epoch, start_time): """ Log info about epoch """ self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('EpRet', with_min_and_max=True) self.logger.log_tabular('EpLen', average_only=True) self.logger.log_tabular('TestEpRet', with_min_and_max=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('TotalEnvInteracts', (epoch + 1) * self.epoch_len) self.logger.log_tabular('Time', time.time() - start_time) for column_name, kwargs in self.agent.log_tabular_kwargs.items(): self.logger.log_tabular(column_name, **kwargs) self.logger.dump_tabular()
def trpo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo='trpo'): """ Trust Region Policy Optimization (with support for Natural Policy Gradient) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: ============ ================ ======================================== Symbol Shape Description ============ ================ ======================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``info`` N/A | A dict of any intermediate quantities | (from calculating the policy or log | probabilities) which are needed for | analytically computing KL divergence. | (eg sufficient statistics of the | distributions) ``info_phs`` N/A | A dict of placeholders for old values | of the entries in ``info``. ``d_kl`` () | A symbol for computing the mean KL | divergence between the current policy | (``pi``) and the old policy (as | specified by the inputs to | ``info_phs``) over the batch of | states given in ``x_ph``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) ============ ================ ======================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TRPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) delta (float): KL-divergence limit for TRPO / NPG update. (Should be small for stability. Values like 0.01, 0.05.) vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. Probably don't play with this hyperparameter. cg_iters (int): Number of iterations of conjugate gradient to perform. Increasing this will lead to a more accurate approximation to :math:`H^{-1} g`, and possibly slightly-improved performance, but at the cost of slowing things down. Also probably don't play with this hyperparameter. backtrack_iters (int): Maximum number of steps allowed in the backtracking line search. Since the line search usually doesn't backtrack, and usually only steps back once when it does, this hyperparameter doesn't often matter. backtrack_coeff (float): How far back to step during backtracking line search. (Always between 0 and 1, usually above 0.5.) lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. algo: Either 'trpo' or 'npg': this code supports both, since they are almost the same. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph, plus placeholders for old pdist (for KL) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph ] + core.values_as_sorted_list(info_phs) # Every step, get: action, value, logprob, & info for pdist (for computing kl div) get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info) # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # TRPO losses ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # Symbols needed for CG solver pi_params = core.get_vars('pi') gradient = core.flat_grad(pi_loss, pi_params) v_ph, hvp = core.hessian_vector_product(d_kl, pi_params) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = core.flat_concat(pi_params) set_pi_params = core.assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) r = b.copy( ) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval inputs = {k: v for k, v in zip(all_phs, buf.get())} Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[ 1], agent_outs[2], agent_outs[3:] o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dims = env.action_space #[ choice.shape for choice in env.action_space.values() ] #act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholder(None), core.placeholder( None), {} for k in env.action_space: logp_old_ph[k] = core.placeholder(None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dims, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio, min_adv, pi_loss = {}, {}, {} for k in env.action_space: ratio[k] = tf.exp(logp[k] - logp_old_ph[k]) # pi(a|s) / pi_old(a|s) min_adv[k] = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss[k] = -tf.reduce_mean(tf.minimum(ratio[k] * adv_ph, min_adv[k])) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl, approx_ent, clipped, clipfrac = {}, {}, {}, {} for k in env.action_space: approx_kl[k] = tf.reduce_mean( logp_old_ph[k] - logp[k]) # a sample estimate for KL-divergence, easy to compute approx_ent[k] = tf.reduce_mean( -logp[k]) # a sample estimate for entropy, also easy to compute clipped[k] = tf.logical_or(ratio[k] > (1 + clip_ratio), ratio[k] < (1 - clip_ratio)) clipfrac[k] = tf.reduce_mean(tf.cast(clipped[k], tf.float32)) pi_loss_sum = tf.reduce_sum(list(pi_loss.values())) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss_sum) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving save_outputs = {'v': v} for k in env.action_space: save_outputs['pi_' + k] = pi[k] logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs=save_outputs) def update(): inputs = {} for k, v in zip(all_phs, buf.get()): if type(k) is not dict: inputs[k] = v else: for k_, v_ in zip(k.values(), v.values()): inputs[k_] = v_ pi_l_old, v_l_old, ent = sess.run([pi_loss_sum, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) for k in kl: kl[k] = mpi_avg(kl[k]) if max(list(kl.values())) > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss_sum, v_loss, approx_kl, clipfrac], feed_dict=inputs) sum_dict = lambda x: x if type(x) is not dict else np.sum( list(x.values())) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=sum_dict(kl), Entropy=sum_dict(ent), ClipFrac=sum_dict(cf), DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) o2, r, d, _ = env.step(**a) env.render() #force_realtime=True) ep_ret += r #print ("frame_return: %.4f sofar_EpRet: %.4f" % (r, ep_ret)) ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) print("EpRet:", ep_ret) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac1_carla(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(3e5), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_space = env.observation_space.spaces[0] act_space = env.action_space obs_dim = obs_space.shape act_dim = act_space.shape # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space(obs_space, act_space, obs_space, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _,q1_pi_, q2_pi_= actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=list(obs_dim), act_dim=list(act_dim), size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/cnn_layer', 'main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t cnn_layer: %d, \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### if alpha == 'auto': target_entropy = (-np.prod(env.action_space.shape)) log_alpha = tf.get_variable( 'log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) q_backup = r_ph + gamma*(1-d_ph)*v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - tf.stop_gradient(q1_pi)) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss cnn_params = get_vars('main/cnn_layer') # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) pi_params = get_vars('main/pi') train_pi_op = pi_optimizer.minimize(pi_loss, var_list = cnn_params + pi_params) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list = cnn_params + value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), train_pi_op, train_value_op, target_update] else: step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update, train_alpha_op] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o[np.newaxis,...]})[0] def test_agent(n=1): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == 200)): # max_ep_len # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = [0.35, 0] # a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): print('EpRet: ',ep_ret, 'ep_len: ', ep_len) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], Alpha=outs[6]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha',average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def sigail(env_fn, traj_dir, actor_critic=core.mlp_actor_critic_add, ac_kwargs=dict(), d_hidden_size=64, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=40, train_v_iters=40, lam=0.97, max_ep_len=4000, beta=1e-4, target_kl=0.01, logger_kwargs=dict(), save_freq=100, r_env_ratio=0, d_itr=20, reward_type='negative', trj_num=20, buf_size=1000, si_update_ratio=0.02, js_smooth=5, buf_update_type='random', pretrain_bc_itr=0): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape D = Discriminator(env, hidden_size=d_hidden_size, reward_type=reward_type) #!add Discriminator object D_js_m = JS_div_machine(env, hidden_size=d_hidden_size) e_obs = np.zeros((buf_size, obs_dim[0])) e_act = np.zeros((buf_size, act_dim[0])) Sibuffer = SIBuffer(obs_dim, act_dim, e_obs, e_act, trj_num=trj_num, max_size=buf_size, js_smooth_num=js_smooth) #!sibuf trj_full = False assert e_obs.shape[1:] == obs_dim # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, pi_std, entropy, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum( ratio * adv_ph, min_adv)) - beta * entropy #add entropy v_loss = tf.reduce_mean((ret_ph - v)**2) #ret_phには累積報酬のバッファが入る # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Sync params across processes # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get()) } #all_phsは各バッファーに対応するプレースホルダー辞書 pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training#ここも変える必要あり? おそらく変えなくて良い for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: #更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): #vの更新 sess.run(train_v, feed_dict=inputs) # Log changes from update(新しいロスの計算) pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) std, std_ent = sess.run([pi_std, entropy], feed_dict=inputs) logger.store( LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=std_ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), #更新での改善量 DeltaLossV=(v_l_new - v_l_old), Std=std) start_time = time.time() o, r, d, ep_ret_task, ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0 if pretrain_bc_itr > 0: BC.learn(Sibuffer.expert_obs, Sibuffer.expert_act, max_itr=pretrain_bc_itr) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ''' if t <150: env.render() time.sleep(0.03) ''' ep_ret_task += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): ''' if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) ''' #!add discriminator train '''#終端も加えるならアリッチャあり o_reshape = o.reshape(core.combined_shape(1,obs_dim)) a_reshape = a.reshape(core.combined_shape(1,act_dim)) agent_obs = np.append(buf.obs_buf[buf.path_slice()],o_reshape,axis = 0)#!o を(obspace,)→(1,obspace)に変換してからアペンド agent_act = np.append(buf.act_buf[buf.path_slice()],a_reshape,axis = 0)#終端での状態行動対も加えてDを学習 ''' agent_obs = buf.obs_buf[buf.path_slice()] agent_act = buf.act_buf[buf.path_slice()] #D.train(sess,e_obs,e_act ,agent_obs,agent_act) #↓buf.r_gail_buf[slice(buf.path_start_idx+1, buf.ptr+2)] = D.get_reward_buf(sess,agent_obs, agent_act).ravel()#状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる) if trj_full: gail_r = 1 else: gail_r = 0 rew_gail = gail_r * D.get_reward( sess, agent_obs, agent_act).ravel() #状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる) ep_ret_gail += rew_gail.sum() #!before gail_ratio ep_ret_sum = r_env_ratio * ep_ret_task + ep_ret_gail rew_gail_head = rew_gail[:-1] last_val_gail = rew_gail[-1] buf.rew_buf[slice( buf.path_start_idx + 1, buf.ptr)] = rew_gail_head + r_env_ratio * buf.rew_buf[ slice(buf.path_start_idx + 1, buf.ptr)] #!add GAIL reward 最後の報酬は含まれないため長さが1短い if d: # if trajectory didn't reach terminal state, bootstrap value target last_val = r_env_ratio * r + last_val_gail else: last_val = sess.run(v, feed_dict={x_ph: o.reshape(1, -1) }) #v_last=...だったけどこれで良さげ buf.finish_path( last_val) #これの前にbuf.finish_add_r_vがなされていることを確認すべし if terminal: #only store trajectory to SIBUffer if trajectory finished if trj_full: Sibuffer.store( agent_obs, agent_act, sum_reward=ep_ret_task) #!store trajectory else: Sibuffer.store( agent_obs, agent_act, sum_reward=ep_ret_task) #!store trajectory logger.store(EpRet=ep_ret_task, EpRet_Sum=ep_ret_sum, EpRet_Gail=ep_ret_gail, EpLen=ep_len) o, r, d, ep_ret_task, ep_ret_sum, ep_ret_gail, ep_len = env.reset( ), 0, False, 0, 0, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, epoch) # Perform PPO update! if not (trj_full): M_obs_buf = Sibuffer.get_obs_trj() trj_full = (M_obs_buf.shape[0] >= buf_size) if trj_full: #replaybufferがr_thresholdよりも大きいとき Sibuffer.update_main_buf(ratio_update=si_update_ratio, update_type=buf_update_type) M_obs_buf = Sibuffer.get_obs_trj() M_act_buf = Sibuffer.get_act_trj() d_batch_size = len(agent_obs) for _t in range(d_itr): e_obs_batch, e_act_batch = Sibuffer.get_random_batch( d_batch_size) D.train(sess, e_obs_batch, e_act_batch, agent_obs, agent_act) D_js_m.train(sess, M_obs_buf, M_act_buf, e_obs, e_act) #バッファとエキスパートの距離を見るためにtrain js_d = D.get_js_div(sess, Sibuffer.main_obs_buf, Sibuffer.main_act_buf, agent_obs, agent_act) js_d_m = D_js_m.get_js_div(sess, M_obs_buf, M_act_buf, e_obs, e_act) else: js_d, js_d_m = 0.5, 0.5 update() Sibuffer.store_js(js_d) logger.store(JS=js_d, JS_M=js_d_m, JS_Ratio=Sibuffer.js_ratio_with_random) # Log info about epoch #if epoch%10 == 0:#logger print each 10 epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpRet_Sum', average_only=True) logger.log_tabular('EpRet_Gail', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('Std', average_only=True) logger.log_tabular('buffer_r', Sibuffer.buffer_r_average) logger.log_tabular('JS', average_only=True) logger.log_tabular('JS_M', average_only=True) logger.log_tabular('JS_Ratio', average_only=True) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-2, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch #logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', average_only=True) #logger.log_tabular('EpLen', average_only=True) #logger.log_tabular('VVals', with_min_and_max=True) #logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) #logger.log_tabular('LossPi', average_only=True) #logger.log_tabular('LossV', average_only=True) #logger.log_tabular('DeltaLossPi', average_only=True) #logger.log_tabular('DeltaLossV', average_only=True) #logger.log_tabular('Entropy', average_only=True) #logger.log_tabular('KL', average_only=True) #logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, explorer=None, eps=.03, pretrain_epochs=0): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 pretrain_steps = steps_per_epoch * pretrain_epochs total_epochs = epochs + pretrain_epochs total_steps = steps_per_epoch * total_epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): if t > start_steps: a = get_action(o) elif pretrain_steps == 0: # only explore if not pretraining with MaxEnt a = env.action_space.sample() # use MaxEnt exploration if you are in a pretrain epoch or if eps-greedy pre = t < pretrain_steps during = random.random() < eps if pre or during: if explorer is None: raise ValueError('Trying to explore but explorer is None') state = env.env.state_vector() a = explorer.sample_action(state) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def cvi_ad(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, alp = 0.8, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, decay = None, squash = False): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) adv_ph = tf.placeholder(dtype = tf.float32, shape = (None,)) alp_ph = tf.placeholder(dtype = tf.float32) t_step = tf.placeholder(dtype = tf.float32) #adv_ph1 = tf.placeholder(dtype = tf.float32, shape = (None,)) #adv_ph2 = tf.placeholder(dtype = tf.float32, shape = (None,)) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, ad1, ad2, ad1_pi, ad2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, ad1_targ, ad2_targ, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) squash_eps = 1e-2 if squash: print("Squashed") squash_func = lambda x: tf.sign(x) * (tf.sqrt(tf.abs(x) + 1) - 1) + x * squash_eps squash_ifunc = lambda x: tf.sign(x) * ((tf.sqrt(1 + 4 * squash_eps * (tf.abs(x) + 1 + squash_eps)) - 1)** 2 * (1 / (2 * squash_eps))** 2 - 1) else: print ("Not Squashed") squash_func = lambda x: x squash_ifunc = lambda x: x # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) q1 = v + ad1 q2 = v + ad2 q1_pi = v + ad1_pi q2_pi = v + ad2_pi # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(squash_func(r_ph + gamma*(1-d_ph)*squash_ifunc(v_targ) + alp_ph * adv_ph)) #q_backup1 = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ + alp * adv_ph1) #q_backup2 = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ + alp * adv_ph2) v_backup = tf.stop_gradient(squash_func(squash_ifunc(min_q_pi) - alpha * logp_pi)) # Soft actor-critic losses #alp = tf.Variable(0.2,dtype=tf.float32) #q_min = tf.minimum(q1,q2) pi_loss = tf.reduce_mean(alpha * logp_pi - squash_ifunc(min_q_pi)) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main') , get_vars('target'))]) # target_update = tf.group([tf.assign(v_targ, tf.cond(tf.not_equal(t_step%1000,0), lambda: v_targ, lambda: v_main)) # for v_main, v_targ in zip(get_vars('main') , get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update] # adv_op = squash_ifunc(tf.minimum(q1_targ, q2_targ))-squash_ifunc(v_targ) adv_op = squash_ifunc(tf.minimum(ad1_targ, ad2_targ)) #adv_op1 = q1_targ-v_targ #adv_op2 = q2_targ-v_targ # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0] def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs if decay: alp_val = 0.2 else: alp_val = alp update_step = 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): update_step+=1 batch = replay_buffer.sample_batch(batch_size) feed_dict = {x2_ph: batch['obs1'], a_ph: batch['acts'] } advantage = sess.run(adv_op , feed_dict) #advantage = sess.run([adv_op1, adv_op2] , feed_dict) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], t_step: update_step, adv_ph : advantage, alp_ph : alp_val #adv_ph1 : advantage[0], #adv_ph2 : advantage[1] } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if decay: alp_val = eval(decay)(t//steps_per_epoch) # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ddpg_lp(env_fn, render_env=False, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Hyper-parameters for NNs hidden_sizes = list(ac_kwargs['hidden_sizes']) actor_hidden_activation = tf.keras.activations.relu actor_output_activation = tf.keras.activations.tanh critic_hidden_activation = tf.keras.activations.relu critic_output_activation = tf.keras.activations.linear # Main actor-critic with tf.variable_scope('main'): actor = MLP(hidden_sizes + [act_dim], hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) critic = MLP(hidden_sizes + [1], hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) pi = act_limit * actor(x_ph) q = tf.squeeze(critic(tf.concat([x_ph, a_ph], axis=-1)), axis=1) q_pi = tf.squeeze(critic(tf.concat([x_ph, pi], axis=-1)), axis=1) # Target actor-critic with tf.variable_scope('target'): actor_targ = MLP(hidden_sizes + [act_dim], hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) critic_targ = MLP(hidden_sizes + [1], hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) # Note that we only need q_targ(s, pi_targ(s)). pi_targ = act_limit * actor_targ(x2_ph) q_pi_targ = tf.squeeze(critic_targ(tf.concat([x2_ph, pi_targ], axis=-1)), axis=1) # Learning Progress on Actor with tf.variable_scope('actor_learning_progress'): actor_lp = LearningProgress(memory_length=10, input_dim=obs_dim, output_dim=act_dim, hidden_sizes = hidden_sizes, hidden_activation=actor_hidden_activation, output_activation=actor_output_activation, logger_kwargs=logger_kwargs, loger_file_name='actor_lp_log.txt') # Learning Progress on Critic with tf.variable_scope('critic_learning_progress'): critic_lp = LearningProgress(memory_length=10, input_dim=obs_dim+act_dim, output_dim=1, hidden_sizes=hidden_sizes, hidden_activation=critic_hidden_activation, output_activation=critic_output_activation, logger_kwargs=logger_kwargs, loger_file_name='critic_lp_log.txt') # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n'%var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q-backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=actor.variables) train_q_op = q_optimizer.minimize(q_loss, var_list=critic.variables) # Polyak averaging for target variables target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(actor.variables+critic.variables, actor_targ.variables+critic_targ.variables)]) # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(actor.variables+critic.variables, actor_targ.variables + critic_targ.variables)]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q}) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def get_learning_progress_based_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ep_actor_lp_norm, ep_actor_var_outputs, ep_actor_var_first_half_outputs, ep_actor_var_second_half_outputs = 0, 0, 0, 0 ep_critic_lp_norm, ep_critic_var_outputs, ep_critic_var_first_half_outputs, ep_critic_var_second_half_outputs = 0, 0, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Update actor_learning_progress with latest weights update_latest_memory_every_n_steps = 1000 if t % update_latest_memory_every_n_steps==0: actor_lp.update_latest_memory(actor.get_weights()) # Step the env if render_env: env.render() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # import pdb; pdb.set_trace() actor_lp_norm, actor_var_outputs, \ actor_var_first_half_outputs,\ actor_var_second_half_outputs = actor_lp.compute_learning_progress(o, sess, t, start_time) ep_actor_lp_norm += actor_lp_norm ep_actor_var_outputs += actor_var_outputs ep_actor_var_first_half_outputs += actor_var_first_half_outputs ep_actor_var_second_half_outputs += actor_var_second_half_outputs critic_lp_norm, critic_var_outputs, \ critic_var_first_half_outputs, \ critic_var_second_half_outputs = critic_lp.compute_learning_progress(np.concatenate((o,a)), sess, t, start_time) ep_critic_lp_norm += critic_lp_norm ep_critic_var_outputs += critic_var_outputs ep_critic_var_first_half_outputs += critic_var_first_half_outputs ep_critic_var_second_half_outputs += critic_var_second_half_outputs # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) logger.store(EpActorLPNorm=ep_actor_lp_norm) logger.store(EpActorVar=ep_actor_var_outputs) logger.store(EpActorVarFirst=ep_actor_var_first_half_outputs) logger.store(EpActorVarSecond=ep_actor_var_second_half_outputs) logger.store(EpCriticLPNorm=ep_critic_lp_norm) logger.store(EpCriticVar=ep_critic_var_outputs) logger.store(EpCriticVarFirst=ep_critic_var_first_half_outputs) logger.store(EpCriticVarSecond=ep_critic_var_second_half_outputs) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ep_actor_lp_norm, ep_actor_var_outputs, ep_actor_var_first_half_outputs, ep_actor_var_second_half_outputs = 0, 0, 0, 0 ep_critic_lp_norm, ep_critic_var_outputs, ep_critic_var_first_half_outputs, ep_critic_var_second_half_outputs = 0, 0, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('EpActorLPNorm', with_min_and_max=True) logger.log_tabular('EpActorVar', with_min_and_max=True) logger.log_tabular('EpActorVarFirst', with_min_and_max=True) logger.log_tabular('EpActorVarSecond', with_min_and_max=True) logger.log_tabular('EpCriticLPNorm', with_min_and_max=True) logger.log_tabular('EpCriticVar', with_min_and_max=True) logger.log_tabular('EpCriticVarFirst', with_min_and_max=True) logger.log_tabular('EpCriticVarSecond', with_min_and_max=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def sppo(args, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=200, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) ########### if args.alpha == 'auto': target_entropy = 0.35 log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=tf.log(0.2)) alpha = tf.exp(log_alpha) else: alpha = args.alpha ########### # Main outputs from computation graph mu, pi, logp, logp_pi, v, q, h = actor_critic(alpha, x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, h] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) ###### if args.alpha == 'auto': alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(-h + target_entropy) ) # tf.clip_by_value(-h + target_entropy, 0.0, 1000.0 ) alpha_optimizer = MpiAdamOptimizer(learning_rate=1e-5) train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) # For PPO # min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # ### Scheme1: SPPO NO.2: add entropy # adv_logp = adv_ph - tf.stop_gradient(alpha) * tf.stop_gradient(logp) # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv)) # ### Scheme3: SPPO NO.3: add entropy # adv_logp = adv_ph - tf.stop_gradient(alpha) * logp_old_ph # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv)) ### Scheme2: SPPO NO.2: add entropy min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean( tf.minimum(ratio * adv_ph, min_adv) + tf.stop_gradient(alpha) * h) v_loss = tf.reduce_mean((ret_ph - v)**2) #+(ret_ph - q)**2)/2.0 # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( h) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer( learning_rate=args.pi_lr).minimize(pi_loss + 0.1 * v_loss) # train_v = MpiAdamOptimizer(learning_rate=args.vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): if args.alpha == 'auto': sess.run(train_alpha_op, feed_dict=inputs) _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # for _ in range(train_v_iters): # sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old), Alpha=sess.run(alpha) if args.alpha == 'auto' else alpha) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t, h_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # q_t = sess.run(q, feed_dict={x_ph: o.reshape(1,-1), a_ph: a}) # SPPO NO.1: add entropy # rh = r - args.alpha * logp_t if args.alpha == 'auto': rh = r + sess.run(alpha) * h_t else: rh = r + alpha * h_t # exact entropy # save and log buf.store(o, a, rh, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 # d = False if ep_len == max_ep_len else d terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # # Save model # if (epoch % save_freq == 0) or (epoch == epochs-1): # logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(workload_file, model_path, ac_kwargs=dict(), seed=0, traj_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, pre_trained=0, trained_model=None, attn=False, shuffle=False, backfil=False, skip=False, score_type=0, batch_job_slice=0, sched_algo=4): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = HPCEnvSkip(shuffle=shuffle, backfil=backfil, skip=skip, job_score_type=score_type, batch_job_slice=batch_job_slice, build_sjf=False, sched_algo=sched_algo) env.seed(seed) env.my_init(workload_file=workload_file, sched_file=model_path) obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space ac_kwargs['attn'] = attn # Inputs to computation graph buf = PPOBuffer(obs_dim, act_dim, traj_per_epoch * JOB_SEQUENCE_SIZE, gamma, lam) if pre_trained: sess = tf.Session() model = restore_tf_graph(sess, trained_model) logger.log('load pre-trained model') # Count variables var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) x_ph = model['x'] a_ph = model['a'] mask_ph = model['mask'] adv_ph = model['adv'] ret_ph = model['ret'] logp_old_ph = model['logp_old_ph'] pi = model['pi'] v = model['v'] # logits = model['logits'] out = model['out'] logp = model['logp'] logp_pi = model['logp_pi'] pi_loss = model['pi_loss'] v_loss = model['v_loss'] approx_ent = model['approx_ent'] approx_kl = model['approx_kl'] clipfrac = model['clipfrac'] clipped = model['clipped'] # Optimizers # graph = tf.get_default_graph() # op = sess.graph.get_operations() # [print(m.values()) for m in op] # train_pi = graph.get_tensor_by_name('pi/conv2d/kernel/Adam:0') # train_v = graph.get_tensor_by_name('v/conv2d/kernel/Adam:0') train_pi = tf.get_collection("train_pi")[0] train_v = tf.get_collection("train_v")[0] # train_pi_optimizer = MpiAdamOptimizer(learning_rate=pi_lr, name='AdamLoad') # train_pi = train_pi_optimizer.minimize(pi_loss) # train_v_optimizer = MpiAdamOptimizer(learning_rate=vf_lr, name='AdamLoad') # train_v = train_v_optimizer.minimize(v_loss) # sess.run(tf.variables_initializer(train_pi_optimizer.variables())) # sess.run(tf.variables_initializer(train_v_optimizer.variables())) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, out] else: x_ph, a_ph = placeholders_from_spaces(env.observation_space, env.action_space) # y_ph = placeholder(JOB_SEQUENCE_SIZE*3) # 3 is the number of sequence features mask_ph = placeholder(env.action_space.n) adv_ph, ret_ph, logp_old_ph = placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v, out = actor_critic(x_ph, a_ph, mask_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, out] # Experience buffer # Count variables var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = tf.train.AdamOptimizer( learning_rate=pi_lr).minimize(pi_loss) train_v = tf.train.AdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) tf.add_to_collection("train_pi", train_pi) tf.add_to_collection("train_v", train_v) # Setup model saving # logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'action_probs': action_probs, 'log_picked_action_prob': log_picked_action_prob, 'v': v}) logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph, 'adv': adv_ph, 'mask': mask_ph, 'ret': ret_ph, 'logp_old_ph': logp_old_ph }, outputs={ 'pi': pi, 'v': v, 'out': out, 'pi_loss': pi_loss, 'logp': logp, 'logp_pi': logp_pi, 'v_loss': v_loss, 'approx_ent': approx_ent, 'approx_kl': approx_kl, 'clipped': clipped, 'clipfrac': clipfrac }) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() [o, co], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset( ), 0, False, 0, 0, 0, 0, 0, 0 # Main loop: collect experience in env and update/log each epoch start_time = time.time() for epoch in range(epochs): t = 0 while True: # [no_skip, skip] lst = [1, 1] #for i in range(0, MAX_QUEUE_SIZE * JOB_FEATURES, JOB_FEATURES): # job = o[i:i + JOB_FEATURES] # # the skip time of will_skip job exceeds MAX_SKIP_TIME # if job[-2] == 1.0: # lst = [1,0] a, v_t, logp_t, output = sess.run(get_action_ops, feed_dict={ x_ph: o.reshape(1, -1), mask_ph: np.array(lst).reshape(1, -1) }) # print(a, end=" ") ''' action = np.random.choice(np.arange(MAX_QUEUE_SIZE), p=action_probs) log_action_prob = np.log(action_probs[action]) ''' # save and log buf.store(o, None, a, np.array(lst), r, v_t, logp_t) logger.store(VVals=v_t) if a[0] == 1: skip_count += 1 o, r, d, r2, sjf_t, f1_t = env.step(a[0]) ep_ret += r ep_len += 1 show_ret += r2 sjf += sjf_t f1 += f1_t if d: t += 1 buf.finish_path(r) logger.store(EpRet=ep_ret, EpLen=ep_len, ShowRet=show_ret, SJF=sjf, F1=f1, SkipRatio=skip_count / ep_len) [ o, co ], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset( ), 0, False, 0, 0, 0, 0, 0, 0 if t >= traj_per_epoch: # print ("state:", state, "\nlast action in a traj: action_probs:\n", action_probs, "\naction:", action) break # print("Sample time:", (time.time()-start_time)/num_total, num_total) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! # start_time = time.time() update() # print("Train time:", time.time()-start_time) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * traj_per_epoch * JOB_SEQUENCE_SIZE) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('ShowRet', average_only=True) logger.log_tabular('SJF', average_only=True) logger.log_tabular('F1', average_only=True) logger.log_tabular('SkipRatio', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()