def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, gru_units=256, trials_per_epoch=100, episodes_per_trial=2, n = 100, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph\ raw_input_ph = tf.placeholder(dtype=tf.float32, shape=obs_dim, name='raw_input_ph') rescale_image_op = tf.image.resize_images(raw_input_ph, [30, 40]) max_seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(), name='max_seq_len_ph') seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(None,)) # Because we pad zeros at the end of every sequence of length less than max length, we need to mask these zeros out # when computing loss seq_len_mask_ph = tf.placeholder(dtype=tf.int32, shape=(trials_per_epoch, episodes_per_trial * max_ep_len)) # rescaled_image_ph This is a ph because we want to be able to pass in value to this node manually rescaled_image_in_ph = tf.placeholder(dtype=tf.float32, shape=[None, 30, 40, 3], name='rescaled_image_in_ph') a_ph = core.placeholders_from_spaces( env.action_space)[0] conv1 = slim.conv2d(activation_fn=tf.nn.relu, inputs=rescaled_image_in_ph, num_outputs=16, kernel_size=[5,5], stride=2) image_out = slim.flatten(slim.conv2d(activation_fn=tf.nn.relu, inputs=conv1, num_outputs=16, kernel_size=[5,5], stride=2)) rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None) rnn_state_ph = tf.placeholder(tf.float32, [None, gru_units], name='pi_rnn_state_ph') # Main outputs from computation graph action_encoder_matrix = np.load(r'encoder.npy') pi, logp, logp_pi, v, rnn_state, logits, seq_len_vec, tmp_vec = actor_critic( image_out, a_ph, rew_ph, rnn_state_ph, gru_units, max_seq_len_ph, action_encoder_matrix, seq_len=seq_len_ph, action_space=env.action_space) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [rescaled_image_in_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, rnn_state, logits] # Experience buffer buffer_size = trials_per_epoch * episodes_per_trial * max_ep_len buf = PPOBuffer(rescaled_image_in_ph.get_shape().as_list()[1:], act_dim, buffer_size, trials_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph) # Need to mask out the padded zeros when computing loss sequence_mask = tf.sequence_mask(seq_len_ph, episodes_per_trial*max_ep_len) # Convert bool tensor to int tensor with 1 and 0 sequence_mask = tf.where(sequence_mask, np.ones(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len)), np.zeros(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len))) # need to reshape because ratio is a 1-D vector (it is a concatnation of all sequence) for masking and then reshape # it back pi_loss_vec = tf.multiply(sequence_mask, tf.reshape(tf.minimum(ratio * adv_ph, min_adv), tf.shape(sequence_mask))) pi_loss = -tf.reduce_mean(tf.reshape(pi_loss_vec, tf.shape(ratio))) aaa = (ret_ph - v)**2 v_loss_vec = tf.multiply(sequence_mask, tf.reshape((ret_ph - v)**2, tf.shape(sequence_mask))) ccc = tf.reshape(v_loss_vec, tf.shape(v)) v_loss = tf.reduce_mean(tf.reshape(v_loss_vec, tf.shape(v))) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) train = MpiAdamOptimizer(learning_rate=1e-4).minimize(pi_loss + 0.01 * v_loss - 0.001 * approx_ent) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'rescaled_image_in': rescaled_image_in_ph}, outputs={'pi': pi, 'v': v}) def update(): print(f'Start updating at {datetime.now()}') inputs = {k:v for k,v in zip(all_phs, buf.get())} inputs[rnn_state_ph] = np.zeros((trials_per_epoch, gru_units), np.float32) inputs[max_seq_len_ph] = int(episodes_per_trial * max_ep_len) inputs[seq_len_ph] = buf.seq_len_buf pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) buf.reset() # Training print(f'sequence length = {sess.run(seq_len_vec, feed_dict=inputs)}') for i in range(train_pi_iters): _, kl, pi_loss_i, v_loss_i, ent = sess.run([train_pi, approx_kl, pi_loss, v_loss, approx_ent], feed_dict=inputs) print(f'i: {i}, pi_loss: {pi_loss_i}, v_loss: {v_loss_i}, entropy: {ent}') logger.store(StopIter=i) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) print(f'Updating finished at {datetime.now()}') start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0 def recenter_rgb(image, min=0.0, max=255.0): ''' :param image: :param min: :param max: :return: an image with rgb value re-centered to [-1, 1] ''' mid = (min + max) / 2.0 return np.apply_along_axis(func1d=lambda x: (x - mid) / mid, axis=2, arr=image) o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trial in range(trials_per_epoch): # TODO: tweek settings to match the paper # TODO: find a way to generate mazes last_a = np.array(0) last_r = np.array(r) last_rnn_state = np.zeros((1, gru_units), np.float32) step_counter = 0 for episode in range(episodes_per_trial): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) action_dict = defaultdict(int) # dirty hard coding to make it print in order action_dict[0] = 0 action_dict[1] = 0 action_dict[2] = 0 for step in range(max_ep_len): a, v_t, logp_t, rnn_state_t, logits_t = sess.run( get_action_ops, feed_dict={ rescaled_image_in_ph: np.expand_dims(o_rescaled, 0), a_ph: last_a.reshape(-1,), rew_ph: last_r.reshape(-1,1), rnn_state_ph: last_rnn_state, # v_rnn_state_ph: last_v_rnn_state, max_seq_len_ph: 1, seq_len_ph: [1]}) action_dict[a[0]] += 1 # save and log buf.store(o_rescaled, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) step_counter += 1 o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) ep_ret += r ep_len += 1 last_a = a[0] last_r = np.array(r) last_rnn_state = rnn_state_t terminal = d or (ep_len == max_ep_len) if terminal or (step==n-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run(v, feed_dict={rescaled_image_in_ph: np.expand_dims(o_rescaled, 0), a_ph: last_a.reshape(-1,), rew_ph: last_r.reshape(-1,1), rnn_state_ph: last_rnn_state, max_seq_len_ph: 1, seq_len_ph: [1]}) buf.finish_path(last_val) logger.store(EpRet=ep_ret, EpLen=ep_len) print(f'episode terminated with {step} steps. epoch:{epoch} trial:{trial} episode:{episode}') break print(action_dict) if step_counter < episodes_per_trial * max_ep_len: buf.pad_zeros(episodes_per_trial * max_ep_len - step_counter) buf.seq_len_buf[trial] = step_counter # pad zeros to sequence buffer after each trial # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*trials_per_epoch*episodes_per_trial*max_ep_len) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, batch_size=250000, n=100, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape sequence_length = n * max_ep_len trials = batch_size // sequence_length # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) # rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None) x_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='x_ph') t_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='t_ph') a_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='a_ph') r_ph = tf.placeholder(dtype=tf.float32, shape=(None, sequence_length), name='r_ph') # input_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, n, None), name='rew_ph') adv_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='adv_ph') ret_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='ret_ph') logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='logp_old_ph') # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, t_ph, a_ph, r_ph, sequence_length, env.action_space.n, env.observation_space.shape[0]) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, t_ph, a_ph, r_ph, adv_ph, ret_ph, logp_old_ph] # for ph in all_phs: # print(ph.shape) # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer buf = PPOBuffer(obs_dim, act_dim, batch_size, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving model_inputs = {'x': x_ph, 't': t_ph, 'a': a_ph, 'r': r_ph} model_outputs = {'pi': pi} logger.setup_tf_saver(sess, inputs=model_inputs, outputs=model_outputs) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} # inputs[a_ph] = np.tril(np.transpose(np.repeat(inputs[a_ph], n).reshape(trials, n, n), [0, 2, 1])) # inputs[rew_ph] = np.tril(np.transpose(np.repeat(inputs[rew_ph], n).reshape(trials, n, n), [0, 2, 1])) # print(inputs[x_ph]) # print(inputs[t_ph]) # print(inputs[a_ph]) # print(inputs[r_ph]) inputs[x_ph] = inputs[x_ph].reshape(trials, sequence_length) inputs[t_ph] = inputs[t_ph].reshape(trials, sequence_length) inputs[a_ph] = inputs[a_ph].reshape(trials, sequence_length) inputs[r_ph] = inputs[r_ph].reshape(trials, sequence_length) # print('x:', inputs[x_ph]) # print('t:', inputs[t_ph]) # print('a:', inputs[a_ph]) # print('r:', inputs[r_ph]) # print('ret:', inputs[ret_ph]) # print('adv:', inputs[adv_ph]) # print('logp_old:', inputs[logp_old_ph]) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # kl = mpi_avg(kl) # if kl > 1.5 * target_kl: # logger.log('Early stopping at step %d due to reaching max kl.'%i) # break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() save_itr = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trail in range(trials): print('trial:', trail) # last_a = np.zeros(n).reshape(1, n) # last_r = np.zeros(n).reshape(1, n) o_deque = deque(sequence_length * [0], sequence_length) t_deque = deque(sequence_length * [0], sequence_length) last_a = deque(sequence_length * [0], sequence_length) last_r = deque(sequence_length * [0], sequence_length) means = env.sample_tasks(1)[0] # print('task means:', means) action_dict = defaultdict(int) total_reward = 0 env.reset_task(means) o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0 for episode in range(sequence_length): # print('episode:', episode) # print('o:', o_deque) # print('d:', t_deque) # print('a:', last_a) # print('r:', last_r) a, v_t, logp_t = sess.run( get_action_ops, feed_dict={ x_ph: np.array(o_deque).reshape(1, sequence_length), t_ph: np.array(t_deque).reshape(1, sequence_length), a_ph: np.array(last_a).reshape(1, sequence_length), r_ph: np.array(last_r).reshape(1, sequence_length) }) # print("a shape:", a.shape) # print("v_t shape:", v_t.shape) # print("logp_t shape:", logp_t.shape) # choosen_a = a[episode, 0] # choosen_v_t = v_t[0, episode] # choosen_logp_t = logp_t[episode] # print('a:', a) choosen_a = a[-1] choosen_v_t = v_t[-1] choosen_logp_t = logp_t[-1] action_dict[choosen_a] += 1 o, r, d, _ = env.step(choosen_a) ep_ret += r ep_len += 1 t = ep_len == max_ep_len total_reward += r o_deque.append(o) t_deque.append(int(d)) last_a.append(choosen_a) last_r.append(r) # save and log buf.store(o, int(t), choosen_a, r, choosen_v_t, choosen_logp_t) logger.store(VVals=v_t) terminal = d or t if terminal or (episode == sequence_length - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target if d: last_val = r else: last_val = sess.run( v, feed_dict={ x_ph: np.array(o_deque).reshape(1, sequence_length), t_ph: np.array(t_deque).reshape(1, sequence_length), a_ph: np.array(last_a).reshape(1, sequence_length), r_ph: np.array(last_r).reshape(1, sequence_length) }) last_val = last_val[-1] buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o_deque[-1] = 0 t_deque[-1] = 0 last_a[-1] = 0 last_r[-1] = 0 print(action_dict) print('average reward:', total_reward / sequence_length) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, save_itr) save_itr += 1 # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * batch_size) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, trials_per_epoch=2500, steps_per_trial=100, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) x_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='x_ph') a_ph = tf.placeholder(dtype=tf.int32, shape=(None, None), name='a_ph') # adv_ph, ret_ph, logp_old_ph, rew_ph = core.placeholders(None, None, None, 1) adv_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='adv_ph') ret_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='ret_ph') logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='logp_old_ph') rew_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='rew_ph') pi_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='pi_state_ph') v_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='v_state_ph') # Initialize rnn states for pi and v # Main outputs from computation graph pi, logp, logp_pi, v, new_pi_state, new_v_state = actor_critic( x_ph, a_ph, rew_ph, pi_state_ph, v_state_ph, NUM_GRU_UNITS, action_space=env.action_space) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph] # Every step, get: action, value, and logprob and reward get_action_ops = [pi, v, logp_pi, new_pi_state, new_v_state] # Experience buffer steps_per_epoch = trials_per_epoch * steps_per_trial local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer( learning_rate=pi_lr).minimize(pi_loss - 0.01 * approx_ent) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # tf.reset_default_graph() # restore_tf_graph(sess, '..//data//ppo//ppo_s0//simple_save') def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} inputs[pi_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) inputs[v_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) print(pi_l_old, v_l_old) # Training for i in range(train_pi_iters): # print(f'pi:{i}') _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # print(sess.run(pi_loss, feed_dict=inputs)) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): # print(f'v:{_}') sess.run(train_v, feed_dict=inputs) # Log changes from update import datetime print(f'finish one batch training at {datetime.datetime.now()}') pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trial in range(trials_per_epoch): print(f'trial: {trial}') old_a = np.array([0]).reshape(1, 1) old_r = np.array([0]).reshape((1, 1, 1)) means = env.sample_tasks(1)[0] action_dict = defaultdict(int) for i in range(env.action_space.n): action_dict[i] = 0 env.reset_task_simple(means) task_avg = 0.0 pi_state_t = np.zeros((1, NUM_GRU_UNITS)) v_state_t = np.zeros((1, NUM_GRU_UNITS)) for step in range(steps_per_trial): a, v_t, logp_t, pi_state_t, v_state_t = sess.run( get_action_ops, feed_dict={ x_ph: o.reshape(1, 1, -1), a_ph: old_a, rew_ph: old_r, pi_state_ph: pi_state_t, v_state_ph: v_state_t }) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) try: o, r, d, _ = env.step(a[0][0]) except: print(a) raise AssertionError action_dict[a[0][0]] += 1 old_a = np.array(a).reshape(1, 1) old_r = np.array([r]).reshape(1, 1, 1) ep_ret += r task_avg += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (step == local_steps_per_epoch - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # logger.log_tabular('Epoch', epoch) # logger.log_tabular('EpRet', with_min_and_max=True) # logger.log_tabular('Means', means) # logger.dump_tabular() print(f'avg in trial {trial}: {task_avg / steps_per_trial}') print(f'Means in trial {trial}: {means}') print(action_dict) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # saved_path = saver.save(sess, f"/tmp/model_epoch{epoch}.ckpt") # print(f'Model saved in {saved_path}') # Perform PPO update! update() logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=.99, clip_ratio=.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=.97, max_ep_len=1000, target_kl=.01, logger_kwargs=dict(), save_freq=10): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Share action space structure with the actor_critic ac_kwargs['action_space'] = env.action_space x_ph, a_ph = tf.placeholder( name="x_ph", shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name="a_ph", shape=[None, act_dim], dtype=tf.float32) adv_ph, ret_ph, logp_old_ph = tf.placeholder( name="adv_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="ret_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="logp_old_ph", shape=[None], dtype=tf.float32) # Main outputs from computation graph # print( actor_critic( x_ph, a_ph, **ac_kwargs)) pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] get_action_ops = [pi, v, logp_pi] local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO Objectives ratio = tf.exp(logp - logp_old_ph) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Stats to watch approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) def mpi_avg(x): """Average a scalar or vector over MPI processes.""" return mpi_sum(x) / num_procs() kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def trpo(env_fn, actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=.99, delta=.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=.8, lam=.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo="trpo"): # LOgger tools logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Seed inits seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) # Environment recreation env = env_fn() # Getting obs dims obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] ac_kwargs['action_space'] = env.action_space # Placeholders x_ph, a_ph = tf.placeholder( name="x_ph", shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name="a_ph", shape=[None, act_dim], dtype=tf.float32) adv_ph, ret_ph, logp_old_ph = tf.placeholder( name="adv_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="ret_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="logp_old_ph", shape=[None], dtype=tf.float32) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic( x_ph, a_ph, **ac_kwargs) def keys_as_sorted_list(dict): return sorted(list(dict.keys())) def values_as_sorted_list(dict): return [dict[k] for k in keys_as_sorted_list(dict)] all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph ] + values_as_sorted_list(info_phs) get_action_ops = [pi, v, logp_pi] + values_as_sorted_list(info) # Experience buffer init local_steps_per_epoch = int(steps_per_epoch / num_procs()) info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) var_counts = tuple(count_vars(scope) for scope in ["pi", "v"]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # TRPO Losses ratio = tf.exp(logp - logp_old_ph) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # CG solver requirements pi_params = get_vars("pi") # Some helpers def flat_concat(xs): return tf.concat([tf.reshape(x, (-1, )) for x in xs], axis=0) def flat_grad(f, params): return flat_concat(tf.gradients(xs=params, ys=f)) def hessian_vector_product(f, params): g = flat_grad(f, params) x = tf.placeholder(tf.float32, shape=g.shape) return x, flat_grad(tf.reduce_sum(g * x), params) def assign_params_from_flat(x, params): flat_size = lambda p: int(np.prod(p.shape.as_list()) ) # the 'int' is important for scalars splits = tf.split(x, [flat_size(p) for p in params]) new_params = [ tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits) ] return tf.group( [tf.assign(p, p_new) for p, p_new in zip(params, new_params)]) gradient = flat_grad(pi_loss, pi_params) v_ph, hvp = hessian_vector_product(d_kl, pi_params) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = flat_concat(pi_params) set_pi_params = assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): x = np.zeros_like(b) r = b.copy() p = r.copy() r_dot_old = np.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval # Always so elegant haha inputs = {k: v for k, v in zip(all_phs, buf.get())} def mpi_avg(x): """Average a scalar or vector over MPI processes.""" return mpi_sum(x) / num_procs() Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) # OK old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == "trpo": for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[ 1], agent_outs[2], agent_outs[3:] # Save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not terminal: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()