def load(exp_dir): # exp_dir = 'experiments/PPO-00_Pendulum-v0_25-10-2018_20-12-37' ckpt_path = path.join(exp_dir, '0/model349.ckpt') params_path = path.join(exp_dir, 'params.json') assert path.exists(params_path), "params.json must exist at the root of the experiment folder >:v" with open(params_path) as f: params= json.load(f) env = gym.make(params["env_name"]) continuous = isinstance(env.action_space, gym.spaces.Box) ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] if continuous else env.action_space.n vero = Agent(continuous, ob_dim, ac_dim, n_layers=2) # regina = Sensei(vero, continuous, ob_dim, ac_dim, # epochs, batch_size, # learning_rate, epsilon) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, ckpt_path) render(vero, env, sess)
def load(exp_dir, file_name, env_name): # exp_dir = 'experiments/PPO-00_Pendulum-v0_25-10-2018_20-12-37' ckpt_path = path.join(exp_dir, '0/model400.ckpt') params_path = path.join(exp_dir, 'params.json') assert path.exists( params_path ), "params.json must exist at the root of the experiment folder >:v" with open(params_path) as f: params = json.load(f) env = gym.make(env_name or params["env_name"]) continuous = isinstance(env.action_space, gym.spaces.Box) ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] if continuous else env.action_space.n vero = Agent(continuous, ob_dim, ac_dim, n_layers=2) # regina = Sensei(vero, continuous, ob_dim, ac_dim, # epochs, batch_size, # learning_rate, epsilon) if file_name: fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G') fps = 50 out = cv2.VideoWriter(file_name, fourcc, fps, (800, 800)) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, ckpt_path) render(vero, env, sess, recorder=out if file_name else None) if file_name: out.release()
def main(): import argparse parser = argparse.ArgumentParser( description='Render random actions in an environment') parser.add_argument('env_name', help='env name') args = parser.parse_args() env = gym.make(args.env_name) vero = RandomAgent(env) render(vero, env)
def main(): env = gym.make('PendrogoneZero-v0') # env = gym.make('DroneZero-v0') continuous = isinstance(env.action_space, gym.spaces.Box) # print(continuous) ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] if continuous else env.action_space.n # print('ob_dim', ob_dim) # print('ac_dim', ac_dim) veronika = Agent(continuous, ob_dim, ac_dim, n_layers=2) regina = Sensei(veronika, continuous, ob_dim, ac_dim, epochs, batch_size, learning_rate, epsilon) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) generator = rollouts_generator(sess, veronika, env, sample_horizon) for i in range(num_iterations): seg = generator.__next__() # print(seg["rew"]) add_vtarg_adv(seg, lam, gamma) adv = seg["adv"] adv = (adv - adv.mean()) / (adv.std() + 1e-8) adv = adv[:, None] regina.train_samples(sess, seg["ob"], seg["ac"], adv, seg["vtarg"], seg["log_probs"]) rewards = np.array(seg["ep_rets"]) if i % 10 == 0 or i == num_iterations - 1: if rewards.shape[0] > 0: mean, std = rewards.mean(), rewards.std() print( 'Iteration {0:3d}: reward: m{1:6.3f}, std{2:4.2f}; ep_len: {3:5.2f}; action: m:{4}, std:{5}' .format(i, mean, std, np.mean(seg["ep_lens"]), np.mean(seg["ac"], axis=0), np.std(seg["ac"], axis=0))) render(veronika, env, sess)
def main(): # env = gym.make('Pendulum-v0') env = gym.make('CartPole-v1') continuous = isinstance(env.action_space, gym.spaces.Box) ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] if continuous else env.action_space.n gamma, lam = 0.99, 0.95 std = 0.1 learning_rate = 5e-3 epsilon = 0.2 epochs = 1 num_ite = 1 sample_size = 5 # Sampled variables ob_no = tf.placeholder(shape=[None, ob_dim], name="observations", dtype=tf.float32) ac_na = tf.placeholder(shape=[None, ac_dim], name="actions", dtype=tf.float32) \ if continuous else \ tf.placeholder(shape=[None], name="actions", dtype=tf.int32) adv_n = tf.placeholder(shape=[None], dtype=tf.float32) val_n = tf.placeholder(shape=[None], dtype=tf.float32) # Target Value function t_val = tf.placeholder(shape=[None], dtype=tf.float32) # print(ac_dim) rla = Agent('veronika', ob_no, ac_dim, continuous, n_layers=2) # Gaussian policy loss operations # mean_na = rla.pi.logits # logprob_n = (ac_na - mean_na) / std**2 # pg_loss = tf.reduce_mean(logprob_n) # with tf.variable_scope('losses'): # log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=ac_na, logits=rla.pi.logits) # pg_loss = tf.reduce_mean(adv_n * log_prob, name='pg_loss') # # Value function loss operations # v_loss = tf.reduce_mean(tf.losses.mean_squared_error(labels=t_val, predictions=val_n), name='v_loss') # loss = pg_loss + v_loss # This only may work for the discrete case # Could avoid doing these ops just by using a tf.nn.softmax activation in the last layer pi_probs = rla.pi.logits - tf.reduce_max(rla.pi.logits) pi_probs = tf.nn.softmax(pi_probs) pi_probs_old = rla.old_pi.logits - tf.reduce_max(rla.old_pi.logits) pi_probs_old = tf.nn.softmax(pi_probs_old) # probabilities of actions which agent took with policy act_probs = pi_probs * tf.one_hot(indices=ac_na, depth=ac_dim) act_probs = tf.reduce_sum(act_probs, axis=1) # probabilities of actions which agent took with old policy act_probs_old = pi_probs_old * tf.one_hot(indices=ac_na, depth=ac_dim) act_probs_old = tf.reduce_sum(act_probs_old, axis=1) with tf.variable_scope('loss/surrogate'): ratio = tf.exp(tf.log(act_probs) - tf.log(act_probs_old)) # ratio = tf.divide(act_probs, act_probs_old) # ratio = tf.exp(act_probs - act_probs_old) clipped_ratio = tf.clip_by_value(ratio, 1.0 - epsilon, 1.0 + epsilon) surrogate = tf.minimum(ratio * adv_n, clipped_ratio * adv_n) surrogate = tf.reduce_mean(surrogate) with tf.variable_scope('loss/value_f'): v_loss = tf.losses.mean_squared_error(labels=t_val, predictions=val_n) v_loss = tf.reduce_mean(v_loss) with tf.variable_scope('loss'): loss = -surrogate + v_loss gradient_clip = 40 optimizer = tf.train.AdamOptimizer(learning_rate) grads = tf.gradients(loss, rla.pi_vars) print(rla.pi_vars) grads, _ = tf.clip_by_global_norm(grads, gradient_clip) grads_and_vars = list(zip(grads, rla.pi_vars)) train_op = optimizer.apply_gradients(grads_and_vars) # optimizer = tf.train.AdamOptimizer(learning_rate) # train_op = optimizer.minimize(loss, var_list=rla.pi_vars) init = tf.global_variables_initializer() # gen = generator.__next__() with tf.Session() as sess: sess.run(init) generator = rollouts_generator(sess, rla, env, sample_size) # From the beginning, the old policy is equal to the current policy rla.save_policy(sess) for i in range(num_ite): seg = generator.__next__() add_vtarg_adv(seg, lam, gamma) adv = seg["adv"] adv = (adv - adv.mean()) / (adv.std() + 1e-8) feed_dict = { ob_no: seg["ob"], ac_na: seg["ac"], adv_n: adv, val_n: seg["vpred"], t_val: seg["vtarg"] } total_loss = 0 for _ in range(epochs): _loss, _ = sess.run([loss, train_op], feed_dict=feed_dict) total_loss += _loss # _stuff = sess.run([ac_na, pi_probs, act_probs, ratio, loss, train_op], feed_dict=feed_dict) # print(_stuff[0]) # print(_stuff[1]) # print(_stuff[2]) # print(_stuff[3]) rla.save_policy(sess) returns = np.array(seg["ep_rets"]) if i % 5 == 0 or i == num_ite: print(total_loss / epochs) print(returns.mean(), returns.std()) # _loss, _ = sess.run([loss, train_op], feed_dict=feed_dict) render(sess, rla, env)