def ppo(BASE_DIR, expert_density, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), steps_per_epoch=1000, epochs=10, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=50, train_v_iters=50, lam=0.97, max_ep_len=1000, target_kl=0.01, data_n=10): data = {} # ALL THE DATA logger_kwargs = setup_logger_kwargs(args.dir_name, data_dir=BASE_DIR) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # update rule def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) policy_distr = Gaussian_Density() policy = lambda s: np.random.uniform( -2.0, 2.0, size=env.action_space.shape) # random policy policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length) density = policy_distr.density() data[0] = { 'pol_s': policy_distr.num_samples, 'pol_t': policy_distr.num_trajects } dist_rewards = [] # repeat REIL for given number of rounds for i in range(args.rounds): message = "\nRound {} out of {}\n".format(i + 1, args.rounds) reward = lambda s: expert_density(s) / (density(s) + args.eps) dist_rewards.append(reward) start_time = time.time() o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 r = reward(o) # custom reward # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, old_r, d, _ = env.step(a[0]) r = reward(o) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = old_r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) last_val = reward(o) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 r = reward(o) # store model! if (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() print(message) policy = lambda state: sess.run( get_action_ops, feed_dict={x_ph: state.reshape(1, -1)})[0][0] data[i] = { 'pol_s': policy_distr.num_samples, 'pol_t': policy_distr.num_trajects } data[i]['rewards'] = evaluate_reward(env, policy, data_n) if i != args.rounds - 1: policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length) density = policy_distr.density() return data, dist_rewards
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 maxRev = float("-inf") #negative infinity in the beginning #maxRevActionSeq=[] maxRevTSTT = 0 maxRevRevenue = 0 maxRevThroughput = 0 maxRevJAH = 0 maxRevRemVeh = 0 maxRevJAH2 = 0 maxRevRMSE_MLvio = 0 maxRevPerTimeVio = 0 maxRevHOTDensity = pd.DataFrame() maxRevGPDensity = pd.DataFrame() maxtdJAHMax = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) #we need to scale the sampled values of action from (-1,1) to our choices of toll coz they were sampled from tanh activation mu numpyFromA = np.array(a[0]) numpyFromA = ((numpyFromA + 1.0) * (env.state.tollMax - env.state.tollMin) / 2.0) + env.state.tollMin a[0] = np.ndarray.tolist(numpyFromA) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) #get other stats and store them too otherStats = env.getAllOtherStats() if np.any(np.isnan(np.array(otherStats))): sys.exit("Nan found in statistics! Error") logger.store(EpTSTT=otherStats[0], EpRevenue=otherStats[1], EpThroughput=otherStats[2], EpJAH=otherStats[3], EpRemVeh=otherStats[4], EpJAH2=otherStats[5], EpMLViolRMSE=otherStats[6], EpPerTimeVio=otherStats[7], EptdJAHMax=otherStats[8]) #determine max rev profile if ep_ret > maxRev: maxRev = ep_ret maxRevActionSeq = env.state.tollProfile maxRevTSTT = otherStats[0] maxRevRevenue = otherStats[1] maxRevThroughput = otherStats[2] maxRevJAH = otherStats[3] maxRevRemVeh = otherStats[4] maxRevJAH2 = otherStats[5] maxRevRMSE_MLvio = otherStats[6] maxRevPerTimeVio = otherStats[7] maxRevHOTDensity = env.getHOTDensityData() maxRevGPDensity = env.getGPDensityData() maxtdJAHMax = otherStats[8] o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpTSTT', average_only=True) logger.log_tabular('EpRevenue', average_only=True) logger.log_tabular('EpThroughput', average_only=True) logger.log_tabular('EpJAH', average_only=True) logger.log_tabular('EpRemVeh', average_only=True) logger.log_tabular('EpJAH2', average_only=True) logger.log_tabular('EpMLViolRMSE', average_only=True) logger.log_tabular('EpPerTimeVio', average_only=True) logger.log_tabular('EptdJAHMax', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() print("Max cumulative reward obtained= %f " % maxRev) print( "Corresponding revenue($)= %f, TSTT(hrs)= %f, Throughput(veh)=%f, JAHstat= %f, remaining vehicles= %f, JAHstat2=%f, RMSEML_vio=%f, percentTimeViolated(%%)=%f, tdJAHMax= %f" % (maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax)) outputVector = [ maxRev, maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax ] #print("\n===Max rev action sequence is\n",maxRevActionSeq) exportTollProfile(maxRevActionSeq, logger_kwargs, outputVector) exportDensityData(maxRevHOTDensity, maxRevGPDensity, logger_kwargs)
def ppo(env_fn, ref_func=None, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=500, epochs=10000, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=500, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) t_a_ph = core.placeholder_from_space(env.action_space) ret_ph = core.placeholder(None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, t_a_ph, ret_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) print("---------------", local_steps_per_epoch) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # dagger objectives pi_loss = tf.reduce_mean(tf.square(pi - t_a_ph)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old = sess.run([pi_loss, v_loss], feed_dict=inputs) # Training for i in range(train_pi_iters): sess.run(train_pi, feed_dict=inputs) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new = sess.run([pi_loss, v_loss], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(1, epochs + 1, 1): for t in range(local_steps_per_epoch): a_s, v_t, logp_t = sess.run( get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)}) a = a_s[0] ref_a = call_mpc(env, ref_func) if (epoch < 100): a = ref_a # save and log buf.store(o, a, ref_a, r) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: np.array(o).reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sigail(env_fn, traj_dir, actor_critic=core.mlp_actor_critic_add, ac_kwargs=dict(), d_hidden_size=64, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=40, train_v_iters=40, lam=0.97, max_ep_len=4000, beta=1e-4, target_kl=0.01, logger_kwargs=dict(), save_freq=100, r_env_ratio=0, d_itr=20, reward_type='negative', trj_num=20, buf_size=1000, si_update_ratio=0.02, js_smooth=5, buf_update_type='random', pretrain_bc_itr=0): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape D = Discriminator(env, hidden_size=d_hidden_size, reward_type=reward_type) #!add Discriminator object D_js_m = JS_div_machine(env, hidden_size=d_hidden_size) e_obs = np.zeros((buf_size, obs_dim[0])) e_act = np.zeros((buf_size, act_dim[0])) Sibuffer = SIBuffer(obs_dim, act_dim, e_obs, e_act, trj_num=trj_num, max_size=buf_size, js_smooth_num=js_smooth) #!sibuf trj_full = False assert e_obs.shape[1:] == obs_dim # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, pi_std, entropy, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum( ratio * adv_ph, min_adv)) - beta * entropy #add entropy v_loss = tf.reduce_mean((ret_ph - v)**2) #ret_phには累積報酬のバッファが入る # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Sync params across processes # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get()) } #all_phsは各バッファーに対応するプレースホルダー辞書 pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training#ここも変える必要あり? おそらく変えなくて良い for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: #更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): #vの更新 sess.run(train_v, feed_dict=inputs) # Log changes from update(新しいロスの計算) pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) std, std_ent = sess.run([pi_std, entropy], feed_dict=inputs) logger.store( LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=std_ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), #更新での改善量 DeltaLossV=(v_l_new - v_l_old), Std=std) start_time = time.time() o, r, d, ep_ret_task, ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0 if pretrain_bc_itr > 0: BC.learn(Sibuffer.expert_obs, Sibuffer.expert_act, max_itr=pretrain_bc_itr) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ''' if t <150: env.render() time.sleep(0.03) ''' ep_ret_task += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): ''' if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) ''' #!add discriminator train '''#終端も加えるならアリッチャあり o_reshape = o.reshape(core.combined_shape(1,obs_dim)) a_reshape = a.reshape(core.combined_shape(1,act_dim)) agent_obs = np.append(buf.obs_buf[buf.path_slice()],o_reshape,axis = 0)#!o を(obspace,)→(1,obspace)に変換してからアペンド agent_act = np.append(buf.act_buf[buf.path_slice()],a_reshape,axis = 0)#終端での状態行動対も加えてDを学習 ''' agent_obs = buf.obs_buf[buf.path_slice()] agent_act = buf.act_buf[buf.path_slice()] #D.train(sess,e_obs,e_act ,agent_obs,agent_act) #↓buf.r_gail_buf[slice(buf.path_start_idx+1, buf.ptr+2)] = D.get_reward_buf(sess,agent_obs, agent_act).ravel()#状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる) if trj_full: gail_r = 1 else: gail_r = 0 rew_gail = gail_r * D.get_reward( sess, agent_obs, agent_act).ravel() #状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる) ep_ret_gail += rew_gail.sum() #!before gail_ratio ep_ret_sum = r_env_ratio * ep_ret_task + ep_ret_gail rew_gail_head = rew_gail[:-1] last_val_gail = rew_gail[-1] buf.rew_buf[slice( buf.path_start_idx + 1, buf.ptr)] = rew_gail_head + r_env_ratio * buf.rew_buf[ slice(buf.path_start_idx + 1, buf.ptr)] #!add GAIL reward 最後の報酬は含まれないため長さが1短い if d: # if trajectory didn't reach terminal state, bootstrap value target last_val = r_env_ratio * r + last_val_gail else: last_val = sess.run(v, feed_dict={x_ph: o.reshape(1, -1) }) #v_last=...だったけどこれで良さげ buf.finish_path( last_val) #これの前にbuf.finish_add_r_vがなされていることを確認すべし if terminal: #only store trajectory to SIBUffer if trajectory finished if trj_full: Sibuffer.store( agent_obs, agent_act, sum_reward=ep_ret_task) #!store trajectory else: Sibuffer.store( agent_obs, agent_act, sum_reward=ep_ret_task) #!store trajectory logger.store(EpRet=ep_ret_task, EpRet_Sum=ep_ret_sum, EpRet_Gail=ep_ret_gail, EpLen=ep_len) o, r, d, ep_ret_task, ep_ret_sum, ep_ret_gail, ep_len = env.reset( ), 0, False, 0, 0, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, epoch) # Perform PPO update! if not (trj_full): M_obs_buf = Sibuffer.get_obs_trj() trj_full = (M_obs_buf.shape[0] >= buf_size) if trj_full: #replaybufferがr_thresholdよりも大きいとき Sibuffer.update_main_buf(ratio_update=si_update_ratio, update_type=buf_update_type) M_obs_buf = Sibuffer.get_obs_trj() M_act_buf = Sibuffer.get_act_trj() d_batch_size = len(agent_obs) for _t in range(d_itr): e_obs_batch, e_act_batch = Sibuffer.get_random_batch( d_batch_size) D.train(sess, e_obs_batch, e_act_batch, agent_obs, agent_act) D_js_m.train(sess, M_obs_buf, M_act_buf, e_obs, e_act) #バッファとエキスパートの距離を見るためにtrain js_d = D.get_js_div(sess, Sibuffer.main_obs_buf, Sibuffer.main_act_buf, agent_obs, agent_act) js_d_m = D_js_m.get_js_div(sess, M_obs_buf, M_act_buf, e_obs, e_act) else: js_d, js_d_m = 0.5, 0.5 update() Sibuffer.store_js(js_d) logger.store(JS=js_d, JS_M=js_d_m, JS_Ratio=Sibuffer.js_ratio_with_random) # Log info about epoch #if epoch%10 == 0:#logger print each 10 epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpRet_Sum', average_only=True) logger.log_tabular('EpRet_Gail', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('Std', average_only=True) logger.log_tabular('buffer_r', Sibuffer.buffer_r_average) logger.log_tabular('JS', average_only=True) logger.log_tabular('JS_M', average_only=True) logger.log_tabular('JS_Ratio', average_only=True) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, custom_h=None, eval_episodes=50, do_checkpoint_eval=False, env_name=None, eval_temp=1.0, train_starting_temp=1.0, env_version=None, env_input=None, target_arcs=None): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # create logger for tensorboard tb_logdir = "{}/tb_logs/".format(logger.output_dir) tb_logger = Logger(log_dir=tb_logdir) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space if custom_h is not None: hidden_layers_str_list = custom_h.split('-') hidden_layers_int_list = [int(h) for h in hidden_layers_str_list] ac_kwargs['hidden_sizes'] = hidden_layers_int_list # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) temperature_ph = tf.placeholder(tf.float32, shape=(), name="init") # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, temperature_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, temperature_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v) ** 2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # create a tf session with GPU memory usage option to be allow_growth so that one program will not use up the # whole GPU memory config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # log tf graph tf.summary.FileWriter(tb_logdir, sess.graph) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'temperature': temperature_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len, ep_dummy_action_count, ep_dummy_steps_normalized = env.reset(), 0, False, 0, 0, 0, [] # initialize variables for keeping track of BEST eval performance best_eval_AverageEpRet = -0.05 # a negative value so that best model is saved at least once. best_eval_StdEpRet = 1.0e30 # save is used to only allow saving BEST models after half of training epochs save = True # below are used for early-stop. We early stop if # 1) a best model has been saved, and, # 2) 50 epochs have passed without a new save saved = False early_stop_count_started = False episode_count_after_saved = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): current_temp = _get_current_temperature(epoch, epochs, train_starting_temp) for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1), temperature_ph: current_temp}) # save and log buf.store(o, a, r, v_t, logp_t, current_temp) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 if env_version >= 4 and env.action_is_dummy: # a is dummy action ep_dummy_action_count += 1 ep_dummy_steps_normalized.append(ep_len / env.allowed_steps) terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1), temperature_ph: current_temp}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) if env_version >= 4: logger.store(EpDummyCount=ep_dummy_action_count) logger.store(EpTotalArcs=env.adjacency_matrix.sum()) if len(ep_dummy_steps_normalized) > 0: ep_dummy_steps_normalized = np.asarray(ep_dummy_steps_normalized, dtype=np.float32).mean() logger.store(EpDummyStepsNormalized=ep_dummy_steps_normalized) o, r, d, ep_ret, ep_len, ep_dummy_action_count, ep_dummy_steps_normalized = env.reset(), 0, False, 0, 0, 0, [] # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): # Save a new model every save_freq and at the last epoch. Do not overwrite the previous save. # logger.save_state({'env_name': env_name}, epoch) # # Save a new model every save_freq and at the last epoch. Only keep one copy - the current model # logger.save_state({'env_name': env_name}) # Evaluate and save best model if do_checkpoint_eval and epoch > 0: # below is a hack. best model related stuff is saved at itr 999999, therefore, simple_save999999. # Doing this way, I can use test_policy and plot directly to test the best models. # saved best models includes: # 1) a copy of the env_name # 2) the best rl model with parameters # 3) a pickle file "best_eval_performance_n_structure" storing best_performance, best_structure and epoch # note that 1) and 2) are spinningup defaults, and 3) is a custom save best_eval_AverageEpRet, best_eval_StdEpRet, saved = eval_and_save_best_model( best_eval_AverageEpRet, best_eval_StdEpRet, # a new logger is created and passed in so that the new logger can leverage the directory # structure without messing up the logger in the training loop eval_logger=EpochLogger(**dict( exp_name=logger_kwargs['exp_name'], output_dir=os.path.join(logger.output_dir, "simple_save999999"))), train_logger=logger, tb_logger=tb_logger, epoch=epoch, # the env_name is passed in so that to create an env when and where it is needed. This is to # logx.save_state() error where an env pointer cannot be pickled env_name="F{}x{}T{}_SP{}_v{}".format(env.n_plant, env.n_product, env.target_arcs, env.n_sample, env_version) if env_version >= 3 else env_name, env_version=env_version, env_input=env_input, render=False, # change this to True if you want to visualize how arcs are added during evaluation target_arcs=env.target_arcs, get_action=lambda x: sess.run(pi, feed_dict={x_ph: x[None, :], temperature_ph: eval_temp})[0], # number of samples to draw when simulate demand n_sample=5000, num_episodes=eval_episodes, save=save, seed=seed ) # Perform PPO update! update() # # # Log into tensorboard log_key_to_tb(tb_logger, logger, epoch, key="EpRet", with_min_and_max=True) log_key_to_tb(tb_logger, logger, epoch, key="EpLen", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="VVals", with_min_and_max=True) log_key_to_tb(tb_logger, logger, epoch, key="LossPi", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="LossV", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossPi", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossV", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="Entropy", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="KL", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="ClipFrac", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="StopIter", with_min_and_max=False) tb_logger.log_scalar(tag="TotalEnvInteracts", value=(epoch + 1) * steps_per_epoch, step=epoch) tb_logger.log_scalar(tag="Time", value=time.time() - start_time, step=epoch) tb_logger.log_scalar(tag="epoch_temp", value=current_temp, step=epoch) if env_version >= 4: log_key_to_tb(tb_logger, logger, epoch, key="EpDummyCount", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="EpTotalArcs", with_min_and_max=False) if len(logger.epoch_dict['EpDummyStepsNormalized']) > 0: log_key_to_tb(tb_logger, logger, epoch, key="EpDummyStepsNormalized", with_min_and_max=False) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('EpochTemp', current_temp) if env_version >= 4: logger.log_tabular('EpDummyCount', with_min_and_max=True) if len(logger.epoch_dict['EpDummyStepsNormalized']) > 0: logger.log_tabular('EpDummyStepsNormalized', average_only=True) logger.log_tabular('EpTotalArcs', average_only=True) logger.dump_tabular() # check for early stop if saved: # start to count the episodes elapsed after a "saved" event early_stop_count_started = True # reset the count to 0 episode_count_after_saved = 0 else: # check whether we should count this episode, i.e., whether early_stop_count_started == True if early_stop_count_started: episode_count_after_saved += 1 if episode_count_after_saved > 60: logger.log('Early Stopped at epoch {}.'.format(epoch), color='cyan') break
def gail(env_fn,traj_dir, actor_critic=core.mlp_actor_critic_add, ac_kwargs=dict(),d_hidden_size =64,d_batch_size = 64,seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=40, train_v_iters=40, lam=0.97, max_ep_len=4000,beta =1e-4, target_kl=0.01, logger_kwargs=dict(), save_freq=100, r_env_ratio=0,gail_ratio =1, d_itr =20, reward_type = 'negative', pretrain_bc_itr =0): """ additional args d_hidden_size : hidden layer size of Discriminator d_batch_size : Discriminator's batch size r_env_ratio,gail_ratio : the weight of rewards from envirionment and gail .Total reward = gail_ratio *rew_gail+r_env_ratio* rew_from_environment d_itr : The number of iteration of update discriminater reward_type : GAIL reward has three type ['negative','positive', 'AIRL'] trj_num :the number of trajectory for pretrain_bc_itr: the number of iteration of pretraining by behavior cloeing """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape D=Discriminator(env,hidden_size = d_hidden_size,reward_type =reward_type) e_obs = np.loadtxt(traj_dir + '/observations.csv',delimiter=',') e_act = np.loadtxt(traj_dir + '/actions.csv',delimiter= ',')#Demo treajectory Sibuffer =SIBuffer(obs_dim, act_dim, e_obs,e_act,trj_num= 0, max_size =None)#!sibuf assert e_obs.shape[1:] == obs_dim # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi,pi_std, entropy, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))- beta*entropy v_loss = tf.reduce_mean((ret_ph - v)**2)#ret_phには累積報酬のバッファが入る # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() BC = BehavioralCloning(sess,pi,logp,x_ph,a_ph) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Sync params across processes # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k:v for k,v in zip(all_phs, buf.get())}#all_phsは各バッファーに対応するプレースホルダー辞書 pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training#ここも変える必要あり? おそらく変えなくて良い for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl:#更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る logger.log('Early stopping at step %d due to reaching max kl.'%i) break logger.store(StopIter=i) for _ in range(train_v_iters):#vの更新 sess.run(train_v, feed_dict=inputs) # Log changes from update(新しいロスの計算) pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) std, std_ent = sess.run([pi_std,entropy],feed_dict = inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=std_ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old),#更新での改善量 DeltaLossV=(v_l_new - v_l_old), Std = std) start_time = time.time() o, r, d, ep_ret_task,ep_ret_gail, ep_len = env.reset(), 0, False, 0,0 , 0 if pretrain_bc_itr>0: BC.learn(Sibuffer.expert_obs,Sibuffer.expert_act ,max_itr =pretrain_bc_itr) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) buf.store_rew(r) ''' if t <150: env.render() time.sleep(0.03) ''' ep_ret_task += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch-1): if d:# if trajectory didn't reach terminal state, bootstrap value target last_val = r else: last_val = sess.run(v, feed_dict={x_ph: o.reshape(1,-1)})#v_last=...だったけどこれで良さげ buf.store_rew(last_val)#if its terminal ,nothing change and if its maxitr last_val is use buf.finish_path() if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret_task, EpLen=ep_len)#,EpRet_Sum =ep_ret_sum,EpRet_Gail =ep_ret_gail) o, r, d, ep_ret_task,ep_ret_sum,ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, epoch) agent_obs , agent_act = buf.obs_buf, buf.act_buf d_batch_size = d_batch_size#or len(agent_obs)//d_itr #update discreminator for _t in range(d_itr): e_obs_batch ,e_act_batch =Sibuffer.get_random_batch(d_batch_size) a_obs_batch =sample_batch(agent_obs,batch_size = d_batch_size) a_act_batch= sample_batch(agent_act,batch_size = d_batch_size) D.train(sess, e_obs_batch,e_act_batch , a_obs_batch,a_act_batch ) js_d = D.get_js_div(sess,Sibuffer.main_obs_buf,Sibuffer.main_act_buf,agent_obs,agent_act) #---------------get_gail_reward------------------------------ rew_gail=D.get_reward(sess,agent_obs, agent_act).ravel() buf.rew_buf = gail_ratio *rew_gail+r_env_ratio*buf.rew_buf for path_slice in buf.slicelist[:-1]: ep_ret_gail = rew_gail[path_slice].sum() ep_ret_sum = buf.rew_buf[path_slice].sum() logger.store(EpRet_Sum=ep_ret_sum,EpRet_Gail=ep_ret_gail) buf.culculate_adv_buf() # -------------Perform PPO update!-------------------- update() logger.store(JS=js_d) # Log info about epoch #if epoch%10 == 0:#logger print each 10 epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpRet_Sum', average_only=True) logger.log_tabular('EpRet_Gail', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.log_tabular('Std', average_only=True) logger.log_tabular('JS', average_only=True) #logger.log_tabular('JS_Ratio', average_only=True) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() # game environment obs_dim = env.observation_space.shape # get the observe dimension from environment act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces( env.observation_space, env.action_space) #构建神经网络的时候,a_ph还没有 adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic( x_ph, a_ph, **ac_kwargs) #目前这里的状态和action都还是放的placeholder # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob # 每一步都需要得到action(这里的pi似乎表示action) get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # 两部分的loss v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes # 同步参数 sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # 前面把计算图构建好了 def update(): # 把input形成字典,等下便于使用 # 通过搜集到的数据,进行梯度下降,更新参数 inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # 上部分的train是policy,这部分是值函数 for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # 主循环 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log # 把数据放进 buffer pool 里 buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # o 应该代表observation o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! # 打完一局游戏,执行一次更新 update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def main(env_fn, traj_dir, actor_critic=core.mlp_actor_critic, bc_itr=1000, ac_kwargs=dict(), d_hidden_size=64, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=4000, target_kl=0.01, save_freq=100, r_env_ratio=0, reward_type='negative', trj_num=30, buf_size=None, si_update_ratio=0.02, js_threshold_ratio=0.5, js_smooth=5): """ test behavior cloning """ seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape D = Discriminator(env, hidden_size=d_hidden_size) #!add Discriminator object D_js_m = JS_div_machine(env, hidden_size=d_hidden_size) e_obs = np.loadtxt(traj_dir + '/observations.csv', delimiter=',') e_act = np.loadtxt(traj_dir + '/actions.csv', delimiter=',') #Demo treajectory Sibuffer = SIBuffer(obs_dim, act_dim, e_obs, e_act, trj_num=trj_num, max_size=buf_size, js_smooth_num=js_smooth) #!sibuf assert e_obs.shape[1:] == obs_dim # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) sess = tf.Session() BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) BC.learn(Sibuffer.expert_obs, Sibuffer.expert_act, max_itr=bc_itr) # Sync params across processes start_time = time.time() o, r, d, ep_ret_task, ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0 # Setup model saving for epoch in range(1000000): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) o, r, d, _ = env.step(a[0]) env.render() time.sleep(1e-3) ep_ret_task += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal: print('EpRet{},EpLen{}'.format(ep_ret_task, ep_len)) o, r, d, ep_ret_task, ep_ret_sum, ep_ret_gail, ep_len = env.reset( ), 0, False, 0, 0, 0, 0
def ppo(env_fn, # by default, use the neural network mlp we define in core actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ "Args: env_fn: A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function with takes in placeholder symbols for state, ``x_ph``, and action ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given states. ``logp`` (batch,) | Gives log probability according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ======================================" -OpenAI Okay, quick interruption to OpenAI documentation here. actor_critic is the function which interfaces with tensorflow. It takes in ``x_ph`` (x placeholder), ie. a representation of the current state, and ``a_ph``, a representation of the some actions. (TODO: document *what* these actions are). actor_critic runs these inputs through the tensorflow graph and returns several pieces of information that are relevant to PPO; these are described above. Back to OpenAI: " ac_kwargs (dict): Any kwargs appropriate for actor_critic function you provided to PPO. seed (int): Seed for random number generators. setps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value funciton per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1). max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function." - OpenAI """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # modify the seed based on the process so if # we run this in multiple processes # simultaneously we don't do the # exact same thing seed += 10000 * proc_id() # set up our random stuff with this seed tf.set_random_seed(seed) np.random.seed(seed) # create the environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # tell the policy (implemented in actor_critic function) what the action space is ac_kwargs['action_space'] = env.action_space # "Inputs to computation graph" -OpenAI # create tensorflow placeholders for observations (x_ph), actions (a_ph), # advantages (adv_ph), returns (ret_ph), log probabilities # in the current state of the policy (logp_old_ph) # (old since this is used compared to the newer version of the policy # we are creating in the optimization step, comparing to this "old" version) x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # "Main outputs from computation graph" -OpenAI # essentially here we fill in the tensorflow graph so we can compute # the pi, logp, logp_pi, and v tensors based on the # x_ph and a_ph we created above pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # "Need all placeholders in *this* order later (to zip with data from buffer)" -OpenAI all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # "Every step, get: action, value, and logprob" -OpenAI # we later feed this list into tf.session.run() # to tell it to compute the value of pi, v, logp_pi # using the tensorflow graph we have created get_action_ops = [pi, v, logp_pi] # Experience buffer # number of steps per epoch per process local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count the number of parameters we are gonna be training, # both for the policy and for the value function var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # PPO objectives # ratio is the ratio of two probabilities: # pi(a|s) / pi_old(a|s) # where pi(a|s) is the probability of performing action a # given state s GIVEN THE POLICY WHOSE PARAMETERS WE ARE CHANGING # DURING THE OPTIMIZATION STEP # and pi_old(a|s) is the probability of the policy, # with fixed mlp parameters after the last update, # performing a given state s # we essentially use math to find the gradient of pi(a|s) with respect # to the parameters of the mlp, and this is the core of how we calculate # the gradient of the objective function for gradient descent ratio = tf.exp(logp - logp_old_ph) # "pi(a|s) / pi_old(a|s)"-OpenAI # this min_adv, along with the tf.minimum call in the next line of code, # implement the PPO-clip functionality # NOTE: calling this `min_adv` is a bit confusing; if advantage is negative # this is the min value we allow the gradient descent to consider as the advantage; # but it is the MAX value if advantage is positive. min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) # create the functions whose gradients we wish to use for gradient descent # during optimization # for our policy optimization, it is the PPO objective; # for the value function it is simply an error-squared # note that reduce_mean just calculates the mean of the values in the tensor; # ie. this gives the expected value of the loss given the experimental values we have pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v) ** 2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # "a sample estimate for KL-divergence, easy to compute" -OpenAI approx_ent = tf.reduce_mean(-logp) # "a sample estimate for entropy, also easy to compute" -OpenAI clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # what fraction of advantages are clipped # Optimizers # These use gradient descent with the gradient of the objective # functions we defined above to improve parameters for pi and v train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # initialize the tensorflow computation graph's parameters # with values sess = tf.Session() sess.run(tf.global_variables_initializer()) # "Sync params across processes" -OpenAI sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): # create a dictionary of values, which specify to tensorflow what # to input for the placeholders: tensors containing the data from # the trajectory we have stored in buf inputs = {k:v for k, v in zip(all_phs, buf.get())} # calculate these for logging later pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): # run a training step for the policy, and estimate the kl-divergence # (ie. how much the policy changed) on this step _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) # if the kl divergence is too high, stop training on this step # TODO: understand better why it is important to do this if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.'%i) break logger.store(StopIter=i) # train our value function mlp for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # "Log changes from update" -OpenAI # TODO: This could be made a bit more computationally efficient by not recalculating pi_l_old each loop # after having calculated the same thing as pi_l_new the previous run through the loop! # Plus, does it really make the most sense to output pi_l_old and v_l_old as LossPi and LossV # instead of pi_l_new and v_l_new? pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() # initialize the variables we use while training # o = observation (env.reset() returns initial observation) # r = reward = (starts as 0) # d = done? (whether current episode in env is over) # ep_ret = episode return # ep_len = length of episode so far o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # "Main loop: collect experience in env and update/log each epoch" for epoch in range(epochs): for t in range(local_steps_per_epoch): # run the computation of the action, value function, and probability of the action # using the most recent observation in the x_ph slot a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # take the action we computed and advance the environment o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch - 1): if not terminal: print('Warning: trajectory cut off by epoch at %d steps'%ep_len) # "if trajectory didn't reach terminal state, bootstrap value target" -OpenAI # in other words, if the we are stopping this trajectory due to a termination # signal from the env, last_val = the reward from the last step, r # otherwise we stopped because we reached the max episode length or max local_steps_per_epoch, # in which ase we set last_val = estimate of the value of current state based on v function # we are training last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) # "only store EpRet / EpLen if trajectory finished" -OpenAI if terminal: logger.store(EpRet=ep_ret, EpLen=ep_len) # reset our training variables and the training environment o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # every save_freq epochs, # save the state of the environment # also save the current state of our value function model # and policy # these are automatically saved by the save_state function # since we have already called logger.setup_tf_saver if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # "Perform PPO update!" update() # "Log info about epoch" logger.log_tabular('Epoch', epoch) try: logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) except: pass logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, expert=None, policy_path=None, actor_critic=core.mlp_actor_critic_m, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=10000, dagger_epochs=500, pretrain_epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=1e-4, dagger_noise=0.01, batch_size=64, replay_size=int(5e3), vf_lr=1e-4, train_pi_iters=80, train_v_iters=80, lam=0.999, max_ep_len=500, target_kl=0.01, logger_kwargs=dict(), save_freq=10, test_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) policy_path (str): path of pretrained policy model train from scratch if None logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) test_logger_kwargs = dict() test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'], "test") test_logger_kwargs['exp_name'] = logger_kwargs['exp_name'] test_logger = EpochLogger(**test_logger_kwargs) test_logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space act_high_limit = env.action_space.high act_low_limit = env.action_space.low sess = tf.Session() if policy_path is None: # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) tfa_ph = core.placeholder(act_dim) # Main outputs from computation graph mu, pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) sess.run(tf.global_variables_initializer()) else: # load pretrained model # sess, x_ph, a_ph, mu, pi, logp, logp_pi, v = load_policy(policy_path, itr='last', deterministic=False, act_high=env.action_space.high) # # get_action_2 = lambda x : sess.run(mu, feed_dict={x_ph: x[None,:]})[0] # adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) model = restore_tf_graph(sess, osp.join(policy_path, 'simple_save')) x_ph, a_ph, adv_ph, ret_ph, logp_old_ph = model['x_ph'], model[ 'a_ph'], model['adv_ph'], model['ret_ph'], model['logp_old_ph'] mu, pi, logp, logp_pi, v = model['mu'], model['pi'], model[ 'logp'], model['logp_pi'], model['v'] # tfa_ph = core.placeholder(act_dim) tfa_ph = model['tfa_ph'] # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) print("---------------", local_steps_per_epoch) buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam) # print(obs_dim) # print(act_dim) dagger_replay_buffer = DaggerReplayBuffer(obs_dim=obs_dim[0], act_dim=act_dim[0], size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives if policy_path is None: ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) dagger_pi_loss = tf.reduce_mean(tf.square(mu - tfa_ph)) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers dagger_pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) optimizer_pi = tf.train.AdamOptimizer(learning_rate=pi_lr) optimizer_v = tf.train.AdamOptimizer(learning_rate=vf_lr) train_dagger_pi_op = dagger_pi_optimizer.minimize( dagger_pi_loss, name='train_dagger_pi_op') train_pi = optimizer_pi.minimize(pi_loss, name='train_pi_op') train_v = optimizer_v.minimize(v_loss, name='train_v_op') sess.run(tf.variables_initializer(optimizer_pi.variables())) sess.run(tf.variables_initializer(optimizer_v.variables())) sess.run(tf.variables_initializer(dagger_pi_optimizer.variables())) else: graph = tf.get_default_graph() dagger_pi_loss = model['dagger_pi_loss'] pi_loss = model['pi_loss'] v_loss = model['v_loss'] approx_ent = model['approx_ent'] approx_kl = model['approx_kl'] clipfrac = model['clipfrac'] train_dagger_pi_op = graph.get_operation_by_name('train_dagger_pi_op') train_pi = graph.get_operation_by_name('train_pi_op') train_v = graph.get_operation_by_name('train_v_op') # sess = tf.Session() # sess.run(tf.global_variables_initializer()) # Sync params across processes # sess.run(sync_all_params()) tf.summary.FileWriter("log/", sess.graph) # Setup model saving logger.setup_tf_saver(sess, inputs={'x_ph': x_ph, 'a_ph': a_ph, 'tfa_ph': tfa_ph, 'adv_ph': adv_ph, 'ret_ph': ret_ph, 'logp_old_ph': logp_old_ph}, \ outputs={'mu': mu, 'pi': pi, 'v': v, 'logp': logp, 'logp_pi': logp_pi, 'clipfrac': clipfrac, 'approx_kl': approx_kl, \ 'pi_loss': pi_loss, 'v_loss': v_loss, 'dagger_pi_loss': dagger_pi_loss, 'approx_ent': approx_ent}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) def choose_action(s, add_noise=False): s = s[np.newaxis, :] a = sess.run(mu, {x_ph: s})[0] if add_noise: noise = dagger_noise * act_high_limit * np.random.normal( size=a.shape) a = a + noise return np.clip(a, act_low_limit, act_high_limit) def test_agent(n=81, test_num=1): n = env.unwrapped._set_test_mode(True) con_flag = False for j in range(n): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, info = env.step(choose_action(np.array(o), 0)) ep_ret += r ep_len += 1 if d: test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_logger.store(arrive_des=info['arrive_des']) test_logger.store( arrive_des_appro=info['arrive_des_appro']) if not info['out_of_range']: test_logger.store(converge_dis=info['converge_dis']) con_flag = True test_logger.store(out_of_range=info['out_of_range']) # print(info) # test_logger.dump_tabular() # time.sleep(10) if not con_flag: test_logger.store(converge_dis=10000) env.unwrapped._set_test_mode(False) def ref_test_agent(n=81, test_num=1): n = env.unwrapped._set_test_mode(True) con_flag = False for j in range(n): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) a = call_ref_controller(env, expert) o, r, d, info = env.step(a) ep_ret += r ep_len += 1 if d: test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_logger.store(arrive_des=info['arrive_des']) test_logger.store( arrive_des_appro=info['arrive_des_appro']) if not info['out_of_range']: test_logger.store(converge_dis=info['converge_dis']) con_flag = True test_logger.store(out_of_range=info['out_of_range']) # print(info) # test_logger.dump_tabular() if not con_flag: test_logger.store(converge_dis=10000) env.unwrapped._set_test_mode(False) ref_test_agent(test_num=-1) test_logger.log_tabular('epoch', -1) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) test_logger.log_tabular('arrive_des_appro', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 test_policy_epochs = 91 episode_steps = 500 total_env_t = 0 test_num = 0 print(colorize("begin dagger training", 'green', bold=True)) for epoch in range(1, dagger_epochs + 1, 1): # test policy if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs): # Save model logger.save_state({}, None) # Test the performance of the deterministic version of the agent. test_num += 1 test_agent(test_num=test_num) test_logger.log_tabular('epoch', epoch) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) test_logger.log_tabular('arrive_des_appro', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() # train policy o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 env.unwrapped._set_test_mode(False) obs, acs, rewards = [], [], [] for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run( get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)}) # a = get_action_2(np.array(o)) # save and log obs.append(o) ref_action = call_ref_controller(env, expert) if (epoch < pretrain_epochs): action = ref_action else: action = choose_action(np.array(o), True) buf.store(o, action, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(action) acs.append(ref_action) rewards.append(r) ep_ret += r ep_len += 1 total_env_t += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: np.array(o).reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Perform dagger and partical PPO update! inputs = {k: v for k, v in zip(all_phs, buf.get())} # pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update max_step = len(np.array(rewards)) dagger_replay_buffer.stores(obs, acs, rewards) for _ in range(int(local_steps_per_epoch / 10)): batch = dagger_replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], tfa_ph: batch['acts']} q_step_ops = [dagger_pi_loss, train_dagger_pi_op] for j in range(10): outs = sess.run(q_step_ops, feed_dict) logger.store(LossPi=outs[0]) c_v_loss = sess.run(v_loss, feed_dict=inputs) logger.store(LossV=c_v_loss, KL=0, Entropy=0, ClipFrac=0, DeltaLossPi=0, DeltaLossV=0, StopIter=0) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Main loop: collect experience in env and update/log each epoch print(colorize("begin ppo training", 'green', bold=True)) for epoch in range(1, epochs + 1, 1): # test policy if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs) or epoch == 1: # Save model logger.save_state({}, None) # Test the performance of the deterministic version of the agent. test_num += 1 test_agent(test_num=test_num) test_logger.log_tabular('epoch', epoch) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) test_logger.log_tabular('arrive_des_appro', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() # train policy o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 env.unwrapped._set_test_mode(False) for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run( get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)}) # a = a[0] # a = get_action_2(np.array(o)) # a = np.clip(a, act_low_limit, act_high_limit) # if epoch < pretrain_epochs: # a = env.action_space.sample() # a = np.clip(a, act_low_limit, act_high_limit) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: np.array(o).reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def __init__(self, args={}): self.bot = None if "bot" in args: bot = args["bot"] self.epoch = 0 self.step = 0 self.actor_critic = core.mlp_actor_critic self.ac_kwargs = dict(hidden_sizes=[64] * 2) self.seed = 0 self.steps_per_epoch = 10000 self.epochs = 10 self.gamma = 0.99 self.clip_ratio = 0.2 self.pi_lr = 3e-4 self.vf_lr = 1e-3 self.train_pi_iters = 80 self.train_v_iters = 80 self.lam = 0.97 self.max_ep_len = 1000 self.target_kl = 0.01 self.logger_kwargs = {} self.save_freq = 1 map_name = "unknown" if bot is not None: map_name = bot.map_name self.logger_kwargs = { "output_dir": f".\\{map_name}\\ai_data", "exp_name": "builder_ai" } self.logger = EpochLogger(**self.logger_kwargs) #self.logger.save_config(locals()) self.logger.save_config(self.__dict__) seed = self.seed seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) #env = env_fn() self.env = BuilderEnv(args={"bot": self.bot}) obs_dim = self.env.observation_space.shape act_dim = self.env.action_space.shape # Share information about action space with policy architecture self.ac_kwargs['action_space'] = self.env.action_space print(str(self.env.observation_space)) print(str(self.env.action_space)) print(str(type(self.env.observation_space))) print(str(type(self.env.action_space))) # Inputs to computation graph self.x_ph, self.a_ph = core.placeholders_from_spaces( self.env.observation_space, self.env.action_space) self.adv_ph, self.ret_ph, self.logp_old_ph = core.placeholders( None, None, None) # Main outputs from computation graph self.pi, self.logp, self.logp_pi, self.v = self.actor_critic( self.x_ph, self.a_ph, **self.ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) self.all_phs = [ self.x_ph, self.a_ph, self.adv_ph, self.ret_ph, self.logp_old_ph ] # Every step, get: action, value, and logprob self.get_action_ops = [self.pi, self.v, self.logp_pi] # Experience buffer self.local_steps_per_epoch = int(self.steps_per_epoch / num_procs()) self.buf = ppo.PPOBuffer( obs_dim, act_dim, self.local_steps_per_epoch, self.gamma, self.lam ) # *2 is to create a lot of extra space in the buffer, hopefully? # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives self.ratio = tf.exp(self.logp - self.logp_old_ph) # pi(a|s) / pi_old(a|s) self.min_adv = tf.where(self.adv_ph > 0, (1 + self.clip_ratio) * self.adv_ph, (1 - self.clip_ratio) * self.adv_ph) self.pi_loss = -tf.reduce_mean( tf.minimum(self.ratio * self.adv_ph, self.min_adv)) self.v_loss = tf.reduce_mean((self.ret_ph - self.v)**2) # Info (useful to watch during learning) self.approx_kl = tf.reduce_mean( self.logp_old_ph - self.logp) # a sample estimate for KL-divergence, easy to compute self.approx_ent = tf.reduce_mean( -self.logp) # a sample estimate for entropy, also easy to compute self.clipped = tf.logical_or(self.ratio > (1 + self.clip_ratio), self.ratio < (1 - self.clip_ratio)) self.clipfrac = tf.reduce_mean(tf.cast(self.clipped, tf.float32)) print(f"pi_lr:{self.pi_lr}, pi_loss:{self.pi_loss}") # Optimizers self.train_pi = MpiAdamOptimizer(learning_rate=self.pi_lr).minimize( self.pi_loss) self.train_v = MpiAdamOptimizer(learning_rate=self.vf_lr).minimize( self.v_loss) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) # Sync params across processes self.sess.run(sync_all_params()) # Setup model saving self.logger.setup_tf_saver(self.sess, inputs={'x': self.x_ph}, outputs={ 'pi': self.pi, 'v': self.v }) self.start_time = time.time() self.o, self.r, self.d, self.ep_ret, self.ep_len = self.env.reset( args={}), 0, False, 0, 0 print(f"o:{self.o}, type:{type(self.o)}") self.epoch = 0 self.t = 0 self.load()