def main(args): env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) D = Discriminator(env) expert_observations = np.genfromtxt('trajectory/observations.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 # do NOT use rewards to update policy success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator is reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy inp = [observations, actions, gaes, d_rewards, v_preds_next] PPO.assign_policy_parameters() for epoch in range(6): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): #init directories if not os.path.isdir(args.logdir): os.mkdir(args.logdir) if not os.path.isdir(args.logdir + '/' + args.env): os.mkdir(args.logdir + '/' + args.env) if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer) if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer + '/lr_' + str(args.lr)): os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer + '/lr_' + str(args.lr)) args.logdir = args.logdir + '/' + args.env + '/' + args.optimizer + '/lr_' + str(args.lr) if not os.path.isdir(args.savedir): os.mkdir(args.savedir) if not os.path.isdir(args.savedir + '/' + args.env): os.mkdir(args.savedir + '/' + args.env) if not os.path.isdir(args.savedir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.savedir + '/' + args.env + '/' + args.optimizer) args.savedir = args.savedir + '/' + args.env + '/' + args.optimizer #init classes env = gym.make(args.env) env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env, args.env) Old_Policy = Policy_net('old_policy', env, args.env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, _optimizer=args.optimizer, _lr=args.lr) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] v_preds = [] rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [0] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)]) , iteration) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) , iteration) if iteration == (args.iteration-1): saver.save(sess, args.savedir+'/model'+str(args.lr)+'.ckpt') print('Clear!! Model saved.') break gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] print('iteration:', iteration, ',rewards:', sum(rewards)) # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): #init directories if not os.path.isdir(args.logdir): os.mkdir(args.logdir) if not os.path.isdir(args.logdir + '/' + args.env): os.mkdir(args.logdir + '/' + args.env) if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer) args.logdir = args.logdir + '/' + args.env + '/' + args.optimizer if not os.path.isdir(args.savedir): os.mkdir(args.savedir) if not os.path.isdir(args.savedir + '/' + args.env): os.mkdir(args.savedir + '/' + args.env) if not os.path.isdir(args.savedir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.savedir + '/' + args.env + '/' + args.optimizer) args.savedir = args.savedir + '/' + args.env + '/' + args.optimizer #init classes env = gym.make(args.env) env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env, args.env) Old_Policy = Policy_net('old_policy', env, args.env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, _optimizer=args.optimizer, _lr=args.lr) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] v_preds = [] rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=episode_length) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if iteration == (args.iteration - 1): saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] print('iteration:', iteration, ',rewards:', sum(rewards)) # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): print(date) energyPolicy_training_data.append("Energy poilcy training") energyPolicy_training_data.append( "Date: " + str(date)) energyPolicy_training_data.append( "Noise type: " + str(args.noise_type)) energyPolicy_training_data.append( "Policy Training max episodes: " + str(args.iteration)) energyPolicy_training_data.append( "Number of iterations the energy model have ben trained: " + str(args.model)) energyPolicy_training_data.append( "PPO gamma: " + str(args.gamma)) energyPolicy_training_data.append( "Do we add noise to sapair for calculating energy " + str(args.sanoise)) energyPolicy_training_data.append( "The noise we add to sapair " + str(args.noise_sigma)) energyPolicy_training_data.append( "h(energy) " + str(args.reward_function)) energyPolicy_training_data.append(" \n\n") env = gym.make('CartPole-v0') Energy = Energy_net('energy', 'CartPole-v0') energy_saver = tf.train.Saver() sapairs = np.genfromtxt('training_data/sapairs.csv') noise_sapairs = np.genfromtxt('training_data/noise_sapairs.csv') with tf.Session() as sess: # writer = tf.summary.FileWriter(args.logdir+'/'+args.alg, sess.graph) sess.run(tf.global_variables_initializer()) if args.model == '': energy_saver.restore( sess, args.modeldir + '/' + args.alg + '/' + args.noise_type + '/' + 'model.ckpt') else: energy_saver.restore( sess, args.modeldir + '/' + args.alg + '/' + args.noise_type + '/' + 'model.ckpt-' + args.model) print("As for model after ", args.model, "training iterations") print("Energy for expert sapairs looks like:", Energy.get_energy(sapairs)) print( "Energy for noise sapairs (not corresponding to the noise trained for Energy) looks like:", Energy.get_energy(noise_sapairs)) energyPolicy_training_data.append( ["As for model after ", args.model, "training iterations"]) energyPolicy_training_data.append( "Energy for expert sapairs looks like:\n" + str(Energy.get_energy(sapairs))) energyPolicy_training_data.append( "Energy for noise sapairs (not corresponding to the noise trained for Energy) looks like:\n" + str(Energy.get_energy(noise_sapairs))) energyPolicy_training_data.append(" \n\n\n\n\n\n\n\n\n") energyPolicy_training_data.append( "Done with reloading Energy. Start RL") # writer.close() open_file_and_save( args.logdir + '/' + args.model + "_iter_" + args.noise_type + '_Policy' + date, energyPolicy_training_data) print("Done with reloading Energy. Start RL") # Start RL env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) saver = tf.train.Saver() # writer = tf.summary.FileWriter(args.logdir+'/'+args.noise_type, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 alter_reward = 0 success_num = 0 render = False #ep_reward = [] # 用于记录每个trajectory的数据最后做总结 Summary_after_max_episodes_training = [] Trajectory_rewards = [] Trajectory_alter_rewards = [] Trajectory_success_num = 0 # 与success_num一样,只不过这个不会清零,这个用于评估这个energy对于训练的效果 plot_rewards = [] plot_alter_rewards = [] plot_iteration = [] for iteration in range(args.iteration): observations = [] actions = [] v_preds = [] rewards = [] alter_rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) alter_rewards.append(alter_reward) rewards.append(reward) next_obs, reward, done, info = env.step(act) # alter reward sapair = np.append(obs, np.array([[act]]), axis=1) # print("sapair:",sapair) energy = Energy.get_energy(sapair)[0][0] print("Energy for this sapair", energy) if args.sanoise == True: # 定义 gauss noise 的均值和方差 mu, sigma = 0, args.noise_sigma # 一维guass # saNumber = sapairs.shape[0] saShape = sapair.shape[1] # sampleNo = saNumber * saShape # 采样sampleNo个gauss noise noise = np.random.normal(mu, sigma, saShape) noise_sapair = sapair + noise print("noise_sapair:", noise_sapair) # noise_sapairs = np.reshape(noise_sapairs, newshape=[saNumber, saShape]) noise_energy = Energy.get_energy(noise_sapair)[0][0] print("Noise Energy for this sapair", noise_energy) energy = noise_energy if args.reward_function == "-energy": alter_reward = -energy elif args.reward_function == "-energy+1": alter_reward = -energy + 1 elif args.reward_function == "exp(-energy-1)": alter_reward = np.exp(-energy - 1) elif args.reward_function == "exp(-energy)": alter_reward = np.exp(-energy) else: print("No such reward_function") #alter_reward = np.exp(-energy-1) #alter_reward = -energy+1 #alter_reward = reward #alter_reward = -energy # if render: # env.render() # pass if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() reward = -1 alter_reward = -1 break else: obs = next_obs # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)]), iteration) # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]), iteration) # if sum(rewards) >= 195: # success_num += 1 # Trajectory_success_num +=1 # render = True # if success_num >= 100: # saver.save(sess, args.savedir + '/model.ckpt') # print('Clear!! Model saved.') # break # else: # success_num = 0 sum_rewards = sum(rewards) sum_alter_rewards = sum(alter_rewards) Trajectory_rewards.append(sum_rewards) Trajectory_alter_rewards.append(sum_alter_rewards) #画图 plot_rewards.append(sum_rewards) plot_alter_rewards.append(sum_alter_rewards) plot_iteration.append(iteration) #ep_reward.append(sum(rewards)) # print("Sample done in one traj.") energyPolicy_training_data_for_this_episode = [] energyPolicy_training_data_for_this_episode.append(" ") energyPolicy_training_data_for_this_episode.append( "Trajectory: " + str(iteration)) energyPolicy_training_data_for_this_episode.append( "episode_len: " + str(episode_length)) energyPolicy_training_data_for_this_episode.append( "True rewards: " + str(sum_rewards)) energyPolicy_training_data_for_this_episode.append( "alter_rewards: " + str(sum_alter_rewards)) open_file_and_save( args.logdir + '/' + args.model + "_iter_" + args.noise_type + '_Policy' + date, energyPolicy_training_data_for_this_episode) print() print("Trajectory", iteration, ":") print("episode_len: ", episode_length) print("rewards: ", sum(rewards)) print("alter_rewards: ", sum(alter_rewards)) # gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = PPO.get_gaes(rewards=alter_rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) alter_rewards = np.array(alter_rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, alter_rewards, v_preds_next] # inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) # writer.add_summary(summary, iteration) # writer.close() #开始画图 plt.title('Noise:' + str(args.sanoise)) plt.plot(plot_iteration, plot_rewards, color='red', label='True_rewards') plt.plot(plot_iteration, plot_alter_rewards, color='green', label='alter_rewards') plt.legend() #显示图例 plt.xlabel('Episodes') plt.ylabel('Rewards') plt.show()
def main(args): # init directories if not os.path.isdir(args.logdir): os.mkdir(args.logdir) if not os.path.isdir(args.logdir + '/' + args.env): os.mkdir(args.logdir + '/' + args.env) if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer) args.logdir = args.logdir + '/' + args.env + '/' + args.optimizer if not os.path.isdir(args.savedir): os.mkdir(args.savedir) if not os.path.isdir(args.savedir + '/' + args.env): os.mkdir(args.savedir + '/' + args.env) if not os.path.isdir(args.savedir + '/' + args.env + '/' + args.optimizer): os.mkdir(args.savedir + '/' + args.env + '/' + args.optimizer) args.savedir = args.savedir + '/' + args.env + '/' + args.optimizer args.tradir = args.tradir + '/' + args.env + '/' + args.optimizer # init classes env = gym.make(args.env) env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env, args.env) Old_Policy = Policy_net('old_policy', env, args.env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, _optimizer=args.optimizer) D = Discriminator(env, args.env, _optimizer=args.optimizer) expert_observations = np.genfromtxt(args.tradir + '/observations.csv') expert_actions = np.genfromtxt(args.tradir + '/actions.csv', dtype=np.int32) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 # do NOT use rewards to update policy success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 while True: run_policy_steps += 1 obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) next_obs, reward, done, info = env.step(act) if done: v_preds_next = v_preds[1:] + [0] # next state of terminate state has 0 state value obs = env.reset() reward = -1 break else: obs = next_obs writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]) , iteration) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) , iteration) print('iteration:', iteration, ',rewards:', sum(rewards)) if iteration == (args.iteration - 1): saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # train discriminator for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # output of this discriminator is reward d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy inp = [observations, actions, gaes, d_rewards, v_preds_next] PPO.assign_policy_parameters() for epoch in range(6): sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): # env = gym.make('CartPole-v0') # env.seed(0) env = CustomEnv() ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=gamma) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(log_path, sess.graph) sess.run(tf.global_variables_initializer()) obs, acs, target_video = env.reset() success_num = 0 for iteration in range(iterations): observations = [] actions = [] pred_actions = [] rewards = [] v_preds = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.array([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs acs = np.array([acs]).astype(dtype=np.float32) pred_act, v_pred = Policy.act(obs=obs, acs=acs, stochastic=True) # act = np.asscalar(act) v_pred = np.asscalar(v_pred) next_obs, reward, done, info = env.step(acs) observations.append(obs) actions.append(acs) pred_actions.append(pred_act) rewards.append(reward) v_preds.append(v_pred) if done: next_obs = np.stack([next_obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs _, v_pred = Policy.act(obs=next_obs, stochastic=True) v_preds_next = v_preds[1:] + [np.asscalar(v_pred)] obs, acs, target_video = env.reset() break else: obs = next_obs writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)]) , iteration) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) , iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, weight_path + '/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder # observations = np.reshape(observations, newshape=[-1,] + list(ob_space.shape)) observations = np.array(observations).astype(dtype=np.float32) actions = np.array(actions).astype(dtype=np.float32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): scene_scope = 'bathroom_02' task_scope = 26 #26 43 53 32 41 env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) S_Class = SIAMESE() #Creating a siamese class -object Policy = Policy_net( 'policy', S_Class) #buiding the actor critic graph / object , Passing Old_Policy = Policy_net('old_policy', S_Class) #same thing as the other PPO PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) #gradiet updatror object or the graph D = Discriminator(S_Class) #discriminator of the Gan Kind of thing ''' batch_n=tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese') ''' #Loading Expert Data State/Tragets etc expert_observations = np.genfromtxt( 'trajectory/observations.csv') #load expert demnetrations expert_targets = np.genfromtxt('trajectory/targets.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) expert_observations = np.reshape(expert_observations, newshape=[-1, 2048, 4]) expert_targets = np.reshape(expert_targets, newshape=[-1, 2048, 4]) saver = tf.train.Saver( ) #Assign another save if you want to use BC weights if args.restore: #We need a seperate saver only for assigning paramters from BC trained thing saver2 = tf.tran.Saver([ tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='policy'), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese') ]) with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run( tf.global_variables_initializer() ) #here already variables get intialized both old policy and new policy net if args.restore: if args.model == '': saver2.restore( sess, args.modeldir + '/' + args.alg + '/' + 'shamane.ckpt') print("Model Reastored") else: saver.restore( sess, args.modeldir + '/' + args.alg + '/' + 'model.ckpt-' + args.model) success_num = 0 #This is use to check whether my agent went to the terminal point #var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) for iteration in range( 100000): #args.iteration):#Here start the adversial training print( "Starting ........ The Iteration---------------------------------------------------- :", iteration) observations = [] actions = [] #rewards = [] targets = [] #for the gail v_preds = [] run_policy_steps = 0 while ( True ): #Here what is happenning is , this again samples trajectories from untrain agent run_policy_steps += 1 obs = np.stack([env.s_t]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs #Initial observation target = np.stack([env.s_target]).astype( dtype=np.float32 ) #This is to make sure that input is [batch_size,2048,4] act, v_pred, prob = Policy.act( state=obs, target=target, stochastic=True) # Agents action and values act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) #save the set of observations targets.append(target) actions.append(act) #save the set of actions v_preds.append(v_pred) #next_obs, reward, done, info = env.step(act) #get the next observation and reward acording to the observation next_obs, is_terminal, is_collided = env.step(act) if is_terminal: success_num = success_num + 1 print( "Congratz yoy just reach the terminal state which is:", env.terminal_state_id) if is_collided: print( "Bad Luck your agent just collided couldn't made it to the terminal state which is :", env.terminal_state_id) if (is_terminal or is_collided or (run_policy_steps == 100)): #run one episode till the termination print("Number Of Exploration by the AGENT:", run_policy_steps) v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value #this list use to update the parameters of the calue net print( "Environment is resetting after the collition/Terminal" ) obs = env.reset() #reward = -1 break #with tihs vreak all obsercation ,action and other lists get empty #print(sum(rewards)) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) #, iteration) if success_num >= 5000: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break #else: #success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1, 2048, 4]) #collect observations targets = np.reshape(targets, newshape=[-1, 2048, 4]) actions = np.array(actions).astype( dtype=np.int32) #collect the actions # train discriminator #Here comes the Discriminator !! Dis_input = [ expert_observations, expert_targets, expert_actions, observations, targets, actions ] observations.shape[0] expert_observations.shape[0] if observations.shape[0] < expert_observations.shape[0]: High = observations.shape[0] else: High = expert_observations.shape[0] for i in range(100): sample_indices = np.random.randint(low=0, high=High, size=32) sampled_inp_D = [ np.take(a=a, indices=sample_indices, axis=0) for a in Dis_input ] D.train(expert_s=sampled_inp_D[0], expert_t=sampled_inp_D[1], expert_a=sampled_inp_D[2], agent_s=sampled_inp_D[3], agent_t=sampled_inp_D[4], agent_a=sampled_inp_D[5]) ''' D.train(expert_s=expert_observations, expert_t=expert_targets, expert_a=expert_actions, agent_s=observations, agent_t=targets, agent_a=actions) ''' #To get rewards we can use a RNN , then we can get the each time unit output to collect the reward function d_rewards = D.get_rewards( agent_s=observations, agent_t=targets, agent_a=actions ) #how well our agent performed with respect to the expert d_rewards = np.reshape(d_rewards, newshape=[-1]).astype( dtype=np.float32) #rewards for each action pair gaes = PPO.get_gaes( rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next ) #this to calcuate the advantage function in PPO gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype( dtype=np.float32) #This is the next value function #train policy inp = [ observations, targets, actions, gaes, d_rewards, v_preds_next ] PPO.assign_policy_parameters( ) #Assigning policy params means assigning the weights to the default policy nets for epoch in range( 100 ): #This is to train the Agent (Actor Critic ) from the obtaiend agent performances and already trained discriminator sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # Here trainign the policy network PPO.train(state=sampled_inp[0], targets=sampled_inp[1], actions=sampled_inp[2], gaes=sampled_inp[3], rewards=sampled_inp[4], v_preds_next=sampled_inp[5]) summary = PPO.get_summary(obs=inp[0], target=inp[1], actions=inp[2], gaes=inp[3], rewards=inp[4], v_preds_next=inp[5]) writer.add_summary(summary, iteration) writer.close()
def main(args): # prepare log dir if not os.path.exists(args.logdir): os.makedirs(args.logdir) if not os.path.exists(args.savedir): os.makedirs(args.savedir) # gym環境作成 env = gym.make("CartPole-v0") env.seed(0) ob_space = env.observation_space # policy net Policy = Policy_net("policy", env) Old_Policy = Policy_net("old_policy", env) # ppo学習インスタンス PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) # tensorflow saver saver = tf.train.Saver() # session config config = tf.ConfigProto( gpu_options=tf.GPUOptions(visible_device_list=args.gpu_num, allow_growth=True) ) # start session with tf.Session(config=config) as sess: # summary writer writer = tf.summary.FileWriter(args.logdir, sess.graph) # Sessionの初期化 sess.run(tf.global_variables_initializer()) # 状態の初期化 obs = env.reset() # episodeの成功回数 success_num = 0 # episode loop for iteration in tqdm(range(args.iteration)): # episodeのtrajectory配列 # buffer observations = [] actions = [] v_preds = [] rewards = [] # episodeのstep回数 episode_length = 0 # run episode while True: episode_length += 1 # プレースホルダー用に変換 obs = np.stack([obs]).astype(dtype=np.float32) # 行動と状態価値を推定 act, v_pred = Policy.act(obs=obs, stochastic=True) # 要素数が1の配列をスカラーに変換 act = np.asscalar(act) v_pred = np.asscalar(v_pred) # policyによる行動で状態を更新 next_obs, reward, done, info = env.step(act) # episodeの各変数を追加 # (s_t, a_t, v_t, r_t) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) # episode終了判定 # episodeが終了していたら次のepisodeを開始 if done: # v_t+1の配列 v_preds_next = v_preds[1:] + [0] obs = env.reset() reward = -1 break else: obs = next_obs # summary追加 writer.add_summary( tf.Summary( value=[ tf.Summary.Value( tag="episode_length", simple_value=episode_length ) ] ), iteration, ) writer.add_summary( tf.Summary( value=[ tf.Summary.Value( tag="episode_reward", simple_value=sum(rewards) ) ] ), iteration, ) # episode成功判定 if sum(rewards) >= 195: success_num += 1 # 連続で100回成功していればepisode loopを終了 if success_num >= 100: saver.save(sess, args.savedir + "/model.ckpt") print("Clear!! Model saved.") break else: success_num = 0 # policy netによるtrajectryをプレースホルダー用に変換 observations = np.reshape( observations, newshape=[-1] + list(ob_space.shape) ) actions = np.array(actions).astype(dtype=np.int32) # rewardsをプレースホルダー用に変換 rewards = np.array(rewards).astype(dtype=np.float32) # gaesの取得 gaes = PPO.get_gaes( rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next ) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # エージェントのexperience inp = [observations, actions, gaes, rewards, v_preds_next] # Old_Policyにパラメータを代入 PPO.assign_policy_parameters() # PPOの学習 for epoch in range(6): # 学習データサンプル用のインデックスを取得 sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32 ) # PPO学習データをサンプル sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] PPO.train( obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4], ) # summaryの取得 summary = PPO.get_summary( obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4], ) writer.add_summary(summary, iteration) writer.close()
def main(args): writer = SummaryWriter(args.logdir) logger = ResultLogger(writer) env = Environment() # 自定义环境 ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, args=args, logger=logger) saver = tf.train.Saver() if args.continue_train: tf.reset_default_graph() tf.train.import_meta_graph(args.continue_meta) with tf.Session() as sess: if args.continue_train: saver.restore(sess, args.continue_modeldir) sess.run(tf.global_variables_initializer()) reward = 0 winnum = 0 drawnum = 0 for episode in range(args.episode): observations = [] actions = [] v_preds = [] rewards = [] run_policy_steps = 0 total_reward = 0 obs = env.reset() while True: # run policy RUN_POLICY_STEPS which is much less than episode length run_policy_steps += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, sparse_rew, done, info = env.step(act) if reward < -1000: reward = -10 reward = utils.get_curriculum_reward(reward, sparse_rew, 1.0, run_policy_steps) # if episode==1: # print(reward) obs = next_obs if done: total_reward = sum(rewards) total_reward /= run_policy_steps total_reward += reward v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value reward = -1 if info == 3: winnum += 1 if info == 2: drawnum += 1 break if episode % 100 == 0: winnum = 0 drawnum = 0 logger.log_result(total_reward, winnum, drawnum, episode) print(episode, total_reward) if episode % 1000 == 0: saver.save(sess, args.savedir + '/model.ckpt') #### ## GAE #### gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # 把list 转成 适应于tf.placeholder 的numpy array observations = np.reshape(observations, newshape=(-1, ob_space)) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) logger.log_gaes(gaes.mean(), episode) PPO.log_parameter(observations, actions, gaes, rewards, v_preds_next) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(2): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4])
def main(args): #env.seed(0) env = gym.make('MineRLNavigateDense-v0') ob_space = env.observation_space action_space = env.action_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 success_num = 0 render = False for iteration in range(args.iteration): observations = [] actions = [] v_preds = [] rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs['pov']]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs['pov']) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step( [int(act / 3) + 1, act - int(act / 3) * 3]) if (episode_length % 2500 == 0): print(sum(rewards)) if render: env.render() if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value obs = env.reset() print('done') break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=episode_length) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if sum(rewards) >= 1: success_num += 1 render = True if success_num >= 10: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1, 64, 64, 3]) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): env = gym.make('CartPole-v0') BCPolicy = Policy_net('bcpolicy', env) BC = BehavioralCloning(BCPolicy) Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) saver = tf.train.Saver(max_to_keep=args.max_to_keep) #实例化一个Saver对象,在训练过程中,定期调用saver.save方法,像文件夹中写入包含当前模型中所有可训练变量的checkpoint文件 saver.save(sess,FLAGG.train_dir,global_step=step) exp_obs = np.genfromtxt('trajectory/observations.csv')[0:exp_len] #exp_len=200 exp_acts = np.genfromtxt('trajectory/actions.csv', dtype=np.int32)[0:exp_len] with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) #指定一个文件用来保存图。格式:tf.summary.FileWritter(path,sess.graph),可以调用其add_summary()方法将训练过程数据保存在filewriter指定的文件中 sess.run(tf.global_variables_initializer()) inp = [exp_obs, exp_acts] #inp[0]就是observations, inp[1]就是actoins for iteration in range(args.iteration): # episode # train for epoch in range(args.epoch_num): # select sample indices in [low, high) sample_indices = np.random.randint(low=0, high=exp_obs.shape[0], size=args.minibatch_size) #函数的作用是,返回一个随机整型数,范围从低(包括)到高(不包括),即[low, high) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data BC.train(obs=sampled_inp[0], actions=sampled_inp[1]) bc_summary = BC.get_summary(obs=inp[0], actions=inp[1]) if (iteration+1) % args.interval == 0: saver.save(sess, args.savedir + '/model.ckpt', global_step=iteration+1) writer.add_summary(bc_summary, iteration) print("Done with BC. Start RL") # Start RL obs = env.reset() ob_space = env.observation_space reward = 0 alter_reward = 0 success_num = 0 render = False ep_reward=[] for iteration in range(5*args.iteration): print("iter:{}".format(iteration)) observations = [] actions = [] v_preds = [] rewards = [] alter_rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) alter_rewards.append(alter_reward) rewards.append(reward) next_obs, reward, done, info = env.step(act) alter_reward = np.log(1/(kl_divergence(obs, BCPolicy, Policy)+0.00001)) #alter_reward = -kl_divergence(obs, BCPolicy, Policy) #alter_reward = kl_divergence(obs, BCPolicy, Policy) #print(alter_reward) if render: #env.render() pass if done: v_preds_next = v_preds[1:] + [0] # next state of terminate state has 0 state value obs = env.reset() reward = -1 alter_reward = -1 print("episode_len: ",episode_length) break else: obs = next_obs writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)]) , iteration) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) , iteration) if sum(rewards) >= 195: success_num += 1 render = True if success_num >= 100: saver.save(sess, args.savedir+'/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 ep_reward.append(sum(rewards)) print("rewards: ",sum(rewards)) print("alter_rewards: ",sum(alter_rewards)) print("Sample done in one traj.") gaes = PPO.get_gaes(rewards=alter_rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) alter_rewards = np.array(alter_rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, alter_rewards, v_preds_next] print("Begin Training") # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) """ summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) """ #writer.add_summary(summary, iteration) writer.close() plt.plot(ep_reward)
def main(args): env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space #This is the environment for the gym to observe Policy = Policy_net( 'policy', env) #take the environments #this is normal policy class Old_Policy = Policy_net('old_policy', env) #this is for the old policy PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) #this is for training saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run(tf.global_variables_initializer() ) #Here all the variabls get initialized obs = env.reset( ) # [position of cart, velocity of cart, angle of pole, rotation rate of pole] Initial observation reward = 0 success_num = 0 for iteration in range(args.iteration): observations = [] #to store observations actions = [] v_preds = [] rewards = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length #Starting to run the episode_length += 1 #episode length is something dynamic obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act( obs=obs, stochastic=True ) #get the action and value prediction (actor and critic network output) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) next_obs, reward, done, info = env.step( act) #get the observation from the environments #The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center. That is done if done: #This is a termination stage #this has all the next state eliements of the episode inputs v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value #after the terminal stage there shouldn;t be a value function obs = env.reset() reward = -1 break else: #here your break the episode obs = next_obs #if the system do not get terminated it will run for ever #After a one episode get terminated writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=episode_length) ]) #From this we can learn how long the episode went , iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]) # , iteration) if sum(rewards) >= 195: success_num += 1 if success_num >= 100: saver.save(sess, args.savedir + '/model.ckpt') print( 'Clear!! Model saved.' ) #this is like after this much sucessfull attempts we are confident about the model break else: success_num = 0 gaes = PPO.get_gaes( rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) #this is the advantage function # convert list to numpy array for feeding tf.placeholder observations = np.reshape( observations, newshape=[-1] + list(ob_space.shape)) #observations from the current policy actions = np.array(actions).astype( dtype=np.int32) #actions taken from current policy gaes = np.array(gaes).astype( dtype=np.float32) #generalized advantage enstimation gaes = (gaes - gaes.mean()) / gaes.std() #Normalize it rewards = np.array(rewards).astype( dtype=np.float32) #Extracted rewrds v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters( ) #before updating the new policy we assign current policy parameters to old policy inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(6): #starting the optimization # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data Randomly take one sample from the training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()
def main(args): # prepare log dir if not os.path.exists(args.logdir): os.makedirs(args.logdir) if not os.path.exists(args.savedir): os.makedirs(args.savedir) # gym環境作成 env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space # policy net Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) # ppo学習インスタンス PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) # discriminator D = Discriminator(env) # エキスパートtrajectory読み込み expert_observations = np.genfromtxt('trajectory/observations.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) # tensorflow saver saver = tf.train.Saver() # session config config = tf.ConfigProto( gpu_options=tf.GPUOptions( visible_device_list=args.gpu_num, allow_growth=True )) # start session with tf.Session(config=config) as sess: # summary writer writer = tf.summary.FileWriter(args.logdir, sess.graph) # Sessionの初期化 sess.run(tf.global_variables_initializer()) # 状態の初期化 obs = env.reset() success_num = 0 # episode loop for iteration in tqdm(range(args.iteration)): # buffer observations = [] actions = [] rewards = [] v_preds = [] run_policy_steps = 0 # run episode while True: run_policy_steps += 1 # ネットワーク入力用にobsを変換 obs = np.stack([obs]).astype(dtype=np.float32) # 行動と価値を推定 act, v_pred = Policy.act(obs=obs, stochastic=True) # 要素数が1の配列をスカラーに変換 act = np.asscalar(act) v_pred = np.asscalar(v_pred) # policy netの推定行動で状態の更新 next_obs, reward, done, info = env.step(act) # episodeの各変数を追加 observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) # episode終了判定 if done: v_preds_next = v_preds[1:] + [0] obs = env.reset() reward = -1 break else: obs = next_obs # summary追加 writer.add_summary( tf.Summary(value=[tf.Summary.Value( tag='episode_length', simple_value=run_policy_steps)]), iteration) writer.add_summary( tf.Summary(value=[tf.Summary.Value( tag='episode_reward', simple_value=sum(rewards))]), iteration) # episode成功判定 if sum(rewards) >= 195: success_num += 1 # 連続で100回成功していればepisode loopを終了 if success_num >= 100: saver.save(sess, args.savedir+'/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 # policy netによるtrajectryをプレースホルダー用に変換 observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) ########################### # GAILの変更点はここだけ # discriminatorでエキスパートの報酬に近づける # discriminator学習 2回 for i in range(2): D.train(expert_s=expert_observations, expert_a=expert_actions, agent_s=observations, agent_a=actions) # get d_rewards from discrminator d_rewards = D.get_rewards(agent_s=observations, agent_a=actions) # transform d_rewards to numpy for placeholder d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) ########################### # get generalized advantage estimator gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # ppo input data whose rewards is discriminator rewards inp = [observations, actions, gaes, d_rewards, v_preds_next] # assign parameters to old policy PPO.assign_policy_parameters() # train PPO for epoch in range(6): # sample index sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # sampling from input data sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # run ppo PPO.train( obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) # get summary summary = PPO.get_summary( obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) # add summary writer.add_summary(summary, iteration) writer.close()
# Train Agent Agent_step = args.G_step # Assign parameters to old policy Agent.assign_policy_parameters() total_loss = 0 clip_loss = 0 vf_loss = 0 entropy_loss = 0 l1_loss = 0 for i, _ in enumerate(observations): # Run ppo train operation losses = Agent.train( obs=observations[i], gaes=gaes[i], rewards=d_rewards[i], v_preds_next=v_preds_next[i], expert_act=expert_actions[i], lr=lr) total_loss += losses[1] clip_loss += losses[2] vf_loss += losses[3] entropy_loss += losses[4] l1_loss += losses[5] total_loss /= len(observations) clip_loss /= len(observations) vf_loss /= len(observations) entropy_loss /= len(observations) l1_loss /= len(observations) print('total_loss: {}, clip_loss: {}, vf_loss: {}, entropy_loss: {} l1_loss: {}'.format(
def main(args): env = gym.make('CartPole-v0') env.seed(0) ob_space = env.observation_space Policy = Policy_net('policy', env) Old_Policy = Policy_net('old_policy', env) PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) saver = tf.train.Saver() tl = args.train_level if tl == 'expert': threshold = 195 savedir = 'trained_models/ppo/expert' logdir = 'log/train/ppo/expert/' elif tl == 'med': threshold = 100 savedir = 'trained_models/ppo/med' logdir = 'log/train/ppo/med/' else: print("[run_ppo.py] Error: Unrecognized train level: {}".format(tl)) exit(1) with tf.Session() as sess: writer = tf.summary.FileWriter(logdir, sess.graph) sess.run(tf.global_variables_initializer()) obs = env.reset() success_num = 0 for iteration in range(args.iteration): observations = [] actions = [] rewards = [] v_preds = [] episode_length = 0 while True: # run policy RUN_POLICY_STEPS which is much less than episode length episode_length += 1 obs = np.stack([obs]).astype( dtype=np.float32) # prepare to feed placeholder Policy.obs act, v_pred = Policy.act(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) next_obs, reward, done, info = env.step(act) observations.append(obs) actions.append(act) rewards.append(reward) v_preds.append(v_pred) if done: next_obs = np.stack([next_obs]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs _, v_pred = Policy.act(obs=next_obs, stochastic=True) v_preds_next = v_preds[1:] + [np.asscalar(v_pred)] obs = env.reset() break else: obs = next_obs writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=episode_length) ]), iteration) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards)) ]), iteration) if sum(rewards) >= threshold: success_num += 1 if success_num >= 100: saver.save(sess, savedir + '/model.ckpt') print('Clear!! Model saved.') break else: success_num = 0 print("Iteration: {}, Rewards: {}".format(iteration, sum(rewards)), end='\r') gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=(-1, ) + ob_space.shape) actions = np.array(actions).astype(dtype=np.int32) gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) / gaes.std() rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) PPO.assign_policy_parameters() inp = [observations, actions, gaes, rewards, v_preds_next] # train for epoch in range(6): # sample indices from [low, high) sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) summary = PPO.get_summary(obs=inp[0], actions=inp[1], gaes=inp[2], rewards=inp[3], v_preds_next=inp[4]) writer.add_summary(summary, iteration) writer.close()