import gym import ppo import collections from tensorboardX import SummaryWriter writer = SummaryWriter('from_base') score = 0 episode = 0 p = 0 train_flag = 0 sess = tf.Session() agent = ppo.PPO(sess, 4, 2) env = gym.make('CartPole-v0') state = env.reset() score_board = collections.deque(maxlen=10) expert_state_action = [] while True: values_list, states_list, actions_list, dones_list, logp_ts_list, rewards_list = \ [], [], [], [], [], [] for _ in range(128):
env = gym.make(ENV) with tf.Session() as sess: training = None if "-t" in sys.argv: training = True else: training = False actor = ppo.PPO(4, 2, gamma=0.90, lam=0.90, clip_param=0.1, horizon=5, optim_epoch=4, lr=0.0001, lr_decay=0.001, storage=10, value_coefficient=1, entropy_coefficient=0.001, training=training, continous=False) saver = tf.train.Saver() if "-n" in sys.argv: sess.run(tf.global_variables_initializer()) else: saver.restore(sess, "model/cartpole") print("Restored...") try:
lambd = 0.95 epsilon = 0.1 num_epochs = 3 num_actors = 4 T_horizon = 51 batch_size = 64 num_iters = 1000 print_interval = 20 if __name__ == "__main__": env = ChessEnv( r"/home/keerthan/Softwares/stockfish-10-linux/Linux/stockfish_10_x64") model = ppo.PPO(env.state_dimension, env.action_dimension, learning_rate=learning_rate, gamma=gamma, lambd=lambd, epsilon=epsilon, num_epochs=num_epochs, batch_size=batch_size) #copy global variables to local iters = num_iters _print_interval = print_interval horizon = T_horizon score = 0 render = False renormalize = True # run the algorithm for i in range(1, iters + 1): start_state = env.reset() avg_score = 0
import env import time import random import ppo import numpy as np import logging import matplotlib.pyplot as plt import naive_controller my_env = env.TrafficEnv() #bt.saver.restore(bt.sess,"./model/my_light_model/my-model.ckpt-3500") all_ep_r = [] myppo = ppo.PPO() myppo.saver.restore(myppo.sess,"./model/my-model2.ckpt-2000") myNaiveCon = naive_controller.NaiveCon() for i_episode in range(1000000): # listener() s,rawOcc = my_env.reset() s = np.concatenate([s[0], np.reshape(s[1] + s[2], -1)]) buffer_s, buffer_a, buffer_r = [], [], [] k = 0 ep_r = 0 mydict = ["go","stop","left","right","nothing"] while True: a = myppo.choose_action(s) #a = myNaiveCon.gen_action(s,rawOcc)
if __name__ == "__main__": env = gym.make(ENV) with tf.Session() as sess: training = None if "-n" in sys.argv: training = True else: training = False actor = ppo.PPO(3, 1, gamma=0.95, clip_param=0.2, horizon=32, optim_epoch=4, lr_decay=0.001, storage=32, training=training, continous=True) saver = tf.train.Saver() if "-n" in sys.argv: sess.run(tf.global_variables_initializer()) else: saver.restore(sess, "model/pendelum") print("Restored...") try: if "-p" in sys.argv: print("Playing...")
discountFactor = d.get('discountFactor') explorationRate = d.get('explorationRate') learnStart = d.get('learnStart') memorySize = d.get('memorySize') current_epoch = d.get('current_epoch') stepCounter = d.get('stepCounter') loadsim_seconds = d.get('loadsim_seconds') clear_monitor_files(outdir) copy_tree(monitor_path, outdir) env = gym.wrappers.Monitor(env, outdir, resume=True) ppo = ppo.PPO(S_DIM=S_DIM, A_DIM=A_DIM, EP_MAX=epochs, EP_LEN=episode_steps, GAMMA=discountFactor, A_LR=A_learningRate, C_LR=C_learningRate, BATCH=minibatch_size, propeller_hovering_speed=0.0) last100Rewards = [0] * 100 last100RewardsIndex = 0 last100Filled = False all_ep_r = [] start_time = time.time() # start iterating from 'current epoch'. for epoch in range(current_epoch + 1, epochs + 1, 1): observation = env.reset() cumulated_reward = 0
import collections import discriminator from tensorboardX import SummaryWriter data = np.load('expert_data.npy') expert_data = collections.deque() for x in data: expert_data.append(x) sess = tf.Session() state_size = 4 action_size = 2 n_step = 128 agent = ppo.PPO(sess, state_size, action_size) dis = discriminator.Discriminator(sess, state_size, action_size) env = gym.make('CartPole-v0') score = 0 episode = 0 p = 0 gail = True writer = SummaryWriter() state = env.reset() while True: values_list, states_list, actions_list, dones_list, logp_ts_list, rewards_list = \ [], [], [], [], [], []
force=True) render = True start_state = env.reset() actor_history, actor_score, start_state, done = ppo.generate_trajectory( env, model, start_state, 500, render) env.close() env_to_wrap.close() if __name__ == "__main__": env = gym.make("CartPole-v1") s = env.reset() model = ppo.PPO(s.shape[0], 2, learning_rate=learning_rate, gamma=gamma, lambd=lambd, epsilon=epsilon, num_epochs=num_epochs, batch_size=batch_size) store_output(model, '0') train(env, model, 250) env.close() store_output(model, '250') env = gym.make("CartPole-v1") train(env, model, 250) env.close()
from nav_wrapper import NavigationEnv import ppo import models_ppo as models import numpy as np import os import rl_eval batch_size = 64 eval_eps = 50 rl_core = ppo.PPO( model = [models.PolicyNet, models.ValueNet], learning_rate = [0.0001, 0.0001], reward_decay = 0.99, batch_size = 1) is_train = True render = True load_model = False ''' is_train = False render = True load_model = True ''' map_path = "Maps/map.png" gif_path = "out/" model_path = "save/" if not os.path.exists(model_path): os.makedirs(model_path) if load_model: print("Load model ...", model_path)