def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--render", action="store_true") parser.add_argument("--unwrap", action="store_true") parser.add_argument("--episode", default=1000) args = parser.parse_args() #Parameters #---------------------------- env_id = args.env save_dir = "./save" device = "cuda:0" n_episode = args.episode #Create environment #---------------------------- env = gym.make(env_id) if args.conti: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n if args.unwrap: env = env.unwrapped #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) #Load model #---------------------------- if os.path.exists(os.path.join(save_dir, "{}.pt".format(env_id))): print("Loading the model ... ", end="") checkpoint = torch.load(os.path.join(save_dir, "{}.pt".format(env_id))) policy_net.load_state_dict(checkpoint["PolicyNet"]) print("Done.") else: print("Error: No model saved") #Start playing #---------------------------- policy_net.eval() s_traj = [] a_traj = [] for i_episode in range(n_episode): ob = env.reset() ret = 0 s_traj.append([]) a_traj.append([]) while True: if args.render: env.render() action = policy_net.action_step(torch.FloatTensor(np.expand_dims(ob, axis=0)).to(device), deterministic=True) action = action.cpu().detach().numpy()[0] s_traj[i_episode].append(ob) a_traj[i_episode].append(action) ob, reward, done, info = env.step(action) ret += reward if done: s_traj[i_episode] = np.array(s_traj[i_episode], dtype=np.float32) if args.conti: a_traj[i_episode] = np.array(a_traj[i_episode], dtype=np.float32) else: a_traj[i_episode] = np.array(a_traj[i_episode], dtype=np.int32) print("{:d}: return = {:.4f}, len = {:d}".format(i_episode, ret, len(s_traj[i_episode]))) break #s_traj: (n_episode, timesteps, s_dim) #a_traj: (n_episode, timesteps, a_dim) or (n_episode, timesteps) print("Saving the trajectories ... ", end="") pkl.dump((s_traj, a_traj), open(os.path.join(save_dir, "{}_traj.pkl".format(env_id)), "wb")) print("Done.") env.close()
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 beta = 0.1 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" expert_path = "../save/{}_traj.pkl".format(args.env) #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Load expert trajectories #---------------------------- if os.path.exists(expert_path): s_real, a_real = pkl.load(open(expert_path, "rb")) sa_real = [] if args.conti: for i in range(len(s_real)): sa_real.append(np.concatenate([s_real[i], a_real[i]], 1)) else: for i in range(len(s_real)): a_real_onehot = np.zeros((len(a_real[i]), a_dim), dtype=np.float32) for j in range(len(a_real[i])): a_real_onehot[j, a_real[i][j]] = 1 sa_real.append(np.concatenate([s_real[i], a_real_onehot], 1)) sa_real = np.concatenate(sa_real, 0) else: print("ERROR: No expert trajectory file found") sys.exit(1) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) dis_net = DiscriminatorNet(s_dim + a_dim).to(device) agent = PPO(policy_net, value_net, dis_net, a_dim, beta, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device, conti=args.conti) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) dis_net.load_state_dict(checkpoint["DiscriminatorNet"]) agent.beta = checkpoint["beta"] start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net, dis_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent, dis_loss, dis_real, dis_fake, avg_kl = agent.train( policy_net, value_net, dis_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps, sa_real) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_true_return, std_true_return, mean_return, std_return, mean_len = runner.get_performance( ) policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("dis loss = {:.6f}".format(dis_loss)) print("entropy = {:.6f}".format(ent)) print("avg_kl = {:.6f}".format(avg_kl)) print("beta = {:.6f}".format(agent.beta)) print("mean true return = {:.6f}".format(mean_true_return)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print("dis_real = {:.3f}".format(dis_real)) print("dis_fake = {:.3f}".format(dis_fake)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "beta": agent.beta, "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict(), "DiscriminatorNet": dis_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) agent = PPO(policy_net, value_net, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_return, std_return, mean_len = runner.get_performance() policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("entropy = {:.6f}".format(ent)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- env_id = args.env save_dir = "./save" device = "cuda:0" #Create environment #---------------------------- env = gym.make(env_id) if args.conti: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n if args.unwrap: env = env.unwrapped #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) #Load model #---------------------------- if os.path.exists(os.path.join(save_dir, "{}.pt".format(env_id))): print("Loading the model ... ", end="") checkpoint = torch.load(os.path.join(save_dir, "{}.pt".format(env_id))) policy_net.load_state_dict(checkpoint["PolicyNet"]) print("Done.") else: print("Error: No model saved") #Start playing #---------------------------- policy_net.eval() for it in range(100): ob = env.reset() ret = 0 while True: env.render() action = policy_net.action_step(torch.from_numpy(np.expand_dims(ob.__array__(), axis=0)).float().to(device), deterministic=True) ob, reward, done, info = env.step(action.cpu().detach().numpy()[0]) ret += reward if done: print("return = {:.4f}".format(ret)) break env.close()
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="BipedalWalker-v3") parser.add_argument("--discrete", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- save_dir = "./save" device = "cuda:0" if torch.cuda.is_available() else "cpu" #Create environment #---------------------------- env = gym.make(args.env) if args.discrete: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] if args.unwrap: env = env.unwrapped #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=not args.discrete).to(device) print(policy_net) #Load model #---------------------------- model_path = os.path.join(save_dir, "{}.pt".format(args.env)) if os.path.exists(model_path): print("Loading the model ... ", end="") checkpoint = torch.load(model_path) policy_net.load_state_dict(checkpoint["PolicyNet"]) start_it = checkpoint["it"] print("Done.") else: print("Error: No model saved") os.exit(1) #Start training #---------------------------- policy_net.eval() with torch.no_grad(): for it in range(10): ob = env.reset() total_reward = 0 length = 0 while True: env.render() ob_tensor = torch.tensor(np.expand_dims(ob, axis=0), dtype=torch.float32, device=device) action = policy_net.action_step(ob_tensor, deterministic=True).cpu().numpy() ob, reward, done, info = env.step(action[0]) total_reward += reward length += 1 if done: print("Total reward = {:.6f}, length = {:d}".format(total_reward, length)) break env.close()