def main(): # ENVIROMENT env_name = "CartPole-v1" # env_name = "LunarLander-v2" env = gym.make(env_name) n_actions = env.action_space.n feature_dim = env.observation_space.shape[0] # PARAMETERS learning_rate = 1e-3 state_scale = 1.0 reward_scale = 1.0 clip = 0.2 n_epoch = 4 max_episodes = 10 max_timesteps = 200 batch_size = 32 max_iterations = 200 gamma = 0.99 gae_lambda = 0.95 entropy_coefficient = 0.01 # NETWORK value_model = ValueNetwork(in_dim=feature_dim).to(device) value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate) policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device) policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) # INIT history = History() observation = env.reset() epoch_ite = 0 episode_ite = 0 train_ite = 0 running_reward = -500 # TENSORBOARD timestr = time.strftime("%d%m%Y-%H%M%S-") log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \ str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \ "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda) writer = SummaryWriter(log_dir=log_dir) # LOAD MODEL # Create folder models if not Path("./models").exists(): print("Creating Models folder") Path("./models").mkdir() model_path = Path("./models/" + env_name + ".tar") if model_path.exists(): print("Loading model!") #Load model checkpoint = torch.load(model_path) policy_model.load_state_dict(checkpoint['policy_model']) policy_optimizer.load_state_dict(checkpoint['policy_optimizer']) value_model.load_state_dict(checkpoint['value_model']) value_optimizer.load_state_dict(checkpoint['value_optimizer']) running_reward = checkpoint['running_reward'] for ite in tqdm(range(max_iterations), ascii=True): if ite % 5 == 0: torch.save({ 'policy_model': policy_model.state_dict(), 'policy_optimizer': policy_optimizer.state_dict(), 'value_model': value_model.state_dict(), 'value_optimizer': value_optimizer.state_dict(), 'running_reward': running_reward}, model_path) episode_ite, running_reward = collect(episode_ite, running_reward, env, max_episodes, max_timesteps, state_scale, reward_scale, writer, history, policy_model, value_model, gamma, gae_lambda, device) # Here we have collected N trajectories. history.build_dataset() data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True) policy_loss, value_loss, train_ite = train_network(data_loader, policy_model, value_model, policy_optimizer, value_optimizer ,n_epoch, clip, train_ite, writer, entropy_coefficient) for p_l, v_l in zip(policy_loss, value_loss): epoch_ite += 1 writer.add_scalar("Policy Loss", p_l, epoch_ite) writer.add_scalar("Value Loss", v_l, epoch_ite) history.free_memory() # print("\n", running_reward) writer.add_scalar("Running Reward", running_reward, epoch_ite) if (running_reward > env.spec.reward_threshold): print("\nSolved!") break
def main(): # ENVIROMENT # env_name = "CartPole-v1" # env_name = "LunarLander-v2" # env_name = "Acrobot-v1" env_name = "MountainCar-v0" env = gym.make(env_name) n_actions = env.action_space.n feature_dim = env.observation_space.shape[0] # PARAMETERS learning_rate = 1e-3 state_scale = 1.0 reward_scale = 1.0 clip = 0.2 n_epoch = 4 max_episodes = 10 max_timesteps = 100 batch_size = 32 max_iterations = 1000 gamma = 0.99 gae_lambda = 0.95 entropy_coefficient = 0.01 env_threshold = env.spec.reward_threshold # NETWORK value_model = ValueNetwork(in_dim=feature_dim).to(device) value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate) policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device) policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) # INIT history = History() observation = env.reset() epoch_ite = 0 episode_ite = 0 train_ite = 0 running_reward = -500 # TENSORBOARD timestr = time.strftime("%d%m%Y-%H%M%S-") log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \ str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \ "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda) writer = SummaryWriter(log_dir=log_dir) # LOAD MODEL # Create folder models if not Path("./models").exists(): print("Creating Models folder") Path("./models").mkdir() model_path = Path("./models/" + env_name + ".tar") if model_path.exists(): print("Loading model!") #Load model checkpoint = torch.load(model_path) policy_model.load_state_dict(checkpoint['policy_model']) policy_optimizer.load_state_dict(checkpoint['policy_optimizer']) value_model.load_state_dict(checkpoint['value_model']) value_optimizer.load_state_dict(checkpoint['value_optimizer']) running_reward = checkpoint['running_reward'] EnvQueue = queue.SimpleQueue() for _ in range(max_episodes): env = gym.make(env_name) observation = env.reset() EnvQueue.put((env, observation, 0)) for ite in tqdm(range(max_iterations), ascii=True): if ite % 5 == 0: torch.save( { 'policy_model': policy_model.state_dict(), 'policy_optimizer': policy_optimizer.state_dict(), 'value_model': value_model.state_dict(), 'value_optimizer': value_optimizer.state_dict(), 'running_reward': running_reward }, model_path) q = queue.SimpleQueue() env_list = [] while not EnvQueue.empty(): env_list.append(EnvQueue.get()) threads = [] for env in env_list: t = threading.Thread(target=collect, args=[ q, env_name, env, EnvQueue, max_timesteps, state_scale, reward_scale, policy_model, value_model, gamma, gae_lambda, device ]) t.start() threads.append(t) for t in threads: t.join() avg_episode_reward = [] # Write all episodes from queue to history buffer while not q.empty(): episode, done = q.get() history.episodes.append(episode) avg_episode_reward.append((episode.reward, done)) for ep_reward, done in avg_episode_reward: if done: running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward writer.add_scalar("Running Reward", running_reward, episode_ite) writer.add_scalar("Episode Reward", ep_reward, episode_ite) episode_ite += 1 # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward) # Here we have collected N trajectories and prepare dataset history.build_dataset() data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True) policy_loss, value_loss, train_ite = train_network( data_loader, policy_model, value_model, policy_optimizer, value_optimizer, n_epoch, clip, train_ite, writer, entropy_coefficient) for p_l, v_l in zip(policy_loss, value_loss): epoch_ite += 1 writer.add_scalar("Policy Loss", p_l, epoch_ite) writer.add_scalar("Value Loss", v_l, epoch_ite) history.free_memory() # print("\n", running_reward) if (running_reward > env_threshold): print("\nSolved!") break
def main(env_name, lr, state_scale, reward_scale, clip, train_epoch, max_episodes, max_timesteps, batch_size, max_iterations, gamma, gae_lambda, entropy_coefficient, start_running_reward, update_rate): # ENVIROMENT env_name = env_name env = ChessEnv() # PARAMETERS learning_rate = lr state_scale = state_scale reward_scale = reward_scale clip = clip n_epoch = train_epoch max_episodes = max_episodes max_timesteps = max_timesteps batch_size = batch_size max_iterations = max_iterations gamma = gamma gae_lambda = gae_lambda entropy_coefficient = entropy_coefficient # NETWORK value_model = ValueNetwork().to(device) value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate) policy_model = PolicyNetwork().to(device) policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) # INIT history = History() epoch_ite = 0 episode_ite = 0 train_ite = 0 running_reward = start_running_reward # TENSORBOARD timestr = time.strftime("%d%m%Y-%H%M%S-") log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \ str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \ "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda) writer = SummaryWriter(log_dir=log_dir) # LOAD MODEL # Create folder models if not Path("./models").exists(): print("Creating Models folder") Path("./models").mkdir() model_path = Path("./models/" + env_name + ".tar") if model_path.exists(): print("Loading model!") #Load model checkpoint = torch.load(model_path) policy_model.load_state_dict(checkpoint['policy_model']) policy_optimizer.load_state_dict(checkpoint['policy_optimizer']) value_model.load_state_dict(checkpoint['value_model']) value_optimizer.load_state_dict(checkpoint['value_optimizer']) running_reward = checkpoint['running_reward'] # Create SavedEnvs queue SavedEnv = queue.SimpleQueue() for _ in range(max_episodes): env = ChessEnv() SavedEnv.put((env, env.reset(), 0)) # START ITERATING for ite in tqdm(range(max_iterations), ascii=True): # Load model to rival each update_rate epochs if ite % update_rate == 0: print("\nUpdating") rival_policy = PolicyNetwork().to(device) rival_policy.load_state_dict(policy_model.state_dict()) if ite % 5 == 0: torch.save( { 'policy_model': policy_model.state_dict(), 'policy_optimizer': policy_optimizer.state_dict(), 'value_model': value_model.state_dict(), 'value_optimizer': value_optimizer.state_dict(), 'running_reward': running_reward }, model_path) print("\nSimulating") start_simulation = time.perf_counter() q = queue.SimpleQueue() env_list = [] while not SavedEnv.empty(): env_list.append(SavedEnv.get()) threads = [] for saved_env in env_list: t = threading.Thread(target=collect, args=[ q, env_name, saved_env, SavedEnv, max_timesteps, state_scale, reward_scale, policy_model, value_model, gamma, gae_lambda, device, rival_policy ]) t.start() threads.append(t) for t in threads: t.join() # for saved_env in env_list: # if ite % 20 == 0: # update_policy = True # else: # update_policy = False # collect(q, env_name, saved_env, # SavedEnv, max_timesteps, state_scale, reward_scale, # policy_model, value_model, gamma, # gae_lambda, device, update_policy) avg_episode_reward = [] # Write all episodes from queue to history buffer while not q.empty(): episode, done = q.get() history.episodes.append(episode) avg_episode_reward.append((episode.reward, done)) end_simulation = time.perf_counter() print(f"Simulation time: {end_simulation-start_simulation:.2f} ") for ep_reward, done in avg_episode_reward: if done: running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward writer.add_scalar("Average Episode Reward", ep_reward, episode_ite) episode_ite += 1 # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward) # Here we have collected N trajectories and prepare dataset history.build_dataset() data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True) print("Training") policy_loss, value_loss, train_ite = train_network( data_loader, policy_model, value_model, policy_optimizer, value_optimizer, n_epoch, clip, train_ite, writer, entropy_coefficient) end_training = time.perf_counter() print(f"Training time: {end_training-end_simulation:.2f}") for p_l, v_l in zip(policy_loss, value_loss): epoch_ite += 1 writer.add_scalar("Policy Loss", p_l, epoch_ite) writer.add_scalar("Value Loss", v_l, epoch_ite) history.free_memory() # print("\n", running_reward) writer.add_scalar("Running Reward", running_reward, epoch_ite) if (running_reward > 0): print("\nSolved!") break
model.load_state_dict(checkpoint['policy_model']) optimizer.load_state_dict(checkpoint['policy_optimizer']) for epoch in tqdm(range(1, num_epochs + 1)): tr_loss = train_epoch(train_loader, model, optimizer, criterion) tr_losses.append(tr_loss) te_loss, te_acc = test_epoch(test_loader, model) te_losses.append(te_loss) te_accs.append(te_acc) writer.add_scalar("Test loss", te_loss, epoch) writer.add_scalar("Test accuracy", te_acc, epoch) torch.save( { 'policy_model': model.state_dict(), 'policy_optimizer': optimizer.state_dict() }, model_path) # plt.figure(figsize=(10, 8)) # plt.subplot(2,1,1) # plt.xlabel('Epoch') # plt.ylabel('NLLLoss') # plt.plot(tr_losses, label='train') # plt.show() # plt.plot(te_losses, label='test') # plt.show() # plt.legend() # plt.subplot(2,1,2) # plt.xlabel('Epoch') # plt.ylabel('Test Accuracy [%]')