def run_env(self, pool_number): env = MyEnv( map_name=self.map_name, step_mul=self.step_mul, screen_size=self.screen_size, minimap_size=self.screen_size, game_length=self.game_length, max_games=self.max_games, envs_number=self.envs_number, visualize=self.visualize, pool_number=pool_number, population_size=self.population_size, generation=self.generation, save_dir=self.save_dir ) env.run()
def test(): ############## Hyperparameters ############## # creating environment env = MyEnv() env_name = env.env_name action_dim = 5 n_latent_var = 64 # number of variables in hidden layer lr = 0.0007 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO ############################################# n_episodes = 100 max_timesteps = 5000 save_gif = False filename = "./preTrained/PPO_{}_train2.pth".format(env_name) memory = Memory() ppo = PPO(64*64*3, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) ppo.policy_old.load_state_dict(torch.load(filename)) rewards = [] for ep in range(1, n_episodes+1): ep_reward = 0 state = env.reset() for t in range(max_timesteps): obs, compass = converter(state) action = ppo.policy_old.act( obs=obs, compass=compass, memory=memory) state, reward, done, _ = env.step(action) ep_reward += reward # if render: # env.render() if save_gif: img = obs.data.numpy() img = Image.fromarray(img) img.save('./gif/{}.jpg'.format(t)) if done: break rewards.append(ep_reward) logging.debug('Episode: {}\tReward: {}'.format(ep, int(ep_reward))) np.save('./PPO_ep_rewards_test_{}'.format(env_name), np.array(rewards))
def main(): ############## Hyperparameters ############## env_name = 'MineRLNavigateDense-v0' from environment import MyEnv # creating environment env = MyEnv() state_dim = 3 * 64 * 64 action_dim = 5 render = False solved_reward = 200 # stop training if avg_reward > solved_reward. this is impossible log_interval = 1 # print avg reward in the interval max_episodes = 50000 # max training episodes max_timesteps = 5000 # max timesteps in one episode n_latent_var = 64 # number of variables in hidden layer update_timestep = 1024 # update policy every n timesteps lr = 0.00025 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 3 # update policy for K epochs eps_clip = 0.1 # clip parameter for PPO random_seed = None save_interval = 5 ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) print(lr, betas) # logging variables running_reward = 0 avg_length = 0 timestep = 0 episode_rewards = [] # training loop for i_episode in range(1, max_episodes + 1): episode_reward = 0 obs, compass = converter(env.reset()) for t in range(max_timesteps): timestep += 1 # Running policy_old: action = ppo.policy_old.act(obs, compass, memory) state, reward, done, _ = env.step(action) obs, compass = converter(state) # Saving reward and is_terminal: memory.rewards.append(reward) memory.is_terminals.append(done) episode_reward += reward # update if its time if timestep % update_timestep == 0: ppo.update(memory) memory.clear_memory() running_reward += reward if render: env.render() if done: break logging.debug(f"instant reward {reward}, timestep {timestep}") episode_rewards.append(episode_reward) avg_length += t # stop training if avg_reward > solved_reward if running_reward > (log_interval * solved_reward): logging.info("########## Solved! ##########") torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name)) break # logging if i_episode % log_interval == 0: avg_length = int(avg_length / log_interval) running_reward = int((running_reward / log_interval)) logging.debug('Episode {} \t avg length: {} \t reward: {}'.format( i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0 if i_episode % save_interval == 0: torch.save(ppo.policy.state_dict(), './PPO_{}_{}.pth'.format(env_name, i_episode)) np.save('./PPO_ep_rewards_{}_{}'.format(env_name, i_episode), np.array(episode_rewards))
from environment import MyEnv import math import random import numpy as np import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt from itertools import count import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F import torchvision.transforms as T import os env = MyEnv() # if gpu is to be used device = torch.device("cuda" if torch.cuda.is_available() else "cpu") BATCH_SIZE = 256 GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 50000 TARGET_UPDATE = 10 LR = 0.005 test_time = False n_steps = 8 n_actions = env.action_space.n img_height = 64
opt_step = 0 # pre-training if not args.no_train: print('Pre-training') for i in range(1000): opt_step += 1 optimize_dqfd(args.bsz, 1.0, opt_step) if i % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) print('Pre-training done') else: args.demo_prop = 0 env = MyEnv() env.reset() # training loop ep_counter = count(1) if args.num_eps < 0 else range(args.num_eps) for i_episode in ep_counter: state = env.reset() total_reward = 0 transitions = [] q_vals = policy_net(state) for step_n in count(): # selecting an action and playing it if args.no_train: action = q_vals.max(1)[1].cpu() else:
from environment import MyEnv if __name__ == "__main__": # Creation of the environment myenv = MyEnv(n_players=5) # Creation of the players
loss_history = [] if not args.no_train: loadExpertData(data, memory) for i in range(num_pretraining): loss_history.append(pretraining_step()) torch.save(policy_net.state_dict(), 'pretrain-model') np.save('loss_history', np.array(loss_history)) else: policy_net.load_state_dict(torch.load("pretrain-model")) policy_net.apply(weights_init) target_net.load_state_dict(policy_net.state_dict()) from environment import MyEnv env = MyEnv() num_episodes = 50 for i_episode in range(num_episodes): # Initialize the environment and state state = converter(env.reset()) avg_rew = 0 for t in count(): # Select and perform an action action = select_action(state) obs, rew, done, _ = env.step(action.item()) reward = torch.tensor([rew], device=device) next_state = converter(obs) # Store the transition in memory memory.push(state, action, next_state, reward)