예제 #1
0
 def run_env(self, pool_number):
     env = MyEnv(
         map_name=self.map_name,
         step_mul=self.step_mul,
         screen_size=self.screen_size,
         minimap_size=self.screen_size,
         game_length=self.game_length,
         max_games=self.max_games,
         envs_number=self.envs_number,
         visualize=self.visualize,
         pool_number=pool_number,
         population_size=self.population_size,
         generation=self.generation,
         save_dir=self.save_dir
     )
     env.run()
예제 #2
0
def test():
    ############## Hyperparameters ##############

    # creating environment
    env = MyEnv()
    env_name = env.env_name
    action_dim = 5
    n_latent_var = 64           # number of variables in hidden layer
    lr = 0.0007
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 4                # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    #############################################

    n_episodes = 100
    max_timesteps = 5000
    save_gif = False

    filename = "./preTrained/PPO_{}_train2.pth".format(env_name)
    
    memory = Memory()
    ppo = PPO(64*64*3, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    
    ppo.policy_old.load_state_dict(torch.load(filename))
    rewards = []
    for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            obs, compass = converter(state)
            action = ppo.policy_old.act( obs=obs, compass=compass, memory=memory)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            # if render:
            #     env.render()
            if save_gif:
                 img = obs.data.numpy()
                 img = Image.fromarray(img)
                 img.save('./gif/{}.jpg'.format(t))  
            if done:
                break
        rewards.append(ep_reward)
        logging.debug('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
    np.save('./PPO_ep_rewards_test_{}'.format(env_name), np.array(rewards))
예제 #3
0
def main():
    ############## Hyperparameters ##############
    env_name = 'MineRLNavigateDense-v0'
    from environment import MyEnv
    # creating environment
    env = MyEnv()
    state_dim = 3 * 64 * 64
    action_dim = 5
    render = False
    solved_reward = 200  # stop training if avg_reward > solved_reward. this is impossible
    log_interval = 1  # print avg reward in the interval
    max_episodes = 50000  # max training episodes
    max_timesteps = 5000  # max timesteps in one episode
    n_latent_var = 64  # number of variables in hidden layer
    update_timestep = 1024  # update policy every n timesteps
    lr = 0.00025
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 3  # update policy for K epochs
    eps_clip = 0.1  # clip parameter for PPO
    random_seed = None
    save_interval = 5
    #############################################

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)
    print(lr, betas)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    episode_rewards = []

    # training loop
    for i_episode in range(1, max_episodes + 1):
        episode_reward = 0
        obs, compass = converter(env.reset())
        for t in range(max_timesteps):
            timestep += 1

            # Running policy_old:
            action = ppo.policy_old.act(obs, compass, memory)
            state, reward, done, _ = env.step(action)
            obs, compass = converter(state)
            # Saving reward and is_terminal:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            episode_reward += reward
            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
            running_reward += reward
            if render:
                env.render()
            if done:
                break
            logging.debug(f"instant reward {reward}, timestep {timestep}")
        episode_rewards.append(episode_reward)
        avg_length += t

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval * solved_reward):
            logging.info("########## Solved! ##########")
            torch.save(ppo.policy.state_dict(),
                       './PPO_{}.pth'.format(env_name))
            break

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length / log_interval)
            running_reward = int((running_reward / log_interval))
            logging.debug('Episode {} \t avg length: {} \t reward: {}'.format(
                i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0

        if i_episode % save_interval == 0:
            torch.save(ppo.policy.state_dict(),
                       './PPO_{}_{}.pth'.format(env_name, i_episode))
            np.save('./PPO_ep_rewards_{}_{}'.format(env_name, i_episode),
                    np.array(episode_rewards))
예제 #4
0
from environment import MyEnv
import math
import random
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from itertools import count
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import os

env = MyEnv()
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 256
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 50000
TARGET_UPDATE = 10
LR = 0.005
test_time = False

n_steps = 8
n_actions = env.action_space.n
img_height = 64
예제 #5
0
    opt_step = 0

    # pre-training
    if not args.no_train:
        print('Pre-training')
        for i in range(1000):
            opt_step += 1
            optimize_dqfd(args.bsz, 1.0, opt_step)
            if i % TARGET_UPDATE == 0:
                target_net.load_state_dict(policy_net.state_dict())
        print('Pre-training done')
    else:
        args.demo_prop = 0

    env = MyEnv()
    env.reset()

    # training loop
    ep_counter = count(1) if args.num_eps < 0 else range(args.num_eps)
    for i_episode in ep_counter:
        state = env.reset()
        total_reward = 0
        transitions = []
        q_vals = policy_net(state)
        for step_n in count():

            # selecting an action and playing it
            if args.no_train:
                action = q_vals.max(1)[1].cpu()
            else:
예제 #6
0
파일: main.py 프로젝트: mathdoug/GameTheory
from environment import MyEnv

if __name__ == "__main__":
    # Creation of the environment
    myenv = MyEnv(n_players=5)

    # Creation of the players
예제 #7
0
    loss_history = []
    if not args.no_train:
        loadExpertData(data, memory)
        for i in range(num_pretraining):
            loss_history.append(pretraining_step())
        torch.save(policy_net.state_dict(), 'pretrain-model')
        np.save('loss_history', np.array(loss_history))
    else:
        policy_net.load_state_dict(torch.load("pretrain-model"))
    policy_net.apply(weights_init)
    target_net.load_state_dict(policy_net.state_dict())

    from environment import MyEnv

    env = MyEnv()

    num_episodes = 50
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        state = converter(env.reset())
        avg_rew = 0
        for t in count():
            # Select and perform an action
            action = select_action(state)
            obs, rew, done, _ = env.step(action.item())
            reward = torch.tensor([rew], device=device)
            next_state = converter(obs)

            # Store the transition in memory
            memory.push(state, action, next_state, reward)