Пример #1
0
 def __init__(
     self,
     env,
     eval_env=None,
     image_size=(45, 45, 45),
     update_frequency=4,
     replay_buffer_size=1e6,
     init_memory_size=5e4,
     max_episodes=100,
     steps_per_episode=50,
     eps=1,
     min_eps=0.1,
     delta=0.001,
     batch_size=4,
     gamma=0.9,
     number_actions=6,
     frame_history=4,
     model_name="CommNet",
     logger=None,
     train_freq=1,
     team_reward=False,
     attention=False,
 ):
     self.env = env
     self.eval_env = eval_env
     self.agents = env.agents
     self.image_size = image_size
     self.update_frequency = update_frequency
     self.replay_buffer_size = replay_buffer_size
     self.init_memory_size = init_memory_size
     self.max_episodes = max_episodes
     self.steps_per_episode = steps_per_episode
     self.eps = eps
     self.min_eps = min_eps
     self.delta = delta
     self.batch_size = batch_size
     self.gamma = gamma
     self.number_actions = number_actions
     self.frame_history = frame_history
     self.epoch_length = self.env.files.num_files
     self.best_val_distance = float('inf')
     self.buffer = ReplayMemory(self.replay_buffer_size, self.image_size,
                                self.frame_history, self.agents)
     self.dqn = DQN(self.agents,
                    self.frame_history,
                    logger=logger,
                    type=model_name,
                    collective_rewards=team_reward,
                    attention=attention)
     self.dqn.q_network.train(True)
     self.evaluator = Evaluator(eval_env, self.dqn.q_network, logger,
                                self.agents, steps_per_episode)
     self.logger = logger
     self.train_freq = train_freq
Пример #2
0
                                        header=True)

# Parameters for training a DQN model
N_EPISODE = 10000  #The number of episodes for training
MAX_STEP = 1000  #The number of steps for each episode
BATCH_SIZE = 32  #The number of experiences for each replay
MEMORY_SIZE = 100000  #The size of the batch for storing experiences
SAVE_NETWORK = 100  # After this number of episodes, the DQN model is saved for testing later.
INITIAL_REPLAY_SIZE = 1000  #The number of experiences are stored in the memory batch before starting replaying
INPUTNUM = 198  #The number of input values for the DQN model
ACTIONNUM = 6  #The number of actions output from the DQN model
MAP_MAX_X = 21  #Width of the Map
MAP_MAX_Y = 9  #Height of the Map

# Initialize a DQN model and a memory batch for storing experiences
DQNAgent = DQN(INPUTNUM, ACTIONNUM)
memory = Memory(MEMORY_SIZE)

# Initialize environment
minerEnv = MinerEnv(
    HOST, PORT
)  #Creating a communication environment between the DQN model and the game environment (GAME_SOCKET_DUMMY.py)
minerEnv.start()  # Connect to the game

train = False  #The variable is used to indicate that the replay starts, and the epsilon starts decrease.
#Training Process
#the main part of the deep-q learning agorithm
for episode_i in range(0, N_EPISODE):
    try:
        # Choosing a map in the list
        mapID = np.random.randint(
  [-2,-2,0,-2,0,-2,600,-2,-3,-3,-2,100,-3,-2,-2,0,-3,-3,0,0,0],
  [0,-3,-3,-2,0,0,-1,0,0,-3,-2,0,0,100,-1,0,0,-1,-1,-1,-2],
  [-2,850,1100,0,-1,100,-1,450,1050,-3,-2,0,-3,350,0,0,-1,-3,-3,-2,-1],
  [-1,-3,-1,-3,0,-2,0,0,-2,-1,0,-3,400,-2,0,-3,700,-2,-3,-2,0],
  [-2, -3,-1,-3,-1,0,-1,-3,-2,-1,300,-1,0,-1,200,-1,150,-2,-3,-3,-1],
  [0, -3, -1, -3, 0,-2,-3,-3,0,0,0,0,-2,300,-2,-3,-3,-3,-3,0,-1],
  [0,-3,-1,-3,-1,-1,-2,-2,0,-1,0,-2,0,-2,0,0,-2,-3,-3,0,0]
]
Map = np.array(map0.copy())
index = np.where(Map==0)
listOfCoordinates= list(zip(index[0], index[1]))
'''
# Initialize a DQN model and a memory batch for 
# storing experiences
weight_path='/home/mayleo/Documents/Inreforcement learning/miner/TrainedModels/DQNmodel_MinerLoss_ep600.h5'
DQNAgent = DQN(input_image_dim, ACTIONNUM, gamma = 0.95,epsilon =epsilon, learning_rate = 0.01, load_weights=None)
memory = Memory(MEMORY_SIZE)

# Initialize environment
minerEnv = MinerEnv(HOST, PORT) #Creating a communication environment between the DQN model and the game environment (GAME_SOCKET_DUMMY.py)
minerEnv.start()  # Connect to the game
path = '/home/mayleo/Documents/Inreforcement learning/miner/Maps/'
train = False #The variable is used to indicate that the replay starts, and the epsilon starts decrease.
#Training Process
#the main part of the deep-q learning agorithm 
for episode_i in range(0, N_EPISODE):
    print('*****')
    try:
        # Choosing a map in the list
        mapID = np.random.randint(0, 1) #Choosing a map ID from 5 maps in Maps folder randomly
        maplist = [0,25,50,75,100]
    pd.DataFrame(columns=header).to_csv(f, encoding='utf-8', index=False, header=True)

# Parameters for training a DQN model
N_EPISODE = 100000 #The number of episodes for training
MAX_STEP = 1000   #The number of steps for each episode
BATCH_SIZE = 32   #The number of experiences for each replay 
MEMORY_SIZE = 100000 #The size of the batch for storing experiences
SAVE_NETWORK = 1000  # After this number of episodes, the DQN model is saved for testing later. 
INITIAL_REPLAY_SIZE = 10000 #The number of experiences are stored in the memory batch before starting replaying
INPUTNUM = (2*limit+1)**2 + 3#198 #The number of input values for the DQN model
ACTIONNUM = 6  #The number of actions output from the DQN model
MAP_MAX_X = 21 #Width of the Map
MAP_MAX_Y = 9  #Height of the Map

# Initialize a DQN model and a memory batch for storing experiences
DQNAgent = DQN(INPUTNUM, ACTIONNUM)
memory = Memory(MEMORY_SIZE)
bots = [Bot1(2), Bot2(3), Bot3(4)]
#load model to continue training
if args.load_model !="":
    file_name = "TrainedModels/DQNmodel_20200730-1832_ep1000out-30.json"
    json_file = file_name if args.load_model == "default" else args.load_model
    DQNAgent.load_model(json_file)

# Initialize environment
minerEnv = MinerEnv(HOST, PORT) #Creating a communication environment between the DQN model and the game environment (GAME_SOCKET_DUMMY.py)
minerEnv.start()  # Connect to the game

train = False #The variable is used to indicate that the replay starts, and the epsilon starts decrease.
#Training Process
#the main part of the deep-q learning agorithm 
Пример #5
0
    init_player = MedicalPlayer(
        files_list=args.files,
        file_type=args.file_type,
        landmark_ids=args.landmarks,
        screen_dims=IMAGE_SIZE,
        # TODO: why is this always play?
        task='play',
        agents=agents,
        logger=logger)
    NUM_ACTIONS = init_player.action_space.n

    if args.task != 'train':
        # TODO: refactor DQN to not have to create both a q_network and
        # target_network
        dqn = DQN(agents,
                  frame_history=FRAME_HISTORY,
                  logger=logger,
                  type=args.model_name)
        model = dqn.q_network
        model.load_state_dict(torch.load(args.load, map_location=model.device))
        environment = get_player(files_list=args.files,
                                 file_type=args.file_type,
                                 landmark_ids=args.landmarks,
                                 saveGif=args.saveGif,
                                 saveVideo=args.saveVideo,
                                 task=args.task,
                                 agents=agents,
                                 viz=args.viz,
                                 logger=logger)
        evaluator = Evaluator(environment, model, logger, agents,
                              args.steps_per_episode)
        evaluator.play_n_episodes()
Пример #6
0
class Trainer(object):
    def __init__(
        self,
        env,
        eval_env=None,
        image_size=(45, 45, 45),
        update_frequency=4,
        replay_buffer_size=1e6,
        init_memory_size=5e4,
        max_episodes=100,
        steps_per_episode=50,
        eps=1,
        min_eps=0.1,
        delta=0.001,
        batch_size=4,
        gamma=0.9,
        number_actions=6,
        frame_history=4,
        model_name="CommNet",
        logger=None,
        train_freq=1,
        team_reward=False,
        attention=False,
    ):
        self.env = env
        self.eval_env = eval_env
        self.agents = env.agents
        self.image_size = image_size
        self.update_frequency = update_frequency
        self.replay_buffer_size = replay_buffer_size
        self.init_memory_size = init_memory_size
        self.max_episodes = max_episodes
        self.steps_per_episode = steps_per_episode
        self.eps = eps
        self.min_eps = min_eps
        self.delta = delta
        self.batch_size = batch_size
        self.gamma = gamma
        self.number_actions = number_actions
        self.frame_history = frame_history
        self.epoch_length = self.env.files.num_files
        self.best_val_distance = float('inf')
        self.buffer = ReplayMemory(self.replay_buffer_size, self.image_size,
                                   self.frame_history, self.agents)
        self.dqn = DQN(self.agents,
                       self.frame_history,
                       logger=logger,
                       type=model_name,
                       collective_rewards=team_reward,
                       attention=attention)
        self.dqn.q_network.train(True)
        self.evaluator = Evaluator(eval_env, self.dqn.q_network, logger,
                                   self.agents, steps_per_episode)
        self.logger = logger
        self.train_freq = train_freq

    def train(self):
        self.logger.log(self.dqn.q_network)
        self.init_memory()
        episode = 1
        acc_steps = 0
        epoch_distances = []
        while episode <= self.max_episodes:
            # Reset the environment for the start of the episode.
            obs = self.env.reset()
            terminal = [False for _ in range(self.agents)]
            losses = []
            score = [0] * self.agents
            for step_num in range(self.steps_per_episode):
                acc_steps += 1
                acts, q_values = self.get_next_actions(
                    self.buffer.recent_state())
                # Step the agent once, and get the transition tuple
                obs, reward, terminal, info = self.env.step(
                    np.copy(acts), q_values, terminal)
                score = [sum(x) for x in zip(score, reward)]
                self.buffer.append((obs, acts, reward, terminal))
                if acc_steps % self.train_freq == 0:
                    mini_batch = self.buffer.sample(self.batch_size)
                    loss = self.dqn.train_q_network(mini_batch, self.gamma)
                    losses.append(loss)
                if all(t for t in terminal):
                    break
            epoch_distances.append(
                [info['distError_' + str(i)] for i in range(self.agents)])
            self.append_episode_board(info, score, "train", episode)
            if (episode * self.epoch_length) % self.update_frequency == 0:
                self.dqn.copy_to_target_network()
            self.eps = max(self.min_eps, self.eps - self.delta)
            # Every epoch
            if episode % self.epoch_length == 0:
                self.append_epoch_board(epoch_distances, self.eps, losses,
                                        "train", episode)
                self.validation_epoch(episode)
                self.dqn.save_model(name="latest_dqn.pt", forced=True)
                self.dqn.scheduler.step()
                epoch_distances = []
            episode += 1

    def init_memory(self):
        self.logger.log("Initialising memory buffer...")
        pbar = tqdm(desc="Memory buffer", total=self.init_memory_size)
        while len(self.buffer) < self.init_memory_size:
            # Reset the environment for the start of the episode.
            obs = self.env.reset()
            terminal = [False for _ in range(self.agents)]
            steps = 0
            for _ in range(self.steps_per_episode):
                steps += 1
                acts, q_values = self.get_next_actions(obs)
                obs, reward, terminal, info = self.env.step(
                    acts, q_values, terminal)
                self.buffer.append((obs, acts, reward, terminal))
                if all(t for t in terminal):
                    break
            pbar.update(steps)
        pbar.close()
        self.logger.log("Memory buffer filled")

    def validation_epoch(self, episode):
        if self.eval_env is None:
            return
        self.dqn.q_network.train(False)
        epoch_distances = []
        for k in range(self.eval_env.files.num_files):
            self.logger.log(f"eval episode {k}")
            (score, start_dists, q_values,
             info) = self.evaluator.play_one_episode()
            epoch_distances.append(
                [info['distError_' + str(i)] for i in range(self.agents)])

        val_dists = self.append_epoch_board(epoch_distances,
                                            name="eval",
                                            episode=episode)
        if (val_dists < self.best_val_distance):
            self.logger.log("Improved new best mean validation distances")
            self.best_val_distance = val_dists
            self.dqn.save_model(name="best_dqn.pt", forced=True)
        self.dqn.q_network.train(True)

    def append_episode_board(self, info, score, name="train", episode=0):
        dists = {
            str(i): info['distError_' + str(i)]
            for i in range(self.agents)
        }
        self.logger.write_to_board(f"{name}/dist", dists, episode)
        scores = {str(i): score[i] for i in range(self.agents)}
        self.logger.write_to_board(f"{name}/score", scores, episode)

    def append_epoch_board(self,
                           epoch_dists,
                           eps=0,
                           losses=[],
                           name="train",
                           episode=0):
        epoch_dists = np.array(epoch_dists)
        if name == "train":
            self.logger.write_to_board(name, {"eps": eps}, episode)
            if len(losses) > 0:
                loss_dict = {"loss": sum(losses) / len(losses)}
                self.logger.write_to_board(name, loss_dict, episode)
        for i in range(self.agents):
            mean_dist = sum(epoch_dists[:, i]) / len(epoch_dists[:, i])
            mean_dist_dict = {str(i): mean_dist}
            self.logger.write_to_board(f"{name}/mean_dist", mean_dist_dict,
                                       episode)
            min_dist_dict = {str(i): min(epoch_dists[:, i])}
            self.logger.write_to_board(f"{name}/min_dist", min_dist_dict,
                                       episode)
            max_dist_dict = {str(i): max(epoch_dists[:, i])}
            self.logger.write_to_board(f"{name}/max_dist", max_dist_dict,
                                       episode)
        return np.array(list(mean_dist_dict.values())).mean()

    def get_next_actions(self, obs_stack):
        # epsilon-greedy policy
        if np.random.random() < self.eps:
            q_values = np.zeros((self.agents, self.number_actions))
            actions = np.random.randint(self.number_actions, size=self.agents)
        else:
            actions, q_values = self.get_greedy_actions(obs_stack,
                                                        doubleLearning=True)
        return actions, q_values

    def get_greedy_actions(self, obs_stack, doubleLearning=True):
        inputs = torch.tensor(obs_stack).unsqueeze(0)
        if doubleLearning:
            q_vals = self.dqn.q_network.forward(inputs).detach().squeeze(0)
        else:
            q_vals = self.dqn.target_network.forward(inputs).detach().squeeze(
                0)
        idx = torch.max(q_vals, -1)[1]
        greedy_steps = np.array(idx, dtype=np.int32).flatten()
        return greedy_steps, q_vals.data.numpy()
Пример #7
0
INITIAL_REPLAY_SIZE = 100  # The number of experiences are stored in the memory batch before starting replaying
INPUTNUM = 198  # The number of input values for the DQN model
ACTIONNUM = 6  # The number of actions output from the DQN model
MAP_MAX_X = 21  # Width of the Map
MAP_MAX_Y = 9  # Height of the Map
load_checkpoint = False
# Initialize a DQN model and a memory batch for storing experiences

DQNAgent = DQN(
    INPUTNUM,
    ACTIONNUM,
    batch_size=BATCH_SIZE,
    mem_size=50000,
    eps_min=0.1,
    replace=1000,
    eps_dec=1e-5,
    chkpt_dir="models/",
    algo="dqnagent",
    env_name="minerai",
    gamma=0.99,
    epsilon=1,
    lr=0.00001,
)
if load_checkpoint:
    DQNAgent.load_models()

# Initialize environment
minerEnv = MinerEnv(HOST, PORT)
minerEnv.start()

fname = (DQNAgent.algo + "_" + DQNAgent.env_name + "_lr" + str(DQNAgent.lr) +
Пример #8
0
        error_message = f"""Wrong input files {len(args.files)} for {args.task}
                            task - should be 1 \'images.txt\' """
        assert len(args.files) == 1, (error_message)
    else:
        error_message = f"""Wrong input files {len(args.files)} for
                            {args.task} task - should be 2 [\'images.txt\',
                            \'landmarks.txt\'] """
        assert len(args.files) == 2, (error_message)

    if args.seed is not None:
        set_reproducible(args.seed)

    logger = Logger(args.log_dir, args.write, args.save_freq, comment=args.log_comment)

    if args.task != 'train':
        dqn = DQN(agents, frame_history=FRAME_HISTORY, logger=logger,
                  type=args.model_name, collective_rewards=args.team_reward, attention=args.attention)
        model = dqn.q_network
        model.load_state_dict(torch.load(args.load, map_location=model.device))
        environment = get_player(files_list=args.files,
                                 file_type=args.file_type,
                                 landmark_ids=args.landmarks,
                                 saveGif=args.saveGif,
                                 saveVideo=args.saveVideo,
                                 task=args.task,
                                 agents=agents,
                                 viz=args.viz,
                                 logger=logger)
        evaluator = Evaluator(environment, model, logger, agents,
                              args.steps_per_episode)
        evaluator.play_n_episodes(fixed_spawn=args.fixed_spawn)
    else:  # train model