Exemplo n.º 1
0
class Agent():
    '''Base class implementing functionality for different Deep Q Learning methods'''
    def __init__(self, env_name, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99,  eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32):
        self.input_dims = input_dims
        self.num_actions = num_actions
        self.discount_factor = discount_factor
        self.eps_min = eps_min
        self.eps = eps
        self.eps_decrement_factor = eps_decrement_factor
        self.mini_batch_size = mini_batch_size
        self.replay_memory_size = replay_memory_size
        self.memory_bank = AgentMemory(self.replay_memory_size)
        self.env_name = env_name

    def get_greedy_action(self, observation):
        raise NotImplementedError

    def save_models(self):
        raise NotImplementedError

    def learn(self):
        raise NotImplementedError

    def save_models(self):
        self.online_network.save_checkpoint()
        self.target_network.save_checkpoint()

    def load_models(self):
        self.online_network.load_checkpoint()
        self.target_network.load_checkpoint()

    def store_memory(self, memory):
        self.memory_bank.remember(memory)

    def make_memory(self, state, action, reward, new_state, done):
        return np.array([state,
                         np.long(action),
                         float(reward),
                         new_state,
                         bool(done)])
    
    def get_random_action(self, observation):
        # randint return is inclusive of final value
        return random.randint(0, num_actions-1)

    def decrement_epsilon(self):
        new_eps = self.eps - self.eps_decrement_factor
        self.eps = new_eps if new_eps > self.eps_min else self.eps_min
    
    def sample_memory(self):
        self.memory_bank.recall_batch(
                self.mini_batch_size)

    def copy_online_nn_to_target_nn(self):
        self.target_network.load_state_dict(self.online_network.state_dict())
Exemplo n.º 2
0
class Agent():
    def __init__(self, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99,  eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32):
        self.input_dims = input_dims
        self.num_actions = num_actions
        self.discount_factor = discount_factor
        self.eps_min = eps_min
        self.eps = eps
        self.eps_decrement_factor = eps_decrement_factor
        self.mini_batch_size = mini_batch_size
        #self.Q = LinearDQN(learning_rate, num_actions, input_dims)
        self.online_network = DeepQCNN(
            input_dims, self.num_actions, name='OnlineNetwork')
        self.target_network = DeepQCNN(
            input_dims, self.num_actions, name='TargetNetwork')
        self.replay_memory_size = replay_memory_size
        self.memory_bank = AgentMemory(self.replay_memory_size)

    def decrement_epsilon(self):
        new_eps = self.eps - self.eps_decrement_factor
        self.eps = new_eps if new_eps > self.eps_min else self.eps_min

    def store_memory(self, memory):
        self.memory_bank.remember(memory)

    def make_memory(self, state, action, reward, new_state, done):
        return np.array([state,
                         np.long(action),
                         float(reward),
                         new_state,
                         bool(done)])

    def get_greedy_action(self, observation):
        # convert obs to tensor, pass to device, forward pass, argmax
        obs_t = T.tensor(observation).to(
            self.online_network.device, dtype=T.float)
        action = self.target_network.forward(obs_t)

        return action.argmax().item()

    def get_random_action(self, observation):
        # randint return is inclusive of final value
        return random.randint(0, num_actions-1)

    def train_online_network(self):
        pass

    def save_models(self):
        self.online_network.save_checkpoint()
        self.target_network.save_checkpoint()

    def load_models(self):
        self.online_network.load_checkpoint()
        self.target_network.load_checkpoint()

        #replay_memory_training_data = self.memory_bank.recall_batch(mini_batch_size)
        # need is an array of arrays outer array (batchsize, 2), inner array(training data, targets)
        # self.online_network.fit()

    def update_target_network(self):
        pass

    def copy_online_nn_to_target_nn(self):
        self.target_network.load_state_dict(self.online_network.state_dict())
Exemplo n.º 3
0
class Agent():
    def __init__(self,
                 input_dims,
                 num_actions,
                 learning_rate=2e-4,
                 discount_factor=0.99,
                 eps=1.0,
                 eps_decrement_factor=1e-5,
                 eps_min=0.1,
                 replay_memory_size=10000,
                 mini_batch_size=32):
        self.input_dims = input_dims
        self.num_actions = num_actions
        self.discount_factor = discount_factor
        self.eps_min = eps_min
        self.eps = eps
        self.eps_decrement_factor = eps_decrement_factor
        self.mini_batch_size = mini_batch_size
        #self.Q = LinearDQN(learning_rate, num_actions, input_dims)
        self.online_network = DualDeepQCNN(input_dims,
                                           self.num_actions,
                                           name='OnlineNetwork')
        self.target_network = DualDeepQCNN(input_dims,
                                           self.num_actions,
                                           name='TargetNetwork')
        self.replay_memory_size = replay_memory_size
        self.memory_bank = AgentMemory(self.replay_memory_size)

    def decrement_epsilon(self):
        new_eps = self.eps - self.eps_decrement_factor
        self.eps = new_eps if new_eps > self.eps_min else self.eps_min

    def store_memory(self, memory):
        self.memory_bank.remember(memory)

    def make_memory(self, state, action, reward, new_state, done):
        return np.array(
            [state,
             np.long(action),
             float(reward), new_state,
             bool(done)])

    def get_greedy_action(self, observation):
        # convert obs to tensor, pass to device, forward pass, argmax
        obs_t = T.tensor(observation).to(self.online_network.device,
                                         dtype=T.float)

        #current value of state, and subtracting average value of best action do not matter as it only results in scaling of the actions without any change in ordering.
        action_v, action_a = self.target_network.forward(obs_t)
        return action_a.argmax().item()

    def get_random_action(self, observation):
        # randint return is inclusive of final value
        return random.randint(0, num_actions - 1)

    def train_online_network(self):
        pass

    def save_models(self):
        self.online_network.save_checkpoint()
        self.target_network.save_checkpoint()

    def load_models(self):
        self.online_network.load_checkpoint()
        self.target_network.load_checkpoint()

    def update_target_network(self):
        pass

    def copy_online_nn_to_target_nn(self):
        self.target_network.load_state_dict(self.online_network.state_dict())