class Agent(): '''Base class implementing functionality for different Deep Q Learning methods''' def __init__(self, env_name, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99, eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32): self.input_dims = input_dims self.num_actions = num_actions self.discount_factor = discount_factor self.eps_min = eps_min self.eps = eps self.eps_decrement_factor = eps_decrement_factor self.mini_batch_size = mini_batch_size self.replay_memory_size = replay_memory_size self.memory_bank = AgentMemory(self.replay_memory_size) self.env_name = env_name def get_greedy_action(self, observation): raise NotImplementedError def save_models(self): raise NotImplementedError def learn(self): raise NotImplementedError def save_models(self): self.online_network.save_checkpoint() self.target_network.save_checkpoint() def load_models(self): self.online_network.load_checkpoint() self.target_network.load_checkpoint() def store_memory(self, memory): self.memory_bank.remember(memory) def make_memory(self, state, action, reward, new_state, done): return np.array([state, np.long(action), float(reward), new_state, bool(done)]) def get_random_action(self, observation): # randint return is inclusive of final value return random.randint(0, num_actions-1) def decrement_epsilon(self): new_eps = self.eps - self.eps_decrement_factor self.eps = new_eps if new_eps > self.eps_min else self.eps_min def sample_memory(self): self.memory_bank.recall_batch( self.mini_batch_size) def copy_online_nn_to_target_nn(self): self.target_network.load_state_dict(self.online_network.state_dict())
class Agent(): def __init__(self, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99, eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32): self.input_dims = input_dims self.num_actions = num_actions self.discount_factor = discount_factor self.eps_min = eps_min self.eps = eps self.eps_decrement_factor = eps_decrement_factor self.mini_batch_size = mini_batch_size #self.Q = LinearDQN(learning_rate, num_actions, input_dims) self.online_network = DeepQCNN( input_dims, self.num_actions, name='OnlineNetwork') self.target_network = DeepQCNN( input_dims, self.num_actions, name='TargetNetwork') self.replay_memory_size = replay_memory_size self.memory_bank = AgentMemory(self.replay_memory_size) def decrement_epsilon(self): new_eps = self.eps - self.eps_decrement_factor self.eps = new_eps if new_eps > self.eps_min else self.eps_min def store_memory(self, memory): self.memory_bank.remember(memory) def make_memory(self, state, action, reward, new_state, done): return np.array([state, np.long(action), float(reward), new_state, bool(done)]) def get_greedy_action(self, observation): # convert obs to tensor, pass to device, forward pass, argmax obs_t = T.tensor(observation).to( self.online_network.device, dtype=T.float) action = self.target_network.forward(obs_t) return action.argmax().item() def get_random_action(self, observation): # randint return is inclusive of final value return random.randint(0, num_actions-1) def train_online_network(self): pass def save_models(self): self.online_network.save_checkpoint() self.target_network.save_checkpoint() def load_models(self): self.online_network.load_checkpoint() self.target_network.load_checkpoint() #replay_memory_training_data = self.memory_bank.recall_batch(mini_batch_size) # need is an array of arrays outer array (batchsize, 2), inner array(training data, targets) # self.online_network.fit() def update_target_network(self): pass def copy_online_nn_to_target_nn(self): self.target_network.load_state_dict(self.online_network.state_dict())
class Agent(): def __init__(self, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99, eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32): self.input_dims = input_dims self.num_actions = num_actions self.discount_factor = discount_factor self.eps_min = eps_min self.eps = eps self.eps_decrement_factor = eps_decrement_factor self.mini_batch_size = mini_batch_size #self.Q = LinearDQN(learning_rate, num_actions, input_dims) self.online_network = DualDeepQCNN(input_dims, self.num_actions, name='OnlineNetwork') self.target_network = DualDeepQCNN(input_dims, self.num_actions, name='TargetNetwork') self.replay_memory_size = replay_memory_size self.memory_bank = AgentMemory(self.replay_memory_size) def decrement_epsilon(self): new_eps = self.eps - self.eps_decrement_factor self.eps = new_eps if new_eps > self.eps_min else self.eps_min def store_memory(self, memory): self.memory_bank.remember(memory) def make_memory(self, state, action, reward, new_state, done): return np.array( [state, np.long(action), float(reward), new_state, bool(done)]) def get_greedy_action(self, observation): # convert obs to tensor, pass to device, forward pass, argmax obs_t = T.tensor(observation).to(self.online_network.device, dtype=T.float) #current value of state, and subtracting average value of best action do not matter as it only results in scaling of the actions without any change in ordering. action_v, action_a = self.target_network.forward(obs_t) return action_a.argmax().item() def get_random_action(self, observation): # randint return is inclusive of final value return random.randint(0, num_actions - 1) def train_online_network(self): pass def save_models(self): self.online_network.save_checkpoint() self.target_network.save_checkpoint() def load_models(self): self.online_network.load_checkpoint() self.target_network.load_checkpoint() def update_target_network(self): pass def copy_online_nn_to_target_nn(self): self.target_network.load_state_dict(self.online_network.state_dict())