class DQNAgent(Agent): def __init__(self, state_size, num_actions, batch_size=64, gamma=0.999, epsilon=0.9, epsilon_decay=0.99995, **kwargs): super(DQNAgent, self).__init__(state_size, num_actions, **kwargs) self.batch_size = batch_size self.gamma = gamma self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.net = DQN(state_size, num_actions, **kwargs) def get_action(self, state: np.ndarray): if self.mode == 'train' and np.random.random() < self.epsilon: action = np.random.randint(self.num_actions) else: action = np.argmax(self.get_q_values(state), axis=-1) self.epsilon *= self.epsilon_decay return action def get_q_values(self, state: np.ndarray) -> np.ndarray: return self.net.predict( state).detach().cpu().numpy() # shape = (b, m, c) def optimize(self): batch: List[Transition] = self.buffer.sample(self.batch_size) if batch is None: return self.net.optimize(batch, self.gamma) def save_model(self, model_save_path: str): self.net.save_model(model_save_path)
class DQNCropAgent(CropAgent): def __init__(self, state_size, _num_actions, batch_size=64, gamma=0.999, epsilon=0.9, epsilon_decay=0.99995, **kwargs): num_actions = len(self.WATER_VALUES) * len(self.NITROGEN_VALUES) \ * len(self.PHOSPHORUS_VALUES) super(DQNCropAgent, self).__init__(state_size, num_actions, **kwargs) self.batch_size = batch_size self.gamma = gamma self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.net = DQN(state_size, self.num_actions, **kwargs) def get_action(self, state: np.ndarray): if self.mode == 'train' and np.random.random() < self.epsilon: action_idx = np.random.randint(self.num_actions) else: action_idx = self.get_q_values(state).argmax(axis=-1) self.epsilon *= self.epsilon_decay # convert action index to actual action values action = self.idx_to_action(action_idx) return action def get_q_values(self, state: np.ndarray) -> np.ndarray: return self.net.predict(state).detach().cpu().numpy() def get_saliency(self, state: np.ndarray, q_values: np.ndarray) -> np.ndarray: assert state.size == self.state_size, "saliency cannot be computed during training" self.update_state_value_range(state) saliency = np.zeros_like(state) action: int = q_values.argmax() q_values_dict = {i: q / 100 for i, q in enumerate(q_values.squeeze())} for _ in range(self.SALIENCY_TRIALS): for i in range(self.state_size): perturbed_state = self.perturb(state, i) perturbed_q_values = self.get_q_values(perturbed_state) perturbed_q_values_dict = { j: q / 100 for j, q in enumerate(perturbed_q_values.squeeze()) } saliency[i] += computeSaliencyUsingSarfa( action, q_values_dict, perturbed_q_values_dict)[0] / self.SALIENCY_TRIALS return saliency def optimize(self): batch: List[Transition] = self.buffer.sample(self.batch_size) if batch is None: return self.net.optimize(batch, self.gamma) def save_model(self, model_save_path: str): self.net.save_model(model_save_path)