def _update_Q(self, experience_tuple, space): """ Update Q table :param experience_tuple: s, a, s', r :return: """ if not self.training_mode: return 0 s, a, sp, r = experience_tuple prev_Q = self.Q[discretize_state(s, space), a] updated_Q = prev_Q + self.alpha * ( r + self.gamma * self.Q[discretize_state(sp, space), np.argmax([ self.Q[discretize_state(sp, space), i] for i in range(self.na) ])] - prev_Q) self.Q[discretize_state(s, space), a] = updated_Q self.alpha *= self.alpha_decay_rate return abs(updated_Q - prev_Q)
def _query(self, s, a, sp, r, space): """ Select action and update Q-table :param s: previous state :param a: selected action :param sp: new state :param r: immediate reward :return: """ delta_Q = self._update_Q((s, a, sp, r), space) self.total_actions += 1 # Dyna-Q if self.dyna > 0: # Replace T and R models with in-memory historical data self.memory.append((self.s, self.a, sp, r)) # Hallucinate for d in range(self.dyna): # Update Q-table self._update_Q(self.memory[np.random.choice(len(self.memory))], space) if np.random.random() < self.epsilon: self.random_actions += 1 action = np.random.choice(self.na) self.epsilon *= self.epsilon_decay_rate else: action = np.argmax([ self.Q[discretize_state(sp, space), a] for a in range(self.na) ]) # Update current state and action self.s = sp self.a = action return action, delta_Q
def _query_initial(self, s, space): """ Select action without updating the Q-table :param s: :return: """ self.total_actions += 1 if np.random.random() < self.epsilon: self.random_actions += 1 action = np.random.choice(self.na) else: action = np.argmax([ self.Q[discretize_state(s, space), a] for a in range(self.na) ]) self.v *= self.epsilon_decay_rate # Update current state and action self.s = s self.a = action return action