def choose_action(self, state): action2q = {a: self.q[(state, a)] for a in Action.get_actions()} if random.random() < self.epsilon: return random.choice(action2q.keys()) max_q = max(action2q.values()) best_actions = [a for a in action2q if action2q[a] == max_q] return random.choice(best_actions)
def learn_q(self, last_state, action, reward, now_state): max_state_value = max( [self.q[(now_state, a)] for a in Action.get_actions()]) # 全部使用reward初始化 if (last_state, action) not in self.q: self.q[(last_state, action)] = reward self.q[( last_state, action)] += self.alpha * (reward + self.gamma * max_state_value - self.q[(last_state, action)])