else: Q.append(r + self.gamma * self.v[s]) # 贪婪策略,进行更新 self.pi[state] = self.actions[Q.index(max(Q))] # self.pi[state] = Q.index(max(Q)) def policy_iterate(self): for i in range(100): # 策略评估,变的时v self.policy_evaluate() # 策略改进, 变的是pi self.policy_improve() if __name__ == "__main__": yuanyang = YuanYangEnv() policy_value = Policy_Value(yuanyang) policy_value.policy_iterate() flag = 1 s = 0 # print(policy_value.pi) step_num = 0 # 将最优路径打印出来 while flag: a = policy_value.pi[s] print('%d->%s\t' % (s, a)) yuanyang.bird_male_position = yuanyang.state_to_position(s) yuanyang.render() time.sleep(0.2) step_num += 1 yuanyang.state = s
def e_greedy_policy(self, s): a_max = np.argmax(self.Q[s]) if np.random.uniform() < 1 - self.epislon: return self.env.actions[a_max] else: return self.env.actions[int(random.random() * len(self.env.actions))] # 训练结束后,使用贪婪策略试验 def greedy_policy(self, s): a_max = np.argmax(self.Q[s]) return self.env.actions[a_max] if __name__ == '__main__': env = YuanYangEnv() agent = Qlearning(env) agent.td_learning() print(np.sum(agent.Q)) flag = 1 s = 0 step_num = 0 # 将最优路径打印出来 while flag: a = agent.greedy_policy(s) print('%d->%s\t' % (s, a)) env.bird_male_position = env.state_to_position(s) env.render() time.sleep(0.2) step_num += 1 env.state = s
self.pi = self.policy_improvment() def choose_action(self, s): a_select = 0 rd = random.random() prob = 0 for i in range(len(self.env.actions)): prob += self.pi[s][i] if rd <= prob: a_select = self.env.actions[i] break return a_select if __name__ == '__main__': env = YuanYangEnv() agent = DP_soft(env, 0) agent.policy_iteration() Q = np.zeros((len(env.states), len(env.actions)), dtype=np.float32) for state in env.states: if env.is_terminal(state): continue for action in env.actions: # state = int(state) # action = int(action) next_state, r, done = env.step(action) if done: Q[state, action] = r else: Q[state, action] = r + agent.gamma * agent.V[next_state] print("动作值函数总和为:", np.sum(Q))