ms, ma = env_model.sample_s_a() # ms in here is a str mr, ms_ = env_model.get_r_s_(ms, ma) RL.learn(ms, ma, mr, str(ms_)) # print(env_model.database) # print('################') # print(RL.q_table) # print('################') s = s_ s2 = s2_ if done: s = env.reset() break if done2: s2 = env.reset2() break # end of game print('game over') print(RL.q_table) env.destroy() if __name__ == "__main__": env = Maze() env_model = EnvModel(actions=list(range(env.n_actions))) RL = QLearningTable(actions=list(range(env.n_actions))) env.after(0, update) env.mainloop()
while True: env.render() a = RL.choose_action(str(s)) s_, r, done = env.step(a) RL.learn(str(s), a, r, str(s_)) # use a model to output (r, s_) by inputting (s, a) 输入状态和一个动作输出其奖励和下一个状态 # the model in dyna Q version is just like a memory replay buffer env_model.store_transition(str(s), a, r, s_) for n in range(10): # learn 10 more times using the env_model ms, ma = env_model.sample_s_a() # ms in here is a str mr, ms_ = env_model.get_r_s_(ms, ma) RL.learn(ms, ma, mr, str(ms_)) s = s_ if done: break # end of game print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = QLearningTable(actions=list(range( env.n_actions))) #直接强化学习,通过环境体验来优化策略模型 env_model = EnvModel(actions=list(range(env.n_actions))) #通过环境体验来优化策略和优化模型 env.after(0, update) env.mainloop()