示例#1
0
                ms, ma = env_model.sample_s_a()  # ms in here is a str
                mr, ms_ = env_model.get_r_s_(ms, ma)
                RL.learn(ms, ma, mr, str(ms_))

                # print(env_model.database)
                # print('################')
                # print(RL.q_table)
                # print('################')
            s = s_
            s2 = s2_
            if done:
                s = env.reset()
                break

            if done2:
                s2 = env.reset2()
                break

    # end of game
    print('game over')
    print(RL.q_table)
    env.destroy()


if __name__ == "__main__":
    env = Maze()
    env_model = EnvModel(actions=list(range(env.n_actions)))
    RL = QLearningTable(actions=list(range(env.n_actions)))
    env.after(0, update)
    env.mainloop()
        while True:
            env.render()
            a = RL.choose_action(str(s))
            s_, r, done = env.step(a)
            RL.learn(str(s), a, r, str(s_))

            # use a model to output (r, s_) by inputting (s, a)  输入状态和一个动作输出其奖励和下一个状态
            # the model in dyna Q version is just like a memory replay buffer
            env_model.store_transition(str(s), a, r, s_)
            for n in range(10):  # learn 10 more times using the env_model
                ms, ma = env_model.sample_s_a()  # ms in here is a str
                mr, ms_ = env_model.get_r_s_(ms, ma)
                RL.learn(ms, ma, mr, str(ms_))

            s = s_
            if done:
                break

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    env = Maze()
    RL = QLearningTable(actions=list(range(
        env.n_actions)))  #直接强化学习,通过环境体验来优化策略模型
    env_model = EnvModel(actions=list(range(env.n_actions)))  #通过环境体验来优化策略和优化模型

    env.after(0, update)
    env.mainloop()