Exemplo n.º 1
0
    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)
        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:
            ep_rs_sum = sum(RL.ep_rs)
            if "running_reward" not in globals():
                running_reward = ep_rs_sum

            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01

            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = False  #True  # rendering
            print("episode:", eposide_i, "  reward:", int(running_reward))

            vt = RL.learn()  # train

            if eposide_i == 30:
                plt.plot(vt)  # plot the episode vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()

            break

        observation = observation_
Exemplo n.º 2
0
        episode_reward += reward

        if done:
            #print("RL.ep_obs: "); print(RL.ep_obs.shape)
            #print("np.vstack(RL.ep_obs).shape = "); print(np.vstack(RL.ep_obs).shape)
            #print("np.array(RL.ep_as).shape = "); print(np.array(RL.ep_as).shape)
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # rendering
                if running_reward > current_max:
                    RL.save_model()
                    current_max = running_reward

            print("episode:", i_episode, " episode_reward:", episode_reward,
                  "  running_reward:", int(running_reward), " t:", t)

            vt = RL.learn()

            if i_episode == 0:
                plt.plot(vt)  # plot the episode vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()

            break
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)     # reward = -1 in all cases

        RL.store_transition(observation, action, reward)

        if done:
            # calculate running reward
            ep_rs_sum = sum(RL.ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True     # rendering

            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()  # train

            if i_episode == 30:
                plt.plot(vt)  # plot the episode vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()

            break

        observation = observation_
Exemplo n.º 4
0
    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)  #存储这一回合的transition

        if done:
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # 判断是否显示模拟
            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()  #学习,输出vt

            if i_episode == 0:
                plt.plot(vt)  # plot the episode vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()
            break

        observation = observation_
    while True:
        if RENDER:
            env.render()
        action = RL.choose_action(observation)
        observation_, reward, done, info = env.step(
            action)  # reward = -1 in all cases
        RL.store_transition(observation, action, reward)

        if done:
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True

            print("episode: ", i_episode, ", reward: ", int(running_reward))

            vt = RL.learn()  # training here

            if i_episode == 30:
                plt.plot(vt)
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()
            break

        observation = observation_
        print('info:', info)

        RL.store_transition(observation, action, reward)  # 存储这一回合的transition

        # 将下一个 state_ 变为 下次循环的 state
        observation = observation_

        # 回合结束
        if done:
            # 一个回合的reward求和
            ep_rs_sum = sum(RL.ep_rs)
            # globals() 函数会以字典类型返回当前位置的全部全局变量
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                # 这步为什么,没搞懂
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # 判断是否显示模拟
            print("episode:", i_episode, "  reward:", int(running_reward))

            # 一个回合的标准化后的收获
            vt = RL.learn()  # 学习, 输出 vt, 我们下节课讲这个 vt 的作用

            if i_episode == 0:
                plt.plot(vt)  # plot 这个回合的 vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()
            break
Exemplo n.º 7
0
for i_episode in range(10):

    observation = env.reset()

    while True:
        env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        position, velocity = observation_

        # the higher the better
        reward = abs(position - (-0.5))

        RL.store_transition(observation, action, reward, observation_)

        if total_steps > 1000:
            RL.learn()
            print('episode: ', i_episode, "step: ", RL.learn_steps, 'cost: ',
                  round(RL.cost, 4), ' epsilon: ', round(RL.epsilon, 2))

        if done:
            break

        observation = observation_
        total_steps += 1

RL.plot_cost()
)

for i_episode in range(3000):
    observation = env.reset()   # 获取回合 i_episode 第一个 observation

    while True:
        if RENDER: env.render() # 刷新环境
        action = RL.choose_action(observation)  # 选行为
        observation_, reward, done, info = env.step(action) # 获取下一个state
        RL.store_transition(observation, action, reward)    # 存储这一回合的transition

        if done:    # 一个回合结束,开始更新参数
            ep_rs_sum = sum(RL.ep_rs)   # 统计每回合的reward

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True     # 判断是否开始模拟
            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn() # 学习参数,输出vt

            if i_episode == 0:  # 画图
                plt.plot(vt)
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()
            break

        observation = observation_
Exemplo n.º 9
0
        RL.store_transition(observation, action, reward)

        if done:
            print('Done', done)
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))

            vt, loss = RL.learn()
            Y.append(loss)

            if i_episode == 0:
                plt.plot(vt)  # plot the episode vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()
            break

        observation = observation_

fig = plt.figure()
X = np.array(X)
Y = np.array(Y)
Exemplo n.º 10
0
    while True:
        if RENDER:env.render()
        action = RL.choose_action(observation)

        observation_,reward,done,info = env.step(action)

        RL.store_transition(observation,action,reward)


        if done:
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True     # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()

            if i_episode == 0:
                plt.plot(vt)    # plot the episode vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()
            break

        observation = observation_

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(
            action)  # reward = -1 in all cases

        RL.store_transition(observation, action, reward)

        if done:
            # calculate running reward
            ep_rs_sum = sum(RL.ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # rendering

            print("episode:", i_episode, "  reward:", int(running_reward))

            vt, loss = RL.learn()  # train

            if i_episode == 5:
                plt.plot(vt)  # plot the episode vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()

            break

        observation = observation_
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)     # reward = -1 in all cases

        RL.store_transition(observation, action, reward)

        if done:
            # calculate running reward
            ep_rs_sum = sum(RL.ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True     # rendering

            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()  # train#回合更新,即本次游戏所有幕结束后更新

            if i_episode == 30:
                plt.plot(vt)  # plot the episode vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()

            break

        observation = observation_
Exemplo n.º 13
0
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:  # 本轮游戏结束
            ep_rs_sum = sum(RL.ep_rs)  # 获取目前为止所有执行动作的reward的总和
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # rendering

            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()  # 训练模型

            if i_episode == 30:
                plt.plot(vt)  # plot the episode vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()

            break

        observation = observation_
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:
            # 计算运行回报
            ep_rs_sum = sum(RL.ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True

            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()  # 回合结束开始学习

            if i_episode == 30:
                plt.plot(vt)  # 第30个回合展示vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()

            break

        observation = observation_