示例#1
0
def main():
    RENDER = False
    MAX_EXPLORE = 2000

    # env = gym.make('MountainCar-v0')
    env = gym.make('CartPole-v0')
    env.seed(1)
    env = env.unwrapped

    print(f"action_space: {env.action_space}")
    print(f"action_space.n: {env.action_space.n}")
    print(f"observation_space: {env.observation_space}")
    print(f"observation_space.shape: {env.observation_space.shape}")
    print(f"observation_space.high: {env.observation_space.high}")
    print(f"observation_space.low: {env.observation_space.low}")
    # action_space: Discrete(3)
    # action_space.n: 3
    # observation_space:
    #     Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)
    # observation_space.shape: (2,)
    # observation_space.high: [0.6  0.07]
    # observation_space.low: [-1.2  -0.07]

    agent = PolicyGradient(n_features=env.observation_space.shape[0],
                           n_actions=env.action_space.n,
                           lr=0.001,
                           reward_decay=0.995)

    episodes = 3000
    total_reward = []
    for episode in range(episodes):
        s = env.reset()
        # s : list
        # s.shape = (2,)
        for i in range(MAX_EXPLORE):
            if RENDER:
                env.render()
            a = agent.choose_action(s)
            # a : scalar
            s_, r, done, _ = env.step(a)
            # s : list, shape=(2,)
            # r : float
            # done : bool
            agent.store_transition(s, a, r)

            if done or (i + 1) == MAX_EXPLORE:
                ep_rs_sum = sum(agent.ep_r)
                total_reward.append(ep_rs_sum)
                avg_reward = sum(total_reward) / len(total_reward)
                print(f"Episode: {episode+1}")
                print(f"\treward: {ep_rs_sum}, done: {done}")
                print(f"\tavg reward: {avg_reward}")
                vt = agent.learn()

                if avg_reward > 200:
                    RENDER = True

                if episode == 30:
                    plt.plot(vt)
                    plt.xlabel('episode steps')
                    plt.ylabel('normalized state-action value')
                    plt.show()
                break

            s = s_
示例#2
0
    learning_rate=0.02,
    reward_decay=0.99,
)


running_reward = 0
for i_episode in range(3000):
    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)
        observation_, reward, done, info = env.step(action)

        RL.store_reward(reward)

        ep_rs_sum = sum(RL.ep_rs)
        if done or ep_rs_sum > 2 * DISPLAY_REWARD_THRESHOLD:  # End the ep if done or get enough reward
            running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True

            print("Episode:", i_episode, "  Reward:", int(running_reward))
            vt = RL.learn()

            break

        observation = observation_

        if running_reward > 2 * DISPLAY_REWARD_THRESHOLD: break