Пример #1
0
def run():
    env = gym.make('CartPole-v0')
    env = env.unwrapped
    N_ACTIONS = env.action_space.n
    N_STATES = env.observation_space.shape[0]

    RL = DeepQNetwork(N_ACTIONS, N_STATES)

    step = 0
    for i in range(600):  # 玩300个回合
        # init env
        observation = env.reset()
        step_in = 0
        while True:
            # refresh env
            env.render()

            action = RL.choose_action(observation)

            observation_, reward, done, info = env.step(action)

            # modify the reward
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            r = r1 + r2

            RL.store_transition(observation, action, r, observation_)

            if step > 200 and step % 5 == 0:
                RL.learn()

            if done:
                print('step_in:%s  reward:%s' % (step_in, reward))
                plot_data.append(step_in)
                break
            observation = observation_
            step += 1
            step_in += 1
    # end of game
    print('game over')
    # env.destroy()

    # plot_data = np.array(plot_data, dtype='float32')
    # plot_data = np.divide(plot_data, plot_data.max())
    print(plot_data)
Пример #2
0
episodes = 2000
step = 0
for i in range(episodes):

    state = env.reset()
    while True:
        env.render()

        feature = [0] * len(env.getStates())
        feature[state - 1] = 1
        feature = np.hstack(feature)
        action = RL.choose_action(feature)

        state_, reward, done = env.step(action)

        feature_ = [0] * len(env.getStates())
        feature_[state_ - 1] = 1
        feature_ = np.hstack(feature_)

        RL.store_transition(feature, action, reward, feature_)

        if (step > 200) and (step % 5 == 0):
            RL.learn()

        state = state_

        if done:
            break
        step += 1