Exemplo n.º 1
0
def run():
    env = gym.make('CartPole-v0')
    env = env.unwrapped
    N_ACTIONS = env.action_space.n
    N_STATES = env.observation_space.shape[0]

    RL = DeepQNetwork(N_ACTIONS, N_STATES)

    step = 0
    for i in range(600):  # 玩300个回合
        # init env
        observation = env.reset()
        step_in = 0
        while True:
            # refresh env
            env.render()

            action = RL.choose_action(observation)

            observation_, reward, done, info = env.step(action)

            # modify the reward
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            r = r1 + r2

            RL.store_transition(observation, action, r, observation_)

            if step > 200 and step % 5 == 0:
                RL.learn()

            if done:
                print('step_in:%s  reward:%s' % (step_in, reward))
                plot_data.append(step_in)
                break
            observation = observation_
            step += 1
            step_in += 1
    # end of game
    print('game over')
    # env.destroy()

    # plot_data = np.array(plot_data, dtype='float32')
    # plot_data = np.divide(plot_data, plot_data.max())
    print(plot_data)
Exemplo n.º 2
0
                  e_greedy=0.1,
                  replace_target_iter=200,
                  memory_size=2000)

episodes = 2000
step = 0
for i in range(episodes):

    state = env.reset()
    while True:
        env.render()

        feature = [0] * len(env.getStates())
        feature[state - 1] = 1
        feature = np.hstack(feature)
        action = RL.choose_action(feature)

        state_, reward, done = env.step(action)

        feature_ = [0] * len(env.getStates())
        feature_[state_ - 1] = 1
        feature_ = np.hstack(feature_)

        RL.store_transition(feature, action, reward, feature_)

        if (step > 200) and (step % 5 == 0):
            RL.learn()

        state = state_

        if done:
                  learning_rate=0.01, e_greedy=0.9,
                  replace_target_iter=100, memory_size=1000,
                  )

total_steps = 0
reward_c = []
show = []
running_reward = 0
for i_episode in range(1000):
    t = 0
    observation = env.reset()
    ep_r = 0
    while True:
        # env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        # the smaller theta and closer to center the better
        # x, x_dot, theta, theta_dot = observation_
        # r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
        # r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
        # reward = r1 + r2
        if done:
            reward = -1

        RL.store_transition(observation, action, reward, observation_)

        ep_r += reward
        if total_steps > 1000:
Exemplo n.º 4
0
                  reward_decay=0.9,
                  e_greedy=0.9,
                  replace_target_iter=200,
                  memory_size=1048576,
                  batch_size=50 * 700,
                  training=True,
                  import_file='saved/trained_dqn')

step = 0
score_history = []
for episode in range(600):
    blue_state, red_state = env.reset()
    score = 0
    #Main game loop
    while True:
        blue_action = RL.choose_action(blue_state)
        red_action = 0  #RL.choose_action(red_state)

        blue_state_, red_state, blue_reward, done = env.step(
            translate_int_action(blue_action),
            translate_int_action(red_action))

        RL.store_transition(blue_state, blue_action, blue_reward, blue_state_)

        if (step > 200) and (step % 50 == 0):
            RL.learn()

        blue_state = blue_state_

        env.render()