def run(k):
    row, col, vv, vh = (3, k, 0, 0)
    seen = []
    step = 0
    while True:
        step += 1
        if step > 1000:
            break
        action = policy[row, col, vv, vh]
        seen.append((row, col))
        # print row, col, vv, vh
        r, row, col, vv, vh = generate_reward_and_next_state(row, col, vv, vh, action, race_track)
        if r == 0:
            break
    return seen
예제 #2
0
def run(k):
    row, col, vv, vh = (3, k, 0, 0)
    seen = []
    step = 0
    while True:
        step += 1
        if step > 1000:
            break
        action = policy[row, col, vv, vh]
        seen.append((row, col))
        # print row, col, vv, vh
        r, row, col, vv, vh = generate_reward_and_next_state(
            row, col, vv, vh, action, race_track)
        if r == 0:
            break
    return seen
        seen.append((row, col))
        # print row, col, vv, vh
        r, row, col, vv, vh = generate_reward_and_next_state(row, col, vv, vh, action, race_track)
        if r == 0:
            seen.append((row, col))
            break
    return seen

start = time.time()

eps = 0.1
for k in xrange(100000):
    row, col, vv, vh = generate_start_state()
    while True:
        action = generate_action(policy[row, col, vv, vh], eps)
        reward, new_row, new_col, new_vv, new_vh = generate_reward_and_next_state(row, col, vv, vh, action, race_track)
        Q[row, col, vv, vh, action] += alpha*(reward+gamma*np.amax(Q[new_row, new_col, new_vv, new_vh, :])-Q[row, col, vv, vh, action])
        policy[row, col, vv, vh] = np.argmax(Q[row, col, vv, vh, :])
        row, col, vv, vh = new_row, new_col, new_vv, new_vh
        if game_over(row, col, race_track):
            break

print time.time()-start

for k in xrange(6, 12):
    race_track = track()
    been = run(k)
    for state in been:
        race_track[state] = 5
    plt.figure()
    plt.imshow(np.flipud(race_track), interpolation='none')
예제 #4
0
        r, row, col, vv, vh = generate_reward_and_next_state(
            row, col, vv, vh, action, race_track)
        if r == 0:
            break
    return seen


start = time.time()

eps = 0.1
for k in xrange(1000):
    E = np.zeros((height, width, n_vv, n_vh, actions))
    row, col, vv, vh = generate_start_state()
    action = generate_action(policy[row, col, vv, vh], eps)
    while True:
        reward, new_row, new_col, new_vv, new_vh = generate_reward_and_next_state(
            row, col, vv, vh, action, race_track)
        new_action = generate_action(policy[new_row, new_col, new_vv, new_vh],
                                     eps)
        delta = reward + gamma * Q[new_row, new_col, new_vv, new_vh,
                                   new_action] - Q[row, col, vv, vh, action]
        E[row, col, vv, vh,
          action] = (1 - alpha) * E[row, col, vv, vh, action] + 1
        Q = Q + alpha * delta * E
        E = E * alpha * lamb
        policy = np.argmax(Q, axis=4)
        row, col, vv, vh, action = new_row, new_col, new_vv, new_vh, new_action
        if game_over(row, col, race_track):
            break

print time.time() - start