예제 #1
0
def main():
    # 이미지 저장 경로 확인 및 생성
    if not os.path.exists('images/'):
        os.makedirs('images/')

    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=None,  # exploring start
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0)
    env.reset()

    # n-스텝
    n = 3

    # 스텝 사이즈
    alpha = 0.2

    # 가 수행당 1번의 에피소드 수행
    episodes = 1000

    values = dict()
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            values[(i, j)] = 0.0

    for ep in range(episodes):
        temporal_difference(env, values, n, alpha)

    draw_grid_world_image(values, 'images/grid_world_fixed_params.png',
                          GRID_HEIGHT, GRID_WIDTH)
def main():
    # 이미지 저장 경로 확인 및 생성
    if not os.path.exists('images/'):
        os.makedirs('images/')

    # 그리드 월드 환경 객체 생성
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=(0, 0),
                    terminal_states=TERMINAL_STATES,
                    transition_reward=-1.0,
                    terminal_reward=-1.0,
                    outward_reward=-1.0)
    env.reset()

    values, returns = first_visit_mc_prediction(env, 1.0, 10000)
    print("First Visit")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j]))
        print()

    draw_grid_world_image(values, 'images/first_visit_mc_state_values.png',
                          GRID_HEIGHT, GRID_WIDTH)
    print()

    values, returns = every_visit_mc_prediction(env, 1.0, 10000)
    print("Every Visit")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j]))
        print()

    draw_grid_world_image(values, 'images/every_visit_mc_state_values.png',
                          GRID_HEIGHT, GRID_WIDTH)
예제 #3
0
def main():
    # 5x5 맵 생성
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=None,
                    terminal_states=[],
                    transition_reward=0,
                    outward_reward=-1.0,
                    warm_hole_states=[(A_POSITION, A_PRIME_POSITION, 10.0),
                                      (B_POSITION, B_PRIME_POSITION, 5.0)])

    optimal_state_values = calculate_grid_world_optimal_state_values(env)

    draw_grid_world_image(np.round(optimal_state_values, decimals=2),
                          'images/grid_world_optimal_state_values.png',
                          GRID_HEIGHT, GRID_WIDTH)

    with np.printoptions(precision=2, suppress=True):
        print(optimal_state_values)
def grid_world_policy_evaluation():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=(0, 0),
                    terminal_states=TERMINAL_STATES,
                    transition_reward=-1.0,
                    terminal_reward=-1.0,
                    outward_reward=-1.0)

    env.reset()

    # 수렴 시킨 상태 가치를 이미지로 저장하고 반복 횟수 반환 받음
    state_values, iteration = compute_state_value(env)

    print('정책 평가 --> 상태 가치 수렴: {} 회 반복'.format(iteration))
    print(state_values)

    draw_grid_world_image(np.round(state_values, decimals=2),
                          'images/state_values.png', GRID_HEIGHT, GRID_WIDTH)
예제 #5
0
def compute_state_values(env):
    policy = generate_initial_random_policy(env)

    state_values = dict()
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            state_values[(i, j)] = 0.0

    num_episodes = 300
    for _ in range(num_episodes):
        temporal_difference(env, policy, state_values)

    draw_grid_world_image(state_values,
                          'images/grid_world_td_prediction_300.png',
                          GRID_HEIGHT, GRID_WIDTH)

    state_values = dict()
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            state_values[(i, j)] = 0.0

    num_episodes = 3000
    for _ in range(num_episodes):
        temporal_difference(env, policy, state_values)

    draw_grid_world_image(state_values,
                          'images/grid_world_td_prediction_3000.png',
                          GRID_HEIGHT, GRID_WIDTH)

    state_values = dict()
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            state_values[(i, j)] = 0.0

    num_episodes = 10000
    for _ in range(num_episodes):
        temporal_difference(env, policy, state_values)

    draw_grid_world_image(state_values,
                          'images/grid_world_td_prediction_10000.png',
                          GRID_HEIGHT, GRID_WIDTH)
        # 가치 함수 수렴 여부 판단
        if np.sum(np.abs(new_value_function - value_function)) < 1e-4:
            break

        value_function = new_value_function

    return new_value_function


# MAIN
if __name__ == '__main__':
    if not os.path.exists('images/'):
        os.makedirs('images/')

    value_function = np.zeros((GRID_HEIGHT, GRID_WIDTH))
    draw_grid_world_image(np.round(value_function, decimals=0), 'images/empty_grid_world.png', GRID_HEIGHT, GRID_WIDTH)

    # 5x5 맵 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=(0, 0),
        terminal_states=[],
        transition_reward=0,
        outward_reward=-1.0,
        warm_hole_states=[(A_POSITION, A_PRIME_POSITION, 10.0), (B_POSITION, B_PRIME_POSITION, 5.0)]
    )

    env.reset()
    state_values = grid_world_state_values(env)
    print(state_values)