示例#1
0
def evaluate(Q, start_evaluate=False):
    e = lunar_lander_evaluator.environment()
    for _ in range(100 if start_evaluate else 20):
        state, done = e.reset(start_evaluate=start_evaluate), False
        while not done:
            action = np.argmax(Q[state])
            state, _, done, _ = e.step(action)

    return np.mean(e._episode_returns[-20:]) if not start_evaluate else 0
示例#2
0
def evaluate(Q, start_evaluate=False, output_file='output'):
    e = lunar_lander_evaluator.environment()
    for _ in range(100):
        state, done = e.reset(start_evaluate=start_evaluate), False
        while not done:
            action = np.argmax(Q[state])
            state, _, done, _ = e.step(action)

    if not start_evaluate:
        np.save(output_file + '-' + np.mean(e._episode_returns[-100:]), Q)

    return np.mean(e._episode_returns[-100:]) if not start_evaluate else 0
示例#3
0
def train_Q(Q, index, args, a_eps, b_eps, c_eps, a_alp, b_alp, c_alp, a_gui,
            b_gui, c_gui, update_policy, get_action):
    e_2 = 0
    epsilon = args.epsilon
    alpha = args.alpha
    guided = args.guided
    env = lunar_lander_evaluator.environment()
    episodes = args.episodes + args.pretrain_episodes
    episode = 0

    for ep in range(args.pretrain_episodes):
        if ep % 10 == 0:
            print(ep)

        if epsilon > args.epsilon_final:
            epsilon = max(args.epsilon_final,
                          a_eps * e_2 + b_eps * env.episode + c_eps)

        if alpha > args.alpha_final:
            alpha = max(args.alpha_final,
                        a_alp * e_2 + b_alp * env.episode + c_alp)

        if guided > args.guided_final:
            guided = max(args.guided_final,
                         a_gui * e_2 + b_gui * env.episode + c_gui)

        perform_expert_episode(env, Q, args.lookback,
                               partial(update_policy, alpha=alpha))

        episode += 1

    while episode < episodes:
        if epsilon > args.epsilon_final:
            epsilon = max(args.epsilon_final,
                          a_eps * e_2 + b_eps * env.episode + c_eps)

        if alpha > args.alpha_final:
            alpha = max(args.alpha_final,
                        a_alp * e_2 + b_alp * env.episode + c_alp)

        if guided > args.guided_final:
            guided = max(args.guided_final,
                         a_gui * e_2 + b_gui * env.episode + c_gui)

        if random.random() < guided:
            # perform guided episode
            perform_expert_episode(env, Q, args.lookback,
                                   partial(update_policy, alpha=alpha))
        else:
            # Perform a training episode
            perform_classic_episode(env, Q, args.lookback,
                                    partial(update_policy, alpha=alpha),
                                    partial(get_action, epsilon=epsilon))

        e_2 = env.episode**2
        episode += 1
        if args.evaluate_each and episode % args.evaluate_each == 0:
            val = evaluate(Q, False, args.output + '_{}'.format(index))
            print('Q_{} performs with score {}'.format(index, val))

    return Q
示例#4
0
    parser.add_argument("--epsilon_final",
                        default=0.0001,
                        type=float,
                        help="Final exploration factor.")
    parser.add_argument("--gamma",
                        default=0.98,
                        type=float,
                        help="Discounting factor.")
    parser.add_argument("--lookback",
                        default=3,
                        type=float,
                        help="Number of lookback states.")
    args = parser.parse_args()

    # Create the environment
    env = lunar_lander_evaluator.environment()

    # Prepare parameters
    episodes = args.episodes + args.pretrain_episodes

    d_eps = args.epsilon_final
    c_eps = args.epsilon
    a_eps = (d_eps - c_eps) / (episodes**2)
    b_eps = 0

    d_alp = args.alpha_final
    c_alp = args.alpha
    a_alp = (d_alp - c_alp) / (episodes**2)
    b_alp = 0

    d_gui = args.guided_final