def evaluate(Q, start_evaluate=False): e = lunar_lander_evaluator.environment() for _ in range(100 if start_evaluate else 20): state, done = e.reset(start_evaluate=start_evaluate), False while not done: action = np.argmax(Q[state]) state, _, done, _ = e.step(action) return np.mean(e._episode_returns[-20:]) if not start_evaluate else 0
def evaluate(Q, start_evaluate=False, output_file='output'): e = lunar_lander_evaluator.environment() for _ in range(100): state, done = e.reset(start_evaluate=start_evaluate), False while not done: action = np.argmax(Q[state]) state, _, done, _ = e.step(action) if not start_evaluate: np.save(output_file + '-' + np.mean(e._episode_returns[-100:]), Q) return np.mean(e._episode_returns[-100:]) if not start_evaluate else 0
def train_Q(Q, index, args, a_eps, b_eps, c_eps, a_alp, b_alp, c_alp, a_gui, b_gui, c_gui, update_policy, get_action): e_2 = 0 epsilon = args.epsilon alpha = args.alpha guided = args.guided env = lunar_lander_evaluator.environment() episodes = args.episodes + args.pretrain_episodes episode = 0 for ep in range(args.pretrain_episodes): if ep % 10 == 0: print(ep) if epsilon > args.epsilon_final: epsilon = max(args.epsilon_final, a_eps * e_2 + b_eps * env.episode + c_eps) if alpha > args.alpha_final: alpha = max(args.alpha_final, a_alp * e_2 + b_alp * env.episode + c_alp) if guided > args.guided_final: guided = max(args.guided_final, a_gui * e_2 + b_gui * env.episode + c_gui) perform_expert_episode(env, Q, args.lookback, partial(update_policy, alpha=alpha)) episode += 1 while episode < episodes: if epsilon > args.epsilon_final: epsilon = max(args.epsilon_final, a_eps * e_2 + b_eps * env.episode + c_eps) if alpha > args.alpha_final: alpha = max(args.alpha_final, a_alp * e_2 + b_alp * env.episode + c_alp) if guided > args.guided_final: guided = max(args.guided_final, a_gui * e_2 + b_gui * env.episode + c_gui) if random.random() < guided: # perform guided episode perform_expert_episode(env, Q, args.lookback, partial(update_policy, alpha=alpha)) else: # Perform a training episode perform_classic_episode(env, Q, args.lookback, partial(update_policy, alpha=alpha), partial(get_action, epsilon=epsilon)) e_2 = env.episode**2 episode += 1 if args.evaluate_each and episode % args.evaluate_each == 0: val = evaluate(Q, False, args.output + '_{}'.format(index)) print('Q_{} performs with score {}'.format(index, val)) return Q
parser.add_argument("--epsilon_final", default=0.0001, type=float, help="Final exploration factor.") parser.add_argument("--gamma", default=0.98, type=float, help="Discounting factor.") parser.add_argument("--lookback", default=3, type=float, help="Number of lookback states.") args = parser.parse_args() # Create the environment env = lunar_lander_evaluator.environment() # Prepare parameters episodes = args.episodes + args.pretrain_episodes d_eps = args.epsilon_final c_eps = args.epsilon a_eps = (d_eps - c_eps) / (episodes**2) b_eps = 0 d_alp = args.alpha_final c_alp = args.alpha a_alp = (d_alp - c_alp) / (episodes**2) b_alp = 0 d_gui = args.guided_final