def run_Q_learning(seed, epsilon_current=0.1, max_episodes=10000, epsilon_decrease=1., start_at_bottom=False): env = MountainCarWithResetEnv() np.random.seed(seed) env.seed(seed) gamma = 0.999 learning_rate = 0.05 epsilon_min = 0.05 solver = Solver( # learning parameters gamma=gamma, learning_rate=learning_rate, # feature extraction parameters number_of_kernels_per_dim=[7, 5], # env dependencies (DO NOT CHANGE): number_of_actions=env.action_space.n, ) bottom_state = np.asarray([-0.5, 0]) bottom_state_val = [] success_rates = [] episodes_gain = [] episodes_bellman_err = [] for episode_index in range(1, max_episodes + 1): episode_gain, mean_delta = run_episode(env, solver, is_train=True, epsilon=epsilon_current, start_at_bottom=start_at_bottom) episodes_gain.append(episode_gain) # reduce epsilon if required epsilon_current *= epsilon_decrease epsilon_current = max(epsilon_current, epsilon_min) episodes_bellman_err.append(mean_delta) bottom_state_features = solver.get_features(bottom_state) bottom_state_max_action = solver.get_max_action(bottom_state) bottom_state_val.append( solver.get_q_val(bottom_state_features, bottom_state_max_action)) # termination condition: if episode_index % 10 == 9: test_gains = [ run_episode(env, solver, is_train=False, epsilon=0.)[0] for _ in range(10) ] mean_test_gain = np.mean(test_gains) success_rates.append(np.mean(np.asarray(test_gains) > -200)) print(f'tested 10 episodes: mean gain is {mean_test_gain}') if mean_test_gain >= -75.: print(f'solved in {episode_index} episodes') break return episodes_gain, success_rates, bottom_state_val, episodes_bellman_err
seeds = [123] epsilons = [1] gamma = 0.999 learning_rate = 0.05 epsilon_decrease = 0.99 epsilon_min = 0.01 max_episodes = 10000 seed_rewards, seed_performance, seed_bottom_val, seed_bellman_err_avg = [], [], [], [] for seed in seeds: env = MountainCarWithResetEnv() np.random.seed(seed) env.seed(seed) solver = Solver( # learning parameters gamma=gamma, learning_rate=learning_rate, # feature extraction parameters number_of_kernels_per_dim=[5, 7], # env dependencies (DO NOT CHANGE): number_of_actions=env.action_space.n, ) for epsilon_current in epsilons: rewards, performance, bottom_val, bellman_err_avg, bellman_err = [], [], [], [], [] for episode_index in range(0, max_episodes): episode_gain, mean_delta = run_episode(env,
def run_q_learning_training(seed, epsilon=0.1, max_episodes=1000): env = MountainCarWithResetEnv() np.random.seed(seed) env.seed(seed) gamma = 0.999 learning_rate = 0.01 max_episodes = max_episodes solver = Solver( # learning parameters gamma=gamma, learning_rate=learning_rate, # feature extraction parameters number_of_kernels_per_dim=[7, 5], # env dependencies (DO NOT CHANGE): number_of_actions=env.action_space.n, ) train_statistics = defaultdict(list) bellman_error = list() bellman_error_index = 100 for episode_index in range(1, max_episodes + 1): episode_gain, mean_delta = run_episode(env, solver, is_train=True, epsilon=epsilon) bellman_error.append(mean_delta) print( f'After {episode_index}, reward = {episode_gain}, epsilon {epsilon}, average error {mean_delta}' ) env.reset() init_state = env.state phi_st_0 = solver.get_state_action_features(init_state, 0) phi_st_1 = solver.get_state_action_features(init_state, 1) phi_st_2 = solver.get_state_action_features(init_state, 2) Q_st_0 = phi_st_0.transpose() @ solver.theta Q_st_1 = phi_st_1.transpose() @ solver.theta Q_st_2 = phi_st_2.transpose() @ solver.theta train_statistics["init_state"].append(max(Q_st_0, Q_st_1, Q_st_2)) train_statistics["reward"].append(episode_gain) if episode_index % 100 == 99: train_statistics["bellman_error"].append(np.mean(bellman_error)) train_statistics["bellman_error_index"].append(bellman_error_index) bellman_error_index += 100 bellman_error = list() if episode_index % 10 == 9: test_gains = [ run_episode(env, solver, is_train=False, epsilon=0.)[0] for _ in range(10) ] mean_test_gain = np.mean(test_gains) train_statistics["performance"].append(mean_test_gain) print(f'tested 10 episodes: mean gain is {mean_test_gain}') if mean_test_gain >= -75.: print(f'solved in {episode_index} episodes') break return train_statistics