def run_Q_learning(seed, epsilon_current=0.1, max_episodes=10000, epsilon_decrease=1., start_at_bottom=False): env = MountainCarWithResetEnv() np.random.seed(seed) env.seed(seed) gamma = 0.999 learning_rate = 0.05 epsilon_min = 0.05 solver = Solver( # learning parameters gamma=gamma, learning_rate=learning_rate, # feature extraction parameters number_of_kernels_per_dim=[7, 5], # env dependencies (DO NOT CHANGE): number_of_actions=env.action_space.n, ) bottom_state = np.asarray([-0.5, 0]) bottom_state_val = [] success_rates = [] episodes_gain = [] episodes_bellman_err = [] for episode_index in range(1, max_episodes + 1): episode_gain, mean_delta = run_episode(env, solver, is_train=True, epsilon=epsilon_current, start_at_bottom=start_at_bottom) episodes_gain.append(episode_gain) # reduce epsilon if required epsilon_current *= epsilon_decrease epsilon_current = max(epsilon_current, epsilon_min) episodes_bellman_err.append(mean_delta) bottom_state_features = solver.get_features(bottom_state) bottom_state_max_action = solver.get_max_action(bottom_state) bottom_state_val.append( solver.get_q_val(bottom_state_features, bottom_state_max_action)) # termination condition: if episode_index % 10 == 9: test_gains = [ run_episode(env, solver, is_train=False, epsilon=0.)[0] for _ in range(10) ] mean_test_gain = np.mean(test_gains) success_rates.append(np.mean(np.asarray(test_gains) > -200)) print(f'tested 10 episodes: mean gain is {mean_test_gain}') if mean_test_gain >= -75.: print(f'solved in {episode_index} episodes') break return episodes_gain, success_rates, bottom_state_val, episodes_bellman_err
def __init__(self): self.game = MountainCarWithResetEnv() self.reset_theta() # Constants used for data standardization self.pos_mu = (self.game.min_position + self.game.max_position)/2 self.pos_sigma = (self.game.max_position - self.game.min_position)/np.sqrt(12) self.speed_mu = 0 self.speed_sigma = 2*self.game.max_speed/np.sqrt(12) # Cache of samples used for visualizing the policy self.vis_samples = None
def mean_std_of_states(): env = MountainCarWithResetEnv() samples_to_collect = 100000 states, actions, rewards, next_states, done_flags = DataCollector(env).collect_data(samples_to_collect) all_states = np.concatenate((states, next_states)) states_mean = np.mean(all_states, axis=0) states_std = np.std(all_states, axis=0) print("states_mean: {}, states_std: {}".format(states_mean, states_std))
def __discretize_state_space(n: int = 100): states = MountainCarWithResetEnv().observation_space positions = np.linspace(states.low[0], states.high[0], n) velocities = np.linspace(states.low[1], states.high[1], n) position_v, velocity_v = np.meshgrid(positions, velocities, sparse=False, indexing='ij') return position_v, velocity_v, n
def training_the_model(samples_to_collect=100000, seed=100): number_of_kernels_per_dim = [10, 8] gamma = 0.999 w_updates = 20 evaluation_number_of_games = 50 evaluation_max_steps_per_game = 300 np.random.seed(seed) env = MountainCarWithResetEnv() # collect data states, actions, rewards, next_states, done_flags = DataCollector( env).collect_data(samples_to_collect) # get data success rate data_success_rate = np.sum(rewards) / len(rewards) print(f'Data Success Rate {data_success_rate}') # standardize data data_transformer = DataTransformer() data_transformer.set_using_states( np.concatenate((states, next_states), axis=0)) states = data_transformer.transform_states(states) next_states = data_transformer.transform_states(next_states) # process with radial basis functions feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim) # encode all states: encoded_states = feature_extractor.encode_states_with_radial_basis_functions( states) encoded_next_states = feature_extractor.encode_states_with_radial_basis_functions( next_states) # set a new linear policy linear_policy = LinearPolicy(feature_extractor.get_number_of_features(), 3, True) # but set the weights as random linear_policy.set_w(np.random.uniform(size=linear_policy.w.shape)) # start an object that evaluates the success rate over time evaluator = GamePlayer(env, data_transformer, feature_extractor, linear_policy) success_rate_vs_iteration = list() for lspi_iteration in range(w_updates): print(f'Starting LSPI iteration {lspi_iteration}') new_w = compute_lspi_iteration(encoded_states, encoded_next_states, actions, rewards, done_flags, linear_policy, gamma) norm_diff = linear_policy.set_w(new_w) success_rate = evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game) success_rate_vs_iteration.append(success_rate) if norm_diff < 0.00001: break print('LSPI Done') return success_rate_vs_iteration
def lspi_data_sample(N=3000): env = MountainCarWithResetEnv() goal_pos = 0.5 min_pos = -1.2 max_pos = 0.6 min_speed = -0.07 max_speed = 0.07 data = [] rewards = np.zeros([N, 1]) states = np.zeros([N, 2]) actions = np.zeros(N) next_states = np.zeros([N, 2]) for i in range(N): #for pos in np.linspace(min_pos, max_pos, num=N_pos): #for speed in np.linspace(min_speed, max_speed, num=N_speed): #for action in [0, 1, 2]: pos = (max_pos - min_pos) * np.random.sample() + min_pos speed = (max_speed - min_speed) * np.random.sample() + min_speed action = np.random.choice(3) #res = {'s' : np.array([pos, speed]), 'a' : action} states[i, :] = np.array([pos, speed]) actions[i] = action if pos >= goal_pos: #res['r'] = 1 rewards[i, 0] = 1 #res['s_next'] = np.array([pos, speed]) next_states[i, :] = np.array([pos, speed]) else: env.reset_specific(pos, speed) s_next, reward, _, _ = env.step(action) #res['r'] = reward rewards[i, 0] = reward #res['s_next'] = s_next next_states[i, :] = s_next return data, states, actions, rewards, next_states
def test_lspi(N_list=[3000]): results = [] fig, ax = plt.subplots() #plt.rcParams.update({'font.size': 10}) for N in N_list: N = int(N) env = MountainCarWithResetEnv() high = -0.4 low = -0.6 init_states = [((high - low) * np.random.sample() + low, 0) for i in range(10)] max_iter = 1000 total_success = 5 * [[]] for i in range(5): #print("Starting iteration i=", i) np.random.seed(seed=i) data, states, actions, rewards, next_states = lspi_data_sample(N) theta_n = list( train_lspi(data, states, actions, rewards, next_states)) success_theta = [] for theta in theta_n: #print("New theta") success_rate = 0 for init_s in init_states: #print("New init state") env.reset_specific(*init_s) #env.render() is_done = False a = next_a(np.array(init_s).reshape([1, 2]), theta) # First step for j in range(max_iter): #print("Game iteration:", j) next_s, r, is_done, _ = env.step(int(a)) a = next_a(next_s.reshape([1, 2]), theta) if is_done: success_rate += 1.0 break success_theta.append(success_rate / 10.0) total_success[i] = success_theta max_len = max( [len(total_success[i]) for i in range(len(total_success))]) for i in range(len(total_success)): l = len(total_success[i]) if l < max_len: for j in range(max_len - l): total_success[i].append(total_success[i][-1]) res = np.array(total_success) res_mean = np.mean(res, axis=0) results.append(list(res_mean)) max_len = max([len(results[i]) for i in range(len(results))]) for i in range(len(results)): l = len(results[i]) if l < max_len: for j in range(max_len - l): results[i].append(results[i][-1]) for N, res_mean in zip(N_list, results): it = list(range(1, len(res_mean) + 1)) ax.plot(it, res_mean, label='N = ' + str(int(N))) ax.grid(True) if len(N_list) > 1: plt.title( 'Average success rate per iteration for different amounts of samples' ) else: plt.title('Average success rate per iteration') plt.xlabel('Iteration') plt.ylabel('Success rate') plt.legend(loc='lower left') plt.ylim(-0.2, 1.2) plt.show() return res_mean
plt.xlabel('Position', fontsize=fsize) plt.ylabel('Velocity', fontsize=fsize) plt.show() #if __name__ == '__main__': # import sys # if len(sys.argv) > 1: # N = int(sys.argv[1]) # print(test_lspi(N)) # else: # print(test_lspi()) # %% if __name__ == '__main__': env = MountainCarWithResetEnv() # # run no force # env.reset() # env.render() # is_done = False # while not is_done: # _, r, is_done, _ = env.step(1) # env.render() # print(r) # # run random forces # env.reset() # env.render() # is_done = False # while not is_done: # _, r, is_done, _ = env.step(env.action_space.sample()) # take a random action # env.render()
if __name__ == '__main__': samples_to_collect = 100000 # samples_to_collect = 150000 # samples_to_collect = 10000 number_of_kernels_per_dim = [10, 8] gamma = 0.99 w_updates = 100 evaluation_number_of_games = 10 evaluation_max_steps_per_game = 1000 np.random.seed(123) # np.random.seed(234) env = MountainCarWithResetEnv() # collect data states, actions, rewards, next_states, done_flags = DataCollector( env).collect_data(samples_to_collect) # get data success rate data_success_rate = np.sum(rewards) / len(rewards) print(f'success rate {data_success_rate}') # standardize data data_transformer = DataTransformer() data_transformer.set_using_states( np.concatenate((states, next_states), axis=0)) states = data_transformer.transform_states(states) next_states = data_transformer.transform_states(next_states) # process with radial basis functions feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim) # encode all states:
if __name__ == "__main__": seeds = [123] epsilons = [1] gamma = 0.999 learning_rate = 0.05 epsilon_decrease = 0.99 epsilon_min = 0.01 max_episodes = 10000 seed_rewards, seed_performance, seed_bottom_val, seed_bellman_err_avg = [], [], [], [] for seed in seeds: env = MountainCarWithResetEnv() np.random.seed(seed) env.seed(seed) solver = Solver( # learning parameters gamma=gamma, learning_rate=learning_rate, # feature extraction parameters number_of_kernels_per_dim=[5, 7], # env dependencies (DO NOT CHANGE): number_of_actions=env.action_space.n, ) for epsilon_current in epsilons: rewards, performance, bottom_val, bellman_err_avg, bellman_err = [], [], [], [], []
# action = int(np.sign(state[1]) + 1) # w=np.array([-0.1,0,0,0.1,0.1,0]) action = self.greedy_policy(state, [False])[0] state, reward, done, _ = env.step(action) rewards += reward if done or itr > 10e2: break itr += 1 rew.append(rewards) return np.mean(rew), np.std(rew) if __name__ == '__main__': env = MountainCarWithResetEnv() # # run no force # env.reset() # env.render() # is_done = False # while not is_done: # _, r, is_done, _ = env.step(1) # env.render() # print(r) # # run random forces # env.reset() # env.render() # is_done = False # while not is_done: # _, r, is_done, _ = env.step(env.action_space.sample()) # take a random action # env.render()
class QLearningAgent: def __init__(self): self.game = MountainCarWithResetEnv() self.reset_theta() # Constants used for data standardization self.pos_mu = (self.game.min_position + self.game.max_position)/2 self.pos_sigma = (self.game.max_position - self.game.min_position)/np.sqrt(12) self.speed_mu = 0 self.speed_sigma = 2*self.game.max_speed/np.sqrt(12) # Cache of samples used for visualizing the policy self.vis_samples = None def reset_theta(self): self.theta = np.random.normal(size=(1, 78)) def reset(self, state=None): if state is None: return self.game.reset() return self.game.reset_specific(*state) def next_a(self, state): N = np.shape(state)[0] Q_est = np.zeros([N, 3]) Q_est[:, 0] = self.theta.dot(self.extract_features(state, np.zeros(N)).T) Q_est[:, 1] = self.theta.dot(self.extract_features(state, np.ones(N)).T) Q_est[:, 2] = self.theta.dot(self.extract_features(state, 2*np.ones(N)).T) action = np.argmax(Q_est, axis=1) return 2 - action def q_max(self, state): N = np.shape(state)[0] Q_est = np.zeros([N, 3]) Q_est[:, 0] = self.theta.dot(self.extract_features(state, np.zeros(N)).T) Q_est[:, 1] = self.theta.dot(self.extract_features(state, np.ones(N)).T) Q_est[:, 2] = self.theta.dot(self.extract_features(state, 2*np.ones(N)).T) return np.max(Q_est, axis=1) def q(self, state, action): Q_est = self.theta.dot(self.extract_features(state, action * np.ones(1)).T) return Q_est def extract_features(self, s, actions): N_a = 3 e_s = self.rbf(s) N_f = np.shape(e_s)[1] feats = np.zeros([np.shape(e_s)[0], N_a * N_f]) for i, a in enumerate(actions): np.put(feats[i, :], range(int(a)*N_f, int(a+1)*N_f), e_s[i, :]) return feats def rbf(self, s): # Implementation of RBF features # pos, speed statistics should be global n_s = np.zeros(s.shape) n_s[:, 0] = (s[:, 0] - self.pos_mu) / self.pos_sigma n_s[:, 1] = (s[:, 1] - self.speed_mu) / self.speed_sigma centers = [] for i in -1.2, -0.6, 0, 0.6, 1.2: for j in -0.07, -0.03, 0, 0.03, 0.07: centers.append((i, j)) n_centers = np.array([( (c[0] - self.pos_mu)/self.pos_sigma, (c[1] - self.speed_mu)/self.speed_sigma) for c in centers]) scales = [1 for c in centers] feats = np.ones([n_s.shape[0], np.size(scales) + 1]) for i, n_c in enumerate(n_centers): feats[:, i] = np.exp(-scales[i] * np.linalg.norm(n_s - n_c, axis=1)) return feats def visualize(self): if self.vis_samples is None: ret = lspi_data_sample(10000) self.vis_samples = ret[1] N = self.vis_samples.shape[0] opt_a = self.next_a(self.vis_samples) plt.clf() ac = [0, 1, 2] for a, color, label in zip(ac, ['tab:blue', 'tab:orange', 'tab:green'], ['LEFT', 'STAY', 'RIGHT']): xy = self.vis_samples[a == opt_a, :] plt.scatter(xy[:, 0], xy[:, 1], c=color, label=label, edgecolors='none') plt.legend() plt.grid(True) plt.title('Sample size - {}'.format(N)) plt.xlabel('Position') plt.ylabel('Velocity') plt.pause(0.1) def gather_data(self, epsilon, iterations_per_game=1000, games=5): states = np.zeros((iterations_per_game*games, 2)) actions = np.zeros((iterations_per_game*games, 1)) next_states = np.zeros((iterations_per_game*games, 2)) rewards = np.zeros((iterations_per_game*games, 1)) data = (states, actions, next_states, rewards) success_count = 0 data_index = 0 for g in range(games): state = self.reset() state = state.reshape((1, 2)) for i in range(iterations_per_game): if np.random.uniform() > epsilon: rand = False action = self.next_a(state)[0] else: rand = True action = np.random.choice(3) next_state, reward, is_done, _ = self.game.step(action) success_count += np.sum(reward) states[data_index, :] = state actions[data_index, :] = action next_states[data_index, :] = next_state rewards[data_index, :] = reward data_index += 1 #print("i: {}, state: {}, action: {}, next: {}, r: {}, rand: {}".format(i, state, action, next_state, reward, rand)) state = np.array(next_state).reshape((1, 2)) if is_done: break success_rate = success_count / games return data, success_rate, data_index def train_step(self, alpha, data, batch_size=100, gamma=0.999): data_length = data[0].shape[0] reward_indices = (data[3] == 1).reshape(data_length) reward_count = reward_indices.sum() batch_indices = np.random.randint(0, data_length, batch_size - reward_count) batch_marker = np.zeros(data_length, dtype=bool) batch_marker[batch_indices] = True batch_marker[reward_indices] = True batch_size = batch_marker.sum() states = data[0][batch_marker] actions = data[1][batch_marker] next_states = data[2][batch_marker] rewards = data[3][batch_marker] update_step = 0 for i in range(batch_size): coeff = rewards[i] + gamma * self.q_max(next_states[i].reshape((1, 2))) - self.q(states[i].reshape((1, 2)), actions[i]) step = self.extract_features(states[i].reshape((1, 2)), actions[i]) * coeff update_step += step max_element = np.max(np.abs(update_step)) return self.theta + alpha * update_step / (max_element or 1) def reset_random(self): init_state = (np.random.uniform(-1.2, 0.6), np.random.uniform(-0.07, 0.07)) return self.reset(init_state) def train(self, init_epsilon=1, init_alpha=1, max_iterations=30, visualise=True, test_states=[]): alpha = init_alpha epsilon = init_epsilon success_rates = np.zeros((len(test_states), max_iterations)) for i in range(max_iterations): data, win_pct, max_ind = self.gather_data(epsilon) data = (data[0][:max_ind, :], data[1][:max_ind, :], data[2][:max_ind, :], data[3][:max_ind, :]) old_theta = self.theta for j in range(10): self.theta = self.train_step(alpha, data) theta_diff = self.theta - old_theta diff_max = np.max(np.abs(theta_diff)) theta_max = np.max(np.abs(self.theta)) success_rates[:, i] = self.test_train_iteration(test_states) avg_rate = np.average(success_rates[:, i]) print("Iter", i, "train_iters", max_ind, "train_win_pct", win_pct, "test_win_pct", avg_rate, "alpha", alpha, "ep", epsilon, "theta_new - theta (max) =", diff_max, "theta_max", theta_max) epsilon = 0.9 * epsilon alpha = 0.8 * alpha if visualise: self.visualize() return success_rates def play(self, init_state=None, render=True, max_iterations=1000): state = self.reset(init_state).reshape((1,2)) done = False for i in range(max_iterations): action = int(self.next_a(state)) next_state, reward, is_done, _ = self.game.step(action) if render: self.game.render() state = np.array(next_state).reshape((1,2)) if is_done: done = True break self.game.close() return done def test_train_iteration(self, test_init_states): results = np.zeros(len(test_init_states)) for state_idx, init_state in enumerate(test_init_states): result = self.play(init_state, render=False) results[state_idx] = int(result) return results def get_test_states(self, count=10): return [(np.random.uniform(low=-0.6, high=-0.4), 0) for i in range(count)] def test_model(self, training_cycles=5, test_states=None, **training_args): success_rates = None if test_states is None: test_states = self.get_test_states() for t in range(training_cycles): self.reset_theta() print("*** TRAINING EXPERIMENT {} ***".format(t)) rates = self.train(test_states=test_states, **training_args) print("*** RESULT ***") print(rates) if success_rates is None: success_rates = rates else: success_rates += rates success_rates /= training_cycles return test_states, success_rates def plot_success_rates(self, success_rates): plt.figure() avg = np.average(success_rates, axis=0) plt.plot(avg) plt.title('Average success rate per iteration') plt.xlabel('Iteration') plt.ylabel('Success rate') plt.show()
def run_q_learning_training(seed, epsilon=0.1, max_episodes=1000): env = MountainCarWithResetEnv() np.random.seed(seed) env.seed(seed) gamma = 0.999 learning_rate = 0.01 max_episodes = max_episodes solver = Solver( # learning parameters gamma=gamma, learning_rate=learning_rate, # feature extraction parameters number_of_kernels_per_dim=[7, 5], # env dependencies (DO NOT CHANGE): number_of_actions=env.action_space.n, ) train_statistics = defaultdict(list) bellman_error = list() bellman_error_index = 100 for episode_index in range(1, max_episodes + 1): episode_gain, mean_delta = run_episode(env, solver, is_train=True, epsilon=epsilon) bellman_error.append(mean_delta) print( f'After {episode_index}, reward = {episode_gain}, epsilon {epsilon}, average error {mean_delta}' ) env.reset() init_state = env.state phi_st_0 = solver.get_state_action_features(init_state, 0) phi_st_1 = solver.get_state_action_features(init_state, 1) phi_st_2 = solver.get_state_action_features(init_state, 2) Q_st_0 = phi_st_0.transpose() @ solver.theta Q_st_1 = phi_st_1.transpose() @ solver.theta Q_st_2 = phi_st_2.transpose() @ solver.theta train_statistics["init_state"].append(max(Q_st_0, Q_st_1, Q_st_2)) train_statistics["reward"].append(episode_gain) if episode_index % 100 == 99: train_statistics["bellman_error"].append(np.mean(bellman_error)) train_statistics["bellman_error_index"].append(bellman_error_index) bellman_error_index += 100 bellman_error = list() if episode_index % 10 == 9: test_gains = [ run_episode(env, solver, is_train=False, epsilon=0.)[0] for _ in range(10) ] mean_test_gain = np.mean(test_gains) train_statistics["performance"].append(mean_test_gain) print(f'tested 10 episodes: mean gain is {mean_test_gain}') if mean_test_gain >= -75.: print(f'solved in {episode_index} episodes') break return train_statistics
def run_lspi(seed, w_updates=20, samples_to_collect=100000, evaluation_number_of_games=1, evaluation_max_steps_per_game=200, thresh=0.00001, only_final=False): """ This is the main lspi function :param seed: random seed for the run :param w_updates: how many w updates to do :param samples_to_collect: how many samples to collect :param evaluation_number_of_games: how many game evaluations to do :param evaluation_max_steps_per_game: how many steps to allow the evaluation game to run :param thresh: the threshold for the stopping condition :param only_final: run evaluation only at the end of the run :return: None """ res_dir = './Results/' np.random.seed(seed) number_of_kernels_per_dim = [12, 10] gamma = 0.999 env = MountainCarWithResetEnv() # collect data states, actions, rewards, next_states, done_flags = DataCollector( env).collect_data(samples_to_collect) # get data success rate data_success_rate = np.sum(rewards) / len(rewards) print('success rate: {}'.format(data_success_rate)) # standardize data data_transformer = DataTransformer() data_transformer.set_using_states( np.concatenate((states, next_states), axis=0)) states = data_transformer.transform_states(states) next_states = data_transformer.transform_states(next_states) # process with radial basis functions feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim) # encode all states: encoded_states = feature_extractor.encode_states_with_radial_basis_functions( states) encoded_next_states = feature_extractor.encode_states_with_radial_basis_functions( next_states) # set a new linear policy linear_policy = LinearPolicy(feature_extractor.get_number_of_features(), 3, True) # but set the weights as random linear_policy.set_w(np.random.uniform(size=linear_policy.w.shape)) # start an object that evaluates the success rate over time evaluator = GamePlayer(env, data_transformer, feature_extractor, linear_policy) # success_rate = evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game) # print("Initial success rate: {}".format(success_rate)) performances = [] if not only_final: performances.append( evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game)) read = False if read: with open(res_dir + 'weight.pickle', 'rb') as handle: new_w = pickle.load(handle) linear_policy.set_w(np.expand_dims(new_w, 1)) for lspi_iteration in range(w_updates): print('starting lspi iteration {}'.format(lspi_iteration)) new_w = compute_lspi_iteration(encoded_states, encoded_next_states, actions, rewards, done_flags, linear_policy, gamma) with open(res_dir + 'weight.pickle', 'wb') as handle: pickle.dump(new_w, handle, protocol=pickle.HIGHEST_PROTOCOL) norm_diff = linear_policy.set_w(new_w) if not only_final: performances.append( evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game)) if norm_diff < thresh: break print('done lspi') if not only_final: with open(res_dir + 'perf' + str(seed) + '.pickle', 'wb') as handle: pickle.dump(performances, handle, protocol=pickle.HIGHEST_PROTOCOL) if only_final: score = evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game) with open(res_dir + 'final_perf' + str(samples_to_collect) + '.pickle', 'wb') as handle: pickle.dump(score, handle, protocol=pickle.HIGHEST_PROTOCOL) evaluator.play_game(evaluation_max_steps_per_game, render=True)