def test_environment_features(): np.random.seed(0) config = Config() config.init_noise_var = 0.0 config.num_obs_features = 4 env = BoyanChain(config) def run_env(e: BoyanChain, s=20): for i in range(s): print("Step number: {0}".format(i + 1)) current_state = e.current_state next_state, _, observed_features, terminal = e.step() print("\tMoved: {0} --> {1}".format(current_state, next_state)) print("\tObserved Features: {0}".format(observed_features)) if terminal: e.reset() run_env(env, 20) print("\nAdding 4 features without noise...") env.reset() env.add_feature(4) run_env(env, 20) print("\nAdding 4 features with noise...") env.reset() env.add_feature(4, noise=1) run_env(env, 20)
def learning_value_function(sample_size=100000, checkpoint=1000): np.random.seed(0) config = Config() config.init_noise_var = 0.1 config.num_obs_features = 4 env = BoyanChain(config) theta = np.zeros(config.num_obs_features, dtype=np.float64) theta_star = env.optimal_weights alpha = 0.005 def train(th, th_star, e: BoyanChain, ss, ckpt): e.reset() current_features = e.get_observable_features() mean_square_value_diff = 0.0 for i in range(ss): current_value = np.dot(current_features, th) optimal_value = np.dot(e.current_features, th_star) current_state, reward, next_features, terminal = e.step() next_value = np.dot(next_features, th) temporal_diff = reward + ( 1 - int(terminal)) * next_value - current_value th += alpha * temporal_diff * current_features mean_square_value_diff += np.square(current_value - optimal_value) / ckpt if (i + 1) % ckpt == 0: print("Training Step: {0}".format(i + 1)) print( "\tEstimated MSVE: {0:.4f}".format(mean_square_value_diff)) print("\tTrue MSVE: {0:.4f}".format(e.compute_msve(th))) mean_square_value_diff *= 0 current_features = next_features if terminal: e.reset() current_features = e.get_observable_features() print("First phase of training...") train(theta, theta_star, env, sample_size, checkpoint) env.add_feature(4, 0.0) print("\n\nSecond phase of training...") new_theta = np.zeros(8, dtype=np.float64) new_theta[:4] += theta train(new_theta, theta_star, env, sample_size, checkpoint)
def boyan_chain_test(steps=50000): from src.env.BoyanChain import BoyanChain from src.env.RandomFeatures_task import LinearFunctionApproximator from src.util import Config import matplotlib.pyplot as plt config = Config() checkpoint = 100 """ Environment Setup """ config.init_noise_var = 0.1 config.num_obs_features = 4 config.max_num_features = 9 """ AutoTIDBD Setup """ config.parameter_size = 4 config.theta = 0.001 config.tau = 10000 config.init_stepsize = 0.001 # to keep track of learning progress run_avg_msve = np.zeros(steps // checkpoint, dtype=np.float64) current_checkpoint = 0 avg_msve = 0 env = BoyanChain(config) approximator = LinearFunctionApproximator(config) optimizer = AutoTIDBD(config) """ Start of Learning""" curr_obs_feats = env.get_observable_features() for s in range(steps): state_value = approximator.get_prediction(curr_obs_feats) optimal_value = env.compute_true_value() # step in the environment _, r, next_obs_feats, term = env.step() next_state_value = approximator.get_prediction(next_obs_feats) # compute td error td_error = r + (1 - term) * next_state_value - state_value # update weights _, _, new_weights = optimizer.update_weight_vector( td_error, features=curr_obs_feats, weights=approximator.get_weight_vector(), discounted_next_features=next_obs_feats) approximator.update_weight_vector(new_weights) # update features curr_obs_feats = next_obs_feats # keep track of progress avg_msve += np.square(state_value - optimal_value) / checkpoint # check if terminal state if term: env.reset() curr_obs_feats = env.get_observable_features() # store learning progress so far if (s + 1) % checkpoint == 0: run_avg_msve[current_checkpoint] += avg_msve avg_msve *= 0 current_checkpoint += 1 if (s + 1) == (steps // 2): env.add_feature(k=4, noise=0.0, fake_feature=False) approximator.increase_num_features(4) optimizer.increase_size(4) curr_obs_feats = env.get_observable_features() print("The average MSVE is: {0:0.4f}".format(np.average(run_avg_msve))) xaxis = np.arange(run_avg_msve.size) + 1 plt.plot(xaxis, run_avg_msve) plt.show() plt.close()
def sarsa_zero_test(steps=10000, add_new_centers=False, number_of_irrelevant_features=0): import matplotlib.pyplot as plt from src.env.RandomFeatures_task import LinearFunctionApproximator from src.step_size_methods.sgd import SGD # epsilon greedy policy def choose_action(av_array: np.ndarray, epsilon): p = np.random.rand() if p > epsilon: argmax_av = np.random.choice( np.flatnonzero(av_array == av_array.max())) return argmax_av else: return np.random.randint(av_array.size) # for computing action values def get_action_values(n, features, approximator_list): action_values = np.zeros(n, dtype=np.float64) for k in range(n): action_values[k] += approximator_list[k].get_prediction(features) return action_values completed_episodes_per_run = [] for _ in range(1): print("==== Results for Sarsa(0) with Epsilon Greedy Policy ====") config = Config() # setting up feature function config.state_dims = 2 config.state_lims = np.array(((-1, 1), (-1, 1)), dtype=np.float64) # config.initial_centers = np.array(((0.0,0.0), (-1.8,0), (1.8,0), (0.0,-1.8), (0.0,1.8)), dtype=np.float64) config.initial_centers = np.array( ((0.0, 0.0), (0.25, 0.25), (0.25, -0.25), (-0.25, -0.25), (-0.25, 0.25)), dtype=np.float64) config.sigma = 0.5 config.init_noise_mean = 0.0 config.init_noise_var = 0.01 feature_function = RadialBasisFunction(config) # setting up environment config.norm_state = True env = MountainCar(config) # function approximator and optimizer parameters num_actions = 3 random_action_prob = 0.1 gamma = 0.99 config.num_obs_features = feature_function.num_features config.max_num_features = 200 # as long as this is more than 12 config.num_actions = num_actions config.alpha = 0.005 config.rescale = False config.parameter_size = feature_function.num_features function_approximator = [] optimizer = [] # one instance for each action for i in range(num_actions): function_approximator.append(LinearFunctionApproximator(config)) optimizer.append(SGD(config)) # setting up summaries all_episodes_return = [] episode_return = 0 # setting up initial state, action, features, and action values curr_s = env.get_current_state() curr_features = feature_function.get_observable_features(curr_s) curr_avs = get_action_values(num_actions, curr_features, function_approximator) curr_a = choose_action(curr_avs, random_action_prob) midpoint_episode = 0 for i in range(steps): # get current action values curr_avs = get_action_values(num_actions, curr_features, function_approximator) # execute current action next_s, r, terminal = env.step(curr_a) next_features = feature_function.get_observable_features(next_s) # get next action values and action next_action_values = get_action_values(num_actions, next_features, function_approximator) next_action = choose_action(next_action_values, random_action_prob) # compute TD error for Sarsa(0) td_error = r + gamma * ( 1 - terminal) * next_action_values[next_action] - curr_avs[curr_a] # update weight vector _, ss, new_weights = optimizer[curr_a].update_weight_vector( td_error, curr_features, function_approximator[curr_a].get_weight_vector()) function_approximator[curr_a].update_weight_vector(new_weights) # set current features and action curr_features = next_features curr_a = next_action # keep track of sum of rewards episode_return += r # if terminal state if terminal: env.reset() all_episodes_return.append(episode_return) episode_return *= 0 curr_s = env.get_current_state() curr_features = feature_function.get_observable_features( curr_s) curr_avs = get_action_values(num_actions, curr_features, function_approximator) curr_a = choose_action(curr_avs, random_action_prob) # if midpoint of training if (i + 1) == (steps // 2): if add_new_centers: new_centers = np.array( ((0, 0), (0.25, 0.25), (0.25, -0.25), (-0.25, -0.25), (-0.25, 0.25)), dtype=np.float64) feature_function.add_centers(new_centers, noise_var=0, noise_mean=0) for k in range(num_actions): function_approximator[k].increase_num_features( new_centers.shape[0]) optimizer[k].increase_size(new_centers.shape[0], init_stepsize=0.25) if number_of_irrelevant_features > 0: new_feature_mean = 0.0 new_feature_var = 0.05 fake_features = True feature_function.add_feature(number_of_irrelevant_features, noise_mean=new_feature_mean, noise_var=new_feature_var, fake_feature=fake_features) for k in range(num_actions): function_approximator[k].increase_num_features( number_of_irrelevant_features) optimizer[k].increase_size( number_of_irrelevant_features) curr_features = feature_function.get_observable_features( curr_s) midpoint_episode = len(all_episodes_return) completed_episodes_per_run.append(len(all_episodes_return)) print("Number of episodes completed: {0}".format( len(all_episodes_return))) print("Average episodes completed: {0:0.4f}".format( np.average(completed_episodes_per_run))) print("Return per episode:\n", all_episodes_return) plt.plot(np.arange(len(all_episodes_return)) + 1, all_episodes_return) plt.vlines(x=midpoint_episode, ymin=-800, ymax=0) plt.ylim((-800, 0)) plt.show() plt.close()