def __init__(self, n_neurons, tau, eta, gamma, lambda_eligibility): self.mountain_car = mountaincar.MountainCar() # Parameters self.n_neurons = n_neurons self.tau = tau self.lambda_eligibility = lambda_eligibility self.gamma = gamma self.eta = eta # Defines the neural lattice self.neurons_pos = np.linspace(-150, 30, n_neurons) print(self.neurons_pos) self.sigma_pos = self.neurons_pos[1] - self.neurons_pos[0] print(self.sigma_pos) self.neurons_vel = np.linspace(-15, 15, n_neurons) print(self.neurons_vel) self.sigma_vel = self.neurons_vel[1] - self.neurons_vel[0] print(self.sigma_vel) self.pos_grid, self.vel_grid = np.meshgrid(self.neurons_pos, self.neurons_vel) # initialize the Q-values etc. self._init_run()
def __init__(self, mountain_car=None): if mountain_car is None: self.mountain_car = mountaincar.MountainCar() else: self.mountain_car = mountain_car self.nn = NeuralNet(20, 20)
def __init__(self, mountain_car = None, parameter1 = 3.0): if mountain_car is None: self.mountain_car = mountaincar.MountainCar() else: self.mountain_car = mountain_car self.parameter1 = parameter1
def __init__(self, mountain_car=None, size=20, eta=0.05, gamma=0.99, tau=1, eligibity_trace_decay=0.95, tau_decay=True, non_zero_weights=False): # GridWorld / neural net size self.N = size self.t = 0 # reward administered t the target location and when # bumping into walls self.reward_at_target = 1. # learning rate self.eta = eta # discount factor - quantifies how far into the future # a reward is still considered important for the # current action self.gamma = gamma # the decay factor for the eligibility trace the # default is 0., which corresponds to no eligibility # trace at all. self.eligibity_trace_decay = eligibity_trace_decay # Exploration parameter self.tau = tau self.tau_decay = tau_decay # Grid centers x_centers = np.linspace(-150, 30, self.N) dx_centers = np.linspace(-15, 15, self.N) # Variance for the input function of the centers self.var_x = ((150 + 30) / self.N)**2 self.var_dx = ((15 + 15) / self.N)**2 # Create grid given the centers self.x_grid, self.dx_grid = np.meshgrid(x_centers, dx_centers) if mountain_car is None: self.mountain_car = mountaincar.MountainCar() else: self.mountain_car = mountain_car # initialize the weights and eligibility traces if non_zero_weights: self.w = np.ones((3, self.N**2)) else: self.w = np.zeros((3, self.N**2)) self._reset_e_values()
def __init__(self, mountain_car=None, eta=0.01, tau=lambda x: 0.01, lambd=0.98, weight=0.5, seed=11): if mountain_car is None: self.mountain_car = mountaincar.MountainCar() else: self.mountain_car = mountain_car ### SARSA parameters self.tau = tau # temperature -> exploration vs exploitation parameter. self.eta = eta # learning rate for weight update << 1 self.lambd = lambd # eligibility decay rate 0 < lambda < 1 self.gamma = 0.95 # reward factor self.weights_hist = [[], []] ### setting up the neural network # computing interval parameters self.nNeuronsX = 20 # minimum 2 self.nNeuronsPsi = 20 self.inputDim = self.nNeuronsX * self.nNeuronsPsi self.outputDim = 3 self.xCenters = np.linspace( -150, 30, self.nNeuronsX + 1, False)[1:] # split the position interval excluding extreme values self.psiCenters = np.linspace( -15, 15, self.nNeuronsPsi + 1, False)[1:] # split the speed interval excluding extreme values self.sigmaX = self.xCenters[1] - self.xCenters[0] self.sigmaPsi = self.psiCenters[1] - self.psiCenters[0] # generating the input neurons iN = [] for psic, psi in enumerate(self.psiCenters): iN.append([]) for xc, x in enumerate(self.xCenters): iN[psic].append(S(x, psi)) self.iN = np.transpose(np.array(iN)) # listing the actions self.ActDict = [-1, 0, 1] np.random.seed(seed) self.weights = np.ones( (3, self.iN.shape[0], self.iN.shape[1])) * weight self.eligibilities = np.zeros((3, self.iN.shape[0], self.iN.shape[1]))
def main(): parser = argparse.ArgumentParser( description="Policy Evaluation Random Dyna") parser.add_argument("--cfg", type=str, default='config.json', help="config file name") args = parser.parse_args() fname = args.cfg config = update_config(fname) tile = MountainCarTileCoder(iht_size=10000, num_tilings=10, num_tiles=8) theta = np.random.uniform(-0.001, 0, size=(tile.n)) F = np.zeros((tile.n, tile.n)) b = np.zeros((tile.n)) alpha = config["alpha"] gamma = config["gamma"] epsilon = config["epsilon"] N_0 = config["N_0"] numEpisodes = config["numEpisodes"] stepsPerEpisode = config["stepsPerEpisode"] n = config["n_planning_steps"] loss = [] env = mountaincar.MountainCar() for episodeNum in tqdm(range(1, numEpisodes + 1)): G = 0 env.init() state = env.start() #print(episodeNum, ":\n") for step in range(stepsPerEpisode): phi = tile.get_tiles(position=state[0], velocity=state[1]) phi = tile.get_one_hot(phi) action = policy(state, epsilon) reward, state2, done = env.step(action) phi_prime = tile.get_tiles(position=state2[0], velocity=state2[1]) phi_prime = tile.get_one_hot(phi_prime) G += reward delta = reward + (gamma * (theta.T @ phi_prime)) - (theta.T @ phi) #modelling F = F + alpha * np.outer((phi_prime - np.dot(F, phi)), phi) b = b + alpha * ((reward - b.T @ phi) * phi) theta += alpha * delta * phi #plan theta = planning(n, theta, F, b, tile, gamma, alpha) state = state2 alpha = alpha * ((N_0 + 1) / (N_0 + (episodeNum)**1.1)) loss.append(delta**2) stname = "losses" + "_n0_" + "{}".format( config["N_0"]) + "_alpha" + "{}".format(config["alpha"]) np.save(stname + 'loss', loss)
def __init__(self, mc=None, net=None, temp=None, learn_rate=1e-2, reward_factor=0.95, el_tr_rate=None, temp_fun=None): self.mc = mountaincar.MountainCar() if mc is None else mc self.net = Network() if net is None else net self.temp0 = 0.1 if temp is None else temp self.learn_rate = learn_rate self.reward_factor = reward_factor self.el_tr_rate = 0.95 if el_tr_rate is None else el_tr_rate self.temp_fun = temp_fun
def __init__(self, seed=1): self.rnd = np.random.RandomState(seed) self.mountain_car = mountaincar.MountainCar(self.rnd) # Initialize constants self.x_min = -150.0 self.x_max = 30.0 self.x_n = 20 # Number of subdivisions along the position axis self.v_min = -15.0 self.v_max = 15.0 self.v_n = 5 # Number of subdivisions along the speed axis self.x_centers = np.linspace(self.x_min, self.x_max, self.x_n) self.v_centers = np.linspace(self.v_min, self.v_max, self.v_n) self.x_sigma = self.x_centers[1] - self.x_centers[0] self.v_sigma = self.v_centers[1] - self.v_centers[0]
def __init__(self, mountain_car=None, temperature=0.1, trace_decay_rate=0.95, grid_size=20, num_actions=3, learning_rate=1e-2, weight_init='constant', weight=0.5): if mountain_car is None: self.mountain_car = mountaincar.MountainCar() else: self.mountain_car = mountain_car if weight_init == 'uniform': self.network_weights = np.random.rand(num_actions, grid_size * grid_size) elif weight_init == 'constant': self.network_weights = weight * np.ones( (num_actions, grid_size * grid_size)) self.activations = np.zeros((grid_size, grid_size)) self.activations1 = np.zeros((grid_size, grid_size)) xcenters = np.linspace(-150, 30, grid_size + 1, False)[1:] xdcenters = np.linspace(-15, 15, grid_size + 1, False)[1:] self.centers = np.array( np.meshgrid(xcenters, np.flip(xdcenters, axis=0))) self.sigma = np.zeros(2) self.sigma[0] = xcenters[1] - xcenters[0] self.sigma[1] = xdcenters[1] - xdcenters[0] self.temperature = temperature self.discount_factor = 0.95 self.learning_rate = learning_rate self.trace_decay_rate = trace_decay_rate self.eligibility_traces = np.zeros_like(self.network_weights) self.a = 0.0 self.a1 = 0.0 self.r = 0.0 self.q = 0.0 self.q1 = 0.0
def __init__(self, mountain_car=None): if mountain_car is None: self.mountain_car = mountaincar.MountainCar() else: self.mountain_car = mountain_car
def watkins_q(alpha=0.15, epsilon=0, gamma=1, lamb=0.9, episodes=100, trace="replace"): '''Performs linear, gradient-descent Sarsa learning''' # Perform initializations episode_rewards = [] env = mountaincar.MountainCar() tiles = make_tiles() action_size = len(tiles) tile_overlap = len(tiles[0][0]) tile_size = len(tiles[0][0][0])*len(tiles[0][1][0]) tile_number = action_size*tile_size*tile_overlap theta = np.zeros(tile_number) # Run number of episodes for learning for _ in range(episodes): env.reset() total_reward = 0 eligibility = np.zeros(tile_number) env_state = (env.position, env.velocity) chosen_action = np.random.choice(env.actions) features = find_features(tiles, env_state, chosen_action) # Repeat for each step in episode while not env.game_over: # Action the appropriate trace for i in range(len(features)): # Convert 3D state-action tile features to 1D index for theta table pos_index, vel_index = features[i] index = np.ravel_multi_index([pos_index, vel_index], (len(tiles[0][0][0]), len(tiles[0][1][0]))) index += (tile_size*i) + ((chosen_action+1)*(tile_size*tile_overlap)) if trace == "accumulate": eligibility[index] += 1 elif trace == "replace": eligibility[index] = 1 else: print("Unknown trace type") break # Take action and observe next state and reward reward = env.make_step(action=chosen_action) env_state = (env.position, env.velocity) delta = reward - evaluate_theta(theta, features, chosen_action) # Update delta from max Qa Q_actions = [] for action in env.actions: features = find_features(tiles, env_state, action) Q_actions.append(evaluate_theta(theta, features, action)) delta += (gamma * max(Q_actions)) theta += (alpha * delta * eligibility) # Perform epsilon-greedy action chance = np.random.uniform(0,1) if (chance >= epsilon): Q_actions = [] for action in env.actions: features = find_features(tiles, env_state, action) Q_actions.append(evaluate_theta(theta, features, action)) # Find choose maximum action chosen_action = env.actions[np.argmax(Q_actions)] features = find_features(tiles, env_state, chosen_action) eligibility *= (gamma*lamb) else: chosen_action = np.random.choice(env.actions) eligibility *= 0 total_reward += reward #env.plot() # Add total reward for episode to array (to allow plotting) and return episode_rewards.append(total_reward) return episode_rewards
def __init__(self, agent): self.mountain_car = mountaincar.MountainCar() self.agent = agent
def __init__(self, mountain_car=None, eta=0.05, gamma=0.95, lam=0.8, initial_epsilon=0.1, initial_temperature=1.0, neurons=10, time=100, dt=0.01, actions=3, n_steps=10000, n_episodes=10, run_type="Default", explore_temp=False, explore_lam=False, explore_both=False, explore_weights=False, weights=.05, greedy=False, verbose=False): if mountain_car is None: self.mountain_car = mountaincar.MountainCar() else: self.mountain_car = mountain_car # Learning rate self.eta_ = eta # Reward Factor self.gamma_ = gamma # Decay Eligibility self.lambda_ = lam self.min_lambda_ = 0 # Choice of Random Action or Not self.greedy_flag = greedy self.initial_epsilon_ = initial_epsilon # Exploration vs Exploitation parameter self.initial_temperature_ = initial_temperature self.min_temperature_ = 0.001 # Neuron Centers self.neuron = neurons self.neuron_count = self.neuron**2 _x_space_, self.x_centers_distance = np.linspace(-150, 30, neurons, retstep=True) _x_d_space_, self.phi_centers_distance = np.linspace(-15, 15, neurons, retstep=True) self.centers = np.array(list(itertools.product(_x_space_, _x_d_space_))) self.x_sigma = self.x_centers_distance**2 self.x_d_sigma = self.phi_centers_distance**2 # Activity / State Parameters self.number_of_actions = actions self.activity = {"Right": 0, "Left": 1, "Neutral": 2} self.action_index_ = {"1": 0, "-1": 1, "0": 2} self.last_action = None self.action = 0 self.old_state = None self.state = [self.mountain_car.x, self.mountain_car.x_d] self.old_index = None self.index = self._get_index(self.state) # Trace Memory self.e = np.zeros((self.neuron_count, actions)) if not explore_weights: self.weights = np.random.rand(self.neuron_count, actions) if explore_weights: self.weights = np.ones((self.neuron_count, actions)) * weights # Time step for Simulation self.time = time self.dt = dt self.n_steps = n_steps self.n_episodes = n_episodes # Exploration self.explore_temp = explore_temp self.explore_lam = explore_lam self.explore_both = explore_both # Save Data save_data_name = datetime.now().strftime('%m-%d-%H.%M.%S') self.filename = "{0}-{1}s.hdf5".format(run_type, save_data_name) # Verbose Toggle self.verbose = verbose
def train( self, n_steps, n_episodes, reward_factor, eligibility_decay, init_learning_rate, duration_learingrate, target_learning_rate, init_tau, duration_tau, target_tau, min_learning_rate=0.005, min_tau=0.01, # must not be lower than 0.01 step_penalty=-0.1, mountain_car=None, save_to_file=True, show_intermediate=False, show_trace=False, show_interactive=True, show_weights=False): """ duration_*: positive integer. Determines at which episode the * parameter reaches it's minimum value. Note that the parameter continues to shrink when it reached the target_learning_rate value. min_*: spezifies a lower bound on the * parameter save_to_file: if True, then stores the NN after the training to a file. can be a string (directory where to store the NN) show_intermediate: if True, shows a plot all 100 episodes show_trace: if True, shows the trace of the car for each episode """ #parameter checks assert init_tau > 0.0 assert init_learning_rate != 0.0 assert n_steps is None or n_steps > 0 assert n_episodes > 0 assert duration_tau > 0 assert duration_learingrate > 0 tau = float(init_tau) learning_rate = float(init_learning_rate) tau_update_factor = (target_tau / init_tau)**(1.0 / duration_tau) learning_rate_update_factor = ( target_learning_rate / init_learning_rate)**(1.0 / duration_learingrate) if mountain_car is None: mountain_car = mc.MountainCar() if n_steps is None: n_steps = float('inf') # init history self.history.append({ 'episodes': n_episodes, 'steps': n_steps, 'init_learning_rate': init_learning_rate, 'duration_learingrate': duration_learingrate, 'target_learning_rate': target_learning_rate, "min_learning_rate": min_learning_rate, 'init_tau': init_tau, 'duration_tau': duration_tau, 'target_tau': target_tau, "min_tau": min_tau, 'eligibility_decay': eligibility_decay, 'step_penalty': step_penalty, 'reward_factor': reward_factor, 'eligibility_decay': eligibility_decay, 'step_penalty': step_penalty, 'sucess_indexes': [], }) for ep in range(n_episodes): # run episode t = time() print("episode", ep, "/", n_episodes, "tau:", tau, "lrate:", learning_rate) idx, trace = self._episode(mountain_car, learning_rate=learning_rate, reward_factor=reward_factor, eligibility_decay=eligibility_decay, n_steps=n_steps, step_penalty=step_penalty, tau=tau) self.history[-1]['sucess_indexes'].append(idx) print(" calc_t={:.4f}s".format(time() - t)) # update tau and learning rate tau = max(min_tau, tau * tau_update_factor) learning_rate = max(min_learning_rate, learning_rate * learning_rate_update_factor) #learning_rate = max(min_learning_rate, 1.0/np.sqrt(ep+44)) # sqrt t = time() # show some stuff if show_interactive: self.show_output(figure_name='activations_interactive', tau=tau, interactive=True) self.show_vector_field(figure_name='vector field interactive', tau=tau, interactive=True) if show_trace is True or (show_trace == 'not_succeeded' and idx > n_steps - 2): self.show_trace(figure_name='trace_interactive', trace=trace, interactive=True) if show_intermediate and ep % 100 == 99: self.show_output(figure_name='activations_' + str(ep), tau=tau, interactive=False) self.show_vector_field(figure_name='vector field' + str(ep), tau=tau, interactive=False) if show_weights is True: self.show_weights(figure_name="weights", interactive=True) if show_weights == 'intermediate' and ep % 1000 == 999: self.show_weights(figure_name="weights_" + str(ep), interactive=False) print(" plot_t={:.4f}s".format(time() - t)) #end for episodes # save the NN if save_to_file is True: self._store_to_file() elif isinstance(save_to_file, str): self._store_to_file(path=save_to_file) # concatenate all previous success_indexes ret_si = [] for h in self.history: ret_si += h['sucess_indexes'] return ret_si
def __init__(self, mountain_car=None, x_linspace=(-150, 30, 20), v_linspace=(-15, 15, 20), w=None, tau=1, gamma=0.95, eta=0.001, lambda_=0.95): ''' Initialize the object ''' # saving the environment object if mountain_car is None: self.mountain_car = mountaincar.MountainCar() else: self.mountain_car = mountain_car # range for x neurons grid self.x_values = np.linspace(*x_linspace) # range for v neurons grid self.v_values = np.linspace(*v_linspace) # steps x and v self.delta_x = self.x_values[1] - self.x_values[0] self.delta_v = self.v_values[1] - self.v_values[0] # sigmas x and v self.sigma_x = np.array([self.delta_x] * len(self.x_values)) self.sigma_v = np.array([self.delta_v] * len(self.v_values)) # number of actions self.n_actions = 3 # number of neurons self.n_neurons = len(self.x_values) * len(self.v_values) # weight matrix if w is None: #self.w = np.random.randn(self.n_actions, self.n_neurons) self.w = np.zeros((self.n_actions, self.n_neurons)) else: self.w = np.copy(w) assert self.w.shape == ( self.n_actions, self.n_neurons), "Please provide w with valid shape" # history of w self.w_history = [] self.w_history.append(self.w) # history of escape latency self.escape_latency = [] # sampling softmax temperature # can be a function from learning iteration self.tau = tau # setting tau if callable(self.tau): self.tau_func = self.tau self.tau = self.tau_func(0) # reward discount factor self.gamma = gamma # learning rate self.eta = eta # eligibility trace parameter self.lambda_ = lambda_ # number of iterations learned self.learning_counter = 0
def __init__(self, mountain_car=None, side_size=10, tau=0.05, x_range=(-150, 30), v_range=(-15, 15), weights=None, eta=0.01, gamma=0.95, lambdaa=0.9): """ Makes a new agent with given parameters: Model: mountain_car : Instance of MountainCar side_size : input neurons are arranged in a grid of this size -- scalar tau : strategy exploration temperature -- scalar x_range : range of positions to with input neurons -- 2-tuple v_range : range of velocities to cover with input neurons -- 2-tuple weights : from input neurons to output neurons -- array(3 x side_size x side_size) Learning: eta : learning rate -- scalar << 1 gamma : future state discounting factor -- scalar (0.95 recommended) lambdaa : eligibility decay rate -- scalar in (0,1) """ if mountain_car is None: self.mountain_car = mountaincar.MountainCar() else: self.mountain_car = mountain_car if weights is None: self.weights = np.ones((3, side_size, side_size)) else: self.weights = weights self.eligibility_trace = np.empty_like(self.weights) # neuron preference centres, widths: self.centres_x, self.sigma_x = np.linspace(x_range[0], x_range[1], side_size, endpoint=True, retstep=True) self.centres_v, self.sigma_v = np.linspace(v_range[0], v_range[1], side_size, endpoint=True, retstep=True) # we transpose one of the dimensions so that it will be broadcasted nicely with the other self.centres_x = np.atleast_2d(self.centres_x) self.centres_v = np.atleast_2d(self.centres_v).T # we always use sigma**2 in our calculations, so save that one instead self.sigma_x = self.sigma_x**2 self.sigma_v = self.sigma_v**2 # save the rest of the params self.tau = tau self.eta = eta self.gamma = gamma self.lambdaa = lambdaa self.side_size = side_size # number of steps used per episode self.escape_times = []