def __init__(self, num_actions, state_bounds, tile_coding={ 'maxSize': 1024, 'num_tilings': 8, 'num_grids': 10 }, learning_rate=0.01, discount_factor=0.9, train_result_file=None): self.learning_rate = learning_rate self.discount_factor = discount_factor self.nA = num_actions # for using tile coding to construct the feature vector self.tile_coding = tile_coding self.maxSize = tile_coding['maxSize'] self.iht = IHT(self.maxSize) #initialize the hash table self.num_tilings = tile_coding['num_tilings'] self.tile_scale = tile_coding['num_grids'] / (state_bounds[1] - state_bounds[0]) self.feature_vec_zero = np.zeros(self.maxSize) if not train_result_file: self.w = np.zeros(self.maxSize) else: hf = h5py.File(train_result_file, 'r') trained_model = hf.get('trained_model') self.w = trained_model.get('w').value hf.close()
def __init__(self, alpha, stateLow, stateHigh, numActions): assert (len(stateLow) == len(stateHigh)) self.alpha = alpha self.stateLow = stateLow self.stateHigh = stateHigh self.numActions = numActions self.numTilings = findProperNumberOfTilings(len(stateLow)) # 16 self.tileWidth = np.array([ 3, 3, 1, 1 ]) * self.numTilings # tileWidth = ultimate resolution * numTilings # Used to re-scale the range of each dimension in state to use Sutton's # tile coding software interface. self.scalingFactor = 1 / self.tileWidth # One advantage of Sutton's tilecoder is the use of hashing. In our problem, # the state range is too large so that the size of the resulting weight vector # is also prohibitively large. However, we can use his tilecoder by specifying # a upper range of returned indices regardless of the state space and tiling numbers. # Since the tilecoder is guaranteed to return different active tiles for different # input state (or state-action pair) as long as there is unused indices left. # This way, we implicitly achieve the desired property of "dynamic tile coding" where # the total number of tiles stays unchanged but give more resolution to state spaces # that are visited more often. maxSize = np.prod(np.ceil( (self.stateHigh - self.stateLow) / self.tileWidth), dtype=int) * self.numTilings * self.numActions self.iht = IHT(maxSize) self.w = np.zeros(maxSize)
def agent_init(): global iht, agent_type, w, possible_actions, fourier_order, seed np.random.seed(seed) """ Hint: Initialize the variables that need to be reset before each run begins Returns: nothing """ # initialize the policy array in a smart way possible_actions = np.concatenate((np.arange(-k, 0), np.arange(1, k + 1))) if agent_type == "tabular": w = np.zeros((size, 1)) elif agent_type == "tile": w = np.zeros( (IHT_SIZE, 1)) # (num_tilings[agent_type] / tile_widths[agent_type], 1)) iht = IHT( IHT_SIZE ) # num_tilings[agent_type] / tile_widths[agent_type]) # 13 # size * k) elif agent_type == "aggregation": w = np.zeros((num_tilings[agent_type] / tile_widths[agent_type], 1)) elif agent_type == "fourier": w = np.zeros((2 * fourier_order + 1, 1)) else: print("Error") exit(213)
def agent_init(): global actions, iht, weights # choose number of action actions = [0, 1, 2] iht = IHT(MAX_SIZE) # 3xn matrix weights = np.random.uniform(-0.001, 0, (len(actions), MAX_SIZE))
def __init__(self, mcar, num_tiling=8, max_size=4096): self.num_tiling = num_tiling self.iht = IHT(max_size) self.weights = np.zeros(max_size, dtype=np.float) self.x_scale = self.num_tiling / (mcar.x_max - mcar.x_min) self.v_scale = self.num_tiling / (mcar.v_max - mcar.v_min)
def agent_init(): global weights, iht, e_trace iht = IHT(IHT_SIZE) weights = np.array( [random.uniform(-0.001, 0) for weight in range(IHT_SIZE)]) weights = weights[np.newaxis, :] e_trace = np.zeros(IHT_SIZE) e_trace = e_trace[np.newaxis, :]
def agent_init(): global iht, w, seed # np.random.seed(seed) """ Hint: Initialize the variables that need to be reset before each run begins Returns: nothing """ # initialize the policy array in a smart way # w = np.zeros((IHT_SIZE, 1)) # (num_tilings[agent_type] / tile_widths[agent_type], 1)) w = -0.001 * np.random.uniform(size=(IHT_SIZE, 1)) iht = IHT( IHT_SIZE ) # num_tilings[agent_type] / tile_widths[agent_type]) # 13 # size * k)
def __init__(self, environment=ServerEnv(), num_of_tiles=8): self.env = environment self.state = self.env.get_state() self.state_low_bound = self.env.observation_space.low self.state_high_bound = self.env.observation_space.high self.n_action = env.action_space.n self.action_space = gym.spaces.Discrete(self.n_action) self.num_of_tiles = num_of_tiles self.d = 2048 self.w = np.zeros(self.d) self.hash_table = IHT(self.d) self.s0_scale = 1.0 * self.d / self.env.n self.s1_scale = 1.0 * self.d / (len(self.env.priority) - 1)
def __init__(self, n_tiles, n_tilings, limits): """ Args: n_tiles (list or 1D array): Number of tiles in each dimension n_tilings (int): Number of tilings limits (list): List of (min, max) tuples for each dimension """ super().__init__() self.n_tiles = np.array(n_tiles) self.n_tilings = n_tilings self.state_size = n_tilings * np.prod(self.n_tiles) self.iht = IHT(self.state_size) self.limits = np.array(limits) self.scaling = self.n_tiles / (self.limits[:, 1] - self.limits[:, 0])
def reset(self): # set up tiles to extract discrete feature represetation of the continuous mdp state rep and discrete action rep self.iht = IHT(self.max_size) self.state_scale_factor = [ self.num_tiles / abs(self.mdp.x_max - self.mdp.x_min), self.num_tiles / abs(self.mdp.xdot_max - self.mdp.xdot_min) ] # setup a tiles fn which returns a list of the active tiles given the state, action pair self.tiles = lambda state, action: tiles(self.iht, self.num_tilings, [ state[0] * self.state_scale_factor[0], state[1] * self. state_scale_factor[1] ], [action]) self.w = np.zeros(self.max_size) self.num_updates = 0 self.w_history = []
def __init__(self, iht_size, num_tilings, num_tiles): ''' Initializes the MountainCar Tile Coder iht_size -- int, the size of the index hash table, typically a power of 2 num_tilings -- int, the number of tillings num_tiles -- int, the number of tiles. Both width and height of the tile coder are the same. Class Variables: self.iht -- IHT, the index hash table that the tile coder will use self.num_tilings -- int, the number of tilings the tile coder will use self.num_tiles -- int, the number of tiles the tile coder will use ''' self.iht = IHT(iht_size) self.num_tilings = num_tilings self.num_tiles = num_tiles
def __init__(self, num_actions, state_bounds, tile_coding={ 'maxSize': 1024, 'num_tilings': 8, 'num_grids': 10 }, learning_rate_w=0.01, learning_rate_theta=0.01, lambda_w=0.8, lambda_theta=0.8, discount_factor=0.9, train_result_file=None): self.learning_rate_w = learning_rate_w self.learning_rate_theta = learning_rate_theta self.discount_factor = discount_factor self.lambda_w = lambda_w self.lambda_theta = lambda_theta self.nA = num_actions # for using tile coding to construct the feature vector self.tile_coding = tile_coding self.maxSize = tile_coding['maxSize'] self.iht = IHT(self.maxSize) #initialize the hash table self.num_tilings = tile_coding['num_tilings'] self.tile_scale = tile_coding['num_grids'] / (state_bounds[1] - state_bounds[0]) self.feature_vec_zero = np.zeros(self.maxSize) if not train_result_file: self.w = np.zeros( self.maxSize) # parameter vector for value estimator self.theta = np.zeros( self.maxSize) # parameter vector for value estimator self.e_w = np.zeros(self.maxSize) # eligibility trace vector for w self.e_theta = np.zeros( self.maxSize) # eligibility trace vector for theta else: hf = h5py.File(train_result_file, 'r') trained_model = hf.get('trained_model') self.w = trained_model.get('w').value self.theta = trained_model.get('theta').value self.e_w = trained_model.get('e_w').value self.e_theta = trained_model.get('e_theta').value hf.close()
def reset(self): # set up tiles to extract discrete feature represetation of the continuous mdp state rep and discrete action rep self.iht = IHT(self.max_size) self.state_scale_factor = [ self.num_tiles / abs(self.mdp.x_max - self.mdp.x_min), self.num_tiles / abs(self.mdp.xdot_max - self.mdp.xdot_min) ] # setup a tiles fn which returns a list of the active tiles given the state, action pair self.tiles = lambda state, action: tiles(self.iht, self.num_tilings, [ state[0] * self.state_scale_factor[0], state[1] * self. state_scale_factor[1] ], [action]) # setup weight vector for linear function approximation self.w = np.zeros(self.max_size) self.total_rewards = 0
def agent_init(): global weights, iht if AGENT == "STATE_AGG": weights = np.array([0.0 for weight in range(1, NUM_STATES, AGGREGATE_SIZE)]) elif AGENT == "TABULAR": weights = np.array([0.0 for weight in range(NUM_STATES)]) elif AGENT == "POLYNOMIAL": weights = np.array([0.0 for weight in range(POLY_DEGREE + 1)]) elif AGENT == "RADIAL": weights = np.array([0.0]) elif AGENT == "TILE_CODING": iht = IHT(IHT_SIZE) weights = np.array([0.0 for weight in range(IHT_SIZE)]) else: exit("Invalid agent selection!") weights = weights[np.newaxis, :]
def __init__(self, mcar, replacing=True, clear_trace=False, true_update=False, num_tiling=8, max_size=4096): self.num_tiling = num_tiling self.replacing = replacing self.clear_trace = clear_trace self.true_update = true_update self.iht = IHT(max_size) self.weights = np.zeros(max_size, dtype=np.float) self.x_scale = self.num_tiling / (mcar.x_max - mcar.x_min) self.v_scale = self.num_tiling / (mcar.v_max - mcar.v_min) self.e_trace = np.zeros_like(self.weights)
def __init__(self, environment=gym.make('MountainCar-v0'), num_of_tiles=8): self.env = environment self.state = self.env.reset() self.state_low_bound = self.env.observation_space.low self.state_high_bound = self.env.observation_space.high self.n_action = env.action_space.n self.action_space = gym.spaces.Discrete(self.n_action) self.d = 100 self.w = np.random.rand(self.d) self.num_of_tiles = num_of_tiles self.d = 4096 self.w = np.zeros(self.d) self.hash_table = IHT(self.d) self.s0_scale = 1.0 * self.d / (self.state_high_bound[0] - self.state_low_bound[0]) self.s1_scale = 1.0 * self.d / (self.state_high_bound[1] - self.state_low_bound[1])
def agent_init(self): """ Arguments: Nothing Returns: Nothing """ # init the number of tilings to 50 self.num_tiling = 50 self.total_state = 1000 self.gamma = 1 self.tile_width = 0.2 # tile width is 0.2 # 5 tiles cover 1000 states # add 1 tile for offsetting # 6 * num_tilings = 300 self.max_size = int(((1 / self.tile_width) + 1) * self.num_tiling) # init index hash table self.iht = IHT(self.max_size) # define alpha self.alpha = 0.01 / self.num_tiling # init weight self.weight = np.zeros(self.max_size) self.action = None # init a variable to keep the last state self.last_state = None # init state feature # for every row, contains binary features obtained by tile coding # each row for a single state self.x_s = np.zeros((self.total_state, self.max_size)) # init estimate state value function self.v_hat = None # init a track for store the states have been tile coding self.track = {}
def agent_init(self): """ Arguments: Nothing Returns: Nothing Hint: Initialize the variables that need to be reset before each run begins """ if self.mode == "tabular": self.feature_vector = np.identity(1001) self.w = np.zeros(1001) self.alpha = 0.5 elif self.mode == "tile": iht = IHT(1024) self.num_tiles = 50 self.feature_vector = np.zeros((1001, 1024)) self.w = np.zeros(1024) self.alpha = 0.01 / 50 for s in range(1, 1001): tile_result = tiles(iht, self.num_tiles, [s / 200]) self.feature_vector[s][tile_result] = 1
def __init__(self): ''' This is the initialization of the agent class. ''' self.maxSize = 65536 # self.z = [0]*self.maxSize self.z = np.zeros(self.maxSize) self.iht = IHT(self.maxSize) # self.weights = [0]*self.maxSize # self.weights = np.random.uniform(-1.0,1.0,self.maxSize) self.weights = np.ones(self.maxSize) self.numTilings = 8 self.stepsize = 0.1/self.numTilings self._gamma = 0.9 self._lambda = 0.2 self._epsilon = 0.05 self._last_action = 0 # action refactor self._current_action = 0 # this gets called and updated in agent._policy() self._previous_action = 0 # this gets called and updated in agent.learn() # self.last_state = [0,0]
def __init__(self): self.weights = np.random.uniform(-0.001, 0.001, TILES) self.iht = IHT(TILES) self.q = dict() self.features = dict()
mpl.use('agg') import matplotlib.pyplot as plt plt.style.use('ggplot') import numpy as np import sys, time from tiles3 import IHT, tiles register( id='MountainCar-v1', entry_point='gym.envs.classic_control:MountainCarEnv', max_episode_steps=5000, reward_threshold=-110.0, ) env = gym.make('MountainCar-v1') nA = 3 iht = IHT(4096) def _greedy(Q, s): qmax = np.max(Q(s)) actions = [] for i, q in enumerate(Q(s)): if q == qmax: actions.append(i) return actions def greedy(Q, s): return np.random.choice(_greedy(Q, s))
import gym import numpy as np import matplotlib.pyplot as plt import math from tiles3 import IHT, tiles # Episodic Semi-gradient Sarsa Implementation (on-policy control) NUM_TILES = 16 MAX_SIZE = 2**16 iht = IHT(MAX_SIZE) def get_epsilon(episode, num_episodes): if (episode < 1000): return 1.0 - (episode) / 1000.0 else: return 0.05 def q(hashtable, state, action, weights): active_tiles = tiles(hashtable, NUM_TILES, state, [action]) return sum(weights[active_tiles]) def epsilon_greedy_action(hashtable, weights, state, e): if (np.random.uniform(0, 1) < e): action = env.action_space.sample() else: state_action_values = [] for a in [0, 1, 2, 3]:
num_tilings = 16 alpha = (1 / num_tilings) #recommended alpha by Rich Sutton tiling_number = 10 #num_rows of tiles per tiling. So total number is dimensionality^obs_dim * num_tilings project_models = [] tilings = [] #List of tilings, each for one dimension weights = [] #List of weights, one for each dimension scalers = [] for k in range(observations_dim): #let us build all the requisite weights project_models.append( load('./PCA Models/pca_pipeline_{}.joblib'.format(k + 1))) tilings.append(IHT(num_tilings * np.power( tiling_number, k + 1))) #tile coding seperate for each dimension weights.append( np.zeros( [num_tilings * np.power(tiling_number, k + 1), num_actions])) stepcount = np.zeros([num_episodes, 1]) gamma = 0.99 #discount factor epsilon = 0.1 #set exploration parameter as desired def compute_tile_indices(state, dimension): c = tiles(tilings[dimension], num_tilings, tiling_number * state.flatten()) #use the appropriate tiling as desired return c
def __init__(self, iht_size=IHT_DEFAULT_SIZE, num_tilings=NUM_TILINGS): self.iht = IHT(iht_size) self.num_tilings = num_tilings
""" Author: Adam White, Matthew Schlegel, Mohammad M. Ajallooeian, Sina Ghiassian Purpose: Skeleton code for On-Policy Sarsa Control Agent for use on A4 of Reinforcement learning course University of Alberta Fall 2017 """ from tiles3 import tiles, IHT from utils import rand_in_range, rand_un import numpy as np import pickle import decimal import random maxSize = 1000 * 50 #big size for the hash vector (states*tilings) iht = IHT(maxSize) weights = None numTilings = 50 #number of tilings stepSize = 0.01 / numTilings #alpha feature_vectors = None last_state = None #last state tile_width = 0.2 #tile width v = None #estimates # function to return the tilings corresponding to an state (x) def mytiles(x): global numTilings, tile_width return tiles(iht, numTilings, [x * tile_width])
def agent_init(): global w, iht w = np.zeros(total_states) iht = IHT(total_states)
def __init__(self, t_num=4096): self.aSpace = [0, 1, 2] self.w = np.zeros(t_num) self.env = gym.make('MountainCar-v0') self.env._max_episode_steps = 500 self.iht = IHT(t_num)
def agent_init(): global actions, iht, weights # choose number of action actions = ['left', 'right'] iht = IHT(MAX_SIZE) weights = np.zeros(MAX_SIZE)
from rl_glue import * # Required for RL-Glue RLGlue("mountaincar", "sarsa_lambda_agent") import numpy as np from tiles3 import tiles, IHT memorySize = 4096 num_tilings = 8 alpha = 0.1 / (num_tilings) lamb = 0.9 epsilon = 0.0 w = [] z = [] discount = 1 iht = IHT(memorySize) def my_tiles(state, action): (x, xdot) = state return tiles(iht, num_tilings, [8.0 * x / (0.5 + 1.2), 8.0 * xdot / (0.07 + 0.07)], [action]) def q_hat(indices, w): value = 0 for t in indices: value += w[t] return value
class Agent: # Set up parameters for agent # State-action pair values # Use a function defined below for Q instead of a dict #Q = None #R_bar = 0 # Action space, to be configured by the environment A = None # Agent configuration variables epsilon = 0.05 # Index hash table / number of features hash_table_size = 2048 # number of tilings / number of feature fields num_offset_tilings = 8 # length of one side of tiling #tiling_side_length = np.sqrt(hash_table_size) tiling_side_length = 8 # weight vector w = None #w = None iht = IHT(hash_table_size) # eligibility trace # z[-1] = [0 for in d] z = None # step size alpha = 0.005 / num_offset_tilings gamma = 0.9 lam = 0.9 tilecache = {} # min and max values for environment mins = [] maxs = [] last_action = None last_state = None times_selected = {} # returns list of feature indices for the given state # [feature# for field 1, feature# for field2, ..., feature# for last field] # This is a list of all the features with value 1 (activated) def active_features(self, state, action): sap = (state, action) if (sap in self.tilecache): return self.tilecache[sap] else: scaleFactor = [ self.tiling_side_length / (self.maxs[i] - self.mins[i]) for i in range(len(self.maxs)) ] # Use distinct integer actions in tiling, resulting in a different # tiling for each action t = tiles(self.iht, self.num_offset_tilings, [state[i] * scaleFactor[i] for i in range(len(state))], [action]) self.tilecache[sap] = t return t # Declare agent variables def __init__(self, actions, max_state, min_state): self.R_bar = 0 self.A = actions self.mins = np.array(min_state) self.maxs = np.array(max_state) # Initialize agent variables # Run once, in experiments def agent_init(self): # Assume there are no states (new states can be valued lazily) self.pi = {} self.returns = {} self.v = {} self.last_action = None self.last_state = None self.w = np.zeros(self.hash_table_size) self.w = np.full(self.hash_table_size, 0.1) # Start agent # Runs at the beginning of an episode. The first method called when the experiment # starts, called after the environment starts # Args: # state (state observation): The agent's current state # returns: # The first action the agent takes def agent_start(self, state): self.tilecache = {} self.times_selected = {a: 0 for a in self.A} self.z = np.zeros(self.hash_table_size) # start episode self.last_state = state self.last_action = self.epGreedy(state) return self.last_action # A step taken by the agent # Args: # reward (float): the reward received for taking the last action taken # state (state observation): The agen's current state # returns: # The action the agent is taking def agent_step(self, reward, state): # S[t+1] Sprime = state # R[t+1] R = reward # S[t] S = self.last_state # A[t] A = self.last_action # w[t], self.w = w[t+1] w = self.w #print(S,A) #input(self.active_features(S,A)) #print("reward", R) #print("old", self.Q(S,A,w)) # error[t] error = R self.z[self.active_features(S, A)] += 1 #for feature in self.active_features(S,A): #error -= w[feature] # accumulating traces #self.z[feature] += 1 # replacing traces # self.z[feature] = 1 # A[t+1] Aprime = self.epGreedy(Sprime) self.times_selected[A] += 1 #print(self.Q(Sprime,Aprime,self.w)) #print(self.Q(S,A,self.w)) old = self.Q(S, A, self.w) error = R + self.gamma * self.Q(Sprime, Aprime, self.w) - self.Q( S, A, self.w) # update eligibility trace #for feature in self.active_features(Sprime,Aprime): #error += self.gamma*w[feature] adjusted_error2 = error #alpha = 0.005/self.times_selected[A] alpha = self.alpha #for i in range(len(self.w)): # self.w[i] += alpha*error*self.z[i] self.w += alpha * error * self.z #for i in range(len(self.w)): # self.z[i] *= self.gamma*self.lam self.z *= self.gamma * self.lam #print("new", self.Q(S,A,w)) new = self.Q(S, A, w) #print("S",S, "A", A,"R", R) #print("qd", self.Q_dict()) #input() #if (new < old): if (1 < 0): print("####") print("old", old) print("new", new) print("reward", R) print("features", self.active_features(S, A)) print("old weights", oldweights) print("new weights", [w[i] for i in self.active_features(S, A)]) print("adjusted error", adjusted_error) print("adjusted error 2", adjusted_error2) print("####") input() self.last_state = Sprime self.last_action = Aprime return self.last_action # Run when the agent termintates # Args: # reward (float): the reward the agent received for entering the terminal state def agent_end(self, reward): #print('terminal') R = reward S = self.last_state A = self.last_action w = self.w self.times_selected[A] += 1 #alpha = 0.005/self.times_selected[A] alpha = self.alpha #print("reward", R) #print("old", self.Q(S,A,w)) # error[t] features = self.active_features(S, A) error = R - np.sum(w[features]) self.z[features] += 1 #for feature in self.active_features(S,A): # error -= w[feature] # accumulating traces # self.z[feature] += 1 # replacing traces #self.z[feature] = 1 #input(str(R) + " " + str(error)) #for i in range(len(self.w)): # self.w[i] += alpha*error*self.z[i] self.w += alpha * error * self.z #print("new", self.Q(S,A,w)) #input() #print("tS",S, "A", A,"R", R) #print("error", error) #print("alpha", alpha) #print("qd", self.Q_dict()) #input() # Receive a message form RLGlue # Args: # Message (str): the message passed # returns: # response (str): The agent's response to the message (optional) def agent_message(self, message): # Output what the agent thinks is the optimal policy if message == "action-values": return None def pi_select(self, pi, s, A): rand = np.random.random() probs = self.pi.get(s, {a: 1 / len(A) for a in A}) val = 0 for a in A: val += probs[a] if rand < val: return a def epGreedy(self, S): rand = np.random.random() if (rand < self.epsilon): return np.random.choice(self.A) else: maxQ = max([self.Q(S, a, self.w) for a in self.A]) max_actions = [a for a in self.A if self.Q(S, a, self.w) >= maxQ] return np.random.choice(max_actions) def Q(self, state, action, w): features = self.active_features(state, action) return np.sum(w[features]) #return sum([w[i] for i in features]) # returns the action with the highest action-value # Args: # Q: Action values # s: state to evaluate # A: Actions to pick from def argmaxa(self, Q, s, A): # Find the highest action value maxQ = max([Q.get((s, a), 0) for a in A]) # Find actions with highest value best_actions = [a for a in A if Q.get((s, a), 0) == maxQ] # consistently pick from best actions return sorted(best_actions)[0] # returns the action with the highest action-value # Args: # Q: Action values # s: state to evaluate # A: Actions to pick from def argmaxa_rand(self, Q, s, A): # Find the highest action value maxQ = max([Q.get((s, a), 0) for a in A]) # Find actions with highest value best_actions = [a for a in A if Q.get((s, a), 0) == maxQ] # Randomly pick from actions with top action value return np.random.choice(best_actions)