def __init__(self, numTilings=1, parameters=2, rlAlpha=0.5, rlLambda=0.9, rlGamma=0.9, cTableSize=0): """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """ self.numTilings = numTilings self.tileWidths = list() self.parameters = parameters self.rlAlpha = rlAlpha self.rlLambda = rlLambda self.rlGamma = rlGamma self.prediction = None self.lastS = None self.lastQ = None self.lastPrediction = None self.lastReward = None self.traceH = TraceHolder((self.numTilings**(self.parameters) + 1), self.rlLambda, 1000) self.F = [0 for item in range(self.numTilings) ] # the indices of the returned tiles will go in here self.theta = [ 0 for item in range((self.numTilings**(self.parameters + 1)) + 1) ] # weight vector. self.cTable = CollisionTable(cTableSize, 'safe') # look into this... self.verifier = Verifier(self.rlGamma)
def __init__(self, numTilings = 1, num_bins = 2, rlAlpha = 0.5, rlLambda = 0.9, rlGamma = 0.9, cTableSize=0): """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """ self.numTilings = numTilings self.num_bins = num_bins self.rlAlpha = rlAlpha self.rlLambda = rlLambda self.rlGamma = rlGamma self.mem_size = 1048576 # 16,384 or 8,192 or 1,048,576 or 8,388,608 or 16,777,216 or 33,554,432 self.prediction = None self.current_prediction = 0 self.delta = 0 self.lastS = None self.previous_tiles = [0 for item in range(self.numTilings)] # self.previous_state = [None for item in range(self.numTilings*(self.num_bins)**10)] self.previous_prediction = None self.lastQ = None self.lastPrediction = None self.lastReward = None self.traceH = TraceHolder(self.mem_size, 0.01, 1000) # TraceHolder(mem, minT, maxN) self.F = [0 for item in range(self.numTilings)] # the indices of the returned tiles will go in here self.theta = [0 for item in range(self.mem_size)] # weight vector. # self.weights = [0 for item in range(self.numTilings*(self.num_bins)**10)] # self.e_trace = [0 for item in range(self.numTilings*(self.num_bins)**10)] # added by Ann self.cTable = CollisionTable(cTableSize, 'super safe') # look into this... # stats = self.cTable.stats() # print stats self.verifier = Verifier(self.rlGamma)
def __init__(actions, self, numTilings = 1, parameters = 2,rlAlpha = 0.5, rlLambda = 0.9, rlGamma = 0.9, rlEpsilon = 0.1, cTableSize=0, action_selection = 'softmax'): """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """ self.numTilings = numTilings self.tileWidths = list() self.parameters = parameters self.rlAlpha = rlAlpha self.rlLambda = rlLambda self.rlGamma = rlGamma self.rlEpsilon = rlEpsilon self.action_selection = action_selection self.lastS = None self.lastQ = None self.lastPrediction = None self.lastReward = None self.lastAction = None self.currentAction = None self.actions = actions # an array of actions which we can select from self.traceH = TraceHolder((self.numTilings**(self.parameters)+1), self.rlLambda, 1000) self.F = [[0 for item in range(self.numTilings)] for i in range(actions)] # the indices of the returned tiles will go in here self.q_vals = [0 for i in range(actions)] for action in actions: self.q.append(action,[0 for item in range((self.numTilings**(self.parameters+1))+1)]) # action and weight vec self.cTable = CollisionTable(cTableSize, 'safe') # look into this... self.verifier = Verifier(self.rlGamma)
def __init__(actions, self, numTilings=1, parameters=2, rlAlpha=0.5, rlLambda=0.9, rlGamma=0.9, rlEpsilon=0.1, cTableSize=0, action_selection='softmax'): """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """ self.numTilings = numTilings self.tileWidths = list() self.parameters = parameters self.rlAlpha = rlAlpha self.rlLambda = rlLambda self.rlGamma = rlGamma self.rlEpsilon = rlEpsilon self.action_selection = action_selection self.lastS = None self.lastQ = None self.lastPrediction = None self.lastReward = None self.lastAction = None self.currentAction = None self.actions = actions # an array of actions which we can select from self.traceH = TraceHolder((self.numTilings**(self.parameters) + 1), self.rlLambda, 1000) self.F = [[0 for item in range(self.numTilings)] for i in range(actions) ] # the indices of the returned tiles will go in here self.q_vals = [0 for i in range(actions)] for action in actions: self.q.append(action, [ 0 for item in range((self.numTilings**(self.parameters + 1)) + 1) ]) # action and weight vec self.cTable = CollisionTable(cTableSize, 'safe') # look into this... self.verifier = Verifier(self.rlGamma)
def __init__(self, numTilings = 1, parameters = 2, rlAlpha = 0.5, rlLambda = 0.9, rlGamma = 0.9, cTableSize=0): """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """ self.numTilings = numTilings self.tileWidths = list() self.parameters = parameters self.rlAlpha = rlAlpha self.rlLambda = rlLambda self.rlGamma = rlGamma self.prediction = None self.lastS = None self.lastQ = None self.lastPrediction = None self.lastReward = None self.traceH = TraceHolder((self.numTilings**(self.parameters)+1), self.rlLambda, 1000) self.F = [0 for item in range(self.numTilings)] # the indices of the returned tiles will go in here self.theta = [0 for item in range((self.numTilings**(self.parameters+1))+1)] # weight vector. self.cTable = CollisionTable(cTableSize, 'safe') # look into this... self.verifier = Verifier(self.rlGamma)
class TDLambdaLearner(Learner): """ Note: the TileCoder is Rich's Python version, which is still in Alpha. See more at: http://webdocs.cs.ualberta.ca/~sutton/tiles2.html#Python%20Versions Collision Table notes: cTableSize is the size that the collision table will be instantiated to. The size must be a power of two. In calls for get tiles, the collision table is used in stead of memory_size, as it already has it. """ def __init__(self, numTilings = 1, num_bins = 2, rlAlpha = 0.5, rlLambda = 0.9, rlGamma = 0.9, cTableSize=0): """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """ self.numTilings = numTilings self.num_bins = num_bins self.rlAlpha = rlAlpha self.rlLambda = rlLambda self.rlGamma = rlGamma self.mem_size = 1048576 # 16,384 or 8,192 or 1,048,576 or 8,388,608 or 16,777,216 or 33,554,432 self.prediction = None self.current_prediction = 0 self.delta = 0 self.lastS = None self.previous_tiles = [0 for item in range(self.numTilings)] # self.previous_state = [None for item in range(self.numTilings*(self.num_bins)**10)] self.previous_prediction = None self.lastQ = None self.lastPrediction = None self.lastReward = None self.traceH = TraceHolder(self.mem_size, 0.01, 1000) # TraceHolder(mem, minT, maxN) self.F = [0 for item in range(self.numTilings)] # the indices of the returned tiles will go in here self.theta = [0 for item in range(self.mem_size)] # weight vector. # self.weights = [0 for item in range(self.numTilings*(self.num_bins)**10)] # self.e_trace = [0 for item in range(self.numTilings*(self.num_bins)**10)] # added by Ann self.cTable = CollisionTable(cTableSize, 'super safe') # look into this... # stats = self.cTable.stats() # print stats self.verifier = Verifier(self.rlGamma) # def Ann_update(self, current_state, numstates, reward=None): # if current_state != None: # self.Ann_learn(current_state, reward, numstates) # return self.current_prediction # else: # return None # # def Ann_learn(self, current_state, reward, numstates): # active_tiles = simple_tiles(self.numTilings, self.numTilings*self.num_bins, current_state, numstates) # returns index of active features # print "active tiles = " + str(active_tiles) # print "previous tiles = " + str(self.previous_tiles) # if self.previous_prediction != None: # # self.current_prediction = 0 # for index in active_tiles: # print 'index = ' + str(index) # self.current_prediction = self.weights[index] # not sure if this is right # # print 'weights[index] = ' + str(self.weights[index]) # self.delta = reward + self.rlGamma * self.current_prediction - self.previous_prediction # print 'self.delta = ' + str(self.delta) # if self.previous_state != None: # self.previous_state = [0 for item in range(self.numTilings*(self.num_bins)**10)] # for index in self.previous_tiles: # self.previous_state[index] = 1 # # print 'previous state = ' + str(self.previous_state) # self.e_trace = [x + y for x, y in zip(self.previous_state, [i * self.rlLambda * self.rlGamma for i in self.e_trace])] # # print 'e_trace = ' + str(self.e_trace) # self.weights = [x + y for x, y in zip(self.weights, [i * self.rlAlpha * self.delta for i in self.e_trace])] # alpha needs to be divided by N # # print 'weights = ' + str(self.weights) # self.previous_tiles = active_tiles # self.previous_prediction = self.current_prediction # print 'current prediction = ' + str(self.current_prediction) # self.verifier.updateReward(reward) # self.verifier.updatePrediction(self.current_prediction) # self.normalized_prediction = self.current_prediction * (1-self.rlGamma) # print 'normalized prediction = ' + str(self.normalized_prediction) def update(self, features, target=None): if features != None: self.learn(features, target) return self.prediction else: return None def learn(self, state, reward): self.loadFeatures(state, self.F) currentq = self.computeQ() # computeQ returns w*x' (current prediction) if self.lastS != None: # print 'reward = ' + str(reward) delta = reward + self.rlGamma*currentq - self.lastQ # delta = r + gamma*w*x' - w*x for i in self.traceH.getTraceIndices(): self.theta[i] += delta * (self.rlAlpha / self.numTilings) * self.traceH.getTrace(i) # delta * alpha/N * e # print 'traces = ' + str(self.traceH.getTrace(i)) self.traceH.decayTraces(self.rlGamma) self.traceH.replaceTraces(self.F) # self.e_trace = min(rlLambda*self.e_trace + x, 1) # added by Ann # print 'delta = ' + str(delta) # print 'trace indices = ' + str(self.traceH.getTraceIndices()) # print 'theta' + str(self.theta) # print 'self.F'+ str(self.F) # print 'lastS' + str(self.lastS) self.lastQ = currentq self.lastS = state self.prediction = currentq self.num_steps+=1 self.verifier.updateReward(reward) self.verifier.updatePrediction(self.prediction) self.normalized_prediction = self.prediction * (1 - self.rlGamma) def computeQ(self): q = 0 for i in self.F: q += self.theta[i] return q def loadFeatures(self, stateVars, featureVector): # loadtiles(featureVector, 0, self.numTilings, self.num_bins, stateVars) # active_tiles = loadtiles([0], 0, self.numTilings*self.num_bins, 1024, stateVars) # buffer = [0] # array of length numtilings # tiles(1,512,stateVars) # active_tiles = loadtiles([0], 0, self.numTilings, self.num_bins, stateVars) self.F = active_tiles = tiles(self.numTilings,self.mem_size,stateVars) # print 'tiles = ' + str(active_tiles) # active_tiles = simple_tiles(self.numTilings, self.numTilings*self.num_bins, stateVars) # simple_tiles(self.numTilings, self.numTilings*self.num_bins, stateVars) # print "featureVector " + str(len(self.theta)) # print 'active tiles = ' + str(active_tiles) # print 'numTilings = ' + str(self.numTilings) # print 'stateVars = ' + str(stateVars) """ As provided in Rich's explanation tiles ; a provided array for the tile indices to go into starting-element ; first element of "tiles" to be changed (typically 0) num-tilings ; the number of tilings desired memory-size ; the number of possible tile indices floats ; a list of real values making up the input vector ints) ; list of optional inputs to get different hashings """ def loss(self, x, r, prev_state=None): """ Returns the TD error assuming reward r given for transition from prev_state to x If prev_state is None will use leftmost element in exp_queue """ if prev_state is None: if len(self.exp_queue) < self.horizon: return None else: prev_state = self.exp_queue[0][0] vp = r + self.gamma * self.value(x) v = self.value(prev_state) delta = vp - v return delta def predict (self,x): self.loadFeatures(x, self.F) return self.computeQ()
class Q_learning(Learner): def __init__(actions, self, numTilings = 1, parameters = 2,rlAlpha = 0.5, rlLambda = 0.9, rlGamma = 0.9, rlEpsilon = 0.1, cTableSize=0, action_selection = 'softmax'): """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """ self.numTilings = numTilings self.tileWidths = list() self.parameters = parameters self.rlAlpha = rlAlpha self.rlLambda = rlLambda self.rlGamma = rlGamma self.rlEpsilon = rlEpsilon self.action_selection = action_selection self.lastS = None self.lastQ = None self.lastPrediction = None self.lastReward = None self.lastAction = None self.currentAction = None self.actions = actions # an array of actions which we can select from self.traceH = TraceHolder((self.numTilings**(self.parameters)+1), self.rlLambda, 1000) self.F = [[0 for item in range(self.numTilings)] for i in range(actions)] # the indices of the returned tiles will go in here self.q_vals = [0 for i in range(actions)] for action in actions: self.q.append(action,[0 for item in range((self.numTilings**(self.parameters+1))+1)]) # action and weight vec self.cTable = CollisionTable(cTableSize, 'safe') # look into this... self.verifier = Verifier(self.rlGamma) def chooseAction(self, features): for action in range(self.actions): self.loadFeatures(featureVector=self.F[action], stateVars=features) self.q_vals[action] = self.computeQ(action) return self.eGreedy() def eGreedy(self): if random.random() < self.rlEpsilon: return random.randrange(self.actions) # random action else: max_index, max_value = max(enumerate(self.q_vals), key=operator.itemgetter(1)) return max_index # best action def update(self, features, target=None): # learning step if features != None: self.learn(features, target, self.currentAction) self.lastAction = self.currentAction self.currentAction = self.chooseAction(features) # action selection step return self.currentAction def learn(self, features, reward, action): self.loadFeatures(features, self.F) currentq = self.computeQ(action) if self.lastS != None and self.lastAction != None: # if we're past the first step delta = reward - self.lastQ delta += self.rlGamma * currentq amt = delta * (self.rlAlpha / self.numTilings) for i in self.traceH.getTraceIndices(): self.theta[i] += amt * self.traceH.getTrace(i) max_action, max_value = max(enumerate(self.q_vals), key=operator.itemgetter(1)) if action == max_action: self.traceH.decayTraces(self.rlGamma*self.rlLambda) else: self.traceH.decayTraces(0) self.traceH.replaceTraces(self.F[action]) self.lastQ = currentq self.lastS = features self.num_steps+=1 self.verifier.updateReward(reward) self.verifier.updatePrediction(self.prediction) def loadFeatures (self, stateVars, featureVector): loadtiles(featureVector, 0, self.numTilings, self.numTilings**(self.parameters), stateVars) print "featureVector " + str(len(self.theta)) """ As provided in Rich's explanation tiles ; a provided array for the tile indices to go into starting-element ; first element of "tiles" to be changed (typically 0) num-tilings ; the number of tilings desired memory-size ; the number of possible tile indices floats ; a list of real values making up the input vector ints) ; list of optional inputs to get different hashings """ def computeQ (self, a): "compute value of action for current F and theta" q = 0 for i in self.F[a]: q += self.theta[i] return q
class TDLambdaLearner(Learner): """ Note: the TileCoder is Rich's Python version, which is still in Alpha. See more at: http://webdocs.cs.ualberta.ca/~sutton/tiles2.html#Python%20Versions Collision Table notes: cTableSize is the size that the collision table will be instantiated to. The size must be a power of two. In calls for get tiles, the collision table is used in stead of memory_size, as it already has it. """ def __init__(self, numTilings = 1, parameters = 2, rlAlpha = 0.5, rlLambda = 0.9, rlGamma = 0.9, cTableSize=0): """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """ self.numTilings = numTilings self.tileWidths = list() self.parameters = parameters self.rlAlpha = rlAlpha self.rlLambda = rlLambda self.rlGamma = rlGamma self.prediction = None self.lastS = None self.lastQ = None self.lastPrediction = None self.lastReward = None self.traceH = TraceHolder((self.numTilings**(self.parameters)+1), self.rlLambda, 1000) self.F = [0 for item in range(self.numTilings)] # the indices of the returned tiles will go in here self.theta = [0 for item in range((self.numTilings**(self.parameters+1))+1)] # weight vector. self.cTable = CollisionTable(cTableSize, 'safe') # look into this... self.verifier = Verifier(self.rlGamma) def update(self, features, target=None): if features != None: self.learn(features, target) return self.prediction else: return None def learn(self, state, reward): self.loadFeatures(state, self.F) currentq = self.computeQ() if self.lastS != None: delta = reward - self.lastQ delta += self.rlGamma * currentq amt = delta * (self.rlAlpha / self.numTilings) for i in self.traceH.getTraceIndices(): self.theta[i] += amt * self.traceH.getTrace(i) self.traceH.decayTraces(self.rlGamma) self.traceH.replaceTraces(self.F) self.lastQ = currentq self.lastS = state self.prediction = currentq self.num_steps+=1 self.verifier.updateReward(reward) self.verifier.updatePrediction(self.prediction) def computeQ(self): q = 0 for i in self.F: q += self.theta[i] return q def loadFeatures(self, stateVars, featureVector): loadtiles(featureVector, 0, self.numTilings, self.numTilings**(self.parameters), stateVars) print "featureVector " + str(len(self.theta)) """ As provided in Rich's explanation tiles ; a provided array for the tile indices to go into starting-element ; first element of "tiles" to be changed (typically 0) num-tilings ; the number of tilings desired memory-size ; the number of possible tile indices floats ; a list of real values making up the input vector ints) ; list of optional inputs to get different hashings """ def loss(self, x, r, prev_state=None): """ Returns the TD error assuming reward r given for transition from prev_state to x If prev_state is None will use leftmost element in exp_queue """ if prev_state is None: if len(self.exp_queue) < self.horizon: return None else: prev_state = self.exp_queue[0][0] vp = r + self.gamma * self.value(x) v = self.value(prev_state) delta = vp - v return delta def predict (self,x): self.loadFeatures(x, self.F) return self.computeQ()
class Q_learning(Learner): def __init__(actions, self, numTilings=1, parameters=2, rlAlpha=0.5, rlLambda=0.9, rlGamma=0.9, rlEpsilon=0.1, cTableSize=0, action_selection='softmax'): """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """ self.numTilings = numTilings self.tileWidths = list() self.parameters = parameters self.rlAlpha = rlAlpha self.rlLambda = rlLambda self.rlGamma = rlGamma self.rlEpsilon = rlEpsilon self.action_selection = action_selection self.lastS = None self.lastQ = None self.lastPrediction = None self.lastReward = None self.lastAction = None self.currentAction = None self.actions = actions # an array of actions which we can select from self.traceH = TraceHolder((self.numTilings**(self.parameters) + 1), self.rlLambda, 1000) self.F = [[0 for item in range(self.numTilings)] for i in range(actions) ] # the indices of the returned tiles will go in here self.q_vals = [0 for i in range(actions)] for action in actions: self.q.append(action, [ 0 for item in range((self.numTilings**(self.parameters + 1)) + 1) ]) # action and weight vec self.cTable = CollisionTable(cTableSize, 'safe') # look into this... self.verifier = Verifier(self.rlGamma) def chooseAction(self, features): for action in range(self.actions): self.loadFeatures(featureVector=self.F[action], stateVars=features) self.q_vals[action] = self.computeQ(action) return self.eGreedy() def eGreedy(self): if random.random() < self.rlEpsilon: return random.randrange(self.actions) # random action else: max_index, max_value = max(enumerate(self.q_vals), key=operator.itemgetter(1)) return max_index # best action def update(self, features, target=None): # learning step if features != None: self.learn(features, target, self.currentAction) self.lastAction = self.currentAction self.currentAction = self.chooseAction(features) # action selection step return self.currentAction def learn(self, features, reward, action): self.loadFeatures(features, self.F) currentq = self.computeQ(action) if self.lastS != None and self.lastAction != None: # if we're past the first step delta = reward - self.lastQ delta += self.rlGamma * currentq amt = delta * (self.rlAlpha / self.numTilings) for i in self.traceH.getTraceIndices(): self.theta[i] += amt * self.traceH.getTrace(i) max_action, max_value = max(enumerate(self.q_vals), key=operator.itemgetter(1)) if action == max_action: self.traceH.decayTraces(self.rlGamma * self.rlLambda) else: self.traceH.decayTraces(0) self.traceH.replaceTraces(self.F[action]) self.lastQ = currentq self.lastS = features self.num_steps += 1 self.verifier.updateReward(reward) self.verifier.updatePrediction(self.prediction) def loadFeatures(self, stateVars, featureVector): loadtiles(featureVector, 0, self.numTilings, self.numTilings**(self.parameters), stateVars) print "featureVector " + str(len(self.theta)) """ As provided in Rich's explanation tiles ; a provided array for the tile indices to go into starting-element ; first element of "tiles" to be changed (typically 0) num-tilings ; the number of tilings desired memory-size ; the number of possible tile indices floats ; a list of real values making up the input vector ints) ; list of optional inputs to get different hashings """ def computeQ(self, a): "compute value of action for current F and theta" q = 0 for i in self.F[a]: q += self.theta[i] return q
class TDLambdaLearner(Learner): """ Note: the TileCoder is Rich's Python version, which is still in Alpha. See more at: http://webdocs.cs.ualberta.ca/~sutton/tiles2.html#Python%20Versions Collision Table notes: cTableSize is the size that the collision table will be instantiated to. The size must be a power of two. In calls for get tiles, the collision table is used in stead of memory_size, as it already has it. """ def __init__(self, numTilings=1, parameters=2, rlAlpha=0.5, rlLambda=0.9, rlGamma=0.9, cTableSize=0): """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """ self.numTilings = numTilings self.tileWidths = list() self.parameters = parameters self.rlAlpha = rlAlpha self.rlLambda = rlLambda self.rlGamma = rlGamma self.prediction = None self.lastS = None self.lastQ = None self.lastPrediction = None self.lastReward = None self.traceH = TraceHolder((self.numTilings**(self.parameters) + 1), self.rlLambda, 1000) self.F = [0 for item in range(self.numTilings) ] # the indices of the returned tiles will go in here self.theta = [ 0 for item in range((self.numTilings**(self.parameters + 1)) + 1) ] # weight vector. self.cTable = CollisionTable(cTableSize, 'safe') # look into this... self.verifier = Verifier(self.rlGamma) def update(self, features, target=None): if features != None: self.learn(features, target) return self.prediction else: return None def learn(self, state, reward): self.loadFeatures(state, self.F) currentq = self.computeQ() if self.lastS != None: delta = reward - self.lastQ delta += self.rlGamma * currentq amt = delta * (self.rlAlpha / self.numTilings) for i in self.traceH.getTraceIndices(): self.theta[i] += amt * self.traceH.getTrace(i) self.traceH.decayTraces(self.rlGamma) self.traceH.replaceTraces(self.F) self.lastQ = currentq self.lastS = state self.prediction = currentq self.num_steps += 1 self.verifier.updateReward(reward) self.verifier.updatePrediction(self.prediction) def computeQ(self): q = 0 for i in self.F: q += self.theta[i] return q def loadFeatures(self, stateVars, featureVector): loadtiles(featureVector, 0, self.numTilings, self.numTilings**(self.parameters), stateVars) print "featureVector " + str(len(self.theta)) """ As provided in Rich's explanation tiles ; a provided array for the tile indices to go into starting-element ; first element of "tiles" to be changed (typically 0) num-tilings ; the number of tilings desired memory-size ; the number of possible tile indices floats ; a list of real values making up the input vector ints) ; list of optional inputs to get different hashings """ def loss(self, x, r, prev_state=None): """ Returns the TD error assuming reward r given for transition from prev_state to x If prev_state is None will use leftmost element in exp_queue """ if prev_state is None: if len(self.exp_queue) < self.horizon: return None else: prev_state = self.exp_queue[0][0] vp = r + self.gamma * self.value(x) v = self.value(prev_state) delta = vp - v return delta def predict(self, x): self.loadFeatures(x, self.F) return self.computeQ()