示例#1
0
    def __init__(self,
                 numTilings=1,
                 parameters=2,
                 rlAlpha=0.5,
                 rlLambda=0.9,
                 rlGamma=0.9,
                 cTableSize=0):
        """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """
        self.numTilings = numTilings
        self.tileWidths = list()
        self.parameters = parameters
        self.rlAlpha = rlAlpha
        self.rlLambda = rlLambda
        self.rlGamma = rlGamma

        self.prediction = None
        self.lastS = None
        self.lastQ = None
        self.lastPrediction = None
        self.lastReward = None
        self.traceH = TraceHolder((self.numTilings**(self.parameters) + 1),
                                  self.rlLambda, 1000)
        self.F = [0 for item in range(self.numTilings)
                  ]  # the indices of the returned tiles will go in here
        self.theta = [
            0 for item in range((self.numTilings**(self.parameters + 1)) + 1)
        ]  # weight vector.
        self.cTable = CollisionTable(cTableSize, 'safe')  # look into this...
        self.verifier = Verifier(self.rlGamma)
    def __init__(self, numTilings = 1, num_bins = 2, rlAlpha = 0.5, rlLambda = 0.9, rlGamma = 0.9, cTableSize=0):
        """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """
        self.numTilings = numTilings
        self.num_bins = num_bins
        self.rlAlpha = rlAlpha
        self.rlLambda = rlLambda
        self.rlGamma = rlGamma
    
        self.mem_size = 1048576 # 16,384 or 8,192 or 1,048,576 or 8,388,608 or 16,777,216 or 33,554,432
        self.prediction = None
        self.current_prediction = 0
        self.delta = 0
        self.lastS = None
        self.previous_tiles = [0 for item in range(self.numTilings)]
#         self.previous_state = [None for item in range(self.numTilings*(self.num_bins)**10)]
        self.previous_prediction = None
        self.lastQ = None
        self.lastPrediction = None
        self.lastReward = None
        self.traceH = TraceHolder(self.mem_size, 0.01, 1000) # TraceHolder(mem, minT, maxN)
        self.F = [0 for item in range(self.numTilings)] # the indices of the returned tiles will go in here
        self.theta = [0 for item in range(self.mem_size)] # weight vector.
#         self.weights = [0 for item in range(self.numTilings*(self.num_bins)**10)]
#         self.e_trace = [0 for item in range(self.numTilings*(self.num_bins)**10)] # added by Ann
        self.cTable = CollisionTable(cTableSize, 'super safe') # look into this...
#         stats = self.cTable.stats()
#         print stats
        self.verifier = Verifier(self.rlGamma)
示例#3
0
    def __init__(actions, self, numTilings = 1, parameters = 2,rlAlpha = 0.5, rlLambda = 0.9,
                 rlGamma = 0.9, rlEpsilon = 0.1, cTableSize=0, action_selection = 'softmax'):
        """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """
        self.numTilings = numTilings
        self.tileWidths = list()
        self.parameters = parameters
        self.rlAlpha = rlAlpha
        self.rlLambda = rlLambda
        self.rlGamma = rlGamma
        self.rlEpsilon = rlEpsilon
        self.action_selection = action_selection

        self.lastS = None
        self.lastQ = None
        self.lastPrediction = None
        self.lastReward = None
        self.lastAction = None
        self.currentAction = None

        self.actions = actions # an array of actions which we can select from
        self.traceH = TraceHolder((self.numTilings**(self.parameters)+1), self.rlLambda, 1000)
        self.F = [[0 for item in range(self.numTilings)] for i in range(actions)] # the indices of the returned tiles will go in here
        self.q_vals = [0 for i in range(actions)]
        for action in actions:
            self.q.append(action,[0 for item in range((self.numTilings**(self.parameters+1))+1)]) # action and weight vec
        self.cTable = CollisionTable(cTableSize, 'safe') # look into this...
        self.verifier = Verifier(self.rlGamma)
示例#4
0
    def __init__(actions,
                 self,
                 numTilings=1,
                 parameters=2,
                 rlAlpha=0.5,
                 rlLambda=0.9,
                 rlGamma=0.9,
                 rlEpsilon=0.1,
                 cTableSize=0,
                 action_selection='softmax'):
        """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """
        self.numTilings = numTilings
        self.tileWidths = list()
        self.parameters = parameters
        self.rlAlpha = rlAlpha
        self.rlLambda = rlLambda
        self.rlGamma = rlGamma
        self.rlEpsilon = rlEpsilon
        self.action_selection = action_selection

        self.lastS = None
        self.lastQ = None
        self.lastPrediction = None
        self.lastReward = None
        self.lastAction = None
        self.currentAction = None

        self.actions = actions  # an array of actions which we can select from
        self.traceH = TraceHolder((self.numTilings**(self.parameters) + 1),
                                  self.rlLambda, 1000)
        self.F = [[0 for item in range(self.numTilings)]
                  for i in range(actions)
                  ]  # the indices of the returned tiles will go in here
        self.q_vals = [0 for i in range(actions)]
        for action in actions:
            self.q.append(action, [
                0
                for item in range((self.numTilings**(self.parameters + 1)) + 1)
            ])  # action and weight vec
        self.cTable = CollisionTable(cTableSize, 'safe')  # look into this...
        self.verifier = Verifier(self.rlGamma)
示例#5
0
 def __init__(self, numTilings = 1, parameters = 2, rlAlpha = 0.5, rlLambda = 0.9, rlGamma = 0.9, cTableSize=0):
     """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """
     self.numTilings = numTilings
     self.tileWidths = list()
     self.parameters = parameters
     self.rlAlpha = rlAlpha
     self.rlLambda = rlLambda
     self.rlGamma = rlGamma
 
     self.prediction = None
     self.lastS = None
     self.lastQ = None
     self.lastPrediction = None
     self.lastReward = None
     self.traceH = TraceHolder((self.numTilings**(self.parameters)+1), self.rlLambda, 1000)
     self.F = [0 for item in range(self.numTilings)] # the indices of the returned tiles will go in here
     self.theta = [0 for item in range((self.numTilings**(self.parameters+1))+1)] # weight vector.
     self.cTable = CollisionTable(cTableSize, 'safe') # look into this...
     self.verifier = Verifier(self.rlGamma)
class TDLambdaLearner(Learner):
    """
    Note: the TileCoder is Rich's Python version, which is still in Alpha.
    See more at: http://webdocs.cs.ualberta.ca/~sutton/tiles2.html#Python%20Versions
    
        Collision Table notes:
            cTableSize is the size that the collision table will be instantiated to. The size must be  a power of two.
            In calls for get tiles, the collision table is used in stead of memory_size, as it already has it.
    
    """
    def __init__(self, numTilings = 1, num_bins = 2, rlAlpha = 0.5, rlLambda = 0.9, rlGamma = 0.9, cTableSize=0):
        """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """
        self.numTilings = numTilings
        self.num_bins = num_bins
        self.rlAlpha = rlAlpha
        self.rlLambda = rlLambda
        self.rlGamma = rlGamma
    
        self.mem_size = 1048576 # 16,384 or 8,192 or 1,048,576 or 8,388,608 or 16,777,216 or 33,554,432
        self.prediction = None
        self.current_prediction = 0
        self.delta = 0
        self.lastS = None
        self.previous_tiles = [0 for item in range(self.numTilings)]
#         self.previous_state = [None for item in range(self.numTilings*(self.num_bins)**10)]
        self.previous_prediction = None
        self.lastQ = None
        self.lastPrediction = None
        self.lastReward = None
        self.traceH = TraceHolder(self.mem_size, 0.01, 1000) # TraceHolder(mem, minT, maxN)
        self.F = [0 for item in range(self.numTilings)] # the indices of the returned tiles will go in here
        self.theta = [0 for item in range(self.mem_size)] # weight vector.
#         self.weights = [0 for item in range(self.numTilings*(self.num_bins)**10)]
#         self.e_trace = [0 for item in range(self.numTilings*(self.num_bins)**10)] # added by Ann
        self.cTable = CollisionTable(cTableSize, 'super safe') # look into this...
#         stats = self.cTable.stats()
#         print stats
        self.verifier = Verifier(self.rlGamma)
    
#     def Ann_update(self, current_state, numstates, reward=None):
#         if current_state != None:
#             self.Ann_learn(current_state, reward, numstates)
#             return self.current_prediction
#         else: 
#             return None
#      
#     def Ann_learn(self, current_state, reward, numstates):   
#         active_tiles = simple_tiles(self.numTilings, self.numTilings*self.num_bins, current_state, numstates) # returns index of active features
#         print "active tiles = " + str(active_tiles)
#         print "previous tiles = " + str(self.previous_tiles)
#         if self.previous_prediction != None:
# #             self.current_prediction = 0
#             for index in active_tiles:
#                 print 'index = ' + str(index)
#                 self.current_prediction = self.weights[index] # not sure if this is right
# #                 print 'weights[index] = ' + str(self.weights[index])     
#             self.delta = reward + self.rlGamma * self.current_prediction - self.previous_prediction
#             print 'self.delta = ' + str(self.delta)
#             if self.previous_state != None:
#                 self.previous_state = [0 for item in range(self.numTilings*(self.num_bins)**10)]
#                 for index in self.previous_tiles:
#                     self.previous_state[index] = 1 
# #                     print 'previous state = ' + str(self.previous_state)
#                     self.e_trace = [x + y for x, y in zip(self.previous_state, [i * self.rlLambda * self.rlGamma for i in self.e_trace])]
# #                     print 'e_trace = ' + str(self.e_trace)
#             self.weights = [x + y for x, y in zip(self.weights, [i * self.rlAlpha * self.delta for i in self.e_trace])] # alpha needs to be divided by N
# #             print 'weights = ' + str(self.weights)
#         self.previous_tiles = active_tiles
#         self.previous_prediction = self.current_prediction
#         print 'current prediction = ' + str(self.current_prediction)
#         self.verifier.updateReward(reward)
#         self.verifier.updatePrediction(self.current_prediction)
#         self.normalized_prediction = self.current_prediction * (1-self.rlGamma)
#         print 'normalized prediction = ' + str(self.normalized_prediction)
     
    def update(self, features, target=None):
        if features != None:
            self.learn(features, target)
            return self.prediction
        else: return None
      
    def learn(self, state, reward):
        self.loadFeatures(state, self.F)
        currentq = self.computeQ() # computeQ returns w*x' (current prediction)
        if self.lastS != None:
#             print 'reward = ' + str(reward)
            delta = reward + self.rlGamma*currentq - self.lastQ # delta = r + gamma*w*x' - w*x
            for i in self.traceH.getTraceIndices():
                self.theta[i] += delta * (self.rlAlpha / self.numTilings) * self.traceH.getTrace(i) # delta * alpha/N * e
#                 print 'traces = ' + str(self.traceH.getTrace(i))
            self.traceH.decayTraces(self.rlGamma)
            self.traceH.replaceTraces(self.F)
#             self.e_trace = min(rlLambda*self.e_trace + x, 1) # added by Ann
#             print 'delta = ' + str(delta)
#             print 'trace indices = ' + str(self.traceH.getTraceIndices())
#             print 'theta' + str(self.theta)
#             print 'self.F'+ str(self.F)
#             print 'lastS' + str(self.lastS)
        self.lastQ = currentq
        self.lastS = state
        self.prediction = currentq
        self.num_steps+=1
        self.verifier.updateReward(reward)
        self.verifier.updatePrediction(self.prediction)
        self.normalized_prediction = self.prediction * (1 - self.rlGamma)
        

    def computeQ(self):
        q = 0 
        for i in self.F:
            q += self.theta[i]
        return q
    
    def loadFeatures(self, stateVars, featureVector):
#         loadtiles(featureVector, 0, self.numTilings, self.num_bins, stateVars)
#         active_tiles = loadtiles([0], 0, self.numTilings*self.num_bins, 1024, stateVars)
#         buffer = [0] # array of length numtilings
#         tiles(1,512,stateVars)
#         active_tiles = loadtiles([0], 0, self.numTilings, self.num_bins, stateVars)
        self.F = active_tiles = tiles(self.numTilings,self.mem_size,stateVars)
#         print 'tiles = ' + str(active_tiles)
#         active_tiles = simple_tiles(self.numTilings, self.numTilings*self.num_bins, stateVars)
#         simple_tiles(self.numTilings, self.numTilings*self.num_bins, stateVars)
#         print "featureVector " + str(len(self.theta))
#         print 'active tiles = ' + str(active_tiles)
#         print 'numTilings = ' + str(self.numTilings)
#         print 'stateVars = ' + str(stateVars)
        """ 
        As provided in Rich's explanation
               tiles                   ; a provided array for the tile indices to go into
               starting-element        ; first element of "tiles" to be changed (typically 0)
               num-tilings             ; the number of tilings desired
               memory-size             ; the number of possible tile indices
               floats                  ; a list of real values making up the input vector
               ints)                   ; list of optional inputs to get different hashings
        """
    
    def loss(self, x, r, prev_state=None):
        """
        Returns the TD error assuming reward r given for 
        transition from prev_state to x
        If prev_state is None will use leftmost element in exp_queue
        """
        if prev_state is None:
            if len(self.exp_queue) < self.horizon:
                return None
            else:
                prev_state = self.exp_queue[0][0]

        vp = r + self.gamma * self.value(x)
        v = self.value(prev_state)
        delta = vp - v
        return delta 
    
    
    def predict (self,x):
        self.loadFeatures(x, self.F)
        return self.computeQ()
示例#7
0
class Q_learning(Learner):


    def __init__(actions, self, numTilings = 1, parameters = 2,rlAlpha = 0.5, rlLambda = 0.9,
                 rlGamma = 0.9, rlEpsilon = 0.1, cTableSize=0, action_selection = 'softmax'):
        """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """
        self.numTilings = numTilings
        self.tileWidths = list()
        self.parameters = parameters
        self.rlAlpha = rlAlpha
        self.rlLambda = rlLambda
        self.rlGamma = rlGamma
        self.rlEpsilon = rlEpsilon
        self.action_selection = action_selection

        self.lastS = None
        self.lastQ = None
        self.lastPrediction = None
        self.lastReward = None
        self.lastAction = None
        self.currentAction = None

        self.actions = actions # an array of actions which we can select from
        self.traceH = TraceHolder((self.numTilings**(self.parameters)+1), self.rlLambda, 1000)
        self.F = [[0 for item in range(self.numTilings)] for i in range(actions)] # the indices of the returned tiles will go in here
        self.q_vals = [0 for i in range(actions)]
        for action in actions:
            self.q.append(action,[0 for item in range((self.numTilings**(self.parameters+1))+1)]) # action and weight vec
        self.cTable = CollisionTable(cTableSize, 'safe') # look into this...
        self.verifier = Verifier(self.rlGamma)


    def chooseAction(self, features):
        for action in range(self.actions):
            self.loadFeatures(featureVector=self.F[action], stateVars=features)
            self.q_vals[action] = self.computeQ(action)
        return self.eGreedy()


    def eGreedy(self):
        if random.random() < self.rlEpsilon:
            return random.randrange(self.actions) # random action
        else:
            max_index, max_value = max(enumerate(self.q_vals), key=operator.itemgetter(1))
            return max_index # best action


    def update(self, features, target=None):
        # learning step
        if features != None:
            self.learn(features, target, self.currentAction)

        self.lastAction = self.currentAction
        self.currentAction = self.chooseAction(features)

        # action selection step
        return self.currentAction


    def learn(self, features, reward, action):
        self.loadFeatures(features, self.F)
        currentq = self.computeQ(action)

        if self.lastS != None and self.lastAction != None: # if we're past the first step
            delta = reward - self.lastQ
            delta += self.rlGamma * currentq
            amt = delta * (self.rlAlpha / self.numTilings)

            for i in self.traceH.getTraceIndices():
                self.theta[i] += amt * self.traceH.getTrace(i)

            max_action, max_value = max(enumerate(self.q_vals), key=operator.itemgetter(1))
            if action == max_action:
                self.traceH.decayTraces(self.rlGamma*self.rlLambda)
            else:
                self.traceH.decayTraces(0)
            self.traceH.replaceTraces(self.F[action])

        self.lastQ = currentq
        self.lastS = features
        self.num_steps+=1
        self.verifier.updateReward(reward)
        self.verifier.updatePrediction(self.prediction)


    def loadFeatures (self, stateVars, featureVector):
        loadtiles(featureVector, 0, self.numTilings, self.numTilings**(self.parameters), stateVars)
        print "featureVector " + str(len(self.theta))
        """
        As provided in Rich's explanation
               tiles                   ; a provided array for the tile indices to go into
               starting-element        ; first element of "tiles" to be changed (typically 0)
               num-tilings             ; the number of tilings desired
               memory-size             ; the number of possible tile indices
               floats                  ; a list of real values making up the input vector
               ints)                   ; list of optional inputs to get different hashings
        """

    def computeQ (self, a):
        "compute value of action for current F and theta"
        q = 0
        for i in self.F[a]:
            q += self.theta[i]
        return q
示例#8
0
class TDLambdaLearner(Learner):
    """
    Note: the TileCoder is Rich's Python version, which is still in Alpha.
    See more at: http://webdocs.cs.ualberta.ca/~sutton/tiles2.html#Python%20Versions
    
        Collision Table notes:
            cTableSize is the size that the collision table will be instantiated to. The size must be  a power of two.
            In calls for get tiles, the collision table is used in stead of memory_size, as it already has it.
    
    """
    def __init__(self, numTilings = 1, parameters = 2, rlAlpha = 0.5, rlLambda = 0.9, rlGamma = 0.9, cTableSize=0):
        """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """
        self.numTilings = numTilings
        self.tileWidths = list()
        self.parameters = parameters
        self.rlAlpha = rlAlpha
        self.rlLambda = rlLambda
        self.rlGamma = rlGamma
    
        self.prediction = None
        self.lastS = None
        self.lastQ = None
        self.lastPrediction = None
        self.lastReward = None
        self.traceH = TraceHolder((self.numTilings**(self.parameters)+1), self.rlLambda, 1000)
        self.F = [0 for item in range(self.numTilings)] # the indices of the returned tiles will go in here
        self.theta = [0 for item in range((self.numTilings**(self.parameters+1))+1)] # weight vector.
        self.cTable = CollisionTable(cTableSize, 'safe') # look into this...
        self.verifier = Verifier(self.rlGamma)


    def update(self, features, target=None):
        if features != None:
            self.learn(features, target)
            return self.prediction
        else: return None


    def learn(self, state, reward):
        self.loadFeatures(state, self.F)
        currentq = self.computeQ()
        if self.lastS != None:
            delta = reward - self.lastQ
            delta += self.rlGamma * currentq
            amt = delta * (self.rlAlpha / self.numTilings)
            for i in self.traceH.getTraceIndices():
                self.theta[i] += amt * self.traceH.getTrace(i)
            self.traceH.decayTraces(self.rlGamma)
            self.traceH.replaceTraces(self.F)
        self.lastQ = currentq
        self.lastS = state
        self.prediction = currentq
        self.num_steps+=1
        self.verifier.updateReward(reward)
        self.verifier.updatePrediction(self.prediction)
        

    def computeQ(self):
        q = 0
        for i in self.F:
            q += self.theta[i]
        return q


    def loadFeatures(self, stateVars, featureVector):
        loadtiles(featureVector, 0, self.numTilings, self.numTilings**(self.parameters), stateVars)
        print "featureVector " + str(len(self.theta))
        """ 
        As provided in Rich's explanation
               tiles                   ; a provided array for the tile indices to go into
               starting-element        ; first element of "tiles" to be changed (typically 0)
               num-tilings             ; the number of tilings desired
               memory-size             ; the number of possible tile indices
               floats                  ; a list of real values making up the input vector
               ints)                   ; list of optional inputs to get different hashings
        """


    def loss(self, x, r, prev_state=None):
        """
        Returns the TD error assuming reward r given for 
        transition from prev_state to x
        If prev_state is None will use leftmost element in exp_queue
        """
        if prev_state is None:
            if len(self.exp_queue) < self.horizon:
                return None
            else:
                prev_state = self.exp_queue[0][0]

        vp = r + self.gamma * self.value(x)
        v = self.value(prev_state)
        delta = vp - v
        return delta


    def predict (self,x):
        self.loadFeatures(x, self.F)
        return self.computeQ()
示例#9
0
class Q_learning(Learner):
    def __init__(actions,
                 self,
                 numTilings=1,
                 parameters=2,
                 rlAlpha=0.5,
                 rlLambda=0.9,
                 rlGamma=0.9,
                 rlEpsilon=0.1,
                 cTableSize=0,
                 action_selection='softmax'):
        """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """
        self.numTilings = numTilings
        self.tileWidths = list()
        self.parameters = parameters
        self.rlAlpha = rlAlpha
        self.rlLambda = rlLambda
        self.rlGamma = rlGamma
        self.rlEpsilon = rlEpsilon
        self.action_selection = action_selection

        self.lastS = None
        self.lastQ = None
        self.lastPrediction = None
        self.lastReward = None
        self.lastAction = None
        self.currentAction = None

        self.actions = actions  # an array of actions which we can select from
        self.traceH = TraceHolder((self.numTilings**(self.parameters) + 1),
                                  self.rlLambda, 1000)
        self.F = [[0 for item in range(self.numTilings)]
                  for i in range(actions)
                  ]  # the indices of the returned tiles will go in here
        self.q_vals = [0 for i in range(actions)]
        for action in actions:
            self.q.append(action, [
                0
                for item in range((self.numTilings**(self.parameters + 1)) + 1)
            ])  # action and weight vec
        self.cTable = CollisionTable(cTableSize, 'safe')  # look into this...
        self.verifier = Verifier(self.rlGamma)

    def chooseAction(self, features):
        for action in range(self.actions):
            self.loadFeatures(featureVector=self.F[action], stateVars=features)
            self.q_vals[action] = self.computeQ(action)
        return self.eGreedy()

    def eGreedy(self):
        if random.random() < self.rlEpsilon:
            return random.randrange(self.actions)  # random action
        else:
            max_index, max_value = max(enumerate(self.q_vals),
                                       key=operator.itemgetter(1))
            return max_index  # best action

    def update(self, features, target=None):
        # learning step
        if features != None:
            self.learn(features, target, self.currentAction)

        self.lastAction = self.currentAction
        self.currentAction = self.chooseAction(features)

        # action selection step
        return self.currentAction

    def learn(self, features, reward, action):
        self.loadFeatures(features, self.F)
        currentq = self.computeQ(action)

        if self.lastS != None and self.lastAction != None:  # if we're past the first step
            delta = reward - self.lastQ
            delta += self.rlGamma * currentq
            amt = delta * (self.rlAlpha / self.numTilings)

            for i in self.traceH.getTraceIndices():
                self.theta[i] += amt * self.traceH.getTrace(i)

            max_action, max_value = max(enumerate(self.q_vals),
                                        key=operator.itemgetter(1))
            if action == max_action:
                self.traceH.decayTraces(self.rlGamma * self.rlLambda)
            else:
                self.traceH.decayTraces(0)
            self.traceH.replaceTraces(self.F[action])

        self.lastQ = currentq
        self.lastS = features
        self.num_steps += 1
        self.verifier.updateReward(reward)
        self.verifier.updatePrediction(self.prediction)

    def loadFeatures(self, stateVars, featureVector):
        loadtiles(featureVector, 0, self.numTilings,
                  self.numTilings**(self.parameters), stateVars)
        print "featureVector " + str(len(self.theta))
        """
        As provided in Rich's explanation
               tiles                   ; a provided array for the tile indices to go into
               starting-element        ; first element of "tiles" to be changed (typically 0)
               num-tilings             ; the number of tilings desired
               memory-size             ; the number of possible tile indices
               floats                  ; a list of real values making up the input vector
               ints)                   ; list of optional inputs to get different hashings
        """

    def computeQ(self, a):
        "compute value of action for current F and theta"
        q = 0
        for i in self.F[a]:
            q += self.theta[i]
        return q
示例#10
0
class TDLambdaLearner(Learner):
    """
    Note: the TileCoder is Rich's Python version, which is still in Alpha.
    See more at: http://webdocs.cs.ualberta.ca/~sutton/tiles2.html#Python%20Versions
    
        Collision Table notes:
            cTableSize is the size that the collision table will be instantiated to. The size must be  a power of two.
            In calls for get tiles, the collision table is used in stead of memory_size, as it already has it.
    
    """
    def __init__(self,
                 numTilings=1,
                 parameters=2,
                 rlAlpha=0.5,
                 rlLambda=0.9,
                 rlGamma=0.9,
                 cTableSize=0):
        """ If you want to run an example of the code, simply just leave the parameters blank and it'll automatically set based on the parameters. """
        self.numTilings = numTilings
        self.tileWidths = list()
        self.parameters = parameters
        self.rlAlpha = rlAlpha
        self.rlLambda = rlLambda
        self.rlGamma = rlGamma

        self.prediction = None
        self.lastS = None
        self.lastQ = None
        self.lastPrediction = None
        self.lastReward = None
        self.traceH = TraceHolder((self.numTilings**(self.parameters) + 1),
                                  self.rlLambda, 1000)
        self.F = [0 for item in range(self.numTilings)
                  ]  # the indices of the returned tiles will go in here
        self.theta = [
            0 for item in range((self.numTilings**(self.parameters + 1)) + 1)
        ]  # weight vector.
        self.cTable = CollisionTable(cTableSize, 'safe')  # look into this...
        self.verifier = Verifier(self.rlGamma)

    def update(self, features, target=None):
        if features != None:
            self.learn(features, target)
            return self.prediction
        else:
            return None

    def learn(self, state, reward):
        self.loadFeatures(state, self.F)
        currentq = self.computeQ()
        if self.lastS != None:
            delta = reward - self.lastQ
            delta += self.rlGamma * currentq
            amt = delta * (self.rlAlpha / self.numTilings)
            for i in self.traceH.getTraceIndices():
                self.theta[i] += amt * self.traceH.getTrace(i)
            self.traceH.decayTraces(self.rlGamma)
            self.traceH.replaceTraces(self.F)
        self.lastQ = currentq
        self.lastS = state
        self.prediction = currentq
        self.num_steps += 1
        self.verifier.updateReward(reward)
        self.verifier.updatePrediction(self.prediction)

    def computeQ(self):
        q = 0
        for i in self.F:
            q += self.theta[i]
        return q

    def loadFeatures(self, stateVars, featureVector):
        loadtiles(featureVector, 0, self.numTilings,
                  self.numTilings**(self.parameters), stateVars)
        print "featureVector " + str(len(self.theta))
        """ 
        As provided in Rich's explanation
               tiles                   ; a provided array for the tile indices to go into
               starting-element        ; first element of "tiles" to be changed (typically 0)
               num-tilings             ; the number of tilings desired
               memory-size             ; the number of possible tile indices
               floats                  ; a list of real values making up the input vector
               ints)                   ; list of optional inputs to get different hashings
        """

    def loss(self, x, r, prev_state=None):
        """
        Returns the TD error assuming reward r given for 
        transition from prev_state to x
        If prev_state is None will use leftmost element in exp_queue
        """
        if prev_state is None:
            if len(self.exp_queue) < self.horizon:
                return None
            else:
                prev_state = self.exp_queue[0][0]

        vp = r + self.gamma * self.value(x)
        v = self.value(prev_state)
        delta = vp - v
        return delta

    def predict(self, x):
        self.loadFeatures(x, self.F)
        return self.computeQ()