class SARSALambdaLearningAlgorithm(ValueLearningAlgorithm): """ :description: Class implementing the SARSA Lambda algorithm. This class is equivalent to the SARSALearningAlgorithm class when self.lambda is set to 0; however, we keep it separate here because it imposes an overhead of tracking eligibility traces and because it is nice to see the difference between the two clearly. """ def __init__(self, actions, discount, featureExtractor, explorationProb, stepSize, threshold, decay, maxGradient, num_consecutive_random_actions): """ :note: please see parent class for params not described here """ super(SARSALambdaLearningAlgorithm, self).__init__(actions, discount, featureExtractor, explorationProb, stepSize, maxGradient, num_consecutive_random_actions) self.eligibility_traces = EligibilityTraces(threshold, decay) def incorporateFeedback(self, state, action, reward, newState): """ :description: performs a SARSA update :type state: dictionary :param state: the state of the game :type action: int :param action: the action for which to retrieve the Q-value :type reward: float :param reward: reward associated with being in newState :type newState: dictionary :param newState: the new state of the game :type rval: int or None :param rval: if rval returned, then this is the next action taken """ stepSize = self.stepSize prediction = self.getQ(state, action) self.eligibility_traces.update_all() target = reward newAction = None for f, v in self.featureExtractor(state, action): ### v might actually be 1 ### self.eligibility_traces[f] += v if newState != None: # SARSA differs from Q-learning in that it does not take the max # over actions, but instead selects the action using it's policy # and in that it returns the action selected # so that the main training loop may use that in the next iteration newAction = self.getAction(newState) target += self.discount * self.getQ(newState, newAction) update = stepSize * (prediction - target) update = np.clip(update, -self.maxGradient, self.maxGradient) for f, e in self.eligibility_traces.iteritems(): #print 'update * e: {} applied to {}, e: {}, update: {}'.format(-1 * update * e, f, e, update) self.weights[f] -= update * e assert(self.weights[f] < MAX_FEATURE_WEIGHT_VALUE) # return newAction to denote that this is an on-policy algorithm return newAction
class SARSALambdaLearningAlgorithm(ValueLearningAlgorithm): """ :description: Class implementing the SARSA Lambda algorithm. This class is equivalent to the SARSALearningAlgorithm class when self.lambda is set to 0; however, we keep it separate here because it imposes an overhead of tracking eligibility traces and because it is nice to see the difference between the two clearly. """ def __init__(self, actions, discount, featureExtractor, explorationProb, stepSize, threshold, decay, maxGradient, num_consecutive_random_actions): """ :note: please see parent class for params not described here """ super(SARSALambdaLearningAlgorithm, self).__init__(actions, discount, featureExtractor, explorationProb, stepSize, maxGradient, num_consecutive_random_actions) self.eligibility_traces = EligibilityTraces(threshold, decay) def incorporateFeedback(self, state, action, reward, newState): """ :description: performs a SARSA update :type state: dictionary :param state: the state of the game :type action: int :param action: the action for which to retrieve the Q-value :type reward: float :param reward: reward associated with being in newState :type newState: dictionary :param newState: the new state of the game :type rval: int or None :param rval: if rval returned, then this is the next action taken """ stepSize = self.stepSize prediction = self.getQ(state, action) self.eligibility_traces.update_all() target = reward newAction = None for f, v in self.featureExtractor(state, action): ### v might actually be 1 ### self.eligibility_traces[f] += v if newState != None: # SARSA differs from Q-learning in that it does not take the max # over actions, but instead selects the action using it's policy # and in that it returns the action selected # so that the main training loop may use that in the next iteration newAction = self.getAction(newState) target += self.discount * self.getQ(newState, newAction) update = stepSize * (prediction - target) update = np.clip(update, -self.maxGradient, self.maxGradient) for f, e in self.eligibility_traces.iteritems(): #print 'update * e: {} applied to {}, e: {}, update: {}'.format(-1 * update * e, f, e, update) self.weights[f] -= update * e assert (self.weights[f] < MAX_FEATURE_WEIGHT_VALUE) # return newAction to denote that this is an on-policy algorithm return newAction
class SARSALambdaLearningAlgorithm(ValueLearningAlgorithm): """ :description: Class implementing the SARSA Lambda algorithm. This class is equivalent to the SARSALearningAlgorithm class when self.lambda is set to 0; however, we keep it separate here because it imposes an overhead of tracking eligibility traces and because it is nice to see the difference between the two clearly. """ def __init__(self, actions, featureExtractor, discount, explorationProb, stepSize, decay, threshold): super(SARSALambdaLearningAlgorithm, self).__init__(actions, featureExtractor, discount, explorationProb, stepSize) self.threshold = threshold self.decay = decay self.eligibility_traces = EligibilityTraces(threshold, decay) self.name = "SARSALambda" self.maxFeatVectorNorm = 1 self.firstReward = 0 self.sawFirst = False def startEpisode(self, state): self.resetTraces() self.featureExtractor.extractFeatures(state) def resetTraces(self): self.eligibility_traces = EligibilityTraces(self.threshold, self.decay) def incorporateFeedback(self, state, action, reward, newState, prediction=None, target=None): """ :description: performs a SARSA update :type state: dictionary :param state: the state of the game :type action: int :param action: the action for which to retrieve the Q-value :type reward: float :param reward: reward associated with being in newState :type newState: dictionary :param newState: the new state of the game :type rval: int or None :param rval: if rval returned, then this is the next action taken """ self.eligibility_traces.update_all() for f in self.featureExtractor.features: self.eligibility_traces[(f, action)] = 1 if prediction is None: prediction = self.getQ(action) if reward != 0 and not self.sawFirst: self.sawFirst = True self.firstReward = abs(float(reward)) scaledReward = reward if self.sawFirst: scaledReward = reward / self.firstReward newAction = None if target is None: target = scaledReward if newState != None: # extract features of new state self.featureExtractor.extractFeatures(newState) # SARSA differs from Q-learning in that it does not take the max # over actions, but instead selects the action using it's policy # and in that it returns the action selected # so that the main training loop may use that in the next iteration newAction = self.getAction() target += self.discount * self.getQ(newAction) if len(self.featureExtractor.features) > self.maxFeatVectorNorm: self.maxFeatVectorNorm = len(self.featureExtractor.features) update = self.stepSize / self.maxFeatVectorNorm * (prediction - target) for f, e in self.eligibility_traces.iteritems(): self.weights[f] -= update * e return newAction