class SARSALambdaLearningAlgorithm(ValueLearningAlgorithm):
    """
    :description: Class implementing the SARSA Lambda algorithm. This
        class is equivalent to the SARSALearningAlgorithm class when
        self.lambda is set to 0; however, we keep it separate here
        because it imposes an overhead of tracking eligibility
        traces and because it is nice to see the difference between
        the two clearly.
    """
    def __init__(self, actions, discount, featureExtractor,
                explorationProb, stepSize, threshold, decay, maxGradient,
                num_consecutive_random_actions):
        """
        :note: please see parent class for params not described here
        """
        super(SARSALambdaLearningAlgorithm, self).__init__(actions, discount, featureExtractor,
                    explorationProb, stepSize, maxGradient, num_consecutive_random_actions)
        self.eligibility_traces = EligibilityTraces(threshold, decay)

    def incorporateFeedback(self, state, action, reward, newState):
        """
        :description: performs a SARSA update

        :type state: dictionary
        :param state: the state of the game

        :type action: int
        :param action: the action for which to retrieve the Q-value

        :type reward: float
        :param reward: reward associated with being in newState

        :type newState: dictionary
        :param newState: the new state of the game

        :type rval: int or None
        :param rval: if rval returned, then this is the next action taken
        """
        stepSize = self.stepSize
        prediction = self.getQ(state, action)
        self.eligibility_traces.update_all()
        target = reward
        newAction = None
        for f, v in self.featureExtractor(state, action):
            ### v might actually be 1 ###
            self.eligibility_traces[f] += v
        if newState != None:
            # SARSA differs from Q-learning in that it does not take the max
            # over actions, but instead selects the action using it's policy
            # and in that it returns the action selected
            # so that the main training loop may use that in the next iteration
            newAction = self.getAction(newState)
            target += self.discount * self.getQ(newState, newAction)

        update = stepSize * (prediction - target)
        update = np.clip(update, -self.maxGradient, self.maxGradient)

        for f, e in self.eligibility_traces.iteritems():
            #print 'update * e: {} applied to {}, e: {}, update: {}'.format(-1 * update * e, f, e, update)
            self.weights[f] -= update * e
            assert(self.weights[f] < MAX_FEATURE_WEIGHT_VALUE)
        # return newAction to denote that this is an on-policy algorithm
        return newAction
class SARSALambdaLearningAlgorithm(ValueLearningAlgorithm):
    """
    :description: Class implementing the SARSA Lambda algorithm. This
        class is equivalent to the SARSALearningAlgorithm class when
        self.lambda is set to 0; however, we keep it separate here
        because it imposes an overhead of tracking eligibility
        traces and because it is nice to see the difference between
        the two clearly.
    """
    def __init__(self, actions, discount, featureExtractor, explorationProb,
                 stepSize, threshold, decay, maxGradient,
                 num_consecutive_random_actions):
        """
        :note: please see parent class for params not described here
        """
        super(SARSALambdaLearningAlgorithm,
              self).__init__(actions, discount, featureExtractor,
                             explorationProb, stepSize, maxGradient,
                             num_consecutive_random_actions)
        self.eligibility_traces = EligibilityTraces(threshold, decay)

    def incorporateFeedback(self, state, action, reward, newState):
        """
        :description: performs a SARSA update

        :type state: dictionary
        :param state: the state of the game

        :type action: int
        :param action: the action for which to retrieve the Q-value

        :type reward: float
        :param reward: reward associated with being in newState

        :type newState: dictionary
        :param newState: the new state of the game

        :type rval: int or None
        :param rval: if rval returned, then this is the next action taken
        """
        stepSize = self.stepSize
        prediction = self.getQ(state, action)
        self.eligibility_traces.update_all()
        target = reward
        newAction = None
        for f, v in self.featureExtractor(state, action):
            ### v might actually be 1 ###
            self.eligibility_traces[f] += v
        if newState != None:
            # SARSA differs from Q-learning in that it does not take the max
            # over actions, but instead selects the action using it's policy
            # and in that it returns the action selected
            # so that the main training loop may use that in the next iteration
            newAction = self.getAction(newState)
            target += self.discount * self.getQ(newState, newAction)

        update = stepSize * (prediction - target)
        update = np.clip(update, -self.maxGradient, self.maxGradient)

        for f, e in self.eligibility_traces.iteritems():
            #print 'update * e: {} applied to {}, e: {}, update: {}'.format(-1 * update * e, f, e, update)
            self.weights[f] -= update * e
            assert (self.weights[f] < MAX_FEATURE_WEIGHT_VALUE)
        # return newAction to denote that this is an on-policy algorithm
        return newAction
예제 #3
0
class SARSALambdaLearningAlgorithm(ValueLearningAlgorithm):
    """
    :description: Class implementing the SARSA Lambda algorithm. This
        class is equivalent to the SARSALearningAlgorithm class when
        self.lambda is set to 0; however, we keep it separate here
        because it imposes an overhead of tracking eligibility
        traces and because it is nice to see the difference between
        the two clearly.
    """
    def __init__(self, actions, featureExtractor, discount, explorationProb,
                 stepSize, decay, threshold):
        super(SARSALambdaLearningAlgorithm,
              self).__init__(actions, featureExtractor, discount,
                             explorationProb, stepSize)
        self.threshold = threshold
        self.decay = decay
        self.eligibility_traces = EligibilityTraces(threshold, decay)
        self.name = "SARSALambda"
        self.maxFeatVectorNorm = 1
        self.firstReward = 0
        self.sawFirst = False

    def startEpisode(self, state):
        self.resetTraces()
        self.featureExtractor.extractFeatures(state)

    def resetTraces(self):
        self.eligibility_traces = EligibilityTraces(self.threshold, self.decay)

    def incorporateFeedback(self,
                            state,
                            action,
                            reward,
                            newState,
                            prediction=None,
                            target=None):
        """
        :description: performs a SARSA update

        :type state: dictionary
        :param state: the state of the game

        :type action: int
        :param action: the action for which to retrieve the Q-value

        :type reward: float
        :param reward: reward associated with being in newState

        :type newState: dictionary
        :param newState: the new state of the game

        :type rval: int or None
        :param rval: if rval returned, then this is the next action taken
        """
        self.eligibility_traces.update_all()
        for f in self.featureExtractor.features:
            self.eligibility_traces[(f, action)] = 1

        if prediction is None:
            prediction = self.getQ(action)

        if reward != 0 and not self.sawFirst:
            self.sawFirst = True
            self.firstReward = abs(float(reward))

        scaledReward = reward
        if self.sawFirst:
            scaledReward = reward / self.firstReward

        newAction = None

        if target is None:
            target = scaledReward
            if newState != None:
                # extract features of new state
                self.featureExtractor.extractFeatures(newState)
                # SARSA differs from Q-learning in that it does not take the max
                # over actions, but instead selects the action using it's policy
                # and in that it returns the action selected
                # so that the main training loop may use that in the next iteration
                newAction = self.getAction()
                target += self.discount * self.getQ(newAction)

        if len(self.featureExtractor.features) > self.maxFeatVectorNorm:
            self.maxFeatVectorNorm = len(self.featureExtractor.features)

        update = self.stepSize / self.maxFeatVectorNorm * (prediction - target)
        for f, e in self.eligibility_traces.iteritems():
            self.weights[f] -= update * e

        return newAction