Exemplo n.º 1
0
class FQIAgent(Agent):
    
    alpha = 1.0
    gamma = 0.9
    iterations = 1
    presentations = 1
    
    def __init__(self, faClass=Linear, resetFA=True, ordered=False, vectorblock=False):
        """ initialize the agent with the estimatorClass. """
        Agent.__init__(self)
        self.faClass = faClass
        self.resetFA = resetFA
        self.ordered = ordered
        self.vectorblock = vectorblock

    
    def _setup(self, conditions):
        """ if agent is discrete in states and actions create Q-Table. """
        Agent._setup(self, conditions)
        if not (self.conditions['discreteStates'] == False and self.conditions['discreteActions']):
            raise AgentException('FQIAgent expects continuous states and discrete actions. Use adapter or a different environment.')
        
        if self.vectorblock:
            self.estimator = VectorBlockEstimator(self.conditions['stateDim'], self.conditions['actionNum'], faClass=self.faClass, ordered=self.ordered)
        else:
            self.estimator = FAEstimator(self.conditions['stateDim'], self.conditions['actionNum'], faClass=self.faClass, ordered=self.ordered)
    

    def _calculate(self):
        self.action = self.estimator.getBestAction(self.state)
    
    
    def newEpisode(self):
        """ reset the memory. """
        Agent.newEpisode(self)

        if self.ordered:
            self.estimator.resetMemory()
    

    def giveReward(self, reward):
        """ additionally remember the chosen action to not draw it again. """
        if self.ordered:
            self.estimator.rememberAction(self.action)
        
        Agent.giveReward(self, reward)    

    
    def buildMemoryFromEpisode(self, episode):
        """ builds the memory for already executed actions from the current
            episode. this is necessary if the episode was appended without
            actually being executed via the experiments interact() method. 
        """
        if self.ordered:
            for a in episode.actions:
                self.estimator.rememberAction(a)
                
    
    def learn(self):
        """ go through whole episode and make Q-value updates. """  

        for i in range(self.iterations):
            dataset = []
            
            for episode in self.history:
                if self.ordered:
                    self.estimator.resetMemory()

                for state, action, reward, nextstate in episode:                    
                    qvalue = self.estimator.getValue(state, action)
                    if self.ordered:
                        self.estimator.rememberAction(action)
                    if nextstate != None:
                        bestnext = self.estimator.getValue(nextstate, self.estimator.getBestAction(nextstate))
                    else:
                        bestnext = 0.
                    target = (1-self.alpha) * qvalue + self.alpha * (reward + self.gamma * bestnext)

                    dataset.append([state, action, target])

            if len(dataset) == 0:
                continue
                
            # ground targets to 0 to avoid drifting values
            mintarget = min(map(itemgetter(2), dataset))
            
            # reset estimator (not resetting might lead to faster convergence!)
            if self.resetFA:
                self.estimator.reset()
            for i in range(self.presentations):
                shuffle(dataset)
                for state, action, target in dataset:
                    self.estimator.updateValue(state, action, target-mintarget)
            self.estimator.train()
Exemplo n.º 2
0
class BASAgent(Agent):
    
    alpha = 1.0
    gamma = 0.9
    
    def __init__(self, faClass=Linear):
        """ initialize the agent with the estimatorClass. """
        Agent.__init__(self)
        
        self.amin = -1.
        self.amax = 1.
        self.nres = 3
        
        # store (decision,action) tuples for one action in the list
        self.decisions = []
        
        self.faClass = faClass
    
    def _setup(self, conditions):
        """ if agent is discrete in states and actions create Q-Table. """
        Agent._setup(self, conditions)
        if not (self.conditions['discreteStates'] == False and self.conditions['discreteActions'] == False):
            raise AgentException('BASAgent expects continuous states and actions. Use adapter or a different environment.')
            
        self.estimator = FAEstimator(self.conditions['stateDim'] + self.conditions['actionDim'], 2**self.conditions['actionDim'], self.faClass)
        
        # change history to store bas-extended experiences
        self.history = History(conditions['stateDim']+self.conditions['actionDim'] , 1)
    
    
    def giveReward(self, reward):
        """ override function to store the internal actions in the history. """
        if self.progressCnt == 2:
            self.reward = reward
            self.progressCnt = 0
            if self.loggingEnabled:
                # go through internal decisions and transform them to states, actions, rewards
                olda = array([(self.amax + self.amin) / 2.]*self.conditions['actionDim'])
                for i, (d,a) in enumerate(self.decisions):
                    state = r_[self.state, olda]
                    action = d
                    
                    if i < self.nres-1:
                        reward = 0.
                    else:
                        reward = self.reward
                    
                    self.history.append(state, action, reward)
                    olda = a                   
                    

        else:
            raise AgentException('reward was given before action was returned.')
    
    def _internalDecisions(self, state):
        """ takes a state and queries the estimator several times as a binary search.
            generates (binary) decision and action at each timestep. """
        
        self.decisions = []
        
        a = array([(self.amax + self.amin) / 2.]*self.conditions['actionDim'])
        delta = (self.amax - self.amin) * float(2**(self.nres-1)) / (2**self.nres -1)
        for i in range(self.nres):
            delta = delta/2.
            decision = self.estimator.getBestAction(r_[self.state, a])
            
            # internal epsilon-greedy exploration
            if random.random() < 0.1:
                decision = array([random.randint(2**self.conditions['actionDim'])])

            # turn into binary list
            blist = -1.*ones(self.conditions['actionDim'])
            for i,bit in enumerate(reversed(bin(decision)[2:])):
                if bit == '1':
                    blist[-i-1] = 1.
            
            # update action
            a = a + delta*blist
            self.decisions.append((decision, a))
            
        return a
                
    def _calculate(self):
        """ Return the action with the maximal value for the given state. """
        self.action = self._internalDecisions(self.state)


    def learn(self):
        """ go through whole episode and make Q-value updates. """  
        for i in range(1):
            
            self.estimator.reset()

            for episode in self.history:
                for state, action, reward, nextstate in episode:
                    # # don't consider last state
                    # if equal(state, nextstate).all():
                    #     break

                    qvalue = self.estimator.getValue(state, action)
                    bestnext = self.estimator.getValue(nextstate, self.estimator.getBestAction(nextstate))
                    target = (1-self.alpha) * qvalue + self.alpha * (reward + self.gamma * bestnext)

                    self.estimator.updateValue(state, action, target)

            self.estimator.train()