Python FAEstimator.getValue示例

编程语言: Python

命名空间/包名称: dopamine.agents.valuebased.faestimator

类/类型: FAEstimator

方法/功能: getValue

hotexamples.com的示例: 2

Python FAEstimator.getValue - 已找到2个示例。这些是从开源项目中提取的最受好评的dopamine.agents.valuebased.faestimator.FAEstimator.getValue现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

getBestAction(2)

getValue(2)

reset(2)

train(2)

updateValue(2)

__init__(1)

rememberAction(1)

resetMemory(1)

示例#1

显示文件

文件： fqi.py 项目： rueckstiess/dopamine

class FQIAgent(Agent):
    
    alpha = 1.0
    gamma = 0.9
    iterations = 1
    presentations = 1
    
    def __init__(self, faClass=Linear, resetFA=True, ordered=False, vectorblock=False):
        """ initialize the agent with the estimatorClass. """
        Agent.__init__(self)
        self.faClass = faClass
        self.resetFA = resetFA
        self.ordered = ordered
        self.vectorblock = vectorblock

    
    def _setup(self, conditions):
        """ if agent is discrete in states and actions create Q-Table. """
        Agent._setup(self, conditions)
        if not (self.conditions['discreteStates'] == False and self.conditions['discreteActions']):
            raise AgentException('FQIAgent expects continuous states and discrete actions. Use adapter or a different environment.')
        
        if self.vectorblock:
            self.estimator = VectorBlockEstimator(self.conditions['stateDim'], self.conditions['actionNum'], faClass=self.faClass, ordered=self.ordered)
        else:
            self.estimator = FAEstimator(self.conditions['stateDim'], self.conditions['actionNum'], faClass=self.faClass, ordered=self.ordered)
    

    def _calculate(self):
        self.action = self.estimator.getBestAction(self.state)
    
    
    def newEpisode(self):
        """ reset the memory. """
        Agent.newEpisode(self)

        if self.ordered:
            self.estimator.resetMemory()
    

    def giveReward(self, reward):
        """ additionally remember the chosen action to not draw it again. """
        if self.ordered:
            self.estimator.rememberAction(self.action)
        
        Agent.giveReward(self, reward)    

    
    def buildMemoryFromEpisode(self, episode):
        """ builds the memory for already executed actions from the current
            episode. this is necessary if the episode was appended without
            actually being executed via the experiments interact() method. 
        """
        if self.ordered:
            for a in episode.actions:
                self.estimator.rememberAction(a)
                
    
    def learn(self):
        """ go through whole episode and make Q-value updates. """  

        for i in range(self.iterations):
            dataset = []
            
            for episode in self.history:
                if self.ordered:
                    self.estimator.resetMemory()

                for state, action, reward, nextstate in episode:                    
                    qvalue = self.estimator.getValue(state, action)
                    if self.ordered:
                        self.estimator.rememberAction(action)
                    if nextstate != None:
                        bestnext = self.estimator.getValue(nextstate, self.estimator.getBestAction(nextstate))
                    else:
                        bestnext = 0.
                    target = (1-self.alpha) * qvalue + self.alpha * (reward + self.gamma * bestnext)

                    dataset.append([state, action, target])

            if len(dataset) == 0:
                continue
                
            # ground targets to 0 to avoid drifting values
            mintarget = min(map(itemgetter(2), dataset))
            
            # reset estimator (not resetting might lead to faster convergence!)
            if self.resetFA:
                self.estimator.reset()
            for i in range(self.presentations):
                shuffle(dataset)
                for state, action, target in dataset:
                    self.estimator.updateValue(state, action, target-mintarget)
            self.estimator.train()

示例#2

显示文件

文件： bas.py 项目： rueckstiess/dopamine

class BASAgent(Agent):
    
    alpha = 1.0
    gamma = 0.9
    
    def __init__(self, faClass=Linear):
        """ initialize the agent with the estimatorClass. """
        Agent.__init__(self)
        
        self.amin = -1.
        self.amax = 1.
        self.nres = 3
        
        # store (decision,action) tuples for one action in the list
        self.decisions = []
        
        self.faClass = faClass
    
    def _setup(self, conditions):
        """ if agent is discrete in states and actions create Q-Table. """
        Agent._setup(self, conditions)
        if not (self.conditions['discreteStates'] == False and self.conditions['discreteActions'] == False):
            raise AgentException('BASAgent expects continuous states and actions. Use adapter or a different environment.')
            
        self.estimator = FAEstimator(self.conditions['stateDim'] + self.conditions['actionDim'], 2**self.conditions['actionDim'], self.faClass)
        
        # change history to store bas-extended experiences
        self.history = History(conditions['stateDim']+self.conditions['actionDim'] , 1)
    
    
    def giveReward(self, reward):
        """ override function to store the internal actions in the history. """
        if self.progressCnt == 2:
            self.reward = reward
            self.progressCnt = 0
            if self.loggingEnabled:
                # go through internal decisions and transform them to states, actions, rewards
                olda = array([(self.amax + self.amin) / 2.]*self.conditions['actionDim'])
                for i, (d,a) in enumerate(self.decisions):
                    state = r_[self.state, olda]
                    action = d
                    
                    if i < self.nres-1:
                        reward = 0.
                    else:
                        reward = self.reward
                    
                    self.history.append(state, action, reward)
                    olda = a                   
                    

        else:
            raise AgentException('reward was given before action was returned.')
    
    def _internalDecisions(self, state):
        """ takes a state and queries the estimator several times as a binary search.
            generates (binary) decision and action at each timestep. """
        
        self.decisions = []
        
        a = array([(self.amax + self.amin) / 2.]*self.conditions['actionDim'])
        delta = (self.amax - self.amin) * float(2**(self.nres-1)) / (2**self.nres -1)
        for i in range(self.nres):
            delta = delta/2.
            decision = self.estimator.getBestAction(r_[self.state, a])
            
            # internal epsilon-greedy exploration
            if random.random() < 0.1:
                decision = array([random.randint(2**self.conditions['actionDim'])])

            # turn into binary list
            blist = -1.*ones(self.conditions['actionDim'])
            for i,bit in enumerate(reversed(bin(decision)[2:])):
                if bit == '1':
                    blist[-i-1] = 1.
            
            # update action
            a = a + delta*blist
            self.decisions.append((decision, a))
            
        return a
                
    def _calculate(self):
        """ Return the action with the maximal value for the given state. """
        self.action = self._internalDecisions(self.state)


    def learn(self):
        """ go through whole episode and make Q-value updates. """  
        for i in range(1):
            
            self.estimator.reset()

            for episode in self.history:
                for state, action, reward, nextstate in episode:
                    # # don't consider last state
                    # if equal(state, nextstate).all():
                    #     break

                    qvalue = self.estimator.getValue(state, action)
                    bestnext = self.estimator.getValue(nextstate, self.estimator.getBestAction(nextstate))
                    target = (1-self.alpha) * qvalue + self.alpha * (reward + self.gamma * bestnext)

                    self.estimator.updateValue(state, action, target)

            self.estimator.train()