示例#1
0
    def act(self, Q: Agent, task: Task, state):

        # set the number of actions of the current task, if not set
        if self.valid_actions == 0:
            self.valid_actions = task.valid_actions()

        # get the distribution over actions for the current state
        pref = self.preferences[state]

        # sample an action from the preference distribution
        action = np.random.choice(self.valid_actions, 1, p=pref)

        # get the greedy action according to Q
        greedy = Q.max_action(state)

        # update the preference distribution
        pref *= (1.0 - self.beta)
        pref[greedy] /= (1.0 - self.beta)
        pref[greedy] += self.beta * (1.0 - pref[greedy])

        return action
示例#2
0
 def epsilon_greedy(self, Q: Agent, task: Task, state, epsilon):
     if np.random.rand() <= epsilon:
         return random.randrange(task.valid_actions())
     else:
         return Q.max_action(state)