def act(self, Q: Agent, task: Task, state): # set the number of actions of the current task, if not set if self.valid_actions == 0: self.valid_actions = task.valid_actions() # get the distribution over actions for the current state pref = self.preferences[state] # sample an action from the preference distribution action = np.random.choice(self.valid_actions, 1, p=pref) # get the greedy action according to Q greedy = Q.max_action(state) # update the preference distribution pref *= (1.0 - self.beta) pref[greedy] /= (1.0 - self.beta) pref[greedy] += self.beta * (1.0 - pref[greedy]) return action
def epsilon_greedy(self, Q: Agent, task: Task, state, epsilon): if np.random.rand() <= epsilon: return random.randrange(task.valid_actions()) else: return Q.max_action(state)