예제 #1
0
 def softmax_selection(self, state_index, Q):
     possible_actions = self.env.get_possible_actions(state_index)
     if len(possible_actions) == 1:
         return 0
     probabilities = utils.softmax(Q, self.inv_temp)
     action_idx = np.random.choice(list(range(len(possible_actions))), p=probabilities)
     return action_idx
예제 #2
0
 def select_action(self, state_idx, softmax=True):
     # TODO: get categorical dist over next state
     # okay because it's local
     # gradient-based (hill-climbing) gradient ascent
     # graph hill climbing
     # Maybe change for M(sa,sa). potentially over state action only in two step
     V = self.M_hat @ self.R_hat
     next_state = [self.env.get_next_state(state_idx, a) for a in range(self.env.nr_actions)]
     Q = [V[s] for s in next_state]
     probabilities = utils.softmax(Q, self.beta)
     return np.random.choice(list(range(self.env.nr_actions)), p=probabilities)
예제 #3
0
 def select_action(self, state_idx, softmax=True):
     # TODO: get categorical dist over next state
     # okay because it's local
     # gradient-based planning (hill-climbing) gradient ascent
     # graph hill climbing
     # Maybe change for M(sa,sa). potentially over state action only in two step
     next_state = [
         self.env.get_next_state(state_idx, a)
         for a in range(self.env.nr_actions)
     ]
     Q = [self.compute_V(s) for s in next_state]
     probabilities = utils.softmax(Q, self.beta)
     try:
         a = np.random.choice(list(range(self.env.nr_actions)),
                              p=probabilities)
     except ValueError:
         print('whats wrong')
     return a
예제 #4
0
 def softmax_selection(self, state_index, Q):
     probabilities = utils.softmax(Q, self.inv_temp)
     action_idx = np.random.choice(list(range(self.env.nr_actions)),
                                   p=probabilities)
     return action_idx
예제 #5
0
 def softmax_selection(self, state_idx):
     probabilities = utils.softmax(self.Q[state_idx], self.beta)
     action_idx = np.random.choice(list(range(self.env.nr_actions)),
                                   p=probabilities)
     return action_idx