示例#1
0
 def get_normalizer(self, state, Q):
     possible_actions = util.possible_moves(state, 'opponent')
     sum = 0
     for action in possible_actions:
         sum += ref.ref_opponent(
             state, action, len(possible_actions)) * math.exp(
                 self.beta_estimation * Q.get_Q_opponent(state, action))
     return sum / 1
示例#2
0
    def choose_move(self, state, Q):
        possible_actions = util.possible_moves(state)
        use_greedy = self.epsilon_greedy()

        if use_greedy:
            return self.get_best_action_based_on_Q(Q, state, possible_actions)
        else:
            return possible_actions[random.randint(0,
                                                   len(possible_actions) - 1)]
示例#3
0
 def get_Q_opponent(self, state, action):
     possible_actions_player = util.possible_moves(state, player='player')
     sum = 0
     for action_pl in possible_actions_player:
         sum += (self.player.get_reference(state, action_pl,
                                           len(possible_actions_player)) *
                 math.exp(self.player.beta * self.values.get(state, {}).get(
                     (action_pl, action), 0)))
     return math.log(sum) / self.player.beta
示例#4
0
    def estimate_v_new_state(self, new_state):
        possible_actions_player = util.possible_moves(new_state)
        sum = 0
        for action in possible_actions_player:
            Q_player = self.get_Q_player(new_state, action,
                                         self.player.use_estimation,
                                         self.player.get_beta_estimation())
            sum += (self.player.get_reference(new_state, action,
                                              len(possible_actions_player)) *
                    math.exp(self.player.beta * Q_player))

        return math.log(sum) / self.player.beta
示例#5
0
 def get_Q_player(self, state, action, use_estimation, beta_estimation):
     possible_actions_opponent = util.possible_moves(state,
                                                     player='opponent')
     sum = 0
     if use_estimation:
         beta = beta_estimation
     else:
         beta = self.opponent.beta
     for action_op in possible_actions_opponent:
         sum += (self.opponent.get_reference(
             state, action_op, len(possible_actions_opponent)) *
                 math.exp(beta * self.values.get(state, {}).get(
                     (action, action_op), 0)))
     return math.log(sum) / beta
示例#6
0
 def get_gradient(self, state, action, Q):
     b = self.beta_estimation
     possible_actions = util.possible_moves(state, 'opponent')
     Qs = []
     for possible_action in possible_actions:
         Qs.append(Q.get_Q_opponent(state, possible_action))
     Q_action = Q.get_Q_opponent(state, action)
     upper_left = 0
     for i in range(len(Qs)):
         upper_left += Qs[i] * math.exp(Qs[i] * b)
     upper_left_divisor = 0
     for i in range(len(Qs)):
         upper_left_divisor += math.exp(Qs[i] * b)
     return upper_left / upper_left_divisor - Q_action