def update_policy(self, s, a, game): if self.a_policy == 'softmax': self.pi[s] = softmax(np.sum(np.multiply(self.Q[s], self.opponent_best_pi[s]), 1)) else: Q = np.sum(np.multiply(self.Q[s], self.opponent_best_pi[s]), 1) self.pi[s] = (Q == np.max(Q)).astype(np.double) self.pi_history.append(deepcopy(self.pi)) self.opponent_best_pi_history.append(deepcopy(self.opponent_best_pi)) print('opponent pi of {}: {}'.format(self.id_, self.opponent_best_pi))
def update_policy(self, s, a, game): # print('Qs {}'.format(self.Q[s])) # print('OPI {}'.format(self.opponent_best_pi[s])) # print('pis: ' + str(np.dot(self.Q[s], self.opponent_best_pi[s]))) self.pi[s] = softmax(np.dot(self.Q[s], self.opponent_pi[s])) # print('pis: ' + str(np.sum(np.dot(self.Q[s], self.opponent_best_pi[s])))) self.pi_history.append(deepcopy(self.pi)) self.opponent_pi_history.append(deepcopy(self.opponent_pi)) if self.verbose: print('opponent pi of {}: {}'.format(self.id_, self.opponent_pi[s]))