def do(self, ac): """ :param int|None action: move pos=0 ~ 63 (0=top left, 7 top right, 63 bottom right), None is resign :return: """ # check error assert ac is None or 0 <= ac <= 63, f"Illegal ac={ac} {self.epoch}" # action None branch if ac is None: logger.warning(f"SITUATION: resigned {self.epoch}") self._other_win() return self.chessboard else: # own = next_move_color own, opp = (self.chessboard.black, self.chessboard.white ) if self.next_to_play == Stone.black else ( self.chessboard.white, self.chessboard.black) # flipped=after move own flipped = calc_flip(ac, own, opp) # if not flipped if bit_count(flipped) == 0: logger.warning( f"SITUATION: Illegal ac={ac}, No Flipped, Set {switch_sides(self.next_to_play)} win {self.epoch}" ) self._other_win() return self.chessboard else: # flip the board own, opp = self._do_flip(own, opp, ac, flipped) self.chessboard.black, self.chessboard.white = ( own, opp) if self.next_to_play == Stone.black else (opp, own) # if there's still way to go if bit_count(find_correct_moves( opp, own)) > 0: # there are legal moves for opp. self.next_to_play = switch_sides(self.next_to_play) elif bit_count(find_correct_moves( own, opp)) > 0: # there are legal moves for me but opp. pass else: # there is no legal moves for me and opp. # logger.warning(f"SITUATION: won till game over {self.epoch}") self._game_over() return self.chessboard
def available(self, px, py): own, enemy = (self.env.chessboard.black, self.env.chessboard.white ) if self.env.next_to_play == Stone.black else ( self.env.chessboard.white, self.env.chessboard.black) action = int(py * 8 + px) if action < 0 or 64 <= action or (1<<action) & self.env.chessboard.black or (1<<action) & self.env.chessboard.white\ or not (1<<action) & find_correct_moves(own, enemy): return False return 1
def ____decide_action(self, env, is_root_node): # find correct moves node = create_node(env) legal_moves = find_correct_moves( node.black, node.white ) if env.next_to_play == Stone.black else find_correct_moves( node.white, node.black) # vn = formula here vn = max(np.sqrt(np.sum(self.num_tree[node])), 1) # SQRT of sum(N(s, b); for all b) # p = formula here re-normalize in legal moves vp = self.policy_tree[node] vp = vp * bit_to_array(legal_moves, 64) temperature = 1 if np.sum(vp) > 0: temperature = min( np.exp(1 - np.power(env.epoch / self.config.play.policy_decay_turn, self.config.play.policy_decay_power)), 1) vp = normalize(vp, temperature) # add noise 0.75*p + 0.25*noise if is_root_node and self.play_config.noise_eps > 0: # Is it correct?? -> (1-e)p + e*Dir(alpha) noise = dirichlet_noise_of_mask(legal_moves, self.play_config.dirichlet_alpha) vp = (1 - self.play_config.noise_eps ) * vp + self.play_config.noise_eps * noise # u_ = formula here vpn = vp * vn / (1 + self.num_tree[node]) if env.next_to_play == Stone.black: vpn_with_weight = (self.win_rate(node) * self.c + vpn + 1000 + self.weight_table) * bit_to_array( legal_moves, 64) else: vpn_with_weight = (-self.win_rate(node) * self.c + vpn + 1000 + self.weight_table) * bit_to_array( legal_moves, 64) action_t = int(np.argmax(vpn_with_weight)) return action_t
def _set_first_move(self, node): # chose the random num_tree = [1] policy_tree = [每个可能的地方都是1/n] legal_array = bit_to_array(find_correct_moves(node.black, node.white), 64) action = np.argmax(legal_array) update_num_tree_with_one_or_moresides(self.num_tree, node, action, ["set"], [1]) update_win_tree_with_one_or_moresides(self.win_tree, node, action, ["set"], [0]) update_policy_tree_with_one_or_moresides( self.policy_tree, node, ["set"], [legal_array / np.sum(legal_array)])
def _find_winning_move_and_score(self, env: OthelloEnv, exactly=True): # end if env.done: b, w = env.chessboard.black_white return None, b - w # restored key = black, white, next_to_play = env.chessboard.black, env.chessboard.white, env.next_to_play if key in self.cache: # store leaf node return self.cache[key] # timeout if time() - self.start_time > self.timeout: logger.debug("timeout!") raise Timeout() # recursive legal_moves = find_correct_moves( *(white, black) if not next_to_play == Stone.black else (black, white)) action_list = [idx for idx in range(64) if legal_moves & (1 << idx)] # 遍历所有解 score_list = np.zeros(len(action_list), dtype=int) record_turn = env.epoch for i, action in enumerate(action_list): env.chessboard.black = black env.chessboard.white = white env.next_to_play = next_to_play env.epoch = record_turn env.done = False env.Result = None env.do(action) _, score = self._find_winning_move_and_score(env, exactly=exactly) score_list[i] = score if not exactly: if next_to_play == Stone.black and score > 0: # 找到一个就得 break elif next_to_play == Stone.white and score < 0: break best_action, best_score = ( action_list[int(np.argmax(score_list))], np.max(score_list)) if next_to_play == Stone.black else ( action_list[int(np.argmin(score_list))], np.min(score_list)) self.cache[key] = (best_action, best_score) return best_action, best_score
def _update_avalable(self, black, white, action, policy): node = TreeNode(black, white, Stone.black.value) next_key = self.__get_next_key(black, white, action) self.avalable = LastAva( find_correct_moves(node.black, node.white), find_correct_moves(next_key.white, next_key.black))