def MCTS_search(self, state, history=[], is_root_node=False, real_hist=None) -> float: """ Monte Carlo Tree Search """ while True: # logger.debug(f"start MCTS, state = {state}, history = {history}") game_over, v, _ = senv.done(state) if game_over: self.executor.submit(self.update_tree, None, v, history) break with self.node_lock[state]: if state not in self.tree: # Expand and Evaluate self.tree[state].sum_n = 1 self.tree[state].legal_moves = senv.get_legal_moves(state) self.tree[state].waiting = True # logger.debug(f"expand_and_evaluate {state}, sum_n = {self.tree[state].sum_n}, history = {history}") if is_root_node and real_hist: self.expand_and_evaluate(state, history, real_hist) else: self.expand_and_evaluate(state, history) break if state in history[:-1]: # loop -> loss # logger.debug(f"loop -> loss, state = {state}, history = {history[:-1]}") self.executor.submit(self.update_tree, None, 0, history) break # Select node = self.tree[state] if node.waiting: node.visit.append(history) # logger.debug(f"wait for prediction state = {state}") break sel_action = self.select_action_q_and_u(state, is_root_node) virtual_loss = self.config.play.virtual_loss self.tree[state].sum_n += 1 # logger.debug(f"node = {state}, sum_n = {node.sum_n}") action_state = self.tree[state].a[sel_action] action_state.n += virtual_loss action_state.w -= virtual_loss action_state.q = action_state.w / action_state.n # logger.debug(f"apply virtual_loss = {virtual_loss}, as.n = {action_state.n}, w = {action_state.w}, q = {action_state.q}") # if action_state.next is None: history.append(sel_action) state = senv.step(state, sel_action) history.append(state)
def start_game(self, idx, search_tree): pipes = self.cur_pipes.pop() if not self.config.play.share_mtcs_info_in_self_play or \ idx % self.config.play.reset_mtcs_info_per_game == 0: search_tree = defaultdict(VisitState) if random() > self.config.play.enable_resign_rate: enable_resign = True else: enable_resign = False self.player = CChessPlayer(self.config, search_tree=search_tree, pipes=pipes, enable_resign=enable_resign, debugging=False) state = senv.INIT_STATE history = [state] value = 0 turns = 0 # even == red; odd == black game_over = False is_alpha_red = True if idx % 2 == 0 else False final_move = None check = False while not game_over: if (is_alpha_red and turns % 2 == 0) or (not is_alpha_red and turns % 2 == 1): no_act = None if not check and state in history[:-1]: no_act = [] for i in range(len(history) - 1): if history[i] == state: no_act.append(history[i + 1]) action, _ = self.player.action(state, turns, no_act) if action is None: logger.debug( f"{turns % 2} (0 = red; 1 = black) has resigned!") value = -1 break else: fen = senv.state_to_fen(state, turns) action = self.get_ucci_move(fen) if action is None: logger.debug( f"{turns % 2} (0 = red; 1 = black) has resigned!") value = -1 break if turns % 2 == 1: action = flip_move(action) history.append(action) state = senv.step(state, action) turns += 1 history.append(state) if turns / 2 >= self.config.play.max_game_length: game_over = True value = 0 else: game_over, value, final_move, check = senv.done( state, need_check=True) if final_move: history.append(final_move) state = senv.step(state, final_move) history.append(state) turns += 1 value = -value self.player.close() del search_tree del self.player gc.collect() if turns % 2 == 1: # balck turn value = -value v = value if turns <= 10: if random() > 0.7: store = True else: store = False else: store = True if store: data = [history[0]] for i in range(turns): k = i * 2 data.append([history[k + 1], value]) value = -value self.save_play_data(idx, data) self.cur_pipes.append(pipes) self.remove_play_data() return v, turns, state, store
def self_play_buffer(config, pipes_bt, pipes_ng, idx, res_data) -> (tuple, list): sleep(random()) playouts = randint(8, 12) * 100 config.play.simulation_num_per_move = playouts logger.info(f"Set playouts = {config.play.simulation_num_per_move}") pipe1 = pipes_bt.pop() # borrow pipe2 = pipes_ng.pop() player1 = CChessPlayer(config, search_tree=defaultdict(VisitState), pipes=pipe1, enable_resign=False, debugging=False) player2 = CChessPlayer(config, search_tree=defaultdict(VisitState), pipes=pipe2, enable_resign=False, debugging=False) # even: bst = red, ng = black; odd: bst = black, ng = red if idx % 2 == 0: red = player1 black = player2 print(f"基准模型执红,待评测模型执黑") else: red = player2 black = player1 print(f"待评测模型执红,基准模型执黑") state = senv.INIT_STATE history = [state] # policys = [] value = 0 turns = 0 game_over = False final_move = None no_eat_count = 0 check = False while not game_over: no_act = None if not check and state in history[:-1]: no_act = [] free_move = defaultdict(int) for i in range(len(history) - 1): if history[i] == state: # 如果走了下一步是将军或捉:禁止走那步 if senv.will_check_or_catch(state, history[i + 1]): no_act.append(history[i + 1]) # 否则当作闲着处理 else: free_move[state] += 1 if free_move[state] >= 2: # 作和棋处理 game_over = True value = 0 logger.info("闲着循环三次,作和棋处理") break if game_over: break start_time = time() if turns % 2 == 0: action, _ = red.action(state, turns, no_act=no_act) else: action, _ = black.action(state, turns, no_act=no_act) end_time = time() if action is None: print(f"{turns % 2} (0 = 红; 1 = 黑) 投降了!") value = -1 break print( f"博弈中: 回合{turns / 2 + 1} {'红方走棋' if turns % 2 == 0 else '黑方走棋'}, 着法: {action}, 用时: {(end_time - start_time):.1f}s" ) # policys.append(policy) history.append(action) try: state, no_eat = senv.new_step(state, action) except Exception as e: logger.error(f"{e}, no_act = {no_act}, policy = {policy}") value = 0 break turns += 1 if no_eat: no_eat_count += 1 else: no_eat_count = 0 history.append(state) if no_eat_count >= 120 or turns / 2 >= config.play.max_game_length: game_over = True value = 0 else: game_over, value, final_move, check = senv.done(state, need_check=True) if not game_over: if not senv.has_attack_chessman(state): logger.info(f"双方无进攻子力,作和。state = {state}") game_over = True value = 0 if final_move: history.append(final_move) state = senv.step(state, final_move) turns += 1 value = -value history.append(state) data = [] if idx % 2 == 0: data = [res_data['base']['digest'], res_data['unchecked']['digest']] else: data = [res_data['unchecked']['digest'], res_data['base']['digest']] player1.close() player2.close() del player1, player2 gc.collect() if turns % 2 == 1: # balck turn value = -value v = value data.append(history[0]) for i in range(turns): k = i * 2 data.append([history[k + 1], value]) value = -value pipes_bt.append(pipe1) pipes_ng.append(pipe2) return (turns, v, idx), data
def start_game(self, idx, search_tree): pipes = self.cur_pipes.pop() if not self.config.play.share_mtcs_info_in_self_play or \ idx % self.config.play.reset_mtcs_info_per_game == 0: search_tree = defaultdict(VisitState) if random() > self.config.play.enable_resign_rate: enable_resign = True else: enable_resign = False self.player = CChessPlayer(self.config, search_tree=search_tree, pipes=pipes, enable_resign=enable_resign, debugging=False) state = senv.INIT_STATE history = [state] # policys = [] value = 0 turns = 0 # even == red; odd == black game_over = False final_move = None no_eat_count = 0 check = False while not game_over: no_act = None if not check and state in history[:-1]: no_act = [] free_move = defaultdict(int) for i in range(len(history) - 1): if history[i] == state: # 如果走了下一步是将军或捉:禁止走那步 if senv.will_check_or_catch(state, history[i+1]): no_act.append(history[i + 1]) # 否则当作闲着处理 else: free_move[state] += 1 if free_move[state] >= 2: # 作和棋处理 game_over = True value = 0 logger.info("闲着循环三次,作和棋处理") break if game_over: break start_time = time() action, policy = self.player.action(state, turns, no_act) end_time = time() if action is None: logger.debug(f"{turns % 2} (0 = red; 1 = black) has resigned!") value = -1 break if self.config.opts.log_move: logger.info(f"Process{self.pid} Playing: {turns % 2}, action: {action}, time: {(end_time - start_time):.1f}s") # logger.info(f"Process{self.pid} Playing: {turns % 2}, action: {action}, time: {(end_time - start_time):.1f}s") # for move, action_state in self.player.search_results.items(): # if action_state[0] >= 20: # logger.info(f"move: {move}, prob: {action_state[0]}, Q_value: {action_state[1]:.2f}, Prior: {action_state[2]:.3f}") # self.player.search_results = {} history.append(action) # policys.append(policy) try: state, no_eat = senv.new_step(state, action) except Exception as e: logger.error(f"{e}, no_act = {no_act}, policy = {policy}") game_over = True value = 0 break turns += 1 if no_eat: no_eat_count += 1 else: no_eat_count = 0 history.append(state) if no_eat_count >= 120 or turns / 2 >= self.config.play.max_game_length: game_over = True value = 0 else: game_over, value, final_move, check = senv.done(state, need_check=True) if not game_over: if not senv.has_attack_chessman(state): logger.info(f"双方无进攻子力,作和。state = {state}") game_over = True value = 0 if final_move: # policy = self.build_policy(final_move, False) history.append(final_move) # policys.append(policy) state = senv.step(state, final_move) turns += 1 value = -value history.append(state) self.player.close() del search_tree del self.player gc.collect() if turns % 2 == 1: # balck turn value = -value v = value if turns < 10: if random() > 0.9: store = True else: store = False else: store = True if store: data = [history[0]] for i in range(turns): k = i * 2 data.append([history[k + 1], value]) value = -value self.save_play_data(idx, data) self.cur_pipes.append(pipes) self.remove_play_data() return v, turns, state, store
def start_game(self, idx): sleep(random()) playouts = randint(8, 12) * 100 self.config.play.simulation_num_per_move = playouts logger.info( f"Set playouts = {self.config.play.simulation_num_per_move}") pipe1 = self.pipes_bt.pop() pipe2 = self.pipes_ng.pop() search_tree1 = defaultdict(VisitState) search_tree2 = defaultdict(VisitState) self.player1 = CChessPlayer(self.config, search_tree=search_tree1, pipes=pipe1, debugging=False, enable_resign=False, use_history=self.hist_base) self.player2 = CChessPlayer(self.config, search_tree=search_tree2, pipes=pipe2, debugging=False, enable_resign=False, use_history=self.hist_ng) # even: bst = red, ng = black; odd: bst = black, ng = red if idx % 2 == 0: red = self.player1 black = self.player2 logger.info(f"进程id = {self.pid} 基准模型执红,待评测模型执黑") else: red = self.player2 black = self.player1 logger.info(f"进程id = {self.pid} 待评测模型执红,基准模型执黑") state = senv.INIT_STATE history = [state] value = 0 # best model's value turns = 0 # even == red; odd == black game_over = False no_eat_count = 0 check = False while not game_over: start_time = time() no_act = None if not check and state in history[:-1]: no_act = [] free_move = defaultdict(int) for i in range(len(history) - 1): if history[i] == state: # 如果走了下一步是将军或捉:禁止走那步 if senv.will_check_or_catch(state, history[i + 1]): no_act.append(history[i + 1]) # 否则当作闲着处理 else: free_move[state] += 1 if free_move[state] >= 2: # 作和棋处理 game_over = True value = 0 logger.info("闲着循环三次,作和棋处理") break if game_over: break if turns % 2 == 0: action, _ = red.action(state, turns, no_act=no_act) else: action, _ = black.action(state, turns, no_act=no_act) end_time = time() if self.config.opts.log_move: logger.debug( f"进程id = {self.pid}, action = {action}, turns = {turns}, time = {(end_time-start_time):.1f}" ) if action is None: logger.debug(f"{turns % 2} (0 = red; 1 = black) has resigned!") value = -1 break history.append(action) state, no_eat = senv.new_step(state, action) turns += 1 if no_eat: no_eat_count += 1 else: no_eat_count = 0 history.append(state) if no_eat_count >= 120 or turns / 2 >= self.config.play.max_game_length: game_over = True value = 0 else: game_over, value, final_move, check = senv.done( state, need_check=True) if not game_over: if not senv.has_attack_chessman(state): logger.info(f"双方无进攻子力,作和。state = {state}") game_over = True value = 0 if final_move: history.append(final_move) state = senv.step(state, final_move) turns += 1 value = -value history.append(state) data = [] if idx % 2 == 0: data = [ self.data['base']['digest'], self.data['unchecked']['digest'] ] else: data = [ self.data['unchecked']['digest'], self.data['base']['digest'] ] self.player1.close() self.player2.close() if turns % 2 == 1: # black turn value = -value v = value data.append(history[0]) for i in range(turns): k = i * 2 data.append([history[k + 1], v]) v = -v self.pipes_bt.append(pipe1) self.pipes_ng.append(pipe2) return value, turns, data
def self_play_buffer(config, cur) -> (tuple, list): pipe = cur.pop() # borrow if random() > config.play.enable_resign_rate: enable_resign = True else: enable_resign = False player = CChessPlayer(config, search_tree=defaultdict(VisitState), pipes=pipe, enable_resign=enable_resign, debugging=False) state = senv.INIT_STATE history = [state] # policys = [] value = 0 turns = 0 game_over = False final_move = None no_eat_count = 0 check = False while not game_over: no_act = None if not check and state in history[:-1]: no_act = [] free_move = defaultdict(int) for i in range(len(history) - 1): if history[i] == state: # 如果走了下一步是将军或捉:禁止走那步 if senv.will_check_or_catch(state, history[i + 1]): no_act.append(history[i + 1]) # 否则当作闲着处理 else: free_move[state] += 1 if free_move[state] >= 2: # 作和棋处理 game_over = True value = 0 logger.info("闲着循环三次,作和棋处理") break if game_over: break start_time = time() action, policy = player.action(state, turns, no_act) end_time = time() if action is None: print(f"{turns % 2} (0 = 红; 1 = 黑) 投降了!") value = -1 break print( f"博弈中: 回合{turns / 2 + 1} {'红方走棋' if turns % 2 == 0 else '黑方走棋'}, 着法: {action}, 用时: {(end_time - start_time):.1f}s" ) # policys.append(policy) history.append(action) try: state, no_eat = senv.new_step(state, action) except Exception as e: logger.error(f"{e}, no_act = {no_act}, policy = {policy}") game_over = True value = 0 break turns += 1 if no_eat: no_eat_count += 1 else: no_eat_count = 0 history.append(state) if no_eat_count >= 120 or turns / 2 >= config.play.max_game_length: game_over = True value = 0 else: game_over, value, final_move, check = senv.done(state, need_check=True) if not game_over: if not senv.has_attack_chessman(state): logger.info(f"双方无进攻子力,作和。state = {state}") game_over = True value = 0 if final_move: # policy = build_policy(final_move, False) history.append(final_move) # policys.append(policy) state = senv.step(state, final_move) turns += 1 value = -value history.append(state) player.close() del player gc.collect() if turns % 2 == 1: # balck turn value = -value v = value data = [history[0]] for i in range(turns): k = i * 2 data.append([history[k + 1], value]) value = -value cur.append(pipe) return (turns, v), data
def ai_move(self): ai_move_first = not self.human_move_first self.history = [self.env.get_state()] no_act = None while not self.env.done: if ai_move_first == self.env.red_to_move: self.ai.search_results = {} state = self.env.get_state() logger.info(f"state = {state}") _, _, _, check = senv.done(state, need_check=True) if not check and state in self.history[:-1]: no_act = [] free_move = defaultdict(int) for i in range(len(self.history) - 1): if self.history[i] == state: # 如果走了下一步是将军或捉:禁止走那步 if senv.will_check_or_catch( state, self.history[i + 1]): no_act.append(self.history[i + 1]) # 否则当作闲着处理 else: free_move[state] += 1 if free_move[state] >= 2: # 作和棋处理 self.env.winner = Winner.draw self.env.board.winner = Winner.draw break if no_act: logger.debug(f"no_act = {no_act}") action, policy = self.ai.action(state, self.env.num_halfmoves, no_act) if action is None: logger.info("AI has resigned!") return self.history.append(action) if not self.env.red_to_move: action = flip_move(action) key = self.env.get_state() p, v = self.ai.debug[key] logger.info(f"check = {check}, NN value = {v:.3f}") self.nn_value = v logger.info("MCTS results:") self.mcts_moves = {} for move, action_state in self.ai.search_results.items(): move_cn = self.env.board.make_single_record( int(move[0]), int(move[1]), int(move[2]), int(move[3])) logger.info( f"move: {move_cn}-{move}, visit count: {action_state[0]}, Q_value: {action_state[1]:.3f}, Prior: {action_state[2]:.3f}" ) self.mcts_moves[move_cn] = action_state x0, y0, x1, y1 = int(action[0]), int(action[1]), int( action[2]), int(action[3]) chessman_sprite = self.select_sprite_from_group( self.chessmans, x0, y0) sprite_dest = self.select_sprite_from_group( self.chessmans, x1, y1) if sprite_dest: self.chessmans.remove(sprite_dest) sprite_dest.kill() chessman_sprite.move(x1, y1, self.chessman_w, self.chessman_h) self.history.append(self.env.get_state())
def start_game(self, idx): pipe1 = self.pipes_bt.pop() pipe2 = self.pipes_ng.pop() search_tree1 = defaultdict(VisitState) search_tree2 = defaultdict(VisitState) self.player1 = CChessPlayer(self.config, search_tree=search_tree1, pipes=pipe1, debugging=False, enable_resign=True) self.player2 = CChessPlayer(self.config, search_tree=search_tree2, pipes=pipe2, debugging=False, enable_resign=True) # even: bst = red, ng = black; odd: bst = black, ng = red if idx % 2 == 0: red = self.player1 black = self.player2 logger.debug(f"best model is red, ng is black") else: red = self.player2 black = self.player1 logger.debug(f"best model is black, ng is red") state = senv.INIT_STATE value = 0 # best model's value turns = 0 # even == red; odd == black game_over = False while not game_over: start_time = time() if turns % 2 == 0: action, _ = red.action(state, turns) else: action, _ = black.action(state, turns) end_time = time() # logger.debug(f"pid = {self.pid}, idx = {idx}, action = {action}, turns = {turns}, time = {(end_time-start_time):.1f}") if action is None: logger.debug(f"{turn % 2} (0 = red; 1 = black) has resigned!") value = -1 break state = senv.step(state, action) turns += 1 if turns / 2 >= self.config.play.max_game_length: game_over = True value = 0 else: game_over, value, final_move = senv.done(state) self.player1.close() self.player2.close() if turns % 2 == 1: # black turn value = -value if idx % 2 == 1: value = -value self.pipes_bt.append(pipe1) self.pipes_ng.append(pipe2) return value, turns