def playout(self, state: Board): node = self.root start_depth = self.root.depth while 1: if node.is_leaf(): # if node.depth < 5 or node.depth < 2 + start_depth or node.visit >= self.expand_bound: if (node.depth < start_depth + 5 and node.visit > (node.depth - start_depth) * 2) \ or node.visit >= self.expand_threshold: if self.use_network: policy = expand_policy_network(state) else: policy = expand_pocliy_random(state) node.expand(policy) else: break action, node = node.select() state.play(action) is_end, _ = state.check_winner() if is_end: break # bp_value = self.evaluate_rollout(state) # bp_value = self.evaluate_rollout_v2(state) # for _ in range(self.expand_bound): # bp_value = self.rollout_simulation(state.copy()) # node.backpropagate(bp_value) bp_value = self.rollout_simulation(state) node.backpropagate(bp_value)
def get_action(self, state: Board) -> int: # last = state.last_move # if last != -1: # last_x, last_y = move_int2xy(last) # self.queue_draw.put((last_x, last_y, state.last_player)) get_input = True move_int = -1 while get_input: # print('get') try: x, y = self.queue_move.get() except TypeError: print('\n GUI window not found\n exit program\n') time.sleep(1) sys.exit() # print((x, y)) move_int = move_xy2int(x, y) if state.check_valid(move_int): get_input = False # print('valid') # self.queue_draw.put((x, y, state.current_player)) return move_int
def get_move(self, state: Board): if state.is_empty(): return move_xy2int(WIDTH // 2, HEIGHT // 2) time.sleep(0.1) start_time = time.time() if len(state.moved) == 1 or len(state.moved) == 2: it = self.compute_budget // 2 else: it = self.compute_budget for i in tqdm(range(it)): board_to_search = state.copy() self.playout(board_to_search) end_time = time.time() children = self.root.children.items() Log.silent_log('%d playouts in %.3f second' % (self.compute_budget, end_time - start_time)) Log.silent_log('average : %.3f ms\n' % ((end_time - start_time) / self.compute_budget * 1000)) Log.silent_log('most visited node:') Log.silent_log('| ' + 'action'.ljust(8, ' ') + '| ' + 'visit'.ljust(8, ' ') + '| ' + 'probability'.ljust(13, ' ') + '| ' + ' Q |') for action, c in sorted(children, key=lambda child: child[1].visit, reverse=True)[:5]: if c.visit == 0 and c.probability < 0.01: continue Log.silent_log('| ' + ' ' + str(move_int2cord(action)).ljust(7, ' ') + '| ' + \ (' %d' % c.visit).ljust(8, ' ') + '| ' + \ (' %.3f%%' % (c.probability * 100)).ljust(13, ' ') + '| ' + \ ('%.4f' % c.Q).rjust(7, ' ') + ' |') most_visited_move = max(children, key=lambda child: child[1].visit)[0] # print('mean:', sum(t) / len(t), 'ms') # print('acc:', sum(check) / len(check) * 100, '%') # print('q:', sum(q_log) / len(q_log)) return most_visited_move
def simulate_random(board: Board, limit=200): net_run_time = 0. start_time = time.time() * 1000 is_end, winner = False, None for i in range(limit): is_end, winner = board.check_winner() if is_end: # print(i, 'end') break must = board.check_must() if must is not None: # if board.play(must): # continue if not board.play(must): raise ValueError('Must Error') # board.show() continue t1 = time.time() * 1000 action_prob = rollout_policy_random(board) # action_prob = policy(board) t2 = time.time() * 1000 net_run_time += t2 - t1 next_action = np.argmax(action_prob) while not board.play(next_action): next_action = np.argmax(action_prob) end_time = time.time() * 1000 # print('%.5fms %.5fms %.5f%%' % (end_time - start_time, net_run_time, net_run_time * 100 / (end_time - start_time))) if is_end: if winner is not None: return winner return 0.5
def expand_pocliy_random(board: Board): board_array = board.get_board_array() conv_available = _convolve_board_available_narrow(board_array) cnt = conv_available.sum() probability = conv_available / cnt return_list = [] for move in board.available: if conv_available[move]: return_list.append((move, probability[move])) return return_list
def __init__(self, player_black: Union[AgentCLI, AgentGUI, DeepMCTSAgent, PureMCTSAgent], player_white: Union[AgentCLI, AgentGUI, DeepMCTSAgent, PureMCTSAgent], queue_draw=None): self.board = Board() self.black = player_black self.white = player_white self.player = {BLACK_: self.black, WHITE_: self.white} self.queue_draw = queue_draw self.playing = False if self.black.gui or self.white.gui: self.gui = True else: self.gui = False self._time = { BLACK_: [], WHITE_: [], }
def simulate_network(board: Board, limit=100, q_confidence=0.5): random_bound = 4 net_run_time = 0. start_time = time.time() * 1000 is_end, winner = False, None for i in range(limit): is_end, winner = board.check_winner() if is_end: # print(i, 'end') break must = board.check_must() if must is not None: if not board.play(must): raise ValueError('Must Error') continue t1 = time.time() * 1000 action_prob = rollout_policy_network(board) # action_prob = policy(board) t2 = time.time() * 1000 net_run_time += t2 - t1 if i < random_bound: next_action = np.random.choice(15 * 15, 1, p=action_prob)[0] while not board.play(next_action): next_action = np.random.choice(15 * 15, 1, p=action_prob)[0] else: next_action = max(board.available, key=lambda move: action_prob[move]) board.play(next_action) end_time = time.time() * 1000 # print('%.5fms %.5fms %.5f%%' % (end_time - start_time, net_run_time, net_run_time * 100 / (end_time - start_time))) if is_end: if winner is not None: return winner value_network_Q = ValueRunner(board.get_board_array()) if board.current_player == BLACK_: # array shape (white, black) value_network_Q = 1 - value_network_Q # change to black side return q_confidence * (value_network_Q - 0.5) + 0.5
def expand_policy_network(board: Board) -> List[Tuple[int, float]]: board_array = board.get_board_array() probability = TreePolicyRunner(board_array) if len(board.moved) == 1: if board.moved[0] == move_xy2int(7, 7): return list(map(lambda move: (move, probability[move]), __second_move_available)) if len(board.must[board.current_player]): return list(map(lambda move: (move, probability[move]), (board.must[board.current_player]))) if len(board.must[0] | board.must[1]): return list(map(lambda move: (move, probability[move]), (board.must[0] | board.must[1]))) conv_available = _convolve_board_available_wide(board_array) return_list = [] for move in board.available: if conv_available[move]: return_list.append((move, probability[move])) return return_list
def get_action(self, state: Board) -> int: while True: raw = input('action to move : ').lower().strip().replace(' ', '') try: x = ord(raw[0]) - 97 y = int(raw[1:]) - 1 except ValueError: print('invalid input format', raw) continue if x >= WIDTH or y >= HEIGHT or x < 0 or y < 0: print('invalid input range:', (x, y)) continue move = move_xy2int(x, y) if state.check_valid(move): return move else: print('invalid action:', (x, y)) continue
def rollout_policy_network(board: Board): probability = RolloutPolicyRunner(board.get_board_array()) return probability
def rollout_policy_random(board: Board): board_array = board.get_board_array() conv_available = _convolve_board_available_narrow(board_array) probability = np.random.rand(15 * 15) * conv_available return probability
class Server: def __init__(self, player_black: Union[AgentCLI, AgentGUI, DeepMCTSAgent, PureMCTSAgent], player_white: Union[AgentCLI, AgentGUI, DeepMCTSAgent, PureMCTSAgent], queue_draw=None): self.board = Board() self.black = player_black self.white = player_white self.player = {BLACK_: self.black, WHITE_: self.white} self.queue_draw = queue_draw self.playing = False if self.black.gui or self.white.gui: self.gui = True else: self.gui = False self._time = { BLACK_: [], WHITE_: [], } @property def current_player(self): return self.player[self.board.current_player] def run(self, log_path=None, time_log_path=None): self.playing = True winner = None self.board.show() sys.stdout.flush() while self.playing: t1 = time.time() action = self.current_player.get_action(self.board) t2 = time.time() self._time[self.board.current_player].append(t2 - t1) if log_path is not None: with open(log_path, 'a') as f: f.write(str(move_int2xy(action)) + '\n') self.board.play(action) os.system('cls') self.board.show() Log.flush() sys.stdout.flush() if self.gui: self.queue_draw.put( (*move_int2xy(action), self.board.last_player)) is_end, winner = self.board.check_winner() if is_end: self.playing = False if winner is None: print(' Game Draw') elif winner == BLACK_: print(' Black win') elif winner == WHITE_: print(' White win') if log_path is not None: with open(log_path, 'a') as f: if winner is None: f.write('Game Draw\n') elif winner == BLACK_: f.write('Black win\n') elif winner == WHITE_: f.write('White win\n') if time_log_path is not None: if type(self.black) == DeepMCTSAgent: black_type = 'Deep MCTS Agent with compute budget %d' % self.black.compute_budget elif type(self.black) == PureMCTSAgent: black_type = 'Pure MCTS Agent with compute budget %d' % self.black.compute_budget else: black_type = 'Human Player' if type(self.white) == DeepMCTSAgent: white_type = 'Deep MCTS Agent with compute budget %d' % self.white.compute_budget elif type(self.white) == PureMCTSAgent: white_type = 'Pure MCTS Agent with compute budget %d' % self.white.compute_budget else: white_type = 'Human Player' with open(time_log_path, 'a') as f: f.write('Black player: ' + black_type + '\n') for black_time in self._time[BLACK_][1:]: f.write('%.4f\n' % black_time) sum_black, len_black = sum( self._time[BLACK_][1:]), len(self._time[BLACK_]) - 1 f.write('average: %.4fs per action\n' % (sum_black / len_black)) if self.black.use_mcts: f.write(' %.4fms per playout\n' % (sum_black * 1000 / len_black / self.black.compute_budget)) f.write('\n') f.write('White player: ' + white_type + '\n') for white_time in self._time[WHITE_]: f.write('%.4f\n' % white_time) sum_white, len_white = sum(self._time[WHITE_]), len( self._time[WHITE_]) f.write('average: %.4fs per action\n' % (sum_white / len_white)) if self.white.use_mcts: f.write(' %.4fms per playout\n' % (sum_white * 1000 / len_white / self.white.compute_budget))