def search_recursive(self, state: State, agent: RLAgent, root: bool = False, logic: Logic = Logic()): """ This function performs one iteration of MCTS. It is recursively called until a leaf node is found. The action chosen at each node is one that has the maximum upper confidence bound as in the paper. Once a leaf node is found, the neural network is called to return an initial policy P and a value value for the state. This value is propagated up the search path. In case the leaf node is a terminal state, the outcome is propagated up the search path. The values of Ns, Nsa, Qsa are updated. Notes ----- Since the board value is computed after game termination during the next recursive step, which includes a player-view shift, the returned value is always from the perspective of the opponent. Returns ------- float, the (opponent's) board value for the player. """ if state.active_team == Team.red: # the network is trained only from the perspective of team blue state.flip_teams() # get string representation of state s = str(state) if s not in self.Es: self.Es[s] = logic.get_status(state) if self.Es[s] != Status.ongoing: # terminal node return -self.Es[s].value if s not in self.Ps: # leaf node return -self._fill_leaf_node(state, s, agent) valids = self.Vs[s] policy = self.Ps[s] if root: policy = self._make_policy_noisy(policy, valids) a = self._select_action(s, policy, valids) move = self.action_map.action_to_move(a, state, Team.blue) self.logic.execute_move(state, move) value = self.search_recursive(state, agent, root=False) self._update_qsa(s, a, value) self.Ns[s] += 1 return -value
def __init__( self, student: AZAgent, action_map: ActionMap, logic: Logic = Logic(), num_iterations: int = 100, num_selfplay_episodes: int = 100, acceptance_rate: float = 0.55, mcts_simulations: int = 100, temperature: int = 100, model_folder: str = "./checkpoints/models", train_data_folder: str = "./checkpoints/data", seed: Optional[Union[int, np.random.Generator]] = None, **kwargs, ): super().__init__( student, action_map, logic, model_folder, train_data_folder, **kwargs, ) self.n_iters = num_iterations self.n_episodes = num_selfplay_episodes self.n_mcts_sim = mcts_simulations self.acceptance_rate = acceptance_rate self.model_folder = model_folder self.train_data_folder = train_data_folder self.temp_thresh = temperature self.skip_first_self_play = False self.rng = np.random.default_rng(seed)
def __init__( self, student: RLAgent, action_map: ActionMap, logic: Logic = Logic(), model_folder: str = "./checkpoints/models", train_data_folder: str = "./checkpoints/data", **kwargs, ): self.model_folder = model_folder self.train_data_folder = train_data_folder if not os.path.exists(model_folder): os.makedirs(model_folder) if not os.path.exists(train_data_folder): os.makedirs(train_data_folder) assert isinstance( student, RLAgent ), f"Student agent to coach has to be of type '{RLAgent}'. Given type '{type(self.student).__name__}'" self.student: RLAgent = student self.student_mirror: RLAgent = deepcopy( student) # a copy of the student to fight against self.action_map = action_map self.game = Game(self.student, self.student_mirror, logic=logic, **kwargs)
def select_random_action( self, state: State, logic: Logic = Logic(), ) -> Action: action_mask = self.action_map.actions_mask(state.board, self.team, logic) return self.rng.choice(self.action_map.actions, p=action_mask / action_mask.sum())
def decide_move(self, state, logic: Logic = Logic()): """ Depending on the amount of enemy pieces left, we are entering the start, mid or endgame and planning through the minimax algorithm. :return: tuple of tuple positions representing the move """ if self.ext_depth is None: self.set_max_depth() # set max_depth each turn else: self.max_depth = self.ext_depth # make sure a flag win will be discounted by a factor that guarantees a preference towards immediate flag kill self.winGameReward = max(self.winGameReward, self.max_depth * self.kill_reward) return self.minimax(max_depth=self.max_depth)
def test_logic(): state = minimal_state() logic = Logic() moves_blue = list(logic.possible_moves_iter(state.board, Team.blue)) moves_red = list(logic.possible_moves_iter(state.board, Team.red)) x = 3 state = minimal_state2() moves_blue = list(logic.possible_moves_iter(state.board, Team.blue)) moves_red = list(logic.possible_moves_iter(state.board, Team.red)) x = 3
def decide_move(self, state: State, logic: Logic = Logic()) -> Move: """ Decide the move to make for the given state of the game. Parameters ---------- state: State, the state on which the decision is to be made. logic: Logic, the logic to use in the engine. Can be changed to vary the game mode if desirable. Returns ------- Move, the chosen move to make on the state. """ raise NotImplementedError
def __init__( self, network: torch.nn.Module, action_map: ActionMap, cpuct: float = 4.0, n_mcts_sims: int = 100, logic: Logic = Logic(), ): self.network = network self.action_map = action_map self.logic = logic self.cpuct = cpuct self.n_mcts_sims = max(1, n_mcts_sims) self.Qsa: Dict[Tuple[str, int], float] = {} # stores Q values for (s, a) self.Nsa: Dict[Tuple[str, int], int] = {} # stores #times edge (s, a) was visited self.Ns: Dict[str, float] = {} # stores #times board s was visited self.Ps: Dict[str, np.ndarray] = { } # stores policy (returned by neural net) self.Es: Dict[str, Status] = {} # stores game end status for state s self.Vs: Dict[str, np.ndarray] = {} # stores valid moves for state s
def decide_move(self, state: State, logic: Logic = Logic()): all_moves = list(logic.possible_moves_iter(state.board, self.team)) if not all_moves: return None else: return self.rng.choice(all_moves)
def search(self, state: State, agent: RLAgent, perspective: Team, logic: Logic = Logic()): """ This function performs one iteration of MCTS. It iterates, until a leaf node is found. The action chosen at each node is one that has the maximum upper confidence bound as in the paper. Once a leaf node is found, the neural network is called to return an initial policy P and a value value for the state. This value is propagated up the search path. In case the leaf node is a terminal state, the outcome is propagated up the search path. The values of Ns, Nsa, Qsa are updated. Returns ------- float, the board value for the agent. """ turn_counter_pre = state.turn_counter # (state, action) -> value sign sa_to_sign = dict() # this simply initializes the variable. # If one finds this value later in the tree, then there is a bug in the logic. value = float("inf") # the first iteration is always the root root = True while True: if state.active_team == Team.red: # the network is trained only from the perspective of team blue state.flip_teams() # get string representation of state s = str(state) if (state.active_team == perspective) == state.flipped_teams: # adjust for the correct perspective: # The value needs to be always seen from the perspective of the 'agent'. # The condition is logically equivalent to: # (selected team == active player AND teams flipped) # OR (selected team != active player AND teams not flipped) # -> Opponent perspective. # and in these cases we then need to multiply with -1 # (assuming symmetric rewards). value_sign = -1 else: value_sign = 1 if s not in self.Es: self.Es[s] = logic.get_status(state) if self.Es[s] != Status.ongoing: # terminal node value = self.Es[s].value break elif s not in self.Ps: # leaf node value = self._fill_leaf_node(state, s, agent) break else: # has not reached a leaf or terminal node yet, so keep searching # by playing according to the current policy valids = self.Vs[s] policy = self.Ps[s] if root: policy = self._make_policy_noisy(policy, valids) # the root is only the first iteration. This information was used # only to add noise to the policy. So now we can deactivate this. root = False a = self._select_action(s, policy, valids) sa_to_sign[(s, a)] = value_sign move = self.action_map.action_to_move(a, state, Team.blue) self.logic.execute_move(state, move) for (s, a), per in sa_to_sign: # for every (state, action) pair: update its Q-value and visitation counter. self._update_qsa(s, a, value * per) # increment the visitation counter of this state self.Ns[s] += 1 # adjust for team perspective and return the value return value * value_sign