def _playout(self, state): """walking down playout_depth step using node.select(), simulate the game result with rollout policy""" node = self._root for d in range(self._playout_depth): if node.is_leaf(): if state.terminate(): break valid_action = [ i for i in range(16) if state.valid_action(*decode_action(i)) ] height_total = 0 for i in valid_action: height_total += self._get_priority( state.get_height(*decode_action(i))) act_prob = [( act, self._get_priority(state.get_height(*decode_action(act))) / height_total) for act in valid_action] node._expand(act_prob) action = node._select(self._c) node = node._children[action] state.take_action(*decode_action(action)) # v = TD(0) z = eligibility trace v = self._evaluate([state.get_state()], [self._player]) if self._lamda < 1 else 0 z = self._rollout(state) if self._lamda > 0 else 0 leaf_value = (1 - self._lamda) * v + self._lamda * z node._back_prop(leaf_value, self._c)
def generate_action(self, state): move = get_winning_move(state, self._player) if len(move) > 0: return move[0][1], move[0][2] move = get_winning_move(state, self._opponent) if len(move) > 0: return move[0][1], move[0][2] env = State(state) actions = [i for i in range(16) if env.valid_action(*decode_action(i))] # random.shuffle(actions) return_action = decode_action(random.choice(actions)) return return_action[0], return_action[1]
def _search(self, c_state, player, depth, max_level=True, alpha=-np.inf, beta=np.inf): """recursively search every possible action and evaluate the leaf nodes Arguments: c_state -- a copy of current state player -- current player depth -- depth left for furthur searching max_level -- whether the level is maximizer alpha -- best value along the path from root to maximizer beta -- worst value along the path from root to minimizer Returns: return_val -- value of this node return_action -- where return_val comes from """ if c_state.terminate(): if c_state.win(player): return 1, None if c_state.win(-player): return -1, None return 0, None if depth == 0: return self._evaluate([c_state.get_state()], [player]), None comp = (lambda a, b: max(a, b)) if max_level else ( lambda a, b: min(a, b)) return_val = -np.inf if max_level else np.inf return_action = -1 actions = [ i for i in range(16) if c_state.valid_action(*decode_action(i)) ] random.shuffle(actions) for i in actions: n_state = State(c_state) r, c = decode_action(i) n_state.take_action(r, c) val, action = self._search(n_state, -player, depth - 1, not max_level, alpha, beta) return_val = comp(return_val, val) if val == return_val: return_action = i alpha, beta, prune = self._alpha_beta_pruning( return_val, max_level, alpha, beta) if prune: break return return_val, return_action
def get_action(self, state): """get action of InforGo given current state Arguments: state -- current state Returns: (row, col) denoting chosen action """ player = 1 if self._play_first else -1 act = decode_action(self._tree.get_action(state, player)) # self.step(encode_action(act)) return act
def _rollout(self, state): """play simulation with _rollout_policy""" c_player = state.player step = 0 while not state.terminate(): if step > self._rollout_limit: return 0 act = self._rollout_policy(state, c_player) # logger.debug("action: {}".format(act)) state.take_action(*decode_action(act)) c_player *= -1 step += 1 if state.win(self._player): return 1 if state.win(-self._player): return -1 return 0
def _rollout_policy(self, state, player): """randomized rollout Arguments: state -- current state player -- current player Returns: action chosen by rollout policy """ move = get_winning_move(state, player) if len(move) > 0: return encode_action((move[0][1], move[0][2])) move = get_winning_move(state, -player) if len(move) > 0: return encode_action((move[0][1], move[0][2])) valid_action = [ i for i in range(16) if state.valid_action(*decode_action(i)) ] return random.choice(valid_action)