def backpropagate(search_path: List[Node], value: float, to_play: Player, discount: float, min_max_stats: MinMaxStats): """ At the end of a simulation, we propagate the evaluation all the way up the tree to the root. """ for node in search_path[::-1]: node.value_sum += value if node.to_play == to_play else -value node.visit_count += 1 min_max_stats.update(node.value()) value = node.reward + discount * value
def run_mcts(config: MuZeroConfig, root: Node, action_history: ActionHistory, network: BaseNetwork): """ Core Monte Carlo Tree Search algorithm. To decide on an action, we run N simulations, always starting at the root of the search tree and traversing the tree according to the UCB formula until we reach a leaf node. """ min_max_stats = MinMaxStats(config.known_bounds) for _ in range(config.num_simulations): history = action_history.clone() node = root search_path = [node] while node.expanded(): action, node = select_child(config, node, min_max_stats) history.add_action(action) search_path.append(node) # Inside the search tree we use the dynamics function to obtain the next # hidden state given an action and the previous hidden state. parent = search_path[-2] network_output = network.recurrent_inference(parent.hidden_state, history.last_action()) expand_node(node, history.to_play(), history.action_space(), network_output) backpropagate(search_path, network_output.value, history.to_play(), config.discount, min_max_stats)
def ucb_score(config: MuZeroConfig, parent: Node, child: Node, min_max_stats: MinMaxStats) -> float: """ The score for a node is based on its value, plus an exploration bonus based on the prior. """ pb_c = math.log((parent.visit_count + config.pb_c_base + 1) / config.pb_c_base) + config.pb_c_init pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) prior_score = pb_c * child.prior value_score = min_max_stats.normalize(child.value()) return prior_score + value_score
def run_mcts(config: MuZeroConfig, root: Node, action_history: ActionHistory, network: BaseNetwork): min_max_stats = MinMaxStats(config.known_bounds) for _ in range(config.num_simulations): history = action_history.clone() node = root search_path = [node] while node.expanded(): action, node = select_child(config, node, min_max_stats) history.add_action(action) search_path.append(node) # Inside the search tree we use the dynamics function to obtain the next # hidden state given an action and the previous hidden state. parent = search_path[-2] network_output = network.recurrent_inference(parent.hidden_state, history.last_action()) expand_node(node, history.to_play(), history.action_space(), network_output) backpropagate(search_path, network_output.value, history.to_play(), config.discount, min_max_stats)