def search(state, args, root=None): if root is None: root = Node(state) for i in range(args.n_simulations): node = root path = [] # Start simulation and add a new child terminal = False while not terminal: # Choose which branch to explore/exploit based on embedding memory Q = node.Q + MCTS.c * np.sqrt(np.log(node.N) / node.N_a) action = int(np.argmax(Q)) # simulate with action to get next state state, reward, terminal = SIMULATOR.simulate(node.state, action) path.append((node, action, reward)) # keep on traversing if the child exists if node.children.get(action) is None: # add new child node.children[action] = Node(state, terminal) break else: node = node.children[action] # backup values through the path to root for node, action, reward in reversed(path): node.N += 1 node.N_a[action] += 1 node.Q[action] = node.Q[action] + (reward + np.max(node.children[action].Q) - node.Q[action]) / node.N_a[action] return int(np.argmax(root.Q)), root
def search(self, state, args): root = self.new_node(state) predictions = [self.f_readout(root.tensors.memory)] logits = [] actions = [] for i in range(args.n_simulations): node = root path = [] logits_m = [] actions_m = [] # Start simulation and add a new child terminal = False while not terminal: # Choose which branch to explore/exploit based on node memory p_actions = self.f_policy(node) action = Categorical(logits=p_actions).sample().item() # store embedding and action for policy gradient if args.training: logits_m.append(p_actions) actions_m.append(action) # simulate with action to get next state next_state, reward, terminal = SIMULATOR.simulate( node.variables.state, action) path.append(Path(node, action, reward)) # if action and observation branch exists, traverse to the next node and add the new state if node.variables.children.get(action) is None: node.variables.children[action] = self.new_node(next_state) break # else, create one else: node = node.variables.children[action] # backup values through the path to root for node, action, reward in reversed(path): node.tensors.memory = self.f_backup( *prepare_input_for_f_backup(node, action, reward)) node.tensors.children[action] = node.variables.children[ action].tensors.memory # store predictions after m_th step predictions.append(self.f_readout(root.tensors.memory)) # store logits and action for the m_th step logits.append(logits_m) actions.append(actions_m) return Categorical( logits=predictions[-1]).sample().item(), (predictions, logits, actions)