Пример #1
0
    def search(state, args, root=None):
        if root is None:
            root = Node(state)

        for i in range(args.n_simulations):
            node = root
            path = []
            # Start simulation and add a new child
            terminal = False
            while not terminal:
                # Choose which branch to explore/exploit based on embedding memory
                Q = node.Q + MCTS.c * np.sqrt(np.log(node.N) / node.N_a)
                action = int(np.argmax(Q))

                # simulate with action to get next state
                state, reward, terminal = SIMULATOR.simulate(node.state, action)

                path.append((node, action, reward))

                # keep on traversing if the child exists
                if node.children.get(action) is None:
                    # add new child
                    node.children[action] = Node(state, terminal)
                    break
                else:
                    node = node.children[action]

            # backup values through the path to root
            for node, action, reward in reversed(path):
                node.N += 1
                node.N_a[action] += 1
                node.Q[action] = node.Q[action] + (reward + np.max(node.children[action].Q) - node.Q[action]) / node.N_a[action]

        return int(np.argmax(root.Q)), root
Пример #2
0
    def search(self, state, args):

        root = self.new_node(state)

        predictions = [self.f_readout(root.tensors.memory)]
        logits = []
        actions = []

        for i in range(args.n_simulations):

            node = root

            path = []
            logits_m = []
            actions_m = []

            # Start simulation and add a new child
            terminal = False
            while not terminal:
                # Choose which branch to explore/exploit based on node memory
                p_actions = self.f_policy(node)
                action = Categorical(logits=p_actions).sample().item()

                # store embedding and action for policy gradient
                if args.training:
                    logits_m.append(p_actions)
                    actions_m.append(action)

                # simulate with action to get next state
                next_state, reward, terminal = SIMULATOR.simulate(
                    node.variables.state, action)
                path.append(Path(node, action, reward))

                # if action and observation branch exists, traverse to the next node and add the new state
                if node.variables.children.get(action) is None:
                    node.variables.children[action] = self.new_node(next_state)
                    break
                # else, create one
                else:
                    node = node.variables.children[action]

            # backup values through the path to root
            for node, action, reward in reversed(path):
                node.tensors.memory = self.f_backup(
                    *prepare_input_for_f_backup(node, action, reward))

                node.tensors.children[action] = node.variables.children[
                    action].tensors.memory

            # store predictions after m_th step
            predictions.append(self.f_readout(root.tensors.memory))

            # store logits and action for the m_th step
            logits.append(logits_m)
            actions.append(actions_m)

        return Categorical(
            logits=predictions[-1]).sample().item(), (predictions, logits,
                                                      actions)