예제 #1
0
    def act(self, player_index: int, information_state: InformationState,
            available_actions: 'Iterable[int]') -> int:

        vectorized_states = np.array([information_state.vectorize()] *
                                     len(available_actions))
        actions_vectorized = np.array([
            to_categorical(action, self.action_size)
            for action in available_actions
        ])

        logits = self.brain.predict_policies(vectorized_states,
                                             actions_vectorized)

        sum = np.sum(logits)
        probabilities = np.reshape(logits / sum, (len(available_actions), ))
        chosen_action = np.random.choice(available_actions, p=probabilities)

        transition = dict()
        transition['s'] = information_state.vectorize()
        transition['a'] = to_categorical(chosen_action, self.action_size)
        transition['r'] = 0.0
        transition['t'] = False

        self.current_trajectory_buffer.append(transition)

        return chosen_action
    def act(self, player_index: int, information_state: InformationState,
            available_actions: 'Iterable[int]') -> int:

        if self.current_transition:
            self.current_transition['terminal'] = False
            self.current_trajectory.append(self.current_transition)
            self.current_transition = None

        for i in range(self.iteration_count):
            self.current_iteration_selected_nodes = {}
            gs = information_state.create_game_state_from_information_state()

            # SELECT
            gs, info_state, current_player, terminal = self.select(gs)

            if not terminal:
                # EXPAND
                node = self.current_trees[current_player][info_state]

                available_actions = gs.get_available_actions_id_for_player(
                    current_player)
                node['a'] = [{
                    'n': 0,
                    'r': 0,
                    'action_id': action_id
                } for action_id in available_actions]
                child_action = random.choice(node['a'])
                action_to_execute = child_action['action_id']

                self.add_visited_node(node, child_action, current_player)

                gs, reward, terminal = gs.step(current_player,
                                               action_to_execute)

            # EVALUATE
            scores = self.brain.predict_state(info_state.vectorize())

            # BACKPROPAGATE SCORE
            for player_id in self.current_iteration_selected_nodes.keys():
                visited_nodes = self.current_iteration_selected_nodes[
                    player_id]
                for node, child_action in reversed(visited_nodes):
                    node['nprime'] += 1
                    child_action['n'] += 1
                    child_action['r'] += scores[player_id]

        child_action = max(
            self.current_iteration_selected_nodes[player_index][0][0]['a'],
            key=lambda child: child['n'])

        self.current_transition = {
            's': information_state.vectorize(),
            'r': 0,
            'player_index': player_index,
            'terminal': False
        }

        return child_action['action_id']
예제 #3
0
    def act(self, player_index: int, information_state: InformationState,
            available_actions: 'Iterable[int]') -> int:

        available_actions_list = list(available_actions)

        inputs_states = np.array([information_state.vectorize()] *
                                 len(available_actions_list))
        actions_vectorized = np.array([
            keras.utils.to_categorical(action_id, self.action_size)
            for action_id in available_actions_list
        ])

        if self.s is not None:
            self.s_next_duplicated = inputs_states
            self.s_next_available_actions = actions_vectorized
            self.t = False
            self.learn()

        if random.random() > self.epsilon:
            q_values = self.Q.predict([inputs_states,
                                       actions_vectorized]).flatten()
            best_id = q_values.argmax()
        else:
            best_id = random.randint(0, len(available_actions_list) - 1)

        self.s = inputs_states[best_id]
        self.a = actions_vectorized[best_id]

        return available_actions_list[best_id]
    def act(self, player_index: int,
            information_state: InformationState,
            available_actions: 'Iterable[int]') -> int:

        available_actions = list(available_actions)
        num_actions = len(available_actions)

        vectorized_state = information_state.vectorize()

        full_actions_probability, value = self.brain.predict_policy_and_value(vectorized_state)

        available_actions_probabilities = full_actions_probability[available_actions]

        sum_available_action_probabilities = np.sum(available_actions_probabilities)

        if sum_available_action_probabilities > 0.0000001:  # just in case all is zero, but unlikely
            probabilities = available_actions_probabilities / sum_available_action_probabilities
            chosen_index = np.random.choice(list(range(num_actions)), p=probabilities)
            chosen_action = available_actions[chosen_index]
        else:
            print("No action eligible, this should be extremely rare")
            chosen_index = np.random.choice(list(range(num_actions)))
            chosen_action = available_actions[chosen_index]

        transition = dict()
        transition['s'] = vectorized_state
        transition['a'] = chosen_action
        transition['r'] = 0.0
        transition['t'] = False
        transition['p_old'] = full_actions_probability.tolist()

        self.current_trajectory_buffer.append(transition)

        return chosen_action
예제 #5
0
    def act(self, player_index: int, information_state: InformationState, available_actions: 'Iterable[int]') -> int:
        actions = tuple(available_actions)
        action_count = len(actions)
        action_scores = np.zeros(action_count)
        for i in range(action_count):
            gs = information_state.create_game_state_from_information_state()
            (result_gs, score, terminal) = gs.step(player_index, actions[i])

            # Two player zero sum game hypothesis
            player_score = (1 if player_index == 0 else -1) * score
            if not terminal:
                history = self.runner.run(self.num_rollouts_per_available_action, gs)
                player_score += history[player_index] - history[(player_index + 1) % 2]
            player_score = player_score / (1.0 if terminal else self.num_rollouts_per_available_action)
            action_scores[i] = player_score
        return actions[np.argmax(action_scores)]
    def act(self, player_index: int, information_state: InformationState,
            available_actions: 'Iterable[int]') -> int:

        for i in range(self.iteration_count):
            self.current_iteration_selected_nodes = {}
            gs = information_state.create_game_state_from_information_state()

            # SELECT
            gs, info_state, current_player, terminal = self.select(gs)

            if not terminal:
                # EXPAND
                node = self.current_trees[current_player][info_state]

                available_actions = gs.get_available_actions_id_for_player(
                    current_player)
                node['a'] = [{
                    'n': 0,
                    'r': 0,
                    'action_id': action_id
                } for action_id in available_actions]
                child_action = random.choice(node['a'])
                action_to_execute = child_action['action_id']

                self.add_visited_node(node, child_action, current_player)

                gs, reward, terminal = gs.step(current_player,
                                               action_to_execute)

            # EVALUATE
            scores = self.runner.run(initial_game_state=gs, max_rounds=1)

            # BACKPROPAGATE SCORE
            for player_id in self.current_iteration_selected_nodes.keys():
                visited_nodes = self.current_iteration_selected_nodes[
                    player_id]
                for node, child_action in reversed(visited_nodes):
                    node['nprime'] += 1
                    child_action['n'] += 1
                    child_action['r'] += scores[player_id]

        child_action = max(
            self.current_iteration_selected_nodes[player_index][0][0]['a'],
            key=lambda child: child['n'])

        return child_action['action_id']
    def act(self, player_index: int, information_state: InformationState,
            available_actions: 'Iterable[int]') -> int:
        available_actions = list(available_actions)

        if self.evaluation_episodes + self.training_episodes < self.current_episode:
            self.X = []
            self.Y = []
            self.current_episode = 0

        if self.current_episode > self.training_episodes:
            probs = self.model.predict(
                np.array([information_state.vectorize()]))[0]

            available_probs = probs[np.array(available_actions)]

            probs_sum = np.sum(available_probs)

            if (probs_sum > 0.001):
                chosen_action_index = np.argmax(available_probs)
                action = available_actions[chosen_action_index]
            else:
                action = random.choice(available_actions)

            return action

        for i in range(self.iteration_count):
            self.current_iteration_selected_nodes = {}
            gs = information_state.create_game_state_from_information_state()

            # SELECT
            gs, info_state, current_player, terminal = self.select(gs)

            if not terminal:
                # EXPAND
                node = self.current_trees[current_player][info_state]

                available_actions = gs.get_available_actions_id_for_player(
                    current_player)
                node['a'] = [{
                    'n': 0,
                    'r': 0,
                    'action_id': action_id
                } for action_id in available_actions]
                child_action = random.choice(node['a'])
                action_to_execute = child_action['action_id']

                self.add_visited_node(node, child_action, current_player)

                gs, reward, terminal = gs.step(current_player,
                                               action_to_execute)

            # EVALUATE
            scores = self.runner.run(initial_game_state=gs, max_rounds=1)

            # BACKPROPAGATE SCORE
            for player_id in self.current_iteration_selected_nodes.keys():
                visited_nodes = self.current_iteration_selected_nodes[
                    player_id]
                for node, child_action in reversed(visited_nodes):
                    node['nprime'] += 1
                    child_action['n'] += 1
                    child_action['r'] += scores[player_id]

        child_action = max(
            self.current_iteration_selected_nodes[player_index][0][0]['a'],
            key=lambda child: child['n'])

        self.X.append(information_state.vectorize().tolist())
        self.Y.append(
            to_categorical(child_action['action_id'], self.action_size))

        return child_action['action_id']