示例#1
0
    def act(self, gs: GameState) -> int:
        gs_unique_id = gs.get_unique_id()
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state()

        mask_vec = np.zeros((self.action_space_size, ))
        mask_vec[available_actions] = 1.0

        v = self.critic.predict(state_vec)
        p = self.actor.predict(state_vec, mask_vec)
        p = np.array(p)
        p /= p.sum()
        indexes = np.arange(self.action_space_size)
        chosen_action = np.random.choice(indexes, p=p)

        # valid_actions_probability = p[available_actions]
        # valid_actions_probability_sum = np.sum(valid_actions_probability)
        # normalized_valid_action_probability = valid_actions_probability / valid_actions_probability_sum
        # #
        # chosen_action = np.random.choice(available_actions, p=normalized_valid_action_probability)

        self.v.append(v)

        self.s.append(state_vec)
        self.m.append(mask_vec)
        self.a.append(to_categorical(chosen_action, self.action_space_size))
        if not self.is_last_episode_terminal:
            self.r.append(self.r_temp)
        self.r_temp = 0.0
        self.is_last_episode_terminal = False

        return chosen_action
    def act(self, gs: GameState) -> int:
        gs_unique_id = gs.get_unique_id()
        available_actions = gs.get_available_actions(gs.get_active_player())
        if gs_unique_id not in self.Q:
            self.Q[gs_unique_id] = dict()
            for a in available_actions:
                self.Q[gs_unique_id][a] = (np.random.random() * 2.0 -
                                           1.0) / 10.0

        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = max(self.Q[gs_unique_id],
                                key=self.Q[gs_unique_id].get)

        if self.s is not None:
            self.Q[self.s][self.a] += self.alpha * (
                self.r + self.gamma * max(self.Q[gs_unique_id].values()) -
                self.Q[self.s][self.a])

        self.s = gs_unique_id
        self.a = chosen_action
        self.r = 0.0

        return self.a
示例#3
0
def run_to_the_end(agents: List[Agent], gs: GameState, render: bool = False):
    while not gs.is_game_over():
        if render:
            gs.render()
        run_step(agents, gs)

    if render:
        gs.render()
示例#4
0
 def act(self, gs: GameState) -> int:
     available_actions = gs.get_available_actions(gs.get_active_player())
     while True:
         for event in pygame.event.get():
             if event.type == pygame.MOUSEBUTTONDOWN:
                 valid_action = gs.get_valid_action_from_mouse_pos(
                     pygame.mouse.get_pos()[0],
                     pygame.mouse.get_pos()[1])
                 if valid_action in available_actions:
                     return valid_action
                 else:
                     print(f"Incorrect action")
示例#5
0
    def act(self, gs: GameState) -> int:
        available_actions = gs.get_available_actions(gs.get_active_player())
        print(f"Choose action index from : {available_actions}")

        while True:
            try:
                action_candidate = int(input())
                if action_candidate in available_actions:
                    break
            except Exception as _:
                pass
            print(f"Action not valid, please try again !")
        return action_candidate
示例#6
0
def run_for_n_games_and_return_max(agents: List[Agent],
                                   gs: GameState,
                                   games_count: int,
                                   render: bool = False) -> np.ndarray:
    old_and_new_scores = np.ones((2, len(gs.get_scores()))) * np.NINF

    for _ in range(games_count):
        gs_copy = gs.clone()
        run_to_the_end(agents, gs_copy, render=render)
        new_scores = gs_copy.get_scores()
        old_and_new_scores[1, :] = new_scores
        old_and_new_scores[0, :] = np.max(old_and_new_scores, axis=0)

    return old_and_new_scores[0, :]
示例#7
0
def run_step(agents: List[Agent], gs: GameState):
    assert not gs.is_game_over()
    active_player_index = gs.get_active_player()

    old_scores = gs.get_scores().copy()
    action = agents[active_player_index].act(gs)
    gs.step(active_player_index, action)
    new_scores = gs.get_scores()
    rewards = new_scores - old_scores
    for i, agent in enumerate(agents):
        agent.observe(rewards[i], gs.is_game_over(), i)
示例#8
0
    def act(self, gs: GameState) -> int:
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state(
            mode="2D" if self.using_convolution else None)
        predicted_Q_values = self.Q.predict(state_vec)

        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        if self.s is not None:
            target = self.r + self.gamma * max(
                predicted_Q_values[available_actions])
            self.Q.train(self.s, self.a, target)

        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0

        return chosen_action
    def act(self, gs: GameState) -> int:
        gs_unique_id = gs.get_unique_id()
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = to_categorical(gs_unique_id, gs.get_max_state_count())
        predicted_Q_values = self.Q.predict(state_vec)

        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        if self.s is not None:
            target = self.r + self.gamma * max(
                predicted_Q_values[available_actions])
            self.Q.train(self.s, self.a, target)

        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0

        return chosen_action
示例#10
0
def run_for_n_games_and_return_stats(
    agents: List[Agent],
    gs: GameState,
    games_count: int,
    shuffle_players: bool = False,
    render: bool = False,
    writer=None,
    show_progress: bool = False,
    progress_desc: str = None,
) -> (np.ndarray, np.ndarray, float):
    total_scores = np.zeros_like(gs.get_scores())
    total_times = 0
    agents_order = np.arange(len(agents))

    agents_copy = agents
    if shuffle_players:
        agents_copy = agents.copy()
    iterable = (tqdm.tqdm(range(games_count), progress_desc)
                if show_progress else range(games_count))
    for game in iterable:
        game = game + 1
        gs_copy = gs.clone()
        if shuffle_players:
            agents_copy = agents.copy()
            shuffle(agents_order)
            for i in agents_order:
                agents_copy[i] = agents[agents_order[i]]
        start = time.time()
        run_to_the_end(agents_copy, gs_copy, render=render)
        total_times += time.time() - start
        total_scores += gs_copy.get_scores()[agents_order]
        if writer:
            mean_scores = total_scores / game
            mean_time_per_game = total_times / game
            write_experiment_row(writer, game, mean_scores, mean_time_per_game)

    return total_scores, total_scores / games_count, total_times / games_count
示例#11
0
    def act(self, gs: GameState) -> int:
        available_actions = gs.get_available_actions(gs.get_active_player())
        if self.agents is None:
            self.agents = [RandomAgent()] * gs.player_count()
        accumulated_scores = np.zeros((len(available_actions),))

        for i, a in enumerate(available_actions):
            gs_clone = gs.clone()
            gs_clone.step(gs.get_active_player(), a)
            if self.determinist_environment:
                max_scores = run_for_n_games_and_return_max(
                    self.agents, gs_clone, self.epochs_per_action
                )
                accumulated_scores[i] = max_scores[gs.get_active_player()]
            else:
                (total_scores, _, _) = run_for_n_games_and_return_stats(
                    self.agents, gs_clone, self.epochs_per_action
                )
                accumulated_scores[i] = total_scores[gs.get_active_player()]

        # print((accumulated_scores, available_actions[np.argmax(accumulated_scores)]))
        return available_actions[np.argmax(accumulated_scores)]
示例#12
0
    def act(self, gs: GameState) -> int:
        root_hash = gs.get_unique_id()
        memory = self.memory if self.keep_memory else dict()

        if root_hash not in memory:
            MOMCTSAgent.create_node_in_memory(
                memory,
                root_hash,
                gs.get_available_actions(gs.get_active_player()),
                gs.get_active_player(),
            )

        for i in range(self.max_iteration):
            gs_copy = gs.clone()
            s = gs_copy.get_unique_id()
            history = []

            # SELECTION
            while not gs_copy.is_game_over() and all(
                (edge["n"] > 0 for edge in memory[s])):
                chosen_edge = max(
                    ((edge, MOMCTSAgent.ucb_1(edge)) for edge in memory[s]),
                    key=lambda kv: kv[1],
                )[0]
                history.append((s, chosen_edge))

                gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    MOMCTSAgent.create_node_in_memory(
                        memory,
                        s,
                        gs_copy.get_available_actions(
                            gs_copy.get_active_player()),
                        gs_copy.get_active_player(),
                    )

            # EXPANSION
            if not gs_copy.is_game_over():
                chosen_edge = choice(
                    list(
                        filter(lambda e: e["n"] == 0,
                               (edge for edge in memory[s]))))

                history.append((s, chosen_edge))
                gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    MOMCTSAgent.create_node_in_memory(
                        memory,
                        s,
                        gs_copy.get_available_actions(
                            gs_copy.get_active_player()),
                        gs_copy.get_active_player(),
                    )

            # SIMULATION
            while not gs_copy.is_game_over():
                gs_copy.step(
                    gs_copy.get_active_player(),
                    choice(
                        gs_copy.get_available_actions(
                            gs_copy.get_active_player())),
                )

            scores = gs_copy.get_scores()
            # REMONTEE DU SCORE
            for (s, edge) in history:
                edge["n"] += 1
                edge["r"] += scores[edge["p"]]
                for neighbour_edge in memory[s]:
                    neighbour_edge["np"] += 1

        return max((edge for edge in memory[root_hash]),
                   key=lambda e: e["n"])["a"]
示例#13
0
    def act(self, gs: GameState) -> int:
        root_hash = gs.get_unique_id()
        memory = self.memory if self.keep_memory else dict()

        if root_hash not in memory:
            q_values = self.brain.predict(gs.get_vectorized_state())
            HalfAlphaZeroAgent.create_node_in_memory(
                memory,
                root_hash,
                gs.get_available_actions(gs.get_active_player()),
                gs.get_active_player(),
                q_values,
            )

        for i in range(self.max_iteration):
            gs_copy = gs.clone()
            s = gs_copy.get_unique_id()
            history = []

            # SELECTION
            while not gs_copy.is_game_over() and all(
                (edge["n"] > 0 for edge in memory[s])):
                chosen_edge = max(
                    ((edge, HalfAlphaZeroAgent.ucb_1(edge))
                     for edge in memory[s]),
                    key=lambda kv: kv[1],
                )[0]
                history.append((s, chosen_edge))

                gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    q_values = self.brain.predict(
                        gs_copy.get_vectorized_state())
                    HalfAlphaZeroAgent.create_node_in_memory(
                        memory,
                        s,
                        gs_copy.get_available_actions(
                            gs_copy.get_active_player()),
                        gs_copy.get_active_player(),
                        q_values,
                    )

            # EXPANSION
            if not gs_copy.is_game_over():
                chosen_edge = choice(
                    list(
                        filter(lambda e: e["n"] == 0,
                               (edge for edge in memory[s]))))

                history.append((s, chosen_edge))
                gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    q_values = self.brain.predict(
                        gs_copy.get_vectorized_state())
                    HalfAlphaZeroAgent.create_node_in_memory(
                        memory,
                        s,
                        gs_copy.get_available_actions(
                            gs_copy.get_active_player()),
                        gs_copy.get_active_player(),
                        q_values,
                    )

            scores = np.zeros(gs_copy.player_count())
            scores_set = np.zeros(gs_copy.player_count())
            # REMONTEE DU SCORE
            for (s, edge) in history:
                if scores_set[edge["p"]] == 0:
                    scores_set[edge["p"]] = 1.0
                    scores[edge["p"]] = edge["q"]

                edge["n"] += 1
                edge["r"] += scores[edge["p"]]
                for neighbour_edge in memory[s]:
                    neighbour_edge["np"] += 1

        chosen_action = max((edge for edge in memory[root_hash]),
                            key=lambda e: e["n"])["a"]

        if len(self.states_buffer) > 0:
            self.rewards_buffer.append(self.intermediate_reward)

        self.states_buffer.append(gs.get_vectorized_state())
        self.actions_buffer.append(
            to_categorical(chosen_action, gs.get_action_space_size()))
        self.intermediate_reward = 0.0

        return chosen_action
示例#14
0
    def act(self, gs: GameState) -> int:

        if self.apprentice_training_count > self.apprentice_training_before_takeover:
            return gs.get_available_actions(gs.get_active_player())[
                np.argmax(
                    self.brain.predict(np.array([gs.get_vectorized_state()]))[0][
                        gs.get_available_actions(gs.get_active_player())
                    ]
                )
            ]

        root_hash = gs.get_unique_id()
        memory = self.memory if self.keep_memory else dict()

        if root_hash not in memory:
            ExpertApprenticeAgent.create_node_in_memory(
                memory,
                root_hash,
                gs.get_available_actions(gs.get_active_player()),
                gs.get_active_player(),
            )

        for i in range(self.max_iteration):
            gs_copy = gs.clone()
            s = gs_copy.get_unique_id()
            history = []

            # SELECTION
            while not gs_copy.is_game_over() and all(
                (edge["n"] > 0 for edge in memory[s])
            ):
                chosen_edge = max(
                    ((edge, ExpertApprenticeAgent.ucb_1(edge)) for edge in memory[s]),
                    key=lambda kv: kv[1],
                )[0]
                history.append((s, chosen_edge))

                gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    ExpertApprenticeAgent.create_node_in_memory(
                        memory,
                        s,
                        gs_copy.get_available_actions(gs_copy.get_active_player()),
                        gs_copy.get_active_player(),
                    )

            # EXPANSION
            if not gs_copy.is_game_over():
                chosen_edge = choice(
                    list(filter(lambda e: e["n"] == 0, (edge for edge in memory[s])))
                )

                history.append((s, chosen_edge))
                gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    ExpertApprenticeAgent.create_node_in_memory(
                        memory,
                        s,
                        gs_copy.get_available_actions(gs_copy.get_active_player()),
                        gs_copy.get_active_player(),
                    )

            # SIMULATION
            while not gs_copy.is_game_over():
                gs_copy.step(
                    gs_copy.get_active_player(),
                    choice(gs_copy.get_available_actions(gs_copy.get_active_player())),
                )

            scores = gs_copy.get_scores()
            # REMONTEE DU SCORE
            for (s, edge) in history:
                edge["n"] += 1
                edge["r"] += scores[edge["p"]]
                for neighbour_edge in memory[s]:
                    neighbour_edge["np"] += 1

        target = np.zeros(gs.get_action_space_size())

        for edge in memory[root_hash]:
            target[edge["a"]] = edge["n"]

        target /= np.sum(target)

        self.states_buffer.append(gs.get_vectorized_state())
        self.actions_buffer.append(target)

        if len(self.states_buffer) > 200:
            self.apprentice_training_count += 1
            self.brain.fit(
                np.array(self.states_buffer), np.array(self.actions_buffer), verbose=0
            )
            self.states_buffer.clear()
            self.actions_buffer.clear()

        if self.apprentice_training_count > self.apprentice_training_before_takeover:
            print("Apprentice is playing next round")

        return max((edge for edge in memory[root_hash]), key=lambda e: e["n"])["a"]
示例#15
0
 def act(self, gs: GameState) -> int:
     available_actions = gs.get_available_actions(gs.get_active_player())
     return np.random.choice(available_actions)