Exemplo n.º 1
0
    def npc_action(self):
        print("NPC action")

        valid_actions = othello.get_valid_actions(self.state, self.npc)

        if self.npc_type == "random":
            action = random.choice(valid_actions)
            self.state, done = othello.step(self.state, action, self.npc)

            self.refresh()
            self.update_label()
            if done:
                self.update_label(game_end=True)
                return

        elif self.npc_type == "eps-greedy":

            if random.random() > self.epsilon:
                best_action = None
                best_score = 0
                for action in valid_actions:
                    next_state, done = othello.step(self.state, action,
                                                    self.npc)
                    _, score = othello.count_stone(next_state)
                    if score > best_score:
                        best_score = score
                        best_action = action

                self.state, done = othello.step(self.state, best_action,
                                                self.npc)
            else:
                action = random.choice(valid_actions)
                self.state, done = othello.step(self.state, action, self.npc)

            self.refresh()
            self.update_label()
            if done:
                self.update_label(game_end=True)
                return

        elif self.npc_type == "alphazero":
            mcts_policy = self.mcts.search(root_state=self.state,
                                           current_player=self.npc,
                                           num_simulations=50)

            print(np.array(mcts_policy[:-1]).reshape(6, 6))

            action = np.argmax(mcts_policy)
            self.state, done = othello.step(self.state, action, self.npc)

            self.refresh()
            self.update_label()
            if done:
                self.update_label(game_end=True)
                return
        else:
            raise NotImplementedError()
Exemplo n.º 2
0
    def _expand(self, state, current_player):

        s = self.state_to_str(state, current_player)

        with tf.device("/cpu:0"):
            nn_policy, nn_value = self.network.predict(
                othello.encode_state(state, current_player))

        nn_policy, nn_value = nn_policy.numpy().tolist()[0], nn_value.numpy(
        )[0][0]

        self.P[s] = nn_policy
        self.N[s] = [0] * othello.ACTION_SPACE
        self.W[s] = [0] * othello.ACTION_SPACE

        valid_actions = othello.get_valid_actions(state, current_player)

        #: cache valid actions and next state to save computation
        self.next_states[s] = [
            othello.step(state, action, current_player)[0] if
            (action in valid_actions) else None
            for action in range(othello.ACTION_SPACE)
        ]

        return nn_value
def selfplay(weights, num_mcts_simulations, dirichlet_alpha):

    record = []

    state = othello.get_initial_state()

    network = AlphaZeroResNet(action_space=othello.ACTION_SPACE)

    network.predict(othello.encode_state(state, 1))

    network.set_weights(weights)

    mcts = MCTS(network=network, alpha=dirichlet_alpha)

    current_player = 1

    done = False

    i = 0

    while not done:

        mcts_policy = mcts.search(root_state=state,
                                  current_player=current_player,
                                  num_simulations=num_mcts_simulations)

        if i <= 10:
            # For the first 30 moves of each game, the temperature is set to τ = 1;
            # this selects moves proportionally to their visit count in MCTS
            action = np.random.choice(range(othello.ACTION_SPACE),
                                      p=mcts_policy)
        else:
            action = random.choice(
                np.where(np.array(mcts_policy) == max(mcts_policy))[0])

        record.append(Sample(state, mcts_policy, current_player, None))

        next_state, done = othello.step(state, action, current_player)

        state = next_state

        current_player = -current_player

        i += 1

    #: win: 1, lose: -1, draw: 0
    reward_first, reward_second = othello.get_result(state)

    for sample in reversed(record):
        sample.reward = reward_first if sample.player == 1 else reward_second

    return record
Exemplo n.º 4
0
    def player_action(self, event):

        if not self.is_player_turn or self.is_gameend:
            return
        else:
            self.is_player_turn = False

        print("Player action")

        row = event.y // 100
        col = event.x // 100

        action = othello.xy_to_idx(row, col)

        valid_actions = othello.get_valid_actions(self.state, self.human)
        #print(valid_actions, action)

        if valid_actions == [othello.ACTION_NOOP]:
            action = othello.ACTION_NOOP

        if action in valid_actions:

            self.state, done = othello.step(self.state, action, self.human)
            self.refresh()
            self.update_label()
            if done:
                self.update_label(game_end=True)
                return

            time.sleep(0.3)

            self.npc_action()
            if self.is_gameend:
                return

        else:
            print("Invalid action")

        self.is_player_turn = True

        return
def testplay(current_weights,
             num_mcts_simulations,
             dirichlet_alpha=None,
             n_testplay=24):

    t = time.time()

    win_count = 0

    network = AlphaZeroResNet(action_space=othello.ACTION_SPACE)

    dummy_state = othello.get_initial_state()

    network.predict(othello.encode_state(dummy_state, 1))

    network.set_weights(current_weights)

    for n in range(n_testplay):

        alphazero = random.choice([1, -1])

        mcts = MCTS(network=network, alpha=dirichlet_alpha)

        state = othello.get_initial_state()

        current_player = 1

        done = False

        while not done:

            if current_player == alphazero:
                mcts_policy = mcts.search(root_state=state,
                                          current_player=current_player,
                                          num_simulations=num_mcts_simulations)
                action = np.argmax(mcts_policy)
            else:
                action = othello.greedy_action(state,
                                               current_player,
                                               epsilon=0.3)

            next_state, done = othello.step(state, action, current_player)

            state = next_state

            current_player = -1 * current_player

        reward_first, reward_second = othello.get_result(state)

        reward = reward_first if alphazero == 1 else reward_second
        result = "win" if reward == 1 else "lose" if reward == -1 else "draw"

        if reward > 0:
            win_count += 1

        stone_first, stone_second = othello.count_stone(state)

        if alphazero == 1:
            stone_az, stone_tester = stone_first, stone_second
            color = "black"
        else:
            stone_az, stone_tester = stone_second, stone_first
            color = "white"

        message = f"AlphaZero ({color}) {result}: {stone_az} vs {stone_tester}"

        othello.save_img(state, "img", f"test_{n}.png", message)

    elapsed = time.time() - t

    return win_count, win_count / n_testplay, elapsed