Exemplo n.º 1
0
    def test_update(self):
        t = TTT(3)
        prev_state = [[1, 1, 0], [-1, -1, 0], [0, 0, 0]]
        next_state = [[1, 1, 1], [-1, -1, 0], [0, 0, 0]]
        prev_state = np.array(prev_state).reshape(-1)
        next_state = np.array(next_state).reshape(-1)
        result = t.get_result(next_state)
        self.assertEqual(result, {'terminated': True, 'score': 5})

        q = TabularQ(3)
        q.set_params(alpha=1, gamma=1)

        encoded_prev_state = t.get_encoded_state(prev_state)
        prev_state_index = q.get_index(encoded_prev_state)
        encoded_next_state = t.get_encoded_state(next_state)
        next_state_index = q.get_index(encoded_next_state)
        self.assertEqual(next_state_index, None)

        q.update(encoded_prev_state, 2, encoded_next_state, 5)
        updated_row = q._Q[prev_state_index, :]

        check_row = np.array_equal(updated_row, [0, 0, 5, 0, 0, 0, 0, 0, 0])
        self.assertTrue(check_row)

        # test correct inference :
        q._is_first_mover = True
        possible_moves = t.get_available_positions(prev_state)
        inferred = q.infer(encoded_prev_state, possible_moves, 1)
        self.assertEqual(inferred, 2)

        pass
Exemplo n.º 2
0
    def _train_against(self,opponent_agent:Callable[[np.ndarray],int],numOfGames:int)->None:

        agent_q_turn = self._is_first_mover
        for _ in tqdm(range(numOfGames)):
            game = TTT(self._size)
            turn = True

            # one complete game :
            # prev state, action taken are from agent's turn
            # next state is from opponent's turn.
            # update in opponent's turn
            encoded_prev_state = None
            move_taken = None
            encoded_next_state = None
            while True:

                if turn is agent_q_turn:
                    # Q turn :
                    if game.is_terminated():
                        break
                    else:
                        possible_moves = game.get_available_positions()
                        encoded_prev_state = game.get_encoded_state()
                        move_taken = self._epsilon_greedy_train(encoded_prev_state,possible_moves)
                        game.put(move_taken)
                        pass
                    pass
                else:
                    # opponent's turn :
                    if not game.is_terminated():
                        state = game.get_state()
                        # move below is considered as random (sampling procedure) :
                        move = opponent_agent(state)
                        game.put(move)
                        pass
                    encoded_next_state = game.get_encoded_state()
                    score = game.get_score()
                    if encoded_prev_state is not None:
                        # : to avoid just after first move case ( in case of Q is second mover )
                        self.update(encoded_prev_state,move_taken,encoded_next_state,score)
                    
                    pass
                
                turn = not turn
            pass
        
        return None
Exemplo n.º 3
0
    def _train_both(self,numOfGames):
        for _ in tqdm(range(numOfGames)):
            game = TTT(self._size)
            self._is_first_mover = True

            # one complete game :
            while True:
                encoded_prev_state = game.get_encoded_state()

                possible_moves = game.get_available_positions()
                selected_move = self._epsilon_greedy_train(encoded_prev_state,possible_moves)
                game.put(selected_move)

                encoded_next_state =  game.get_encoded_state()
                result = game.get_result()
                self.update(encoded_prev_state,selected_move,encoded_next_state,result['score'])
                if result['terminated']:
                    break
                pass

            pass
Exemplo n.º 4
0
    def test_deterministic_vs_minimax(self):
        # gamma, alpha == 1 guarantees that for endstates s and optimal move a,
        # Q(s,a) = R(s,a) IF Q(s,a) IS NOT 0
        # Here, R(s,a) is the score of the terminated state
        parameters = {
            "ep_train": 0.5,
            "ep_infer": 0,
            "gamma": 1,
            "alpha": 1,
            "agent_for": 'both',
        }
        q = TabularQ(3)
        q.set_params(**parameters)
        q.train(numOfGames=500)

        s = Settings()
        minimax = minimax_load(s.path('minimax'))
        t = TTT(3)

        Q = q._Q
        to_check_state_indices = np.where(Q != [0, 0, 0, 0, 0, 0, 0, 0, 0])[0]
        to_check_state_indices = map(int, to_check_state_indices)

        for state_index in to_check_state_indices:

            self.assertFalse(
                np.array_equal(Q[state_index],
                               np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])))
            state = q.get_state(state_index)
            encoded_state = t.get_encoded_state(state)
            mover = t.get_mover(state=state)
            possible_moves = t.get_available_positions(state)

            if mover == 1:
                best_move_q = np.argmax(Q[state_index])
                if int(Q[state_index, best_move_q]) is not 0:
                    move_inferred = q.infer(encoded_state, possible_moves,
                                            mover)
                    q_value_1 = Q[state_index, best_move_q]
                    q_value_2 = Q[state_index, move_inferred]
                    self.assertEqual(q_value_1, q_value_2)
            elif mover == -1:
                best_move_q = np.argmin(Q[state_index])
                if int(Q[state_index, best_move_q]) is not 0:
                    move_inferred = q.infer(encoded_state, possible_moves,
                                            mover)
                    q_value_1 = Q[state_index, best_move_q]
                    q_value_2 = Q[state_index, move_inferred]
                    self.assertEqual(q_value_1, q_value_2)

            next_state = state.copy()
            next_state[best_move_q] = mover

            result = t.get_result(next_state)
            if result['terminated']:
                best_score, _ = minimax(state)
                q_value = Q[state_index, best_move_q]
                if best_score != q_value:
                    # not yet sampled (s,a)
                    # or withdraw case
                    self.assertEqual(q_value, 0)
                else:
                    # sampled (s,a)
                    self.assertEqual(best_score, q_value)
            pass
Exemplo n.º 5
0
 def test_state_encode(self):
     t3 = TTT(3)
     encoded01 = t3.get_encoded_state()
     state02 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int)
     encoded02 = t3.get_encoded_state(state02)
     self.assertEqual(encoded01, encoded02)