def minimax(state, mover: int, t: TTT) -> [Score, Move]: next_mover = -1 if mover is 1 else 1 possible_moves = t.get_available_positions(state) corresponding_scores = [] best_score = 0 best_move = None for index in possible_moves: next_state = state.copy() next_state[index] = mover if t.is_terminated(next_state): score = t.get_score(next_state) corresponding_scores.append(score) else: [score, _] = minimax(next_state, next_mover, t) corresponding_scores.append(score) if mover == 1: best_score = max(corresponding_scores) best_move_index = corresponding_scores.index(best_score) best_move = possible_moves[best_move_index] elif mover == -1: best_score = min(corresponding_scores) best_move_index = corresponding_scores.index(best_score) best_move = possible_moves[best_move_index] return [best_score, best_move]
def test_update(self): t = TTT(3) prev_state = [[1, 1, 0], [-1, -1, 0], [0, 0, 0]] next_state = [[1, 1, 1], [-1, -1, 0], [0, 0, 0]] prev_state = np.array(prev_state).reshape(-1) next_state = np.array(next_state).reshape(-1) result = t.get_result(next_state) self.assertEqual(result, {'terminated': True, 'score': 5}) q = TabularQ(3) q.set_params(alpha=1, gamma=1) encoded_prev_state = t.get_encoded_state(prev_state) prev_state_index = q.get_index(encoded_prev_state) encoded_next_state = t.get_encoded_state(next_state) next_state_index = q.get_index(encoded_next_state) self.assertEqual(next_state_index, None) q.update(encoded_prev_state, 2, encoded_next_state, 5) updated_row = q._Q[prev_state_index, :] check_row = np.array_equal(updated_row, [0, 0, 5, 0, 0, 0, 0, 0, 0]) self.assertTrue(check_row) # test correct inference : q._is_first_mover = True possible_moves = t.get_available_positions(prev_state) inferred = q.infer(encoded_prev_state, possible_moves, 1) self.assertEqual(inferred, 2) pass
def minimax_save(state, mover: int, t: TTT, table) -> (Score, Move): encoded_state = encode_state(state) if encode_state in table: return table[encoded_state] next_mover = -1 if mover is 1 else 1 possible_moves = t.get_available_positions(state) corresponding_scores = [] best_score = 0 best_move = None for index in possible_moves: next_state = state.copy() next_state[index] = mover if t.is_terminated(next_state): score = t.get_score(next_state) corresponding_scores.append(score) else: [score, _] = minimax_save(next_state, next_mover, t, table) corresponding_scores.append(score) if mover == 1: best_score = max(corresponding_scores) best_move_index = corresponding_scores.index(best_score) best_move = possible_moves[best_move_index] elif mover == -1: best_score = min(corresponding_scores) best_move_index = corresponding_scores.index(best_score) best_move = possible_moves[best_move_index] table[encoded_state] = (best_score, best_move) return (best_score, best_move)
def _train_against(self,opponent_agent:Callable[[np.ndarray],int],numOfGames:int)->None: agent_q_turn = self._is_first_mover for _ in tqdm(range(numOfGames)): game = TTT(self._size) turn = True # one complete game : # prev state, action taken are from agent's turn # next state is from opponent's turn. # update in opponent's turn encoded_prev_state = None move_taken = None encoded_next_state = None while True: if turn is agent_q_turn: # Q turn : if game.is_terminated(): break else: possible_moves = game.get_available_positions() encoded_prev_state = game.get_encoded_state() move_taken = self._epsilon_greedy_train(encoded_prev_state,possible_moves) game.put(move_taken) pass pass else: # opponent's turn : if not game.is_terminated(): state = game.get_state() # move below is considered as random (sampling procedure) : move = opponent_agent(state) game.put(move) pass encoded_next_state = game.get_encoded_state() score = game.get_score() if encoded_prev_state is not None: # : to avoid just after first move case ( in case of Q is second mover ) self.update(encoded_prev_state,move_taken,encoded_next_state,score) pass turn = not turn pass return None
def _train_both(self,numOfGames): for _ in tqdm(range(numOfGames)): game = TTT(self._size) self._is_first_mover = True # one complete game : while True: encoded_prev_state = game.get_encoded_state() possible_moves = game.get_available_positions() selected_move = self._epsilon_greedy_train(encoded_prev_state,possible_moves) game.put(selected_move) encoded_next_state = game.get_encoded_state() result = game.get_result() self.update(encoded_prev_state,selected_move,encoded_next_state,result['score']) if result['terminated']: break pass pass
def test_deterministic_vs_minimax(self): # gamma, alpha == 1 guarantees that for endstates s and optimal move a, # Q(s,a) = R(s,a) IF Q(s,a) IS NOT 0 # Here, R(s,a) is the score of the terminated state parameters = { "ep_train": 0.5, "ep_infer": 0, "gamma": 1, "alpha": 1, "agent_for": 'both', } q = TabularQ(3) q.set_params(**parameters) q.train(numOfGames=500) s = Settings() minimax = minimax_load(s.path('minimax')) t = TTT(3) Q = q._Q to_check_state_indices = np.where(Q != [0, 0, 0, 0, 0, 0, 0, 0, 0])[0] to_check_state_indices = map(int, to_check_state_indices) for state_index in to_check_state_indices: self.assertFalse( np.array_equal(Q[state_index], np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]))) state = q.get_state(state_index) encoded_state = t.get_encoded_state(state) mover = t.get_mover(state=state) possible_moves = t.get_available_positions(state) if mover == 1: best_move_q = np.argmax(Q[state_index]) if int(Q[state_index, best_move_q]) is not 0: move_inferred = q.infer(encoded_state, possible_moves, mover) q_value_1 = Q[state_index, best_move_q] q_value_2 = Q[state_index, move_inferred] self.assertEqual(q_value_1, q_value_2) elif mover == -1: best_move_q = np.argmin(Q[state_index]) if int(Q[state_index, best_move_q]) is not 0: move_inferred = q.infer(encoded_state, possible_moves, mover) q_value_1 = Q[state_index, best_move_q] q_value_2 = Q[state_index, move_inferred] self.assertEqual(q_value_1, q_value_2) next_state = state.copy() next_state[best_move_q] = mover result = t.get_result(next_state) if result['terminated']: best_score, _ = minimax(state) q_value = Q[state_index, best_move_q] if best_score != q_value: # not yet sampled (s,a) # or withdraw case self.assertEqual(q_value, 0) else: # sampled (s,a) self.assertEqual(best_score, q_value) pass
class ABPruning: def __init__(self,size=3): self._t = TTT(size) self._mode = 'optimal' self._penalty_prob = 0 pass def set_penalty(self,penalty_prob=0): assert type(penalty_prob) == int or type(penalty_prob) == float assert penalty_prob >= 0 assert penalty_prob <= 1 if penalty_prob > 0: self._mode = 'modified' self._penalty_prob = penalty_prob else: pass return def get(self,state:np.ndarray,mover:int)->(Score,Move): if self._mode == 'optimal': return self._optimal(state,mover) elif self._mode == 'modified': return self._modified(state,mover) def _optimal(self,state,mover:int,alpha=-1000,beta=1000)->(Score,Move): t = self._t next_mover = -1 if mover is 1 else 1 possible_moves = t.get_available_positions(state) best_move = None best_score = None # maximizer : if mover == 1: best_score = -1000 for i in possible_moves: next_state = state.copy() next_state[i] = mover if t.is_terminated(next_state): score = t.get_score(next_state) else: [score,_] = self._optimal(next_state,next_mover,alpha,beta) if score > best_score: best_score = score best_move = i alpha = best_score if alpha >= beta: break # minimizer : elif mover == -1: best_score = 1000 for i in possible_moves: next_state = state.copy() next_state[i] = mover if t.is_terminated(next_state): score = t.get_score(next_state) else: [score,_] = self._optimal(next_state,next_mover,alpha,beta) if score < best_score: best_score = score best_move = i beta = best_score if alpha >= beta: break return (best_score, best_move) def _modified(self,state,mover:int,alpha=-1000,beta=1000)->(Score,Move): t = self._t next_mover = -1 if mover is 1 else 1 possible_moves = self._get_reduced_moves(state) best_move = None best_score = None # maximizer : if mover == 1: best_score = -1000 for i in possible_moves: next_state = state.copy() next_state[i] = mover if t.is_terminated(next_state): score = t.get_score(next_state) else: [score,_] = self._modified(next_state,next_mover,alpha,beta) if score > best_score: best_score = score best_move = i alpha = best_score if alpha >= beta: break # minimizer : elif mover == -1: best_score = 1000 for i in possible_moves: next_state = state.copy() next_state[i] = mover if t.is_terminated(next_state): score = t.get_score(next_state) else: [score,_] = self._modified(next_state,next_mover,alpha,beta) if score < best_score: best_score = score best_move = i beta = best_score if alpha >= beta: break return (best_score, best_move) def _get_reduced_moves(self,state)->tuple: all_moves = self._t.get_available_positions(state) if len(all_moves) == 0: return [] p = 1 - self._penalty_prob num_of_moves = int(len(all_moves)*p) if num_of_moves == 0: num_of_moves = 1 sample_moves = random.sample(all_moves,num_of_moves) return sample_moves
def test_availables(self): t3 = TTT(3) s3 = [[1, -1, 0], [0, 1, -1], [1, -1, 0]] s3 = np.array(s3).reshape(-1) indices = t3.get_available_positions(s3) self.assertListEqual(indices, [2, 3, 8])