def test_step_with_action_taken_by_opponent(): mock_opponent = Mock() env = TicTacToeEnv(mock_opponent) env.reset() attrs = { "select_action.side_effect": [1, 3], "get_player_number.return_value": 2 } mock_opponent.configure_mock(**attrs) env.step(0) with pytest.raises(ValueError): env.step(1)
def _play_out(agent_actions, opponent_actions, expected_reward): mock_opponent = Mock() env = TicTacToeEnv(mock_opponent) _, is_done = env.reset() attrs = { "select_action.side_effect": opponent_actions, "get_player_number.return_value": 2 } mock_opponent.configure_mock(**attrs) expected_state = np.zeros((3, 3), dtype=np.int) i = 0 # call env.step until the end while not is_done: state, reward, is_done = env.step(agent_actions[i]) expected_state[agent_actions[i] // 3, agent_actions[i] % 3] = PLAYER1 if i < len(opponent_actions): expected_state[opponent_actions[i] // 3, opponent_actions[i] % 3] = PLAYER2 if not is_done: assert np.array_equal(state, expected_state) assert reward == 0 mock_opponent.select_action.assert_called_once() mock_opponent.reset_mock() i += 1 assert np.array_equal(state, expected_state) assert i == len(agent_actions) assert reward == expected_reward if i > len(opponent_actions): mock_opponent.select_action.assert_not_called() else: mock_opponent.select_action.assert_called_once()
def main(): use_sarsa = True if len(sys.argv) >= 2 and '--sarsa' in sys.argv else False num_episodes = 3000 env = TicTacToeEnv(TicTacToeDecentAlgorithmPlayer(2, 1)) agent = TicTacToeSarsaAgent() if use_sarsa else TicTacToeQLearningAgent() epsilon = Epsilon(1.0, 0.95) rewards = [] moves = [] def before_episode_callback(env, agent, episode_number): agent.set_epsilon(epsilon.value) def after_episode_callback(env, agent, episode_number, reward): epsilon.decay() rewards.append(reward) moves.append(env.get_moves()) if use_sarsa: experiment = SarsaExperiment(env, agent, before_episode_callback, after_episode_callback) print("using SARSA") else: experiment = QLearningExperiment(env, agent, before_episode_callback, after_episode_callback) print("using Q-Learning") experiment.experiment(num_episodes) num_games_to_analyze = 100 analyzer = TicTacToeMoveAnalyzer(moves, rewards, num_games_to_analyze) num_wins = analyzer.num_wins() num_corner_openings = analyzer.num_corner_openings() rl_method = "SARSA" if use_sarsa else "Q-Learning" text = "{0} wins and {1} corner openings in the last {2} games.".format( num_wins, num_corner_openings, num_games_to_analyze) TicTacToePlotter.plot_episode_reward(rewards, text, rl_method)
action = np.r_[user_type, move_target] self._reset_step() return action else: pi = self.model.get_pi(state) choice = self.np_random.choice( 9, 1, p=pi.flatten(), replace=False) move_target = self.action_space[choice[0]] action = np.r_[PLAYER, move_target] self._reset_step() return action if __name__ == "__main__": # 환경 생성 및 시드 설정 env = TicTacToeEnv() env.seed(2018) # 에이전트 생성 및 시드 생성 my_agent = ZeroAgent() my_agent.seed(2018) # 통계용 result = {1: 0, 0: 0, -1: 0} # play game for e in range(episode_count): state = env.reset() print('-' * 15, '\nepisode: %d' % (e + 1)) # 첫턴을 나와 상대 중 누가 할지 정하기 my_agent.first_turn = my_agent.np_random.choice(2, replace=False) done = False while not done: # action 선택하기 (셀프 모드)
from tictactoe_env import TicTacToeEnv as MEnv #main env import random import tkinter as tk #import sys #import pdb env = MEnv(19, 650, 650) arraybackup = env.action_space for epoch in range(0, 2): env.render() #print("reset") root = env.reset() arrayCount = 0 for ev in range(0, 361): action = random.choice(random.choice(env.action_space)) if action == None: if arrayCount < 361: while action is None: action = random.choice(random.choice(env.action_space)) else: env.action_space = arraybackup continue a, b = action arrayCount += 1 env.action_space[a][b] = None observation, reword, done, info = env.step(action) #print(done) if done == True: #print("True, reset") continue root.update()
from tictactoe_env import TicTacToeEnv as MEnv #main env #import tictactoe_env #import random #import tkinter as tk #import pdb #import numpy as np import bot import pickle env = MEnv(19, 650, 650) ateration = 0 reword = 0 epochs = 10 layers = None done = False Gametype = "EvE" #, "PvP", "EvP"# try: with open('weights.pickle', 'rb') as weights: layers = pickle.load(weights) print(len(layers)) except: pass env.render() root = env.reset() if Gametype == "EvE": for epoch in range(0, epochs): env.render() root = env.reset() #print("after reset")
def test_reset(): mock_opponent = Mock() env = TicTacToeEnv(mock_opponent) state, is_done = env.reset() assert np.array_equal(state, np.zeros((3, 3), dtype=np.int)) assert is_done == False
def test_did_win(): mock_opponent = Mock() env = TicTacToeEnv(mock_opponent) for player_a, player_b in [(PLAYER1, PLAYER2), (PLAYER2, PLAYER1)]: # test rows for i in range(3): env.reset() env.board[i, :] = player_a assert env._did_win(player_a) assert not env._did_win(player_b) # test columns for i in range(3): env.reset() env.board[:, i] = player_a assert env._did_win(player_a) assert not env._did_win(player_b) # test diagonals env.reset() env.board[0, 0] = player_a env.board[1, 1] = player_a env.board[2, 2] = player_a assert env._did_win(player_a) assert not env._did_win(player_b) env.reset() env.board[0, 2] = player_a env.board[1, 1] = player_a env.board[2, 0] = player_a assert env._did_win(player_a) assert not env._did_win(player_b) # test no winners env.reset() assert not env._did_win(PLAYER1) assert not env._did_win(PLAYER2) env.board[0, 0] = PLAYER1 assert not env._did_win(PLAYER1) assert not env._did_win(PLAYER2) env.board[0, 1] = PLAYER1 assert not env._did_win(PLAYER1) assert not env._did_win(PLAYER2) env.board[0, 2] = PLAYER2 assert not env._did_win(PLAYER1) assert not env._did_win(PLAYER2) env.board[1, 0] = PLAYER2 assert not env._did_win(PLAYER1) assert not env._did_win(PLAYER2) env.board[1, 1] = PLAYER2 assert not env._did_win(PLAYER1) assert not env._did_win(PLAYER2) env.board[1, 2] = PLAYER1 assert not env._did_win(PLAYER1) assert not env._did_win(PLAYER2) env.board[2, 0] = PLAYER1 assert not env._did_win(PLAYER1) assert not env._did_win(PLAYER2) env.board[2, 1] = PLAYER2 assert not env._did_win(PLAYER1) assert not env._did_win(PLAYER2) env.board[2, 2] = PLAYER2 assert not env._did_win(PLAYER1) assert not env._did_win(PLAYER2)
def test_get_moves(): mock_opponent = Mock() env = TicTacToeEnv(mock_opponent) env.reset() assert not env.get_moves() attrs = { "select_action.side_effect": [3, 4, 2, 7], "get_player_number.return_value": 2 } mock_opponent.configure_mock(**attrs) env.step(0) assert env.get_moves() == [0, 3] env.step(1) env.step(5) env.step(6) env.step(8) assert env.get_moves() == [0, 3, 1, 4, 5, 2, 6, 7, 8] env.reset() assert not env.get_moves()