def play(action_modes): """1ゲームの実行 """ # 3目並べの状態を保持するクラス"State"を初期化する。 state = game.State() # ゲーム終了までループ。(Stateクラスのis_doneで確認) while (state.is_done() != True): # 行動の取得 action_mode = action_modes[0] if state.is_first_player( ) else action_modes[1] action = ai.action(state, action_mode) # 行動を状態に反映させた次の状態に更新する。 state = state.next(action) # 先手プレイヤーのポイントを返す return first_player_point(state)
def run(user, opponent, opponentFirst): s = tictactoe.State() if opponentFirst: a = tictactoe.chooseAction(opponent, s, 0) s = tictactoe.takeAction(opponent.player, s, a) printBoard(s) while True: a = getUserAction(user, s) s = tictactoe.takeAction(user, s, a) printBoard(s) if s.terminal(): break a = tictactoe.chooseAction(opponent, s, 0) s = tictactoe.takeAction(opponent.player, s, a) printBoard(s) if s.terminal(): break printWinner(s, user)
def rewardPerEpisode(q, gamma): if q.player == tictactoe.PlayerCircle: opponent = tictactoe.ActionValueFunc(tictactoe.PlayerCross) else: opponent = tictactoe.ActionValueFunc(tictactoe.PlayerCircle) rpe = 0.0 # reward per episode t = 0 # time step s = tictactoe.State() # Randomly determine whether the player or her opponent should move first. if random.random() < 0.5: a = tictactoe.chooseAction(opponent, s, 0) s = tictactoe.takeAction(opponent.player, s, a) t += 1 while True: # Player makes a move and defers observing the reward until her opponent has made his move. # Only under the special case where the move is the last move should the player observe reward before exiting. a = tictactoe.chooseAction(q, s, 0) s1 = tictactoe.takeAction(q.player, s, a) t += 1 if s1.terminal(): reward = tictactoe.observeReward(q.player, s1) rpe += math.pow(gamma, t) * reward break # Opponent make a move, and the resulting state is observed by player to calculate her reward. opponentAction = tictactoe.chooseAction(opponent, s1, 0) s2 = tictactoe.takeAction(opponent.player, s1, opponentAction) t += 1 reward = tictactoe.observeReward(q.player, s2) rpe += math.pow(gamma, t) * reward s = s2 if s.terminal(): break return rpe
def runEpisode(algo, q, epsilon, alpha, gamma): s = tictactoe.State() a = tictactoe.chooseAction(q[0], s, epsilon) s1 = tictactoe.takeAction(q[0].player, s, a) while True: # After the first player has made her move, let the second make his move, too. # The resulting state s2 is effectively the outcome of the action taken by the first player earlier. # From the first player's point of view, with # # * the current state: "s" # * the taken action: "a" # * the new state: "s2" # # we can update her action-value function according to the algorithm. opponentAction = tictactoe.chooseAction(q[1], s1, epsilon) s2 = tictactoe.takeAction(q[1].player, s1, opponentAction) if algo == SARSA: SARSA(q[0], s, a, s2, epsilon, alpha, gamma) else: QLearning(q[0], s, a, s2, alpha, gamma) # Roll forward states and switch sides. s = s1 s1 = s2 a = opponentAction q[0], q[1] = q[1], q[0] # When the game ends, due to a time step lag, the player that made the last move has not observed the reward yet. # Let her observe the terminal state and update her action-value function before leaving. if s1.terminal(): if algo == SARSA: SARSA(q[0], s, a, s1, epsilon, alpha, gamma) else: QLearning(q[0], s, a, s1, alpha, gamma) break
# else: # raise game.Error("Please select a valid player.") P1 = utic.OurPlayer(1) P2 = AIPlayer(2) # if not args.p2 or args.p2 == 'RP': # P2 = utic.RandomPlayer(2) # elif args.p2 == 'AP': # P2 = AIPlayer(2) # elif args.p2 == 'OP': # P2 = utic.OurPlayer(2) # else: # raise game.Error("Please select a valid player.") State = utic.State([P1, P2], 2) # Change the third argument to True to print the gamestate after every move. # Change the fourth argument to True to wait for keyboard input to move to the next state. # Press enter to advance the game by two moves. Game = utic.TicTacToeGame(State, [P1, P2], False, False) Game.run() Game.genScore() print Game.score moves = "" for i in xrange(len(Game.State.moves) - 1): moves += str(Game.State.moves[i] + ",") moves += str(Game.State.moves[len(Game.State.moves) - 1]) ## I'm assuming a table Id(int),IfWon(bool),IfTie(bool),Moves(string),Result(string),Score(int) print Game.State.winner
def test_first_column_player_one_wins(self): s = tictactoe.State([[1, 1, None], [1, 1, None], [1, None, None]], 1) self.assertEqual(tictactoe.did_win(s), True)
def test_empty_board_noone_wins(self): s = tictactoe.State( [[None, None, None], [None, None, None], [None, None, None]], 1) self.assertEqual(tictactoe.did_win(s), False)
def test_noone_wins(self): s = tictactoe.State([[2, 1, None], [1, 2, None], [2, None, None]], 1) self.assertEqual(tictactoe.did_win(s), False)
def test_top_diagonal_player_zero_wins(self): s = tictactoe.State([[2, 1, None], [1, 2, None], [2, None, 2]], 1) self.assertEqual(tictactoe.did_win(s), True)
def test_stalemate(self): s = tictactoe.State([[1, 1, 2], [2, 1, 1], [1, 2, 2]], 1) self.assertEqual(tictactoe.did_win(s), False)
elif temp[1]: state.data[i] = 2 return state def state_size(self): return 2 * (self.dim * self.dim) if __name__ == "__main__": DIM = 4 MARK = "O" actions = [] for pos in xrange(DIM**2): actions.append(tictactoe.Action(pos, MARK)) opp = tictactoe.RandomPlayer("X") state_parser = TicTacToeStateParser(DIM) world = TicTacToeWorld(DIM, MARK, opp, actions, state_parser) rl = DQN(world, state0=tictactoe.State(DIM)) rl.buffer_size = 10000 rl.batch_size = 100 rl.clone_network_steps = 50 try: rl.train(4000) finally: print "SAVING FILES...", rl.save_data("graph/dqn_ttt_" + str(DIM)) print "DONE"
def load(self, n): i = self.dim**2 - 1 state = tictactoe.State(self.dim) while n != 0: d = 3**i state.data[i] = n // d n = n % d i -= 1 return state if __name__ == "__main__": DIM = 4 MARK = "O" actions = [] for pos in xrange(DIM**2): actions.append(tictactoe.Action(pos, MARK)) opp = tictactoe.RandomPlayer("X") state_parser = TicTacToeStateParser(DIM) world = TicTacToeWorld(DIM, MARK, opp, actions, state_parser) qlearning = QLearning(world, state0=tictactoe.State(DIM)) try: qlearning.train(4000) finally: print "SAVING FILES...", qlearning.save_data("graph/ql_ttt_" + str(DIM)) print "DONE"