def run(user, opponent, opponentFirst): s = tictactoe.State() if opponentFirst: a = tictactoe.chooseAction(opponent, s, 0) s = tictactoe.takeAction(opponent.player, s, a) printBoard(s) while True: a = getUserAction(user, s) s = tictactoe.takeAction(user, s, a) printBoard(s) if s.terminal(): break a = tictactoe.chooseAction(opponent, s, 0) s = tictactoe.takeAction(opponent.player, s, a) printBoard(s) if s.terminal(): break printWinner(s, user)
def rewardPerEpisode(q, gamma): if q.player == tictactoe.PlayerCircle: opponent = tictactoe.ActionValueFunc(tictactoe.PlayerCross) else: opponent = tictactoe.ActionValueFunc(tictactoe.PlayerCircle) rpe = 0.0 # reward per episode t = 0 # time step s = tictactoe.State() # Randomly determine whether the player or her opponent should move first. if random.random() < 0.5: a = tictactoe.chooseAction(opponent, s, 0) s = tictactoe.takeAction(opponent.player, s, a) t += 1 while True: # Player makes a move and defers observing the reward until her opponent has made his move. # Only under the special case where the move is the last move should the player observe reward before exiting. a = tictactoe.chooseAction(q, s, 0) s1 = tictactoe.takeAction(q.player, s, a) t += 1 if s1.terminal(): reward = tictactoe.observeReward(q.player, s1) rpe += math.pow(gamma, t) * reward break # Opponent make a move, and the resulting state is observed by player to calculate her reward. opponentAction = tictactoe.chooseAction(opponent, s1, 0) s2 = tictactoe.takeAction(opponent.player, s1, opponentAction) t += 1 reward = tictactoe.observeReward(q.player, s2) rpe += math.pow(gamma, t) * reward s = s2 if s.terminal(): break return rpe
def runEpisode(algo, q, epsilon, alpha, gamma): s = tictactoe.State() a = tictactoe.chooseAction(q[0], s, epsilon) s1 = tictactoe.takeAction(q[0].player, s, a) while True: # After the first player has made her move, let the second make his move, too. # The resulting state s2 is effectively the outcome of the action taken by the first player earlier. # From the first player's point of view, with # # * the current state: "s" # * the taken action: "a" # * the new state: "s2" # # we can update her action-value function according to the algorithm. opponentAction = tictactoe.chooseAction(q[1], s1, epsilon) s2 = tictactoe.takeAction(q[1].player, s1, opponentAction) if algo == SARSA: SARSA(q[0], s, a, s2, epsilon, alpha, gamma) else: QLearning(q[0], s, a, s2, alpha, gamma) # Roll forward states and switch sides. s = s1 s1 = s2 a = opponentAction q[0], q[1] = q[1], q[0] # When the game ends, due to a time step lag, the player that made the last move has not observed the reward yet. # Let her observe the terminal state and update her action-value function before leaving. if s1.terminal(): if algo == SARSA: SARSA(q[0], s, a, s1, epsilon, alpha, gamma) else: QLearning(q[0], s, a, s1, alpha, gamma) break