def main(rounds): global ticTacToe global wins global losses initial_q_value = 0.0 alpha = 0.5 # Step size gamma = 1.0 # Discount factor epsilon = 0.2 # Exploration rate actions = [(i, j) for i in range(3) for j in range(3)] actions_per_state = {a: initial_q_value for a in actions} Q_values = {'terminal': actions_per_state.copy()} Q_values['terminal'][None] = 0.0 first_turn_random_count = 0 # Running Q-Learning Q-value updates for many episodes for i in range(rounds): if i % 10000 == 0: print("Rounds done = {}".format(i), end=" | ") print("Wins = {}".format(wins), end=" | ") print("Losses = {}".format(losses)) wins = 0 losses = 0 ticTacToe = TicTacToe() first_turn = random.choice(['random', 'computer']) if first_turn == 'random': first_turn_random_count += 1 # Random player playing one turn # Q opposition player playing one turn selected_grid = epsilon_greedy_for_opposition( Q_values, ticTacToe.get_current_state(), actions_per_state, epsilon) if selected_grid not in ticTacToe.get_empty_cells(): selected_grid = random.choice(ticTacToe.get_empty_cells()) ticTacToe.set_one_grid(selected_grid[0], selected_grid[1]) ticTacToe.toggle_turn() Q_values = Q_Learning(Q_values, alpha, gamma, epsilon, actions_per_state) # pprint(Q_values) policy = find_optimal_policy(Q_values) print("First turn by random players = {}%".format(first_turn_random_count * 100 / rounds)) filename = "Q_values_{}_episodes_025_epsilon.p".format(rounds) pickle.dump(Q_values, open(filename, "wb")) filename = "policy_{}_episodes_025_epsilon.p".format(rounds) pickle.dump(policy, open(filename, "wb"))
def main(rounds): global ticTacToe wins, losses = 0, 0 first_turn_random_count = 0 # Running Q-Learning Q-value updates for many episodes for i in range(rounds): if i % 10000 == 0: print("Rounds done = {}".format(i), end=" | ") print("Wins = {}".format(wins), end=" | ") print("Losses = {}".format(losses)) wins = 0 losses = 0 ticTacToe = TicTacToe() current_state = None first_turn = random.choice(['random', 'computer']) if first_turn == 'random': first_turn_random_count += 1 # Random player playing one turn selected_grid = random.choice(ticTacToe.get_empty_cells()) ticTacToe.set_one_grid(selected_grid[0], selected_grid[1]) ticTacToe.toggle_turn() while current_state != 'terminal': current_state = ticTacToe.get_current_state() try: selected_grid = policy[current_state] if selected_grid not in ticTacToe.get_empty_cells(): selected_grid = random.choice(ticTacToe.get_empty_cells()) except: selected_grid = random.choice(ticTacToe.get_empty_cells()) ticTacToe.set_one_grid(selected_grid[0], selected_grid[1]) solved, result = ticTacToe.is_solved() if solved: if result != 0: wins += 1 break ticTacToe.toggle_turn() selected_grid = random.choice(ticTacToe.get_empty_cells()) ticTacToe.set_one_grid(selected_grid[0], selected_grid[1]) solved, result = ticTacToe.is_solved() if solved: if result != 0: losses += 1 break ticTacToe.toggle_turn() print("First turn by random players = {}%".format(first_turn_random_count * 100 / rounds))