def train(self, total_games=500): fixed_epsilon = None alpha = 0.5 gamma = 0.9 epsilon = 1 agent = QLearningAgent(epsilon=epsilon, fixed_epsilon=fixed_epsilon, alpha=alpha, gamma=gamma, total_games=total_games) for game_number in range(total_games): last_action = None while True: game_result = game.game_results() if game_result is not None: pygame.display.flip() game.reset() game.initialize() break last_action = game.play(agent) pygame.display.flip() agent.save_policy() return agent
from tqdm import tqdm def q(text=''): print(f'>{text}<') sys.exit() from environment import TicTacToe from agent import QLearningAgent, Hoooman import config as cfg from config import display_board # initializing the TicTacToe environment and a QLearningAgent (the master Tic-Toc-Toc player, your opponent!) env = TicTacToe() player1 = QLearningAgent(name=cfg.playerX_QLearningAgent_name) player1.loadQtable() # load the learnt Q-Table player1.epsilon = 0.0 # greedy actions only, 0 exploration # initializing the agent class that let's you, the human user take the actions in the game player2 = Hoooman() # replay decides whether to rematch or not, at the end of a game replay = True while replay: done = False # the episode goes on as long as done is False # deciding which player makes a move first playerID = random.choice([True, False]) # True means player1
if __name__ == '__main__': episode = 30 # 训练多少回合 epsilon = 0.8 # 使用历史经验的概率, 若值为0.9,则有 90% 的情况下,会根据历史经验选择 action, 10% 的情况下,随机选择 action learning_rate = 0.01 # 根据公式可知,该值越大,则旧训练数据被保留的就越少 discount_factor = 0.9 # from maze_game.game import startGame env = startGame() key = input('Do you want to see the training process? [y] ') key = key == 'y' or key == '' agent = QLearningAgent( epsilon=epsilon, learning_rate=learning_rate, discount_factor=discount_factor, actions=Game.DIRECTION.ACTIONS ) successful_step_counter_arr = [] failed_step_counter_arr = [] if key: env.display() for eps in range(1, episode + 1): cur_state = env.reset() step_counter = 0 while True: step_counter += 1
import numpy as np import pandas as pd from env import TicTacToeEnv from agent import QLearningAgent env = TicTacToeEnv() agent = QLearningAgent(env) for game_nr in range(1000000): if game_nr % 10000 == 0: print(game_nr) done = False s = env.reset().copy() # print('Init', s) while not done: a = agent.take_action(s) r, s_, done, _ = env.step(a) agent.learn(s, a, r, s_, done) # print(s, a, r, s_, done) s = s_.copy() V = pd.DataFrame.from_dict(agent._V, orient='index', dtype=np.float32, columns=['V']) N = pd.DataFrame.from_dict(agent._N, orient='index', dtype=np.uint32, columns=['N']) df = V.merge(N, how='left', left_index=True, right_index=True) states = pd.DataFrame(df.index.values.tolist(), index=df.index)