def play(max_episode=10): episode = 0 start_mark = 'O' env = TicTacToeEnv() agents = [BaseAgent('O'), BaseAgent('X')] while episode < max_episode: env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: env.show_turn(True, mark) agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) env.render() if done: env.show_result(True, mark, reward) break else: _, mark = state # rotate start start_mark = next_mark(start_mark) episode += 1
def learn_on_policy(episodes, epsilon=0.1, discount_factor=0.9): env = TicTacToeEnv() agents = [MCOPA('O', epsilon), MCOPA('X', epsilon)] start_mark = 'O' env.set_start_mark(start_mark) for i in range(episodes): episode = i + 1 state = env.reset() _, mark = state steps = [] done = False while not done: agent = agent_by_mark(agents, mark) actions = env.available_actions() action = agent.act(state, actions) next_state, reward, done, _ = env.step(action) steps.append((state, reward)) if done: break _, mark = state = next_state steps.reverse() G = 0 # As in one episode of tic tac toe there will only be unique states we don't need to check for them for step in steps: _, mark = step[0] G = step[1] + discount_factor * G agents[0].update(step[0], G) # rotate start start_mark = next_mark(start_mark)
def arena(self, agent1, agent2, mcts_args, games_to_play=10): mcts1 = MCTS(agent1, mcts_args) mcts2 = MCTS(agent2, mcts_args) results = [] for i in range(games_to_play): #tqdm() if i % 2 == 0: player1 = mcts1 player2 = mcts2 else: player2 = mcts1 player1 = mcts2 env = TicTacToeEnv() done = False while not done: first_player_move = env.fpt if first_player_move: probs = player1.getProbs(env, temp=0) else: probs = player2.getProbs(env, temp=0) action = np.random.choice(probs.shape[0], p=probs.reshape(-1,)) _, reward, done, _ = env.step( (action//env.size, action % env.size) ) if reward == -1: print('Repeated move!') if done: results.append( reward if first_player_move else -1*reward ) return results
def play(show_number): env = TicTacToeEnv(show_number=show_number) agents = [MinimaxAgent('O'), HumanAgent('X')] episode = 0 while True: state = env.reset() _, mark = state done = False env.render() while not done: agent = agent_by_mark(agents, mark) env.show_turn(True, mark) ava_actions = env.available_actions() if mark=='O': n,action=agent.act(state, ava_actions) else: action = agent.act(state, ava_actions) if action is None: sys.exit() state, reward, done, info = env.step(action) print('') env.render() if done: env.show_result(True, mark, reward) break else: _, _ = state mark = next_mark(mark) episode += 1
def _bench(max_episode, model_file, show_result=True): """Benchmark given model. Args: max_episode (int): Episode count to benchmark. model_file (str): Learned model file name to benchmark. show_result (bool): Output result to stdout. Returns: (dict): Benchmark result. """ minfo = load_model(model_file) agents = [BaseAgent('O'), TDAgent('X', 0, 0)] show = False start_mark = 'O' env = TicTacToeEnv() env.set_start_mark(start_mark) episode = 0 results = [] for i in tqdm(range(max_episode)): env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) if show: env.show_turn(True, mark) env.render(mode='human') if done: if show: env.show_result(True, mark, reward) results.append(reward) break else: _, mark = state # rotation start start_mark = next_mark(start_mark) episode += 1 o_win = results.count(1) x_win = results.count(-1) draw = len(results) - o_win - x_win mfile = model_file.replace(CWD + os.sep, '') minfo.update( dict(base_win=o_win, td_win=x_win, draw=draw, model_file=mfile)) result = json.dumps(minfo) if show_result: print(result) return result
def _play(load_file, vs_agent, show_number): """Play with learned model. Make TD agent and adversarial agnet to play with. Play and switch starting mark when the game finished. TD agent behave no exploring action while in play mode. Args: load_file (str): vs_agent (object): Enemy agent of TD agent. show_number (bool): Whether show grid number for visual hint. """ load_model(load_file) env = TicTacToeEnv(show_number=show_number) td_agent = TDAgent('X', 0, 0) # prevent exploring start_mark = 'O' agents = [vs_agent, td_agent] while True: # start agent rotation env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False # show start board for human agent if mark == 'O': env.render(mode='human') while not done: agent = agent_by_mark(agents, mark) human = isinstance(agent, HumanAgent) env.show_turn(True, mark) ava_actions = env.available_actions() if human: action = agent.act(ava_actions) if action is None: sys.exit() else: action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) env.render(mode='human') if done: env.show_result(True, mark, reward) break else: _, mark = state # rotation start start_mark = next_mark(start_mark)
def _learn(max_episode, epsilon, alpha, save_file): """Learn by episodes. Make two TD agent, and repeat self play for given episode count. Update state values as reward coming from the environment. Args: max_episode (int): Episode count. epsilon (float): Probability of exploration. alpha (float): Step size. save_file: File name to save result. """ reset_state_values() env = TicTacToeEnv() agents = [TDAgent('O', epsilon, alpha), TDAgent('X', epsilon, alpha)] start_mark = 'O' for i in tqdm(range(max_episode)): episode = i + 1 env.show_episode(False, episode) # reset agent for new episode for agent in agents: agent.episode_rate = episode / float(max_episode) env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() env.show_turn(False, mark) action = agent.act(state, ava_actions) # update (no rendering) nstate, reward, done, info = env.step(action) agent.backup(state, nstate, reward) if done: env.show_result(False, mark, reward) # set terminal state value set_state_value(state, reward) _, mark = state = nstate # rotate start start_mark = next_mark(start_mark) # save states save_model(save_file, max_episode, epsilon, alpha)
def play(show_number): env = TicTacToeEnv(show_number=show_number) agents = [HumanAgent(HUMAN_MARK)] episode = 0 j = 0 while True: state = env.reset() _, mark = state done = False env.render() i = 0 if j == 0: Papa = Node(state, None, [1, 2, 3, 4, 5, 6, 7, 8, 9], 0) Papa.fill() j += 1 action = Papa.maxAddress current = Papa.children[Papa.maxAddress] print("X's Turn") while not done: pre_action = action pre_current = current ava_actions = env.available_actions() if i % 2 == 0 and i != 0: print("X's Turn") print("Previous Action: ", pre_action) action, current = pre_current.reach_child(pre_action) print("Playing: ", action) elif i % 2 == 1: print("O's Turn") action = HumanAgent.act(state, ava_actions) i += 1 if action is None: sys.exit() state, reward, done, info = env.step(action - 1) print('') env.render() if done: env.show_result(True, mark, reward) break else: _, _ = state mark = next_mark(mark) episode += 1
def play_game(self, agent): train_samples = [] episode_step = 0 env = TicTacToeEnv() while True: episode_step += 1 temp = int(episode_step < self.temp_thres) probs = self.mcts.getProbs(env, temp=temp) action = np.random.choice(probs.shape[0], p=probs.reshape(-1,)) #probs_orig, val = self.mcts.net.predict(env.getPBoard()) for board_s, probs_s in getBoardSims(env.getPBoard(), probs): train_samples.append([board_s, probs_s, env.fpt]) #, probs_orig, action, val _, reward, done, _ = env.step( (action//env.size, action % env.size) ) if done: final_player = env.fpt return [ (x[0], x[1], reward if final_player != x[2] else -reward) for x in train_samples] #, x[3], x[4], x[5]
def learn_off_policy(episodes, discount_factor=0.9): env = TicTacToeEnv() agents = [MCOffPA('O'), MCOffPA('X')] start_mark = 'O' env.set_start_mark(start_mark) for i in range(episodes): state = env.reset() _, mark = state steps = [] done = False while not done: agent = agent_by_mark(agents, mark) actions = env.available_actions() action = random.choice(actions) next_state, reward, done, _ = env.step(action) steps.append((state, reward, action, actions)) if done: break _, mark = state = next_state steps.reverse() G = 0 W = 1 # As in one episode of tic tac toe there will only be unique states we don't need to check for them for step in steps: _, mark = step[0] agent = agent_by_mark(agents, mark) G = step[1] + discount_factor*G agent.update(step[0], G, W) if agent.act(step[0], step[3]) != step[2]: break # behaviour policy = 1/available_actions W = W*len(step[3]) # rotate start start_mark = next_mark(start_mark)
import numpy as np from env import TicTacToeEnv from time import sleep if __name__ == '__main__': env = TicTacToeEnv(render=True) sleep(2) env.step((16, 17)) sleep(2) env.step((16, 16)) sleep(2) env.step((17, 16)) sleep(2) env.step((17, 17)) sleep(2) env.step((14, 16)) sleep(2) env.step((14, 14)) sleep(2) env.step((14, 15)) sleep(2) env.step((13, 13)) env.window.mainloop()
import pandas as pd from env import TicTacToeEnv from agent import QLearningAgent env = TicTacToeEnv() agent = QLearningAgent(env) for game_nr in range(1000000): if game_nr % 10000 == 0: print(game_nr) done = False s = env.reset().copy() # print('Init', s) while not done: a = agent.take_action(s) r, s_, done, _ = env.step(a) agent.learn(s, a, r, s_, done) # print(s, a, r, s_, done) s = s_.copy() V = pd.DataFrame.from_dict(agent._V, orient='index', dtype=np.float32, columns=['V']) N = pd.DataFrame.from_dict(agent._N, orient='index', dtype=np.uint32, columns=['N']) df = V.merge(N, how='left', left_index=True, right_index=True) states = pd.DataFrame(df.index.values.tolist(), index=df.index) res = states.merge(V, how='left', left_index=True, right_index=True).merge(