def simulate(self, node: Node, my_env: TicTacToeEnv) -> float: """ MCTS: Simulation stage. - Randomly play out remainder of moved and report reward Won reward=1, Tie reward=0.5, Lost reward=0 """ state = node.state while not my_env.done: action = random.choice(my_env.available_actions()) state, _, _, _ = my_env.step(action) return self.compute_reward(state)
def play_against(agent_mc,agent_2,max_episode = 10,bench = True): start_mark = 'O' env = TicTacToeEnv() env.set_start_mark(start_mark) agents = [agent_mc,agent_2] episode = 0 results = [] for i in range(max_episode): env.set_start_mark(start_mark) state = env.reset() _,mark = state done = False while not done: agent = agent_by_mark(agents,mark) ava_actions = env.available_actions() # print(agent.mark) # if agent.mark == 'O': # print(agent.Q[state]) action = agent.act(state,ava_actions) state,reward,done,_ = env.step(action) # env.render() if done: results.append(reward) break else: _,mark = state start_mark = next_mark(start_mark) episode += 1 o_win = results.count(1) x_win = results.count(-1) draw = len(results) - o_win - x_win if bench == False: print("O_WINS = {},X_WINS = {},DRAW = {}".format(o_win,x_win,draw)) return float(o_win-x_win)/(max_episode)
def select(self, node: Node, my_env: TicTacToeEnv): """ MCTS: Selection stage. - If node has any unvisted children, select one - If all children visited, choose best by UCB score and advance environment - If no children, return itself. """ while node.children: # if any unexplored children, pick one univisted_children = [c for c in node.children if c.unvisited] if univisted_children: node = univisted_children[0] # otherwise, choose best child according to ucb else: node = max(node.children, key=self.ucb_score) # step copied environment forward my_env.step(node.action) return node, my_env
def run(): env = TicTacToeEnv() state = torch.tensor([one_hot_board(env.reset())], dtype=torch.float).to(model.device) for step in range(n_step): t = np.clip(step / eps_steps, 0, 1) eps = (1 - t) * eps_start + t * eps_end print('\r'+f'step----{step},epsilon----{eps}', flush=True, end='') action, _ = model.select_action(state, eps) next_state, reward, done, _ = env.step(action.item()) next_state=one_hot_board(next_state) if not done: next_state, _, done, _ = env.step( model.select_dummy_action(next_state)) next_state=one_hot_board(next_state) next_state = torch.tensor( [next_state], dtype=torch.float).to(model.device) if done: next_state = None model.Memory.memorize(state, action, next_state, torch.tensor([reward], device=model.device)) state = next_state model.learn() if done: state = torch.tensor( [one_hot_board(env.reset())], dtype=torch.float).to(model.device) if step % target_update == 0: model.target_update()
def expand(self, node: Node, my_env: TicTacToeEnv): """ MCTS: Expansion stage. - If additional moves are possible from given node child nodes will be created, one selected, and env advanced. - If not, same node and env will be returned. """ # If this is a terminal state, don't try to expand if my_env.done: return node, my_env # Add a child node for each possible action for action in my_env.available_actions(): nstate = after_action_state(node.state, action) Node(nstate, action, parent=node) # If node has children after expansion, select one if node.children: node = random.choice(node.children) my_env.step(node.action) return node, my_env
def act(self, state, my_env: TicTacToeEnv): available_actions = my_env.available_actions() # --- Step 1: play winning move, if possible --- for action in available_actions: nstate = after_action_state(state, action) gstatus = check_game_status(nstate[0]) if gstatus > 0: if tomark(gstatus) == self.mark: return action # --- Step 2: block opponent from winning --- # imagine the opponent was playing rev_state = (state[0], next_mark(state[1])) for action in available_actions: nstate = after_action_state(rev_state, action) gstatus = check_game_status(nstate[0]) if gstatus > 0: # if they can make a winning move, play that if tomark(gstatus) == self.opponent_mark: return action return random.choice(available_actions)
def _learn(max_episode, epsilon, alpha, save_file): """Learn by episodes. Make two TD agent, and repeat self play for given episode count. Update state values as reward coming from the environment. Args: max_episode (int): Episode count. epsilon (float): Probability of exploration. alpha (float): Step size. save_file: File name to save result. """ reset_state_values() env = TicTacToeEnv() agents = [TDAgent('O', epsilon, alpha), TDAgent('X', epsilon, alpha)] start_mark = 'O' for i in tqdm(range(max_episode)): episode = i + 1 env.show_episode(False, episode) # reset agent for new episode for agent in agents: agent.episode_rate = episode / float(max_episode) env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() env.show_turn(False, mark) action = agent.act(state, ava_actions) # update (no rendering) nstate, reward, done, info = env.step(action) agent.backup(state, nstate, reward) if done: env.show_result(False, mark, reward) # set terminal state value set_state_value(state, reward) _, mark = state = nstate # rotate start start_mark = next_mark(start_mark) # save states save_model(save_file, max_episode, epsilon, alpha)
def play(max_episode=10): episode = 0 start_mark = 'O' env = TicTacToeEnv() agents = [BaseAgent('O'), BaseAgent('X')] while episode < max_episode: env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: env.show_turn(True, mark) agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) env.render() if done: env.show_result(True, mark, reward) break else: _, mark = state # rotate start start_mark = next_mark(start_mark) episode += 1
def play(show_number): env = TicTacToeEnv(show_number=show_number) # print("101", env.available_actions()) ROLLOUTS = 1000 REWARD_FACTOR = 1 # because rollout agent is player 2 agents = [ HumanAgent('X'), # HumanAgent('X')] RolloutAgent('O', ROLLOUTS, REWARD_FACTOR) ] episode = 0 while True: state = env.reset() _, mark = state done = False _ = os.system('clear') print('') env.render() while not done: agent = agent_by_mark(agents, mark) env.show_turn(True, mark) ava_actions = env.available_actions() if agent.agenttype == 'rollout': action = agent.act(ava_actions, env) else: action = agent.act(ava_actions) if action is None: sys.exit() state, reward, done, info = env.step(action) _ = os.system('clear') print('') env.render() if done: env.show_result(True, mark, reward) # print("Reward:", reward) break else: _, mark = state time.sleep(10) episode += 1
def play_game(qagent): env = TicTacToeEnv() opponent = RandomAgent('X') start_mark = 'O' env.set_start_mark(start_mark) state = env.reset() s, mark = state done = False agents = [qagent, opponent] while not done: env.render() agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() env.show_turn(False, mark) action = agent.act(state, ava_actions) nstate, reward, done, info = env.step(action) print(f'state: {s}, action: {action}') if done: env.render() env.show_result(True, mark, reward) s, mark = state = nstate
from utils import one_hot_board, Net from algorithm import DQN import torch def run(player): done = False obs = env.reset() env.render() while not done: if obs[1] == player: action = int(input()) obs_, _, done, info = env.step(action) else: obs = one_hot_board(obs) action = model.act(torch.tensor([obs], dtype=torch.float)).item() obs_, _, done, exp = env.step(action) obs = obs_ env.render() if __name__ == "__main__": env = TicTacToeEnv() model = DQN() model.load_net('net.pkl') print('choose O or X player') player = str(input('')) assert player == 'O' or player == 'X' run(player)
def play(num_games, verbose=True): """ Test out two agents playing against each other. Displays progress and result. Parameters: ----------- num_games: int How many games to simulate verbose: bool If true, display play information during each game If false, just display progress bar as simulations progress. """ # Print header print("-" * 30) print(f"Playing {num_games} games") print(" * Player X: {}".format(players["X"].name)) print(" * Player O: {}".format(players["O"].name)) print("-" * 30) # select random starting player start_mark = random.choice(["X", "O"]) # keep track of who won winners = [] # if verbose is false, display progress bar if not verbose: myrange = trange else: myrange = range for _ in myrange(num_games): # set up board env = TicTacToeEnv() env.set_start_mark(start_mark) state = env.reset() # init the agents agents = [players["X"]("X"), players["O"]("O")] # play until game is done while not env.done: _, mark = state if verbose: env.show_turn(True, mark) agent = agent_by_mark(agents, mark) action = agent.act(state, copy(env)) state, reward, _, _ = env.step(action) if verbose: env.render() # append winner to list (-1=X, 1=0, 0=tie) winners.append(reward) # print out result if verbose: env.show_result(True, mark, reward) # rotate start start_mark = next_mark(start_mark) # tally and display final stats c = Counter(winners) total = c[-1] + c[1] + c[0] print("\nX won {} ({:.2%})".format(c[-1], c[-1] / total)) print("O won {} ({:.2%})".format(c[1], c[1] / total)) print("Tied {} ({:.2%})".format(c[0], c[0] / total))
def _learnhuman1(epsilon, alpha, save_file, load_file, vs_agent, show_number): load_model(load_file) env = TicTacToeEnv(show_number=show_number) start_mark = 'X' agents = [vs_agent, TDAgent('O', epsilon, alpha)] max_episode = 0 agent_temp = 6 #Set Start position at 6 to td_agent human_temp = 0 while True: # start agent rotation env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False turns = 0 human_diff = False # show start board for human agent if mark == 'X': env.render(mode='human') while not done: agent = agent_by_mark(agents, mark) human = isinstance(agent, HumanAgent) print( "======================================Switch Turn======================================" ) env.show_turn(True, mark) ava_actions = env.available_actions() # print(ava_actions) if human: action = agent.act(ava_actions) turns += 1 if turns == 1 and human_temp != action: human_temp = action human_diff = True # print("Human action is %s"%(action)) # print("Turns == %s" % (turns)) # print("human_temp == %s" %(human_temp)) # print("human_diff == %s" %(human_diff)) if action is None: sys.exit() else: action = agent.act(state, ava_actions) if turns == 1 and human_diff == False: action = agent_temp print("------------> Human start in the same position") print("------------> [Fix]Agent action is %s" % (action + 1)) elif turns == 1 and human_diff == True: agent_temp = action print( "------------> Human start in the diiferent position") print("------------> [New]Agent action is %s" % (action + 1)) else: print("------------> Agent action is %s" % (action + 1)) ### nstate, reward, done, info = env.step(action) agent.backup(state, nstate, reward) env.render(mode='human') if done: print("Return reward : " + str(reward)) env.show_result(True, mark, reward) time.sleep(1) # if reward == 1: # _conlearn(700, epsilon, alpha, save_file, load_file) # set terminal state value set_state_value(state, reward) break else: _, mark = state = nstate # rotation start # start_mark = next_mark(start_mark) max_episode += 1 # print(max_episode) save_model(save_file, max_episode, epsilon, alpha)
def _learnhuman(epsilon, alpha, save_file, load_file, vs_agent, show_number): load_model(load_file) env = TicTacToeEnv(show_number=show_number) start_mark = 'X' agents = [vs_agent, TDAgent('O', epsilon, alpha)] max_episode = 0 while True: # start agent rotation env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False # show start board for human agent if mark == 'X': env.render(mode='human') while not done: agent = agent_by_mark(agents, mark) human = isinstance(agent, HumanAgent) print("==================================") env.show_turn(True, mark) ava_actions = env.available_actions() # print(ava_actions) if human: action = agent.act(ava_actions) print("action is %s" % (action)) if action is None: sys.exit() else: action = agent.act(state, ava_actions) print("action is %s" % (action)) ### nstate, reward, done, info = env.step(action) agent.backup(state, nstate, reward) env.render(mode='human') if done: print("Return reward : " + str(reward)) env.show_result(True, mark, reward) time.sleep(1) # if reward == 1: # _conlearn(700, epsilon, alpha, save_file, load_file) # set terminal state value set_state_value(state, reward) break else: _, mark = state = nstate # rotation start # start_mark = next_mark(start_mark) max_episode += 1 # print(max_episode) save_model(save_file, max_episode, epsilon, alpha)
def play(max_episode=10): start_mark = 'O' env = TicTacToeEnv() agents = [BaseAgent('O'), BaseAgent('X')] for _ in range(max_episode): env.set_start_mark(start_mark) state = env.reset() while not env.done: _, mark = state env.show_turn(True, mark) agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) env.render() env.show_result(True, mark, reward) # rotate start start_mark = next_mark(start_mark)
def _play(load_file, vs_agent, show_number): """Play with learned model. Make TD agent and adversarial agnet to play with. Play and switch starting mark when the game finished. TD agent behave no exploring action while in play mode. Args: load_file (str): vs_agent (object): Enemy agent of TD agent. show_number (bool): Whether show grid number for visual hint. """ load_model(load_file) env = TicTacToeEnv(show_number=show_number) td_agent = TDAgent('X', 0, 0) # prevent exploring start_mark = 'O' agents = [vs_agent, td_agent] while True: # start agent rotation env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False # show start board for human agent if mark == 'O': env.render(mode='human') while not done: agent = agent_by_mark(agents, mark) human = isinstance(agent, HumanAgent) env.show_turn(True, mark) ava_actions = env.available_actions() if human: action = agent.act(ava_actions) if action is None: sys.exit() else: action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) env.render(mode='human') if done: env.show_result(True, mark, reward) break else: _, mark = state # rotation start start_mark = next_mark(start_mark)
def _bench(max_episode, model_file, show_result=True): """Benchmark given model. Args: max_episode (int): Episode count to benchmark. model_file (str): Learned model file name to benchmark. show_result (bool): Output result to stdout. Returns: (dict): Benchmark result. """ minfo = load_model(model_file) agents = [BaseAgent('O'), TDAgent('X', 0, 0)] show = False start_mark = 'O' env = TicTacToeEnv() env.set_start_mark(start_mark) episode = 0 results = [] for i in tqdm(range(max_episode)): env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) if show: env.show_turn(True, mark) env.render(mode='human') if done: if show: env.show_result(True, mark, reward) results.append(reward) break else: _, mark = state # rotation start start_mark = next_mark(start_mark) episode += 1 o_win = results.count(1) x_win = results.count(-1) draw = len(results) - o_win - x_win mfile = model_file.replace(CWD + os.sep, '') minfo.update( dict(base_win=o_win, td_win=x_win, draw=draw, model_file=mfile)) result = json.dumps(minfo) if show_result: print(result) return result
def _learnhuman1(epsilon, alpha, save_file, load_file, vs_agent, show_number): connection = pymysql.connect(host="localhost", user="******", passwd="", database="tictactoe") cursor = connection.cursor() load_model(load_file) env = TicTacToeEnv(show_number=show_number) start_mark = 'X' agents = [vs_agent, TDAgent('O', epsilon, alpha)] max_episode = 0 agent_temp = 6 #Set Start position at 6 to td_agent human_temp = 0 while True: # start agent rotation env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False turns = 0 human_diff = False # variable for save to database db_nstate = [] db_nvalue = [] db_choose = 0 db_pick = 0 db_current_state = '' db_action = 0 db_note = '' # show start board for human agent if mark == 'X': env.render(mode='human') while not done: agent = agent_by_mark(agents, mark) human = isinstance(agent, HumanAgent) print( "======================================Switch Turn======================================" ) env.show_turn(True, mark) ava_actions = env.available_actions() # print(ava_actions) if human: action = agent.act(ava_actions) turns += 1 if turns == 1 and human_temp != action: human_temp = action human_diff = True # print("Human action is %s"%(action)) # print("Turns == %s" % (turns)) # print("human_temp == %s" %(human_temp)) # print("human_diff == %s" %(human_diff)) if action is None: sys.exit() nstate, reward, done, info = env.step(action) else: action = agent.act(state, ava_actions) db_nstate, db_nvalue, db_choose, db_pick = agent.get_db() if turns == 1 and human_diff == False: action = agent_temp db_note = 'repeat' print("------------> Human start in the same position") print("------------> [Fix]Agent action is %s" % (action + 1)) elif turns == 1 and human_diff == True: agent_temp = action db_note = 'different' print( "------------> Human start in the diiferent position") print("------------> [New]Agent action is %s" % (action + 1)) else: print("------------> Agent action is %s" % (action + 1)) nstate, reward, done, info = env.step(action) db_table_tictactoe_state = "INSERT INTO Tictactoe_state(EPISODE, NSTATE, NVALUE, CHOOSE, PICK, STATE_NOW, ACTION, NOTE) \ VALUES(%d, '%s', '%s', %d, %d, '%s', %d, '%s');" % ( max_episode, db_nstate, db_nvalue, db_choose, db_pick, remove_X(nstate), action, db_note) cursor.execute(db_table_tictactoe_state) connection.commit() ### agent.backup(state, nstate, reward) env.render(mode='human') if done: db_win_state = '' if reward == 0: db_win_state = "draw" else: db_win_state = mark print("Return reward : " + str(reward)) db_table_check_win = "INSERT INTO Check_win(REWARD, WIN_STATE) VALUES(%d, '%s');" % ( reward, db_win_state) cursor.execute(db_table_check_win) connection.commit() env.show_result(True, mark, reward) time.sleep(1) # if reward == 1: # _conlearn(700, epsilon, alpha, save_file, load_file) # set terminal state value set_state_value(state, reward) break else: _, mark = state = nstate connection.commit() # insert2 = "INSERT INTO Tictactoe_state(EPISODE, NSTATE, REWARD, ALL_CHOICE, CHOOSE, PICK, STATE_NOW, \ # ACTION, NOTE) VALUES(1, 'asdasd', 1, 'asdasd', 1, 1, 'asdasd', 1, 'asdasasd');" # cursor.execute(insert2) # connection.commit() # rotation start # start_mark = next_mark(start_mark) max_episode += 1 # print(max_episode) save_model(save_file, max_episode, epsilon, alpha) connection.close()
def play(show_number): env = TicTacToeEnv(show_number=show_number) agents = [MinimaxAgent('O'), HumanAgent('X')] episode = 0 while True: state = env.reset() _, mark = state done = False env.render() while not done: agent = agent_by_mark(agents, mark) env.show_turn(True, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) if action is None: sys.exit() state, reward, done, info = env.step(action) print('') env.render() if done: env.show_result(True, mark, reward) break else: _, _ = state mark = next_mark(mark) episode += 1
def train_agents(opponent, max_episode, epsilon, epsilon_decay, alpha, alpha_decay, gamma, render=False): reset_state_values() env = TicTacToeEnv() if opponent == 'random': agents = [ QAgent(env.observation_space.n, env.action_space.n, 'O', epsilon, epsilon_decay, alpha, alpha_decay, gamma), RandomAgent('X') ] else: # Two Q agents agents = [ QAgent(env.observation_space.n, env.action_space.n, 'O', epsilon, epsilon_decay, alpha, alpha_decay, gamma), QAgent(env.observation_space.n, env.action_space.n, 'X', epsilon, epsilon_decay, alpha, alpha_decay, gamma) ] start_mark = 'O' agent_rewards = {'O': [], 'X': []} episode = 0 for i in tqdm(range(max_episode)): episode += 1 env.show_episode(False, episode) # reset agent for new episode for agent in agents: agent.episode_rate = episode / float(max_episode) env.set_start_mark(start_mark) state = env.reset() s, mark = state done = False while not done: if render: env.render() agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() env.show_turn(False, mark) action = agent.act(state, ava_actions) # update (no rendering) nstate, reward, done, info = env.step(action) agent.update(s, nstate[0], action, reward, done) if done: if render: env.render() env.show_result(render, mark, reward) # set terminal state value set_state_value(state, reward) agent_rewards['O'].append(reward) agent_rewards['X'].append(-reward) s, mark = state = nstate # rotate start start_mark = next_mark(start_mark) return agent_rewards, agent_by_mark(agents, 'O')