def play(max_episode=10): episode = 0 start_mark = 'O' env = TicTacToeEnv() agents = [BaseAgent('O'), BaseAgent('X')] while episode < max_episode: env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: env.show_turn(True, mark) agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) env.render() if done: env.show_result(True, mark, reward) break else: _, mark = state # rotate start start_mark = next_mark(start_mark) episode += 1
def play_base(env): load_model(MC_MODEL_FILE) agents = [BaseAgent('O'), OnPolicyMCAgent('X', 0, 1)] start_mark = 'X' test_cases = 10 while test_cases: env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() env.show_turn(False, mark) action = agent.act(state, ava_actions) next_state, reward, done, _ = env.step(action) state = next_state if done: env.show_result(True, mark, reward) break else: _, mark = state # rotation start start_mark = next_mark(start_mark) test_cases -= 1
def play(show_number): env = TicTacToeEnv(show_number=show_number) agents = [MinimaxAgent('O'), HumanAgent('X')] episode = 0 while True: state = env.reset() _, mark = state done = False env.render() while not done: agent = agent_by_mark(agents, mark) env.show_turn(True, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) if action is None: sys.exit() state, reward, done, info = env.step(action) print('') env.render() if done: env.show_result(True, mark, reward) break else: _, _ = state mark = next_mark(mark) episode += 1
def _bench(max_episode, model_file, show_result=True): """Benchmark given model. Args: max_episode (int): Episode count to benchmark. model_file (str): Learned model file name to benchmark. show_result (bool): Output result to stdout. Returns: (dict): Benchmark result. """ minfo = load_model(model_file) agents = [BaseAgent('O'), TDAgent('X', 0, 0)] show = False start_mark = 'O' env = TicTacToeEnv() env.set_start_mark(start_mark) episode = 0 results = [] for i in tqdm(range(max_episode)): env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) print((state,reward,action)) if show: env.show_turn(True, mark) env.render(mode='human') if done: if show: env.show_result(True, mark, reward) results.append(reward) break else: _, mark = state # rotation start start_mark = next_mark(start_mark) episode += 1 o_win = results.count(1) x_win = results.count(-1) draw = len(results) - o_win - x_win mfile = model_file.replace(CWD + os.sep, '') minfo.update(dict(base_win=o_win, td_win=x_win, draw=draw, model_file=mfile)) result = json.dumps(minfo) if show_result: print(result) return result
def learn(self,env,num_episodes,agent_2,rndm): returns_sum = defaultdict(float) returns_count = defaultdict(float) unique_states = set() # number_unique = [] mean_returns = [] # low_returns = [] # high_returns = [] start_mark = 'X' for episode in range(1,num_episodes+1): if episode%1000 == 0: print("\rEpisode {}/{}.".format(episode,num_episodes),end="") sys.stdout.flush() episodes = self.generate_episode(env,self.policy,start_mark) sa_in_episode = set([(tuple(x[0]),x[1]) for x in episodes]) # print(Q) # for x in episodes: # unique_states.add(x[0]) for state,action in sa_in_episode: sa_pair = (state,action) first_occurence_idx = next(i for i,x in enumerate(episodes) if x[0] == state and x[1] == action) # print(sa_pair) # print(first_occurence_idx) G = sum([x[2]*(self.discount_factor**i) for i,x in enumerate(episodes[first_occurence_idx:])]) # print(G) returns_sum[sa_pair] += G returns_count[sa_pair] += 1.0 self.Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair] # print(self.Q) for s,a in rndm: self.backup[(s,a)].append(deepcopy(self.Q[s][a])) start_mark = next_mark(start_mark) mu = play_against(self,agent_2,10) self.unique_states.append(len(self.Q.keys())) mean_returns.append(mu) # low_returns.append(low) # high_returns.append(high) save_model('Mc_OnPolicy_agent',num_episodes,self.epsilon,self.discount_factor,'Mc_OnPolicy',self.Q) return mean_returns
def _play(load_file, vs_agent, show_number): """Play with learned model. Make TD agent and adversarial agnet to play with. Play and switch starting mark when the game finished. TD agent behave no exploring action while in play mode. Args: load_file (str): vs_agent (object): Enemy agent of TD agent. show_number (bool): Whether show grid number for visual hint. """ load_model(load_file) env = TicTacToeEnv(show_number=show_number) td_agent = TDAgent('X', 0, 0) # prevent exploring start_mark = 'O' agents = [vs_agent, td_agent] while True: # start agent rotation env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False # show start board for human agent if mark == 'O': env.render(mode='human') while not done: agent = agent_by_mark(agents, mark) human = isinstance(agent, HumanAgent) env.show_turn(True, mark) ava_actions = env.available_actions() if human: action = agent.act(ava_actions) if action is None: sys.exit() else: action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) print((state,reward,action)) env.render(mode='human') if done: env.show_result(True, mark, reward) break else: _, mark = state # rotation start start_mark = next_mark(start_mark)
def _learn(max_episode, epsilon, alpha, save_file): """Learn by episodes. Make two TD agent, and repeat self play for given episode count. Update state values as reward coming from the environment. Args: max_episode (int): Episode count. epsilon (float): Probability of exploration. alpha (float): Step size. save_file: File name to save result. """ reset_state_values() env = TicTacToeEnv() agents = [TDAgent('O', epsilon, alpha), TDAgent('X', epsilon, alpha)] start_mark = 'O' for i in tqdm(range(max_episode)): episode = i + 1 env.show_episode(False, episode) # reset agent for new episode for agent in agents: agent.episode_rate = episode / float(max_episode) env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() env.show_turn(False, mark) action = agent.act(state, ava_actions) # update (no rendering) nstate, reward, done, info = env.step(action) print((state,reward,action)) agent.backup(state, nstate, reward) if done: env.show_result(False, mark, reward) # set terminal state value set_state_value(state, reward) _, mark = state = nstate # rotate start start_mark = next_mark(start_mark) # save states save_model(save_file, max_episode, epsilon, alpha)
def play_against(agent_mc,agent_2,max_episode = 10,bench = True): start_mark = 'O' env = TicTacToeEnv() env.set_start_mark(start_mark) agents = [agent_mc,agent_2] episode = 0 results = [] for i in range(max_episode): env.set_start_mark(start_mark) state = env.reset() _,mark = state done = False while not done: agent = agent_by_mark(agents,mark) ava_actions = env.available_actions() # print(agent.mark) # if agent.mark == 'O': # print(agent.Q[state]) action = agent.act(state,ava_actions) state,reward,done,_ = env.step(action) # env.render() if done: results.append(reward) break else: _,mark = state start_mark = next_mark(start_mark) episode += 1 o_win = results.count(1) x_win = results.count(-1) draw = len(results) - o_win - x_win if bench == False: print("O_WINS = {},X_WINS = {},DRAW = {}".format(o_win,x_win,draw)) return float(o_win-x_win)/(max_episode)
def act(self, state, ava_actions): board, mark = state nboard = list(board[:]) if check_game_status(nboard) < 0: min = 100 max = -100 action_min = ava_actions[0] action_max = ava_actions[0] if mark == 'O': for action in ava_actions: nboard[action] = 1 mark = next_mark(mark) value, q = self.act( (tuple(nboard), mark), [p for p in ava_actions if p != action]) if (value < min): min = value action_min = action nboard[action] = 0 #backtrack mark = next_mark(mark) return min, action_min else: for action in ava_actions: nboard[action] = 2 mark = next_mark(mark) value, m = self.act( (tuple(nboard), mark), [p for p in ava_actions if p != action]) if (value > max): max = value action_max = action nboard[action] = 0 #backtrack mark = next_mark(mark) return max, action_max else: return check_game_status(nboard), 12
def generate_episode(self,env,policy,start_mark): episodes = [] env.set_start_mark(start_mark) state = env.reset() done = False iteration = 0 while not done: available_actions = env.available_actions() action = policy(state,available_actions,False) nstate,reward,done,_ = env.step(action) episodes.append((state,action,reward)) state = nstate iteration += 1 start_mark = next_mark(start_mark) return episodes
def find_loc_prob(state, aval_actions, action, win_count, loss_count, step): aval_actions.remove(action) state = after_action_state(state, action) game_status = check_game_status(state[0]) if (game_status == 0 or game_status == tocode(next_mark(state[1]))): win_count = win_count + step if (game_status == 0): #If there is draw then it will be counted as victory for both the players loss_count = loss_count + step return win_count, loss_count elif (game_status == tocode(state[1])): loss_count = loss_count + step return win_count, loss_count else: for action in aval_actions: temp = aval_actions.copy() loss_count, win_count = find_loc_prob(state, temp, action, loss_count, win_count, step/5) return win_count, loss_count
def learn(env): max_episode = MAX_EPISODE epsilon = EPSILON agents = [OnPolicyMCAgent('X', epsilon), OnPolicyMCAgent('O', epsilon)] agents[0].orig_actions = env.available_actions() agents[1].orig_actions = env.available_actions() start_mark = 'O' #iterating through episodes for episode in tqdm(range(max_episode)): #env.show_episode(False,episode+1) agents[0].trans_list = [] agents[1].trans_list = [] env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() env.show_turn(False, mark) action = agent.act(state, ava_actions) next_state, reward, done, _ = env.step(action) #agent.update(state, new_state) agent.trans_list.append((state, action, reward)) state = next_state _, mark = state agents[0].update() agents[1].update() if done: env.show_result(False, mark, reward) start_mark = next_mark(start_mark) save_model(MC_MODEL_FILE)
def act(self, state, my_env: TicTacToeEnv): available_actions = my_env.available_actions() # --- Step 1: play winning move, if possible --- for action in available_actions: nstate = after_action_state(state, action) gstatus = check_game_status(nstate[0]) if gstatus > 0: if tomark(gstatus) == self.mark: return action # --- Step 2: block opponent from winning --- # imagine the opponent was playing rev_state = (state[0], next_mark(state[1])) for action in available_actions: nstate = after_action_state(rev_state, action) gstatus = check_game_status(nstate[0]) if gstatus > 0: # if they can make a winning move, play that if tomark(gstatus) == self.opponent_mark: return action return random.choice(available_actions)
def find_loc_prob(state, aval_actions, action, win_count, loss_count, step): aval_actions.remove(action) state = after_action_state(state, action) game_status = check_game_status(state[0]) print("Action = {}".format(action)) if (game_status == 0 or game_status == tocode(next_mark(state[1]))): win_count = win_count + step return win_count, loss_count elif (game_status == tocode(state[1])): loss_count = loss_count + step return win_count, loss_count else: for action in aval_actions: print("Calling recurssively for step {}".format(step)) print( "Win count and Loss count till this step = {} and {} for mark {}" .format(win_count, loss_count, state[1])) loss_count, win_count = find_loc_prob(state, aval_actions, action, loss_count, win_count, step - 1) return win_count, loss_count
def play(max_episode=10): start_mark = 'O' env = TicTacToeEnv() agents = [BaseAgent('O'), BaseAgent('X')] for _ in range(max_episode): env.set_start_mark(start_mark) state = env.reset() while not env.done: _, mark = state env.show_turn(True, mark) agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) env.render() env.show_result(True, mark, reward) # rotate start start_mark = next_mark(start_mark)
def train_agents(opponent, max_episode, epsilon, epsilon_decay, alpha, alpha_decay, gamma, render=False): reset_state_values() env = TicTacToeEnv() if opponent == 'random': agents = [ QAgent(env.observation_space.n, env.action_space.n, 'O', epsilon, epsilon_decay, alpha, alpha_decay, gamma), RandomAgent('X') ] else: # Two Q agents agents = [ QAgent(env.observation_space.n, env.action_space.n, 'O', epsilon, epsilon_decay, alpha, alpha_decay, gamma), QAgent(env.observation_space.n, env.action_space.n, 'X', epsilon, epsilon_decay, alpha, alpha_decay, gamma) ] start_mark = 'O' agent_rewards = {'O': [], 'X': []} episode = 0 for i in tqdm(range(max_episode)): episode += 1 env.show_episode(False, episode) # reset agent for new episode for agent in agents: agent.episode_rate = episode / float(max_episode) env.set_start_mark(start_mark) state = env.reset() s, mark = state done = False while not done: if render: env.render() agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() env.show_turn(False, mark) action = agent.act(state, ava_actions) # update (no rendering) nstate, reward, done, info = env.step(action) agent.update(s, nstate[0], action, reward, done) if done: if render: env.render() env.show_result(render, mark, reward) # set terminal state value set_state_value(state, reward) agent_rewards['O'].append(reward) agent_rewards['X'].append(-reward) s, mark = state = nstate # rotate start start_mark = next_mark(start_mark) return agent_rewards, agent_by_mark(agents, 'O')
def learn(self,env,num_episodes,agent_2,rndm): mean_returns = [] C = defaultdict(lambda: np.zeros(env.action_space.n)) start_mark = 'X' for episode in range(1,num_episodes+1): if episode%1000 == 0: print("\rEpisode {}/{}".format(episode,num_episodes),end="") sys.stdout.flush() episodes = self.generate_episode(env,self.behaviour_policy,start_mark) G = 0.0 W = 1.0 t_initial = len(episodes) for t in range(len(episodes))[::-1]: state,action,reward = episodes[t] # print("State = {}, Action = {}, reward = {},W = {}".format(state,action,reward,W)) G = self.discount_factor*G + reward C[state][action] += W # if (W/C[state][action]) != 1.0: # print(W/C[state][action]) # print(self.Q[state]) # if t_initial-t > 4: # print(t_initial-t) self.Q[state][action] += (W/C[state][action]) * (G - self.Q[state][action]) # print(self.Q[state]) x = np.nonzero(state[0]) y = [] for i in range(9): if i in x[0]: continue else: y.append(i) # print(state) # print(y) y = np.array(y) # print(self.target_policy(state,y)) if action != self.target_policy(state,y): break W = W*(len(y)) start_mark = next_mark(start_mark) mu = play_against(self,agent_2,10) self.unique_states.append(len(self.Q.keys())) mean_returns.append(mu) for s,a in rndm: self.backup[(s,a)].append(deepcopy(self.Q[s][a])) # low_returns.append(low) # high_returns.append(high) save_model('Mc_OffPolicy_agent.dat',num_episodes,None,self.discount_factor,'Mc_OffPolicy',self.Q) return mean_returns
def opponent_mark(self): return next_mark(self.mark)
def play(num_games, verbose=True): """ Test out two agents playing against each other. Displays progress and result. Parameters: ----------- num_games: int How many games to simulate verbose: bool If true, display play information during each game If false, just display progress bar as simulations progress. """ # Print header print("-" * 30) print(f"Playing {num_games} games") print(" * Player X: {}".format(players["X"].name)) print(" * Player O: {}".format(players["O"].name)) print("-" * 30) # select random starting player start_mark = random.choice(["X", "O"]) # keep track of who won winners = [] # if verbose is false, display progress bar if not verbose: myrange = trange else: myrange = range for _ in myrange(num_games): # set up board env = TicTacToeEnv() env.set_start_mark(start_mark) state = env.reset() # init the agents agents = [players["X"]("X"), players["O"]("O")] # play until game is done while not env.done: _, mark = state if verbose: env.show_turn(True, mark) agent = agent_by_mark(agents, mark) action = agent.act(state, copy(env)) state, reward, _, _ = env.step(action) if verbose: env.render() # append winner to list (-1=X, 1=0, 0=tie) winners.append(reward) # print out result if verbose: env.show_result(True, mark, reward) # rotate start start_mark = next_mark(start_mark) # tally and display final stats c = Counter(winners) total = c[-1] + c[1] + c[0] print("\nX won {} ({:.2%})".format(c[-1], c[-1] / total)) print("O won {} ({:.2%})".format(c[1], c[1] / total)) print("Tied {} ({:.2%})".format(c[0], c[0] / total))