def test_game_plays(): a1 = RandomAgent() a2 = RandomAgent() g = Game(a1, a2) winner = g.play(3) assert a1 is winner or a2 is winner
def move(self, board, settings): # These will be a bunch of synchronous arrays connecting a move (square) to win/loss results openSquares = board.getOpenSquares() estimatedWins = [] estimatedLosses = [] estimatedDraws = [] # Create the random agents that will play to estimate board state values myAgent = RandomAgent(self.type, self.m, self.n, self.k) oppType = Square.O_HAS if oppType == self.type: oppType = Square.X_HAS oppAgent = RandomAgent(oppType, self.m, self.n, self.k) # Copy the settings and make sure the new one is not set to verbose (too much output) newSettings = deepcopy(settings) newSettings.verbose = False # for each possible move, play a bunch of games to estimate its value for x,y in openSquares: wins = 0.0 losses = 0.0 draws = 0.0 for i in range(settings.numGamesToEstimateValue): # copy the "current" board and make the move we are considering newBoard = deepcopy(board) newBoard.addPiece(x, y, self.type) # play a random game from here winner = mnkgame.MNKGame().playGame(newBoard, oppAgent, myAgent, newSettings) # do bookkeeping based on who won if winner == myAgent: wins += 1.0 elif winner == oppAgent: losses += 1.0 else: draws += 1.0 # Stick results on the synchronous arrays to unpack later. estimatedWins.append(wins) estimatedLosses.append(losses) estimatedDraws.append(draws) # compute a scoring function for each move based on wins/losses/draws moveScores = [W * 2 - L + D for W,L,D in zip(estimatedWins, estimatedLosses, estimatedDraws)] ''' if settings.verbose: print("*** Simulation complete, results: (square, wins, losses, draws, SCORE)") for square, wins, losses, draws, score in zip(openSquares, estimatedWins, estimatedLosses, estimatedDraws, moveScores): print(square, wins, losses, draws, score) ''' # select the move with the best score return openSquares[np.argmax(moveScores)]
def create_agent(name, model): """ Create a specific type of Snake AI agent. Args: name (str): key identifying the agent type. model: (optional) a pre-trained model required by certain agents. Returns: An instance of Snake agent. """ from agent import DeepQNetworkAgent, PlayerAgent, RandomAgent if name == 'human': return PlayerAgent() elif name == 'dqn': if model is None: raise ValueError('A model file is required for a DQN agent.') return DeepQNetworkAgent(model=model, memory_size=-1, num_last_frames=4) elif name == 'random': return RandomAgent() raise KeyError('Unknown agent type: "{%s}"'.format(name))
def get_agent(name, properties): if name == "random": return RandomAgent(world) if name == "alphabeta": return AlphaBetaAgent(world, properties) if name == "human": return HumanAgent(world) if name == "dls": return DLS(world)
def build_agent(name): if name == "Random": return RandomAgent() elif name == "MCTS": return MCTSAgent() elif name == "AlphaZero": config = AlphaZeroConfig() network = AlphaZeroNetwork(config) agent = AlphaZeroAgent(config, network) agent.load_model("./pretrained/alphazero_model.pth") return agent else: raise RuntimeError
def main(env_name, render=False): env = gym.make(env_name) # Inicialize seu agente aqui agent = RandomAgent(env) for episode_i in range(100000): state = env.reset() done = False while not done: if render and episode_i % 10 == 0: env.render() # Ação do seu agente aqui action = agent.act(state) state, reward, done, info = env.step(action)
def __init__(self): # Board parameters self.m = 7 self.n = 6 self.k = 5 # Agent parameters. Specify different agent types here via constructor self.Xagent = RandomAgent(Square.X_HAS, self.m, self.n, self.k) self.Oagent = cnnAgent(Square.O_HAS, self.m, self.n, self.k) self.numGamesToEstimateValue = 5 # Outermost loop parameters self.numGamesToTest = 10 self.numGamesToTrain = 10 self.verbose = False
def execute(self): for k in np.arange(self.max_runs): environment = KArmedTestbed(self.number_of_arms) agent = RandomAgent(self.number_of_arms) for i in np.arange(self.max_steps): current_state = environment.get_current_state() next_action = agent.get_action(current_state) reward = environment.do_action(next_action) if not self.avg_reward[i]: self.avg_reward[i] = reward else: self.avg_reward[i] = self.avg_reward[i] + ( reward - self.avg_reward[i]) / (i + 1.0)
def train(env, model, args): model.optim = torch.optim.Adam(islice(model.parameters(), 20), lr=0.0005) model.pos_optim = torch.optim.Adam(model.eval_mlp.parameters(), lr=0.0005) replay_buffer = MemoryBuffer(int(args.batch)) agent = RandomAgent(env.action_spec()) max_steps = args.num_steps env.reset() step = 0 sub_trajectory = SubTrajectory(100) pbar = tqdm(total = max_steps) while step < max_steps: action = agent.step() # for _ in range(np.random.randint(1,5)): rgb, pos, orientation = env.observations()['RGB'], env.observations()['DEBUG.POS.TRANS'], env.observations()['DEBUG.POS.ROT'] reward = env.step(action) if (not env.is_running()): env.reset() else: new_rgb, new_pos, new_orientation = env.observations()['RGB'], env.observations()['DEBUG.POS.TRANS'], env.observations()['DEBUG.POS.ROT'] if sub_trajectory.len == 100: tmp = copy.deepcopy(sub_trajectory) # Send initial belief to replay buffer o_0 = torch.from_numpy(tmp.new_rgb[0]).to(dtype=torch.float32).unsqueeze(0).to(device) a_0 = torch.from_numpy(tmp.action[0]).to(dtype=torch.float32).unsqueeze(0).to(device) z_0 = model.conv(o_0) bgru_input = torch.cat((z_0, a_0), dim=1) _, tmp.belief = model.belief_gru.gru1(torch.unsqueeze(bgru_input, 1)) replay_buffer.add(tmp) sub_trajectory.clear() sub_trajectory.add(rgb, pos, orientation, action, new_rgb, new_pos, new_orientation) # Train using replay_buffer if step >= args.batch * 100: train_batch = replay_buffer.sample(64) if None in train_batch: raise Exception("Training batch contains None object") model.update(train_batch) step += 1 pbar.update(1) pbar.close()
def __init__(self, agent_type): # TODO:: Add agent selector if agent_type == 'random': self.agent = RandomAgent() elif agent_type == 'fixed-action': self.agent = FixedActionAgent() elif agent_type == 'a2c': self.agent = A2CAgent(num_steps=50, max_frames=1000) elif agent_type == 'a3c': self.agent = A3CAgent(num_envs=1, num_steps=50, max_frames=1000) else: status = { 'status': 'ERROR', 'error_msg': 'Not supported agent-type' } raise Exception(status)
def markovDecision(layout, circle): env = SnakesAndLadder(layout, circle) agent = RandomAgent(env.action_space) n_episodes = 50 for episode in range(n_episodes): state = env.reset() done = False while not done: action = agent.select_action(state) next_state, reward, done = env.step(action) agent.update(state, action, reward, next_state) state = next_state
def simulate_random_agent(games): data = { "actions": [], "rewards": np.zeros(NUM_STEPS) } for g in tqdm(range(NUM_GAMES), desc="Random Agent"): agent = RandomAgent(NUM_ARMS, NUM_STEPS) game = games[g] actions, rewards = agent.play(game) data["actions"].extend(actions) data["rewards"] += rewards # Convert sum to average reward per step. data["rewards"] /= NUM_GAMES return data
def __init__(self, token, agent_type): self.token = token self.remote_base = "http://grader.crowdai.org:1729" self.client = Client(self.remote_base) # TODO:: Add agent selector if agent_type == 'random': self.agent = RandomAgent() elif agent_type == 'fixed-action': self.agent = FixedActionAgent() elif agent_type == 'a3c': self.agent = A3CAgent(num_envs=2, num_steps=50, max_frames=1000) else: status = { 'status': 'ERROR', 'error_msg': 'Not supported agent-type' } raise Exception(status)
def test_random(): agent = RandomAgent() env = Atari() for i_episode in range(10): episode_reward = 0 state = env.reset() for _ in range(10000): env.env.render() action = agent.evaluate_action(state) next_state, reward, done, _ = env.step(action.item()) state = next_state episode_reward += reward if done: print( "Episode: %6d, interaction_steps: %6d, reward: %2d, epsilon: %f" % (i_episode, agent.interaction_steps, episode_reward, 1.0)) break
def plot_cost(layout, circle, legend=True, title=None): agents = [ OptimalAgent(layout, circle), ConstantAgent(SECURITY_DICE), ConstantAgent(NORMAL_DICE), ConstantAgent(RISKY_DICE), RandomAgent([SECURITY_DICE, NORMAL_DICE, RISKY_DICE]) ] labels = [ "Optimal", "Security dice", "Normal dice", "Risky dice", "Random" ] costs = [ estimate_cost(layout, circle, pi, n_episodes=1000) for pi in agents ] states = np.arange(14) + 1 plt.figure(figsize=(6, 4), dpi=120) ax = plt.gca() C_th, _ = markovDecision(layout, circle) plt.plot(states, C_th, ls='--', color=colors[0], lw=1.8) for i, C in enumerate(costs): plt.plot(states, C, marker="o", color=colors[i], lw=1.8, label=labels[i]) plt.xlabel("States") plt.ylabel("Total expected cost") plt.legend() if legend else "" ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') plt.title(title) plt.show()
def main(view=False, result=False): player1 = Human() player2 = RandomAgent() env = TicTocToe() state = env.setGame() done = False turn = FIRST try: while not done: if view: viewBoard(state) if turn == FIRST: action = player1.action(state) else: action = player2.action(state) state, info = env.step(divmod(action, 3)) done = (info == ENDGAME) turn *= -1 time.sleep(0.1) if env.winner: name = "player1" if env.winner == FIRST else "player2" print(name + " win!") else: print("引き分けです") if result: print("--- result ----------------") state.show() except KeyboardInterrupt: print("終了します。") except: import traceback traceback.print_exc() name = "player1" if turn != FIRST else "player2" print(name + " win!")
def __init__(self, mode: str): super().__init__() self.state = np.full((rows, cols), 0) self.blocks = np.full((rows, cols), None) self.cr = -1 self.cc = -1 self.mode = mode self.player = 1 if randint(0, 1) == 0: self.player = -1 self.takeInput = True if mode == Game.HUMAN_MODE and self.player == 1: self.takeInput = False self.ai_agent = None self.random_agent = None #print(self.mode) if not (self.mode == Game.HUMAN_HUMAN_MODE): self.ai_agent = Agent(rows, cols) if self.mode == Game.AI_MODE: print("init") self.random_agent = RandomAgent(rows, cols) print("inited") self.initUI()
def __init__(self, n_players, agents=None, card_set='random', verbose=False): '''Initialize a new game, with n players. Args: n_players (int): Number of players in this game. Must be between 2 and 4. agents (dict): Contains the agents who will play in this game. The dict key is used as the player_id. If n_players is greater than the number of agents provided, RandomAgents will be used for the remaining players. card_set (str): Indicates which pre-specified card set to use. Options are 'random' and 'base'. Default: 'random'. verbose (bool): Indicates whether to print game state as actions take place. Default: 'False'. ''' if agents is None: agents = {} assert n_players >= 2 and n_players <= 4, "n_players must be between 2 and 4" assert len(agents) <= n_players, "must not have more agents than n_players" self.n_players = n_players self.card_set = card_set self.verbose = verbose players = [] for player_id, agent in six.iteritems(agents): players.append(Player(player_id=player_id, agent=agent)) for n in range(len(agents), self.n_players): players.append(Player(player_id='Player ' + str(n), agent=RandomAgent())) self.players = players self.supply_piles = SupplyPiles(n_players=self.n_players, card_set=self.card_set) if self.verbose: self.supply_piles.display_supply_pile_count()
import numpy as np import gym from agent import RandomAgent from benchmarker import GameBenchmarker if __name__ == '__main__': env = gym.make("Pong-v0") num_actions = env.action_space.n agent = RandomAgent(num_actions) gb = GameBenchmarker(env, 10, render=True) mean_rew = gb.benchmark_agent(agent) print(mean_rew)
def main(args): parser = ArgumentParser(description='') parser.add_argument('--height', type=int, required=True, help='Heigth of the map') parser.add_argument('--width', type=int, required=True, help='Width of the map') parser.add_argument('--num-powerups', type=int, required=True, help='Number of powerups to put in the game map') parser.add_argument('--num-monsters', type=int, required=True, help='Number of monsters to put in the game map') parser.add_argument('--initial-strength', default=100, type=int, help='Initial strength of each agent') parser.add_argument('--save-dir', type=str, help='Save directory for saving the map') parser.add_argument('--map-file', type=str, help='Path to the map JSON file') parser.add_argument('--play-against-human', action='store_true', help='Whether to have a Human player as one of the ' 'agents in the game') parser.add_argument('--play-against-seekers',action='store_true', help='Whether to have a Seeker player as one of the ' 'agents in the game') parser.add_argument('--show-map', action='store_true', help='Whether to display the map in the terminal') parser.add_argument('--map-type', choices=MAP_TYPES, default='ascii', help='Select map type. Choices are {' + ', '.join(MAP_TYPES) + '}') parser.add_argument('--verbose', action='store_true', help='Whether to be verbose when playing game') args = parser.parse_args(args) # TODO: Change how agents are populated agent = RandomAgent(args.height, args.width, args.initial_strength) agents = [agent] if args.play_against_human: human = HumanAgent(args.height, args.width, args.initial_strength) agents.append(human) game_driver = GameDriver( height=args.height, width=args.width, num_powerups=args.num_powerups, num_monsters=args.num_monsters, agents=agents, initial_strength=args.initial_strength, show_map=args.show_map, map_type=args.map_type, save_dir=args.save_dir, map_file=args.map_file) #Play against seekers if args.play_against_seekers: seeker = SeekerAgent(args.height, args.width, args.initial_strength) agents.append(seeker) game_driver = GameDriver( height=args.height, width=args.width, num_powerups=args.num_powerups, num_monsters=args.num_monsters, agents=agents, initial_strength=args.initial_strength, show_map=args.show_map, map_type=args.map_type, save_dir=args.save_dir, map_file=args.map_file) print('Starting game') game_driver.play(verbose=args.verbose)
import time import gym from agent import RandomAgent env = gym.make("CartPole-v1") agent = RandomAgent(env.action_space) episode_count = 10 reward = 0 done = False for i in range(episode_count): ob = env.reset() while True: action = agent.act(ob, reward, done) ob, reward, done, info = env.step(action) if done: print("Game Finished!") break env.render() time.sleep(1 / 30) env.close()
def play_random(params_path): Parameters.load(params_path) environment = Environment() agent = RandomAgent(environment) all_scores = agent.play() print('mean: ', np.mean(all_scores), '\tstd: ', np.std(all_scores))
parser.add_argument('--gamma', default=0.99, type=float, help='discount factor') args = parser.parse_args() # set seed random.seed(args.seed) np.random.seed(args.seed) env = MazeEnv(args, args.game_name, args.graph_param, args.game_len, args.gamma) # agent if args.agent == 'random': agent = RandomAgent(args, env) NUM_GRAPH = 100 NUM_ITER = 32 ep_rews = [] for graph_id in range(NUM_GRAPH): for _ in range(NUM_ITER): ep_rew = 0 state, info = env.reset(graph_index=graph_id) done = False while not done: action = agent.act(state) state, rew, done, info = env.step(action) ep_rew += rew ep_rews.append(ep_rew)
batch_size = 32 num_steps = 2 # ##### Initialization for learning Agent # env = environment.NYCTaxiEnv(data_months=data_months) # brain = LearningAgent(gamma, epsilon, alpha, maxMemorySize, epsEnd, # action_space, replace) # start = time.time() # initialize(env, brain) # end = time.time() # print('%d seconds passed for initialization' %(end - start)) # pickle.dump(brain, open('brain_init.pickle','wb')) # ##### Training Learning Agent # # scores,steps,epsHistory = train_agent(env, brain, numGames, batch_size, num_steps) # brain.saveModelCheckpoint('./../output/') # brain.saveModelState('./../output/') # filename = str(numGames) + 'Games' + 'Gamma' + str(brain.GAMMA) + \ # 'Alpha' + str(brain.ALPHA) + 'Memory' + str(brain.memSize) # df = pd.DataFrame({'scores':scores,'steps':steps,'epsHistory':epsHistory}) # df.to_csv(output_dir + 'LearningAgent' + filename + '.csv', index=False) ##### Random agent env = environment.NYCTaxiEnv(data_months=data_months) randombrain = RandomAgent(maxMemorySize=1,action_space=action_space) numGames = 100 scores,steps = test_agent(env, randombrain, numGames) fileName = str(numGames) + 'Games' df = pd.DataFrame({'scores':scores,'steps':steps}) df.to_csv(output_dir + 'RandomAgent' + fileName + '.csv', index=False)
def game_mngr(): """ Game manager, used for navigation among different choices offered to user. """ # Options command = options('PLAY', 'RULES', 'Tap 1 to play or 2 to read the rules') # Rules page if int(command) == 2: print_rules() # Go back print('Tap 1 to come back to the main menu\n') comeback = tap_valid_digits([1]) if int(comeback): game_mngr() # Game page if int(command) == 1: # Options players = options('PLAYER', 'PLAYERS', 'How many players ?', comeback=True) # Go back if int(players) == 0: game_mngr() # 2 players if int(players) == 2: # Ask players' name player1, player2 = input_names(n_players=2) # Init scores scores = [0, 0] # Games tapnswap = TapnSwap() over = False while not over: game_over, winner = game_1vs1(tapnswap, player1, player2) scores[winner] += 1 if game_over: # Display scores restart = display_endgame(scores, player1, player2) # Go back if not restart: over = True game_mngr() # 1 player if int(players) == 1: # Options level = options('EASY', 'DIFFICULT', 'Which level ?', comeback=True) # Go back if int(level) == 0: game_mngr() # Define agent elif int(level) == 1: agent = RandomAgent() # easy else: # Load agent agent = RLAgent() agent.load_model('greedy0_2_vsRandomvsSelf') # difficult # Ask player's name player = input_names(n_players=1) # Init scores scores = [0, 0] # Games tapnswap = TapnSwap() over = False while not over: game_over, winner = game_1vsAgent(tapnswap, player, agent, greedy=False) scores[winner] += 1 if game_over: # Display scores restart = display_endgame(scores, player, 'Computer') # Go back if not restart: over = True game_mngr()
from othello import Othello from agent import Agent, RandomAgent, GreedyAgent, DullAgent, Human from MCTS import MCTSAgent if __name__ == '__main__': agent1 = MCTSAgent(1000) agents = [RandomAgent(), GreedyAgent(), DullAgent(), MCTSAgent(200)] for agent2 in agents: print('========') print(agent1, 'vs', agent2) black_win = 0 white_win = 0 tie = 0 for _ in range(100): game = Othello(agent1, agent2, print_mode=False) winner = game.play() if winner == 1: black_win += 1 elif winner == -1: white_win += 1 elif winner == 0: tie += 1 print(agent1, black_win) print(agent2, white_win) print('Tie', tie) print()
def challenger_round(): challengers = [] leaders = [] leader_checkpoints = os.listdir(LEADER_DIR) # Need to share the same schedule with all challengers, so they all anneal # at same rate epsilon_schedule = LinearSchedule(EPS_START, EPS_END, TRAIN_FRAMES) for i in xrange(NUM_LEADERS): challenger = try_gpu( DQNAgent(6, epsilon_schedule, OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM)) if i < len(leader_checkpoints): leader = try_gpu( DQNAgent(6, LinearSchedule(0.1, 0.1, 500000), OBSERVATION_MODE)) leader_path = os.path.join(LEADER_DIR, leader_checkpoints[i]) print "LOADING CHECKPOINT: {}".format(leader_path) challenger.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) leader.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) else: leader = RandomAgent(6) print "INITIALIZING NEW CHALLENGER AND LEADER" challengers.append(challenger) leaders.append(leader) if CHALLENGER_DIR is not None: challengers = [] # Load in all of the leaders for checkpoint in os.listdir(CHALLENGER_DIR): path = os.path.join(CHALLENGER_DIR, checkpoint) print "LOADING FROM CHALLENGER_DIR: {}".format(path) challenger = try_gpu( DQNAgent(6, LinearSchedule(0.05, 0.05, 1), CHALLENGER_OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM, name=checkpoint)) challenger.load_state_dict( torch.load(path, map_location=lambda storage, loc: storage)) challengers.append(challenger) challenger = EnsembleDQNAgent(challengers) leader = EnsembleDQNAgent(leaders) if OPPONENT is not None or HUMAN: leader = NoOpAgent() replay_buffer = ReplayBuffer(1000000) rewards = collections.deque(maxlen=1000) frames = 0 # number of training frames seen episodes = 0 # number of training episodes that have been played with tqdm(total=TRAIN_FRAMES) as progress: # Each loop completes a single episode while frames < TRAIN_FRAMES: states = env.reset() challenger.reset() leader.reset() episode_reward = 0. episode_frames = 0 # Each loop completes a single step, duplicates _evaluate() to # update at the appropriate frame #s for _ in xrange(MAX_EPISODE_LENGTH): frames += 1 episode_frames += 1 action1 = challenger.act(states[0]) action2 = leader.act(states[1]) next_states, reward, done = env.step(action1, action2) episode_reward += reward # NOTE: state and next_state are LazyFrames and must be # converted to np.arrays replay_buffer.add( Experience(states[0], action1._action_index, reward, next_states[0], done)) states = next_states if len(replay_buffer) > 50000 and \ frames % 4 == 0: experiences = replay_buffer.sample(32) challenger.update_from_experiences(experiences) if frames % 10000 == 0: challenger.sync_target() if frames % SAVE_FREQ == 0: # TODO: Don't access internals for agent in challenger._agents: path = os.path.join(LEADER_DIR, agent.name + "-{}".format(frames)) print "SAVING CHECKPOINT TO: {}".format(path) torch.save(agent.state_dict(), path) #path = os.path.join( # LEADER_DIR, challenger.name + "-{}".format(frames)) #torch.save(challenger.state_dict(), path) if frames >= TRAIN_FRAMES: break if done: break if episodes % 300 == 0: print "Evaluation: {}".format( evaluate(challenger, leader, EPISODES_EVALUATE_TRAIN)) print "Episode reward: {}".format(episode_reward) episodes += 1 rewards.append(episode_reward) stats = challenger.stats stats["Avg Episode Reward"] = float(sum(rewards)) / len(rewards) stats["Num Episodes"] = episodes stats["Replay Buffer Size"] = len(replay_buffer) progress.set_postfix(stats, refresh=False) progress.update(episode_frames) episode_frames = 0
def trainAndTestCNNAgent(settings): ourHero = cnnAgent(Square.X_HAS, settings.m, settings.n, settings.k) trainingPartner = SearchAgent(Square.O_HAS, settings.m, settings.n, settings.k) gauntlet1 = RandomAgent(Square.O_HAS, settings.m, settings.n, settings.k) gauntlet2 = SearchAgent( Square.O_HAS, settings.m, settings.n, settings.k ) # FIXME but add more search (settings specifying that parameter globally is bad) theGauntlet = [gauntlet1, gauntlet2] trainingSessions = 100 print("****Testing ", settings.numGamesToTest, " games of agents looking for sequences of length k=", settings.k, " using ", settings.numGamesToEstimateValue, " games to estimate value") print("Agents: X: ", ourHero) print(" and O: ", trainingPartner) print( "\tTestSession\tHeroWins\tHeroLoses\tHeroDraws\tAvgIllegalMoves\tAvgGameLength\tMaxGameLength\tOpponent\t(REPEAT)" ) for session in range(trainingSessions): for i in range(settings.numGamesToTrain): # new game, create a fresh board if settings.verbose: print("Creating a M x N board, where m =", settings.m, " and n=", settings.n, "\n") board = Board(settings.m, settings.n) # play the game, taking turns being first to act if i % 2 == 0: MNKGame().playGame(board, ourHero, trainingPartner, settings) else: MNKGame().playGame(board, trainingPartner, ourHero, settings) # test vs the gauntlet for opponent in theGauntlet: heroWins = 0 heroLoses = 0 heroDraws = 0 totalIllegalMoves = 0 maxGameLength = 0 totalGameLength = 0 for j in range(settings.numGamesToTest): # new game, create a fresh board if settings.verbose: print("Creating a M x N board, where m =", settings.m, " and n=", settings.n, "\n") board = Board(settings.m, settings.n) # play the game, taking turns being first to act if j % 2 == 0: winner, moveCount, illegalMoveCount = MNKGame().playGame( board, ourHero, opponent, settings) else: winner, moveCount, illegalMoveCount = MNKGame().playGame( board, opponent, ourHero, settings) # do the bookkeeping now that a result is obtained totalIllegalMoves += illegalMoveCount totalGameLength += moveCount if moveCount > maxGameLength: maxGameLength = moveCount if winner == ourHero: heroWins += 1 if settings.verbose: print( "X emerges victorious over the vile O!!!!! in game ", i) elif winner != None: heroLoses += 1 if settings.verbose: print("O has defeated the disgusting X!!!!! in game ", i) elif winner == None: heroDraws += 1 if settings.verbose: print("fought to a draw... maybe next time. In game ", i) # All games vs this opponent complete, generate some final output print("\t", session, "\t", heroWins, "\t", heroLoses, "\t", heroDraws, "\t", float(totalIllegalMoves) / settings.numGamesToTest, "\t", float(totalGameLength) / settings.numGamesToTest, "\t", maxGameLength, "\t", opponent, end='') print("")
def game_2Agents(agent1, agent2, start_idx=-1, train=True, time_limit=None, n_games_test=0, play_checkpoint_usr=False, verbose=False): """ Manages a game between 2 agents (agent1, agent2) potentially time-limited, with possibility to train them, to confront 1 of them through a game with user before the game between the 2 agents (agent1, agent2) and to test 1 of them through several games against a Random Agent after the game between the 2 agents (agent1, agent2). Parameters ---------- agent1, agent2: instances of Agent Agents involved in the game. start_idx: -1, 0 or 1 Index of the agent that starts the game (0: agent1, 1: agent2, -1: random). train: boolean Set to True to train both agents. time_limit: int Maximum number of rounds between the 2 agents (possibility of loops with optimal actions). Avoid to set it to 0 in case of identical agents. n_games_test: int Number of games between agent1 and a Random Agent following the game between agent1 and agent2. play_checkpoint_usr: boolean Set to True for a game between the user and agent1 preceding the game between agent1 and agent2. verbose: boolean Set to True for a written explanation of each round. Return ------ game_over: boolean. winner: index of winner agent (0: agent1, 1: agent2, -1: tie). test_results: list of int Only working if n_games_test > 0 (otherwise empty list by default). If n_games_test > 0: * test_results[0]: number of finished games. * test_results[1]: number of games = n_games_test. * test_results[2]: score of agent1. * test_results[3]: score of Random Agent. """ tapnswap = TapnSwap() tapnswap.reset() # Time of pause between several actions (if verbose) delay = 2 # Preliminary game with the user if play_checkpoint_usr: game_1vsAgent(tapnswap, 'test player', agent1, greedy=False) tapnswap.reset() # Select starting player if start_idx == -1: np.random.seed() player_idx = np.random.randint(0, 2) else: assert start_idx == 0 or start_idx == 1, \ 'The starting agent index must be 0, 1 or -1.' player_idx = start_idx agents = [agent1, agent2] names = ['Agent1', 'Agent2'] count_rounds = 0 prev_state = [] prev_action = [] # Start game game_over = False while not game_over: if verbose: # Print current configuration show_score(tapnswap, names, 1 - player_idx, invert=False) time.sleep(delay) # Get current state hands = tapnswap.show_hands().copy() state = [hands[player_idx], hands[1 - player_idx]] # Choose action actions = tapnswap.list_actions(player_idx) action = agents[player_idx].choose_action(state, actions, greedy=train) # Take action and get reward reward = tapnswap.take_action(player_idx, action) if verbose: # Print chosen action seq = str(names[player_idx]) if action[0] == 0: seq = seq + str(' tapped with ' + str(hands[player_idx, action[1]]) + ' on ' + str(hands[1 - player_idx, action[2]])) else: new_hands = tapnswap.show_hands().copy() seq = seq + str(' swapped ' + str(hands[player_idx][0]) + '-' + str(hands[player_idx][1]) + ' for ' + str(new_hands[player_idx][0]) + '-' + str(new_hands[player_idx][1])) print(seq) time.sleep(delay) print() # Print new configuration show_score(tapnswap, names, player_idx) time.sleep(delay) # Print corresponding reward print('Reward of ', names[player_idx], ' : ', reward) time.sleep(delay) print('----------------------------') game_over, winner = tapnswap.game_over() # Training if train: # Get new state next_hands = tapnswap.show_hands().copy() next_state = [next_hands[player_idx], next_hands[1 - player_idx]] # Train playing agent for a winning move if game_over: agents[player_idx].update_Q(state, action, reward, next_state) # Train waiting agent (response of the environment) if count_rounds: # New state in other's agent point of view inv_next_state = [ next_hands[1 - player_idx], next_hands[player_idx] ] # Each waiting agent receives the transition with the # response of the environment for the new state agents[1 - player_idx].update_Q(prev_state, prev_action, -reward, inv_next_state) # Keep in memory previous state and action prev_state = state prev_action = action # Avoid loops if time_limit is not None: if count_rounds > time_limit and not game_over: game_over = True winner = -1 # Next round player_idx = 1 - player_idx count_rounds += 1 # Test of agent1 test_results = [] if bool(n_games_test): random_agent = RandomAgent() test_results = compare_agents(agent1, random_agent, n_games=n_games_test, time_limit=None, verbose=False) return game_over, winner, test_results
def train(n_epochs, epsilon, gamma, load_model, filename, random_opponent, n_games_test, freq_test, n_skip_games=int(0), verbose=False): """ Train 2 agents by making them play and learn together. Save the learned Q-function into CSV file. It is possible to confront 1 of the agents (against either the user or a Random Agent) during training, as often as one wants. It is also possible to train an already trained model. Parameters ---------- n_epochs: int Number of games used for training. epsilon: float (in [0,1]) Fraction of greedy decisions during training of the 2 RL Agents. gamma: float (in [0,1]) Factor of significance of first actions over last ones for the 2 RL Agents. load_model: string CSV filename in which is stored the learned Q-function of an agent. If load_model = 'model', the function loads the model './Models/model.csv'. If load_model is not None, the previous parameters epsilon and gamma are used for a second training. filename: string Name of the CSV file that will store the learned Q-function of one of the agents. The path to CSV file is then ./Models/filename.csv. The counter of state-action pairs is also stored at ./Models/data/count_filename.csv for future training. random_opponent: boolean If set to true, the function trains 1 RL Agent by making it play against a Random Agent. Otherwise, the RL agent is trained by playing against another version of itself. n_games_test: int Number of games one of the RL Agent plays against a Random Agent for testing. If set to 0, the RL Agents will not be tested by a Random Agent. freq_test: int Number of epochs after which one of the RL Agents plays n_games_test games against a Random Agent. If set to 1000, each 1000 epochs of training, one of the RL Agents is tested against a Random Agent. If set to 0, test occurs at the last epoch of training only. If set to -1, none of the agents is tested during training. n_skip_games: int Number of epochs after which the user can choose to play against one of the learning agents. If set to 1000, each 1000 games, the user can choose to play against one agent. If set to 0, the user can choose to play against one agent at the last epoch only. If set to -1, no choice is offered and the user cannot test any agent. verbose: boolean If set to True, each game action during training has a written explanation. Return ------ learning_results: list Only significant with n_games_test > 0 (otherwise, empty list by default). List of each n_epochs // freq_test epoch test results against a Random Agent. Each test result is a list: [current epoch, score of RL Agent, number of finished games, n_games test]. """ # Learning agent agent1 = RLAgent(epsilon, gamma) if load_model is not None: agent1.load_model(load_model) # Choose opponent if random_opponent: agent2 = RandomAgent() time_limit = None print('Training vs Random') else: agent2 = RLAgent(epsilon, gamma) if load_model is not None: agent2.load_model(load_model) time_limit = None print('Training vs Self') start_idx = 0 scores = [0, 0] # If the user only confronts the agent at the last epoch # or if no confrontation if n_skip_games in [-1, 0]: n_skip_games = n_epochs - n_skip_games # Boolean for game between the user and agent1 preceding a game # between agent1 and agent2 play_checkpoint_usr = False # If there is a test of agent1 at the last epoch only or no test if freq_test in [-1, 0]: freq_test = n_epochs - freq_test # Number of games between agent1 and a Random Agent for testing n_games_test_mem = n_games_test learning_results = [] # Start training print('Training epoch:') for epoch in range(1, n_epochs + 1): if epoch % (n_epochs // 10) == 0: print(epoch, '/', n_epochs) #Update boolean for playing with user play_checkpoint_usr = bool(epoch % n_skip_games == 0) if play_checkpoint_usr: # Print training status print('Number of games: ', epoch) print('Scores: ', scores) # Ask user to play play = int(input('Play ? (1 Yes | 0 No)\n')) play_checkpoint_usr = bool(play) # Update boolean for test n_games_test = int(epoch % freq_test == 0) * n_games_test_mem # Start game game_over, winner, test_results = game_2Agents( agent1, agent2, start_idx=start_idx, train=True, time_limit=time_limit, n_games_test=n_games_test, play_checkpoint_usr=play_checkpoint_usr, verbose=verbose) assert game_over, str('Game not over but new game' + ' beginning during training') if winner in [0, 1]: scores[winner] += 1 # Save test games of agent1 against a Random Agent if bool(n_games_test): assert len(test_results) != 0, \ 'Agent1 has been tested but there is no result of that.' learning_results.append( [epoch, test_results[2], test_results[0], test_results[1]]) # Next round start_idx = 1 - start_idx # Save Q-function of agent1 np.savetxt(str('Models/' + filename + '.csv'), agent1.Q, delimiter=',') # Save stats for learning rate of agent1 np.savetxt(str('Models/data/count_' + filename + '.csv'), agent1.count_state_action, delimiter=',') return learning_results