def init_agents(sess, info_state_size, num_actions, dqn_kwargs, a2c_kwargs): policy_module = PolicyGradient(sess, 0, info_state_size**0.5, num_actions,**a2c_kwargs) rollout_module = PolicyGradient(sess, 0, info_state_size**0.5, num_actions,**a2c_kwargs) # rollout_module = DQN(sess, 0, info_state_size, num_actions, **dqn_kwargs) sess.run(tf.global_variables_initializer()) policy_module.restore("../used_model/a2c_CNN/602704") # rollout_module.restore("../used_model/38000") restore_agent_op = tf.group([ tf.assign(rollout_v, policy_v) for (rollout_v, policy_v) in zip(rollout_module.variable_list,policy_module.variable_list) ]) sess.run(restore_agent_op) # TODO: load parameters agents = [MCTSAgent(policy_module,rollout_module,playout_depth = FLAGS.pd, n_playout = FLAGS.np), MCTSAgent(None,None)] logging.info("MCTS INIT OK!!") return agents
def main(): # agents = [RandomAgent(), MCTSAgent(timeLimit=1000)] agents = [ RandomAgent(), MCTSAgent(timeLimit=1000), MCTSAgent(timeLimit=1000) ] #curr player is either 1 or -1, so index 0 is ignored. Player at index 1 plays first game = Game() done = False while not done: next_action = agents[game.currentPlayer].choose_action(game) next_state, value, done, info = game.step(next_action) game.gameState.get_visual_state() print('Reward: {}'.format( game.currentPlayer * -1)) # whoevers turn it is when game loop terminates has lost
def run(match_num, iteration_limit, mcts_process_num, result_list=None, process_id=None, render=False): """ Run the match for MCTS and three simple agents. :param iteration_limit: The maximal iteration of MCTS :param match_num: The number of matches :param mcts_process_num: The number of processes used in MCTS :param result_list: A list to record results :param process_id: The process ID given when you do multiprocessing :param render: Determine whether to render game :return: None """ if mcts_process_num == 1: mcts_process_num = None agent_list = [ MCTSAgent([agents.SimpleAgent for _ in range(3)], iteration_limit, process_count=mcts_process_num), agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent(), ] env = pommerman.make('PommeFFACompetition-v0', agent_list) for i_episode in range(match_num): state = env.reset() done = False initial_agents = state[0]['alive'] survivors = initial_agents dead_agents = [] while not done: if render: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) survivors = state[0]['alive'] for agent in initial_agents: if agent not in survivors and agent not in dead_agents: dead_agents.append(agent) if process_id is not None: print('[Process %d, Episode %d] Dead order: ' % (process_id, i_episode), str(dead_agents), 'Survivors:', survivors) else: print('[Episode %d] Dead order: ' % i_episode, str(dead_agents), 'Survivors:', survivors) if result_list is None: result_list = [] result_list.append((dead_agents, survivors)) env.close() return result_list
def train(self): red_agent = MCTSAgent(num_simulations=50) yellow_agent = MCTSAgent(num_simulations=50) trace = [] while True: cur_state = yellow_agent.state.bitPack() turn = yellow_agent.state.turn if turn == Circle.RED: move = red_agent.play_move() if move is None: return yellow_agent.play_opponent_move(move) else: move = yellow_agent.play_move() if move is None: return red_agent.play_opponent_move(move) trace.append((cur_state, move)) winner = yellow_agent.state.check_winner() if winner is not None: self.lambda_learn(trace, winner, red_agent, yellow_agent) yellow_agent.state.printBoard() break
def get_batched_greedy_actions(self, batched_env_states, env): self.eval() agent_id = env.training_agent # Preserve existing state of env saved_state = env.get_json_info() batched_greedy_actions = [] for env_states in batched_env_states: greedy_actions = np.empty(len(env_states), dtype=int) for i, env_state in enumerate(env_states): next_states = np.empty((self.num_actions, self.in_channels, self.board_size, self.board_size)) MCTSAgent.set_state(env, env_state) obs = env.get_observations() actions = env.act(obs) actions.insert(agent_id, None) terminal = np.full(self.num_actions, False) terminal_rewards = np.empty(self.num_actions) for a in range(self.num_actions): actions[env.training_agent] = a obs, rewards, done, _ = env.step(actions) if done: terminal[a] = True terminal_rewards[a] = rewards[agent_id] state, _ = MCTSAgent.state_space_converter(obs[agent_id]) next_states[a] = state MCTSAgent.set_state(env, env_state) if terminal.all(): vals = terminal_rewards else: _, vals = self(next_states) vals = vals.detach().numpy()[:, 0] # Replace vals with reward if terminal vals[terminal] = terminal_rewards[terminal] greedy_actions[i] = np.argmax(vals) batched_greedy_actions.append(greedy_actions) # Restore existing state from before training MCTSAgent.set_state(env, saved_state) return batched_greedy_actions
def arena(): global better prev_graph = tf.Graph() prev_sess = tf.Session(graph=prev_graph) if not better: prev = "treesup" else: prev = "dualdagger" with prev_sess.as_default(): with prev_graph.as_default(): prev_agent = MCTSAgent(prev_sess, prev) stat = np.zeros(shape=(2, 2), dtype=np.int) for i in range(NUM_TEST_GAMES): agent.refresh() prev_agent.refresh() s = State() prev_is_black = (i % 2 == 0) while not s.end and len(s.history) < 225: if prev_is_black == (s.player > 0): with prev_sess.as_default(): s.move(*get_random_mcts_action(prev_agent, s)) else: with current_sess.as_default(): s.move(*get_random_mcts_action(agent, s)) agent.update(s) prev_agent.update(s) if len(s.history) == 225: print("UNBELIEVABLE EVEN!") sys.stdout.write("x") sys.stdout.flush() stat[int(prev_is_black), int(s.player > 0)] += 1 win_rate = (stat[0, 1] + stat[1, 0]) / stat.sum() print("\nwin_rate is %.02f" % win_rate) if win_rate > .5: better = True agent.save(iteration) print("new model %d saved" % iteration) else: agent.restore(prev) print("old model restored") prev_sess.close()
parser.add_argument("--multiplier2", "-m2", type=float) parser.add_argument("--chkpnt1", "-c1", type=int) parser.add_argument("--chkpnt2", "-c2", type=int) parser.add_argument("--num_games", "-n", default=100, type=int) parser.add_argument('--save', '-s', action='store_true') args = parser.parse_args() a_graph = tf.Graph() b_graph = tf.Graph() a_sess = tf.Session(graph=a_graph) b_sess = tf.Session(graph=b_graph) with a_sess.as_default(): with a_graph.as_default(): a_agent = MCTSAgent(a_sess, args.model_name_1, chkpnt=args.chkpnt1, epsilon=args.epsilon1, multiplier=args.multiplier1) with b_sess.as_default(): with b_graph.as_default(): b_agent = MCTSAgent(b_sess, args.model_name_2, chkpnt=args.chkpnt2, epsilon=args.epsilon2, multiplier=args.multiplier2) stdout.write( "A(e=%.01f, m=%.01f) vs B(e=%.01f, m=%.01f) = " % (args.epsilon1, args.multiplier1, args.epsilon2, args.multiplier2)) stdout.flush() stat = np.zeros(shape=(2, 2), dtype=np.int)
def main(): '''Simple function to bootstrap a game. Use this as an example to set up your training env. ''' # Print all possible environments in the Pommerman registry print(pommerman.REGISTRY) episodes = 2 # Create a set of agents (exactly four) for i_episode in range(episodes): agent_list = [ agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent() # agents.DockerAgent("pommerman/simple-agent", port=12345), ] learner_index = randint(0,3) print(learner_index) agent_list[learner_index] = MCTSAgent() #ExtractedStateAgent('extract', 0, 0, 0) # agent_list[learner_index] = RandomForestAgent() #agent_list[learner_index] = SnorkelAgent() agent_list[learner_index].set_agent_id(learner_index) # agent_list[learner_index].epsilon = 0 # Make the "Free-For-All" environment using the agent list env = pommerman.make('PommeFFACompetition-v0', agent_list) # env.set_training_agent(learner_index) wins=np.zeros(4) win_str='winners' # Run the episodes just like OpenAI Gym obs = env.reset() state = env.get_json_info() agent_list[learner_index].set_state(state) done = False steps = 0 while not done and steps < 500: steps += 1 # env.render() #actions = env.act(state) actions = env.act(obs) # action = agent_list[learner_index].search(state) # actions.insert(learner_index, action) obs, step_reward, done, info = env.step(actions) state = env.get_json_info() agent_list[learner_index].set_state(state) print(actions) print(info) if win_str in info.keys(): for w in info[win_str]: wins[w] += 1 print('Episode {} finished'.format(i_episode)) env.close() print('rates: ',wins/episodes) print('Learner win rate: ',wins[learner_index]/episodes)
def run_training( opponent, mcts_opp, game_state_file, graph_file, model_save_file, mcts_iters, temp, tempsteps, lr, discount, memsize, num_episodes, num_epochs, batch_size, train_every, save_every, graph_every, averaging_window, opt_eps=1e-8, ucb_c=1.5, boardsize=8, inputs=20, render=False, verbose=False, ): env = PommermanEnvironment( render=render, num_agents=2, game_state_file=game_state_file, ) run_settings = RunSettings( num_episodes=num_episodes, num_epochs=num_epochs, batch_size=batch_size, train_every=train_every, save_every=save_every, graph_every=graph_every, averaging_window=averaging_window, graph_file=graph_file, verbose=verbose, ) agent_settings = AgentSettings( optimizer=torch.optim.Adam, learning_rate=lr, opt_eps=opt_eps, epsilon_max=0, epsilon_min=0, epsilon_duration=0, verbose=verbose, ) memory = MCTSMemory(buffer_len=memsize, discount=discount) if mcts_opp is None: mcts_opp = opponent if mcts_opp == 'rand': opp = pommerman.agents.RandomAgent() elif mcts_opp == 'noop': opp = PommermanNoopAgent() elif mcts_opp == 'simp': opp = pommerman.agents.SimpleAgent() else: raise Exception('Invalid MCTS opponent type', mcts_opp) mcts_model = ActorCriticNet(board_size=boardsize, in_channels=inputs) agent1 = MCTSAgent( mcts_iters=mcts_iters, discount=discount, c=ucb_c, temp=temp, tempsteps=tempsteps, agent_id=0, opponent=opp, model_save_file=model_save_file, model=mcts_model, settings=agent_settings, memory=memory, ) agent1.load() if opponent == 'rand': agent2 = RandomAgent() elif opponent == 'noop': agent2 = NoopAgent() elif opponent == 'simp': agent2 = SimpleAgent() else: raise Exception('Invalid opponent type', opponent) experiment = Experiment([agent1, agent2], env, run_settings) experiment.train()
parser.add_argument("--chkpnt2", "-c2", type=int) parser.add_argument("--num_games", "-n", default=100, type=int) parser.add_argument('--save', '-s', action='store_true') args = parser.parse_args() a_graph = tf.Graph() b_graph = tf.Graph() a_sess = tf.Session(graph=a_graph) b_sess = tf.Session(graph=b_graph) with a_sess.as_default(): with a_graph.as_default(): a_agent = DualAgent(a_sess, args.model_name_1, chkpnt=args.chkpnt1) with b_sess.as_default(): with b_graph.as_default(): b_agent = MCTSAgent(b_sess, args.model_name_2, chkpnt=args.chkpnt2, epsilon=args.epsilon) print("ARENA: DUAL %s-%d VERSES MCTS %s-%d" % (a_agent.model_name, a_agent.chkpnt, b_agent.model_name, b_agent.chkpnt)) stat = np.zeros(shape=(2, 2), dtype=np.int) for i in range(args.num_games): t = time() s = State() a_is_black = (i % 2 == 0) while not s.end and len(s.history) < 225: if a_is_black == (s.player > 0): with a_sess.as_default(): s.move(*a_agent.get_action(s, deterministic=False)) b_agent.update(s) else:
(i, j), dims=(15, 15))].config(padx=1, pady=1, bg="red") else: self.recommend() return respond def create_widgets(self): for i in range(15): for j in range(15): f = tk.Frame(self, height=50, width=50) f.pack_propagate(0) f.grid(row=i, column=j, padx=0, pady=0) self.frames.append(f) b = tk.Label(f, image=self.image[0], bg="yellow") b.pack(fill=tk.BOTH, expand=1) b.bind("<Button-1>", self.click(i, j)) self.button.append(b) root = tk.Tk() root.wm_title("Alpha Gomoku") root.attributes("-topmost", True) with tf.Session() as sess: agent = MCTSAgent(sess, "treesup") # agent = DualAgent(sess, "dualdagger") app = Application(agent, root, ensemble=False) app.mainloop()
""" battle arena between agents """ import argparse import numpy as np import tensorflow as tf from time import time from state import State from minimax import MinimaxAgent from mcts_agent import MCTSAgent NUM_GAMES = 2 with tf.Session() as sess: mcts = MCTSAgent(sess, "dualsup", chkpnt=3000) agent = MinimaxAgent() print("ARENA: %s-%d VERSES %s-%d" % (mcts.model_name, mcts.chkpnt, "minimax", 0)) stat = np.zeros(shape=(2, 2), dtype=np.int) for i in range(NUM_GAMES): t = time() s = State() a_is_black = (i % 2 == 0) while not s.end and len(s.history) < 225: if a_is_black == (s.player > 0): s.move(*mcts.get_action(s, deterministic=True)) mcts.update(s) else: s.move(*agent.get_action(s)) mcts.update(s) mcts.refresh()
def main(): user_agent = RandomAgent() adversarial_agent = RandomAgent() agents = [RandomAgent(), user_agent, adversarial_agent] #curr player is either 1 or -1, so index 0 is ignored game = Game() num_simulations = 200 victories = 0 for i in range(num_simulations): game.reset() if i%10 == 0: print('{} simulations run'.format(i)) done = False while not done: next_action = agents[game.currentPlayer].choose_action(game) next_state, value, done, info = game.step(next_action) if game.currentPlayer == -1: victories += 1 print('{}/{} games won by user agent (Random v Random)'.format(victories, num_simulations)) user_agent = MCTSAgent(timeLimit=1000) # user_agent = RandomAgent() adversarial_agent = RandomAgent() # agents = [user_agent, adversarial_agent] agents = [RandomAgent(), user_agent, adversarial_agent] #curr player is either 1 or -1, so index 0 is ignored game = Game() num_simulations = 200 victories = 0 for i in range(num_simulations): game.reset() if i%10 == 0: print('{} simulations run'.format(i)) done = False while not done: next_action = agents[game.currentPlayer].choose_action(game) next_state, value, done, info = game.step(next_action) if game.currentPlayer == -1: victories += 1 print('{}/{} games won by user agent (MCTS v Random)'.format(victories, num_simulations)) user_agent = MCTSAgent(timeLimit=1000) # user_agent = RandomAgent() adversarial_agent = RandomAgent() # agents = [user_agent, adversarial_agent] agents = [RandomAgent(), adversarial_agent, user_agent] #curr player is either 1 or -1, so index 0 is ignored game = Game() num_simulations = 200 victories = 0 for i in range(num_simulations): game.reset() if i%10 == 0: print('{} simulations run'.format(i)) done = False while not done: next_action = agents[game.currentPlayer].choose_action(game) next_state, value, done, info = game.step(next_action) if game.currentPlayer == 1: victories += 1 print('{}/{} games won by user agent (Random v MCTS)'.format(victories, num_simulations)) user_agent = MCTSAgent(timeLimit=1000) # user_agent = RandomAgent() adversarial_agent = MCTSAgent(timeLimit=1000) # agents = [user_agent, adversarial_agent] agents = [RandomAgent(), user_agent, adversarial_agent] #curr player is either 1 or -1, so index 0 is ignored game = Game() num_simulations = 200 victories = 0 for i in range(num_simulations): game.reset() if i%10 == 0: print('{} simulations run'.format(i)) done = False while not done: next_action = agents[game.currentPlayer].choose_action(game) next_state, value, done, info = game.step(next_action) if game.currentPlayer == -1: victories += 1 print('{}/{} games won by user agent (MCTS v MCTS)'.format(victories, num_simulations))
better = True agent.save(iteration) print("new model %d saved" % iteration) else: agent.restore(prev) print("old model restored") prev_sess.close() current_graph = tf.Graph() current_sess = tf.Session(graph=current_graph) with current_sess.as_default(): with current_graph.as_default(): agent = MCTSAgent(current_sess, "treesup", "dualdagger", epsilon=0.2, multiplier=3) print("Initialization complete") better = False for iteration in range(NUM_DAGGER_ITER): with current_sess.as_default(): t = time() print("\nDAgger iteration %d" % iteration) data = TenaryOnlineData() for j in range(NUM_GAMES_PER_DAGGER): game = generate_game(save=(j == 0), iter=iteration) sys.stdout.write("+") sys.stdout.flush() data.store(*analyze_game(game))
a1 = copy.deepcopy(agent1) a2 = copy.deepcopy(agent2) if i < n_trials/2: winner, count_moves, trial_times = play_wo_human(a1, a2, 1) else: winner, count_moves, trial_times = play_wo_human(a1, a2, 2) if winner: results[winner].append(count_moves) times[a1.name].append(trial_times[a1.name]) times[a2.name].append(trial_times[a2.name]) total_moves = sum(results[a1.name]) + sum(results[a2.name]) print results print "TOTAL GAMES WON BY ", a1.name, ": ", len(results[a1.name]) if len(results[a1.name]) != 0: print "AVERAGE NO. MOVES: ", sum(results[a1.name]) / len(results[a1.name]) print "AVERAGE TIME PER MOVE: ", sum(times[a1.name]) / total_moves print "TOTAL GAMES WON BY ", a2.name, ": ", len(results[a2.name]) if len(results[a2.name]) != 0: print "AVERAGE NO. MOVES: ", sum(results[a2.name])/ len(results[a2.name]) print "AVERAGE TIME PER MOVE: ", sum(times[a2.name]) / total_moves print "################################################################" if __name__ == "__main__": # test_agents(NaiveAgent(), MCTSAgent()) # test_agents(NaiveAgent(), QLearningAgent()) # test_agents(MCTSAgent(), MinimaxAgent(depth=3)) # test_agents(QLearningAgent("q_values"), MinimaxAgent(depth=3)) test_agents(MCTSAgent(), QLearningAgent("q_values"))