Пример #1
0
def init_agents(sess,
                info_state_size,
                num_actions, 
                dqn_kwargs,
                a2c_kwargs):

    policy_module = PolicyGradient(sess, 0, info_state_size**0.5, num_actions,**a2c_kwargs)
    rollout_module = PolicyGradient(sess, 0, info_state_size**0.5, num_actions,**a2c_kwargs)
    # rollout_module = DQN(sess, 0, info_state_size, num_actions, **dqn_kwargs) 
    sess.run(tf.global_variables_initializer())
    
    policy_module.restore("../used_model/a2c_CNN/602704")
    # rollout_module.restore("../used_model/38000")
    restore_agent_op = tf.group([
                tf.assign(rollout_v, policy_v) 
                for (rollout_v, policy_v) in zip(rollout_module.variable_list,policy_module.variable_list)
            ])
    sess.run(restore_agent_op)

    # TODO: load parameters
    agents = [MCTSAgent(policy_module,rollout_module,playout_depth = FLAGS.pd, n_playout = FLAGS.np), 
              MCTSAgent(None,None)]

    logging.info("MCTS INIT OK!!")

    


    return agents 
Пример #2
0
def main():
    # agents = [RandomAgent(), MCTSAgent(timeLimit=1000)]
    agents = [
        RandomAgent(),
        MCTSAgent(timeLimit=1000),
        MCTSAgent(timeLimit=1000)
    ]  #curr player is either 1 or -1, so index 0 is ignored. Player at index 1 plays first
    game = Game()

    done = False
    while not done:
        next_action = agents[game.currentPlayer].choose_action(game)
        next_state, value, done, info = game.step(next_action)
        game.gameState.get_visual_state()
    print('Reward: {}'.format(
        game.currentPlayer *
        -1))  # whoevers turn it is when game loop terminates has lost
Пример #3
0
def run(match_num, iteration_limit, mcts_process_num, result_list=None, process_id=None, render=False):
    """
    Run the match for MCTS and three simple agents.
    :param iteration_limit: The maximal iteration of MCTS
    :param match_num: The number of matches
    :param mcts_process_num: The number of processes used in MCTS
    :param result_list: A list to record results
    :param process_id: The process ID given when you do multiprocessing
    :param render: Determine whether to render game
    :return: None
    """
    if mcts_process_num == 1:
        mcts_process_num = None
    agent_list = [
        MCTSAgent([agents.SimpleAgent for _ in range(3)], iteration_limit, process_count=mcts_process_num),
        agents.SimpleAgent(),
        agents.SimpleAgent(),
        agents.SimpleAgent(),
    ]

    env = pommerman.make('PommeFFACompetition-v0', agent_list)

    for i_episode in range(match_num):
        state = env.reset()
        done = False
        initial_agents = state[0]['alive']
        survivors = initial_agents
        dead_agents = []
        while not done:
            if render:
                env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)

            survivors = state[0]['alive']
            for agent in initial_agents:
                if agent not in survivors and agent not in dead_agents:
                    dead_agents.append(agent)

        if process_id is not None:
            print('[Process %d, Episode %d] Dead order: ' % (process_id, i_episode),
                  str(dead_agents), 'Survivors:', survivors)
        else:
            print('[Episode %d] Dead order: ' % i_episode, str(dead_agents), 'Survivors:', survivors)

        if result_list is None:
            result_list = []
        result_list.append((dead_agents, survivors))

    env.close()

    return result_list
Пример #4
0
    def train(self):
        red_agent = MCTSAgent(num_simulations=50)
        yellow_agent = MCTSAgent(num_simulations=50)
        trace = []
        while True:
            cur_state = yellow_agent.state.bitPack()
            turn = yellow_agent.state.turn
            if turn == Circle.RED:
                move = red_agent.play_move()
                if move is None:
                    return
                yellow_agent.play_opponent_move(move)
            else:
                move = yellow_agent.play_move()
                if move is None:
                    return
                red_agent.play_opponent_move(move)
            trace.append((cur_state, move))

            winner = yellow_agent.state.check_winner()
            if winner is not None:
                self.lambda_learn(trace, winner, red_agent, yellow_agent)
                yellow_agent.state.printBoard()
                break
Пример #5
0
    def get_batched_greedy_actions(self, batched_env_states, env):
        self.eval()
        agent_id = env.training_agent
        # Preserve existing state of env
        saved_state = env.get_json_info()

        batched_greedy_actions = []
        for env_states in batched_env_states:
            greedy_actions = np.empty(len(env_states), dtype=int)
            for i, env_state in enumerate(env_states):
                next_states = np.empty((self.num_actions, self.in_channels,
                                        self.board_size, self.board_size))

                MCTSAgent.set_state(env, env_state)
                obs = env.get_observations()
                actions = env.act(obs)
                actions.insert(agent_id, None)

                terminal = np.full(self.num_actions, False)
                terminal_rewards = np.empty(self.num_actions)
                for a in range(self.num_actions):
                    actions[env.training_agent] = a
                    obs, rewards, done, _ = env.step(actions)
                    if done:
                        terminal[a] = True
                        terminal_rewards[a] = rewards[agent_id]
                    state, _ = MCTSAgent.state_space_converter(obs[agent_id])
                    next_states[a] = state

                    MCTSAgent.set_state(env, env_state)

                if terminal.all():
                    vals = terminal_rewards
                else:
                    _, vals = self(next_states)
                    vals = vals.detach().numpy()[:, 0]
                    # Replace vals with reward if terminal
                    vals[terminal] = terminal_rewards[terminal]
                greedy_actions[i] = np.argmax(vals)
            batched_greedy_actions.append(greedy_actions)

        # Restore existing state from before training
        MCTSAgent.set_state(env, saved_state)
        return batched_greedy_actions
Пример #6
0
def arena():
    global better
    prev_graph = tf.Graph()
    prev_sess = tf.Session(graph=prev_graph)
    if not better:
        prev = "treesup"
    else:
        prev = "dualdagger"
    with prev_sess.as_default():
        with prev_graph.as_default():
            prev_agent = MCTSAgent(prev_sess, prev)

    stat = np.zeros(shape=(2, 2), dtype=np.int)
    for i in range(NUM_TEST_GAMES):
        agent.refresh()
        prev_agent.refresh()
        s = State()
        prev_is_black = (i % 2 == 0)
        while not s.end and len(s.history) < 225:
            if prev_is_black == (s.player > 0):
                with prev_sess.as_default():
                    s.move(*get_random_mcts_action(prev_agent, s))
            else:
                with current_sess.as_default():
                    s.move(*get_random_mcts_action(agent, s))
            agent.update(s)
            prev_agent.update(s)
        if len(s.history) == 225:
            print("UNBELIEVABLE EVEN!")
        sys.stdout.write("x")
        sys.stdout.flush()
        stat[int(prev_is_black), int(s.player > 0)] += 1
    win_rate = (stat[0, 1] + stat[1, 0]) / stat.sum()
    print("\nwin_rate is %.02f" % win_rate)
    if win_rate > .5:
        better = True
        agent.save(iteration)
        print("new model %d saved" % iteration)
    else:
        agent.restore(prev)
        print("old model restored")
    prev_sess.close()
Пример #7
0
parser.add_argument("--multiplier2", "-m2", type=float)
parser.add_argument("--chkpnt1", "-c1", type=int)
parser.add_argument("--chkpnt2", "-c2", type=int)
parser.add_argument("--num_games", "-n", default=100, type=int)
parser.add_argument('--save', '-s', action='store_true')
args = parser.parse_args()

a_graph = tf.Graph()
b_graph = tf.Graph()
a_sess = tf.Session(graph=a_graph)
b_sess = tf.Session(graph=b_graph)
with a_sess.as_default():
    with a_graph.as_default():
        a_agent = MCTSAgent(a_sess,
                            args.model_name_1,
                            chkpnt=args.chkpnt1,
                            epsilon=args.epsilon1,
                            multiplier=args.multiplier1)
with b_sess.as_default():
    with b_graph.as_default():
        b_agent = MCTSAgent(b_sess,
                            args.model_name_2,
                            chkpnt=args.chkpnt2,
                            epsilon=args.epsilon2,
                            multiplier=args.multiplier2)
stdout.write(
    "A(e=%.01f, m=%.01f) vs B(e=%.01f, m=%.01f) = " %
    (args.epsilon1, args.multiplier1, args.epsilon2, args.multiplier2))
stdout.flush()

stat = np.zeros(shape=(2, 2), dtype=np.int)
Пример #8
0
def main():
    '''Simple function to bootstrap a game.
       
       Use this as an example to set up your training env.
    '''
    # Print all possible environments in the Pommerman registry
    print(pommerman.REGISTRY)

    episodes = 2
    # Create a set of agents (exactly four)
    for i_episode in range(episodes):
        agent_list = [
            agents.SimpleAgent(),
            agents.SimpleAgent(),
            agents.SimpleAgent(),
            agents.SimpleAgent()
            # agents.DockerAgent("pommerman/simple-agent", port=12345),
        ]
        
        
        learner_index = randint(0,3)
        print(learner_index)
        agent_list[learner_index] = MCTSAgent() #ExtractedStateAgent('extract', 0, 0, 0)
        
        
    #    agent_list[learner_index] = RandomForestAgent()
        #agent_list[learner_index] = SnorkelAgent()
        
        agent_list[learner_index].set_agent_id(learner_index)
    #    agent_list[learner_index].epsilon = 0
        # Make the "Free-For-All" environment using the agent list
        env = pommerman.make('PommeFFACompetition-v0', agent_list)
#        env.set_training_agent(learner_index)
        wins=np.zeros(4)
        
        
        win_str='winners'

    # Run the episodes just like OpenAI Gym
    
        obs = env.reset()
        state = env.get_json_info()
        agent_list[learner_index].set_state(state)
        done = False
        steps = 0
        while not done and steps < 500:
            steps += 1
            # env.render()
            #actions = env.act(state)
            actions = env.act(obs)
#            action = agent_list[learner_index].search(state)
#            actions.insert(learner_index, action)
            obs, step_reward, done, info = env.step(actions)
            state = env.get_json_info()
            agent_list[learner_index].set_state(state)
            print(actions)
                
            

        print(info)
        if win_str in info.keys():
            for w in info[win_str]:
                wins[w] += 1
        print('Episode {} finished'.format(i_episode))
        env.close()
    print('rates: ',wins/episodes)
    print('Learner win rate: ',wins[learner_index]/episodes)
Пример #9
0
def run_training(
    opponent,
    mcts_opp,
    game_state_file,
    graph_file,
    model_save_file,
    mcts_iters,
    temp,
    tempsteps,
    lr,
    discount,
    memsize,
    num_episodes,
    num_epochs,
    batch_size,
    train_every,
    save_every,
    graph_every,
    averaging_window,
    opt_eps=1e-8,
    ucb_c=1.5,
    boardsize=8,
    inputs=20,
    render=False,
    verbose=False,
):
    env = PommermanEnvironment(
        render=render,
        num_agents=2,
        game_state_file=game_state_file,
    )

    run_settings = RunSettings(
        num_episodes=num_episodes,
        num_epochs=num_epochs,
        batch_size=batch_size,
        train_every=train_every,
        save_every=save_every,
        graph_every=graph_every,
        averaging_window=averaging_window,
        graph_file=graph_file,
        verbose=verbose,
    )

    agent_settings = AgentSettings(
        optimizer=torch.optim.Adam,
        learning_rate=lr,
        opt_eps=opt_eps,
        epsilon_max=0,
        epsilon_min=0,
        epsilon_duration=0,
        verbose=verbose,
    )

    memory = MCTSMemory(buffer_len=memsize, discount=discount)

    if mcts_opp is None:
        mcts_opp = opponent
    if mcts_opp == 'rand':
        opp = pommerman.agents.RandomAgent()
    elif mcts_opp == 'noop':
        opp = PommermanNoopAgent()
    elif mcts_opp == 'simp':
        opp = pommerman.agents.SimpleAgent()
    else:
        raise Exception('Invalid MCTS opponent type', mcts_opp)

    mcts_model = ActorCriticNet(board_size=boardsize, in_channels=inputs)
    agent1 = MCTSAgent(
        mcts_iters=mcts_iters,
        discount=discount,
        c=ucb_c,
        temp=temp,
        tempsteps=tempsteps,
        agent_id=0,
        opponent=opp,
        model_save_file=model_save_file,
        model=mcts_model,
        settings=agent_settings,
        memory=memory,
    )
    agent1.load()

    if opponent == 'rand':
        agent2 = RandomAgent()
    elif opponent == 'noop':
        agent2 = NoopAgent()
    elif opponent == 'simp':
        agent2 = SimpleAgent()
    else:
        raise Exception('Invalid opponent type', opponent)

    experiment = Experiment([agent1, agent2], env, run_settings)
    experiment.train()
Пример #10
0
parser.add_argument("--chkpnt2", "-c2", type=int)
parser.add_argument("--num_games", "-n", default=100, type=int)
parser.add_argument('--save', '-s', action='store_true')
args = parser.parse_args()

a_graph = tf.Graph()
b_graph = tf.Graph()
a_sess = tf.Session(graph=a_graph)
b_sess = tf.Session(graph=b_graph)
with a_sess.as_default():
    with a_graph.as_default():
        a_agent = DualAgent(a_sess, args.model_name_1, chkpnt=args.chkpnt1)
with b_sess.as_default():
    with b_graph.as_default():
        b_agent = MCTSAgent(b_sess,
                            args.model_name_2,
                            chkpnt=args.chkpnt2,
                            epsilon=args.epsilon)
print("ARENA: DUAL %s-%d VERSES MCTS %s-%d" %
      (a_agent.model_name, a_agent.chkpnt, b_agent.model_name, b_agent.chkpnt))

stat = np.zeros(shape=(2, 2), dtype=np.int)
for i in range(args.num_games):
    t = time()
    s = State()
    a_is_black = (i % 2 == 0)
    while not s.end and len(s.history) < 225:
        if a_is_black == (s.player > 0):
            with a_sess.as_default():
                s.move(*a_agent.get_action(s, deterministic=False))
                b_agent.update(s)
        else:
Пример #11
0
                            (i, j), dims=(15, 15))].config(padx=1,
                                                           pady=1,
                                                           bg="red")
                else:
                    self.recommend()

        return respond

    def create_widgets(self):
        for i in range(15):
            for j in range(15):
                f = tk.Frame(self, height=50, width=50)
                f.pack_propagate(0)
                f.grid(row=i, column=j, padx=0, pady=0)
                self.frames.append(f)
                b = tk.Label(f, image=self.image[0], bg="yellow")
                b.pack(fill=tk.BOTH, expand=1)
                b.bind("<Button-1>", self.click(i, j))
                self.button.append(b)


root = tk.Tk()
root.wm_title("Alpha Gomoku")
root.attributes("-topmost", True)

with tf.Session() as sess:
    agent = MCTSAgent(sess, "treesup")
    # agent = DualAgent(sess, "dualdagger")
    app = Application(agent, root, ensemble=False)
    app.mainloop()
Пример #12
0
""" battle arena between agents """

import argparse
import numpy as np
import tensorflow as tf
from time import time
from state import State
from minimax import MinimaxAgent
from mcts_agent import MCTSAgent

NUM_GAMES = 2

with tf.Session() as sess:
    mcts = MCTSAgent(sess, "dualsup", chkpnt=3000)
    agent = MinimaxAgent()
    print("ARENA: %s-%d VERSES %s-%d" %
          (mcts.model_name, mcts.chkpnt, "minimax", 0))

    stat = np.zeros(shape=(2, 2), dtype=np.int)
    for i in range(NUM_GAMES):
        t = time()
        s = State()
        a_is_black = (i % 2 == 0)
        while not s.end and len(s.history) < 225:
            if a_is_black == (s.player > 0):
                s.move(*mcts.get_action(s, deterministic=True))
                mcts.update(s)
            else:
                s.move(*agent.get_action(s))
                mcts.update(s)
        mcts.refresh()
Пример #13
0
def main():
    user_agent = RandomAgent()
    adversarial_agent = RandomAgent()
    agents = [RandomAgent(), user_agent, adversarial_agent] #curr player is either 1 or -1, so index 0 is ignored
    game = Game()

    num_simulations = 200
    victories = 0

    for i in range(num_simulations):
        game.reset()
        if i%10 == 0:
            print('{} simulations run'.format(i))
        done = False
        while not done:
            next_action = agents[game.currentPlayer].choose_action(game)
            next_state, value, done, info = game.step(next_action)
        if game.currentPlayer == -1:
            victories += 1

    print('{}/{} games won by user agent (Random v Random)'.format(victories, num_simulations))

    user_agent = MCTSAgent(timeLimit=1000)
    # user_agent = RandomAgent()
    adversarial_agent = RandomAgent()
    # agents = [user_agent, adversarial_agent]
    agents = [RandomAgent(), user_agent, adversarial_agent] #curr player is either 1 or -1, so index 0 is ignored
    game = Game()

    num_simulations = 200
    victories = 0

    for i in range(num_simulations):
        game.reset()
        if i%10 == 0:
            print('{} simulations run'.format(i))
        done = False
        while not done:
            next_action = agents[game.currentPlayer].choose_action(game)
            next_state, value, done, info = game.step(next_action)
        if game.currentPlayer == -1:
            victories += 1

    print('{}/{} games won by user agent (MCTS v Random)'.format(victories, num_simulations))

    user_agent = MCTSAgent(timeLimit=1000)
    # user_agent = RandomAgent()
    adversarial_agent = RandomAgent()
    # agents = [user_agent, adversarial_agent]
    agents = [RandomAgent(), adversarial_agent, user_agent] #curr player is either 1 or -1, so index 0 is ignored
    game = Game()

    num_simulations = 200
    victories = 0

    for i in range(num_simulations):
        game.reset()
        if i%10 == 0:
            print('{} simulations run'.format(i))
        done = False
        while not done:
            next_action = agents[game.currentPlayer].choose_action(game)
            next_state, value, done, info = game.step(next_action)
        if game.currentPlayer == 1:
            victories += 1

    print('{}/{} games won by user agent (Random v MCTS)'.format(victories, num_simulations))

    user_agent = MCTSAgent(timeLimit=1000)
    # user_agent = RandomAgent()
    adversarial_agent = MCTSAgent(timeLimit=1000)
    # agents = [user_agent, adversarial_agent]
    agents = [RandomAgent(), user_agent, adversarial_agent] #curr player is either 1 or -1, so index 0 is ignored
    game = Game()

    num_simulations = 200
    victories = 0

    for i in range(num_simulations):
        game.reset()
        if i%10 == 0:
            print('{} simulations run'.format(i))
        done = False
        while not done:
            next_action = agents[game.currentPlayer].choose_action(game)
            next_state, value, done, info = game.step(next_action)
        if game.currentPlayer == -1:
            victories += 1

    print('{}/{} games won by user agent (MCTS v MCTS)'.format(victories, num_simulations))
Пример #14
0
        better = True
        agent.save(iteration)
        print("new model %d saved" % iteration)
    else:
        agent.restore(prev)
        print("old model restored")
    prev_sess.close()


current_graph = tf.Graph()
current_sess = tf.Session(graph=current_graph)
with current_sess.as_default():
    with current_graph.as_default():
        agent = MCTSAgent(current_sess,
                          "treesup",
                          "dualdagger",
                          epsilon=0.2,
                          multiplier=3)
        print("Initialization complete")

better = False
for iteration in range(NUM_DAGGER_ITER):
    with current_sess.as_default():
        t = time()
        print("\nDAgger iteration %d" % iteration)
        data = TenaryOnlineData()
        for j in range(NUM_GAMES_PER_DAGGER):
            game = generate_game(save=(j == 0), iter=iteration)
            sys.stdout.write("+")
            sys.stdout.flush()
            data.store(*analyze_game(game))
Пример #15
0
		a1 = copy.deepcopy(agent1)
		a2 = copy.deepcopy(agent2)
		if i < n_trials/2:
			winner, count_moves, trial_times = play_wo_human(a1, a2, 1)
		else:
			winner, count_moves, trial_times = play_wo_human(a1, a2, 2)
		if winner:
			results[winner].append(count_moves)
			times[a1.name].append(trial_times[a1.name])
			times[a2.name].append(trial_times[a2.name])

	total_moves = sum(results[a1.name]) + sum(results[a2.name])
	print results
	print "TOTAL GAMES WON BY ", a1.name, ": ", len(results[a1.name])
	if len(results[a1.name]) != 0:
		print "AVERAGE NO. MOVES: ", sum(results[a1.name]) / len(results[a1.name])
	print "AVERAGE TIME PER MOVE: ", sum(times[a1.name]) / total_moves
	print "TOTAL GAMES WON BY ", a2.name, ": ", len(results[a2.name])
	if len(results[a2.name]) != 0:
		print "AVERAGE NO. MOVES: ", sum(results[a2.name])/ len(results[a2.name])
	print "AVERAGE TIME PER MOVE: ", sum(times[a2.name]) / total_moves
	print "################################################################"


if __name__ == "__main__":
	# test_agents(NaiveAgent(), MCTSAgent())
	# test_agents(NaiveAgent(), QLearningAgent())
	# test_agents(MCTSAgent(), MinimaxAgent(depth=3))
	# test_agents(QLearningAgent("q_values"), MinimaxAgent(depth=3))
	test_agents(MCTSAgent(), QLearningAgent("q_values"))