def main(unused_argv):
    begin = time.time()
    env = Go()
    agents = [agent.Random_Rollout_MCTS_Agent(n_playout=100), agent.RandomAgent(1)]
    ret = []

    for ep in range(NUM_EPISODES):
        time_step = env.reset()
        print('start ep: %d'%ep)
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            if player_id == 0:
                agent_output = agents[player_id].step(time_step, env)
            else:
                agent_output = agents[player_id].step(time_step)
            action_list = agent_output.action
            time_step = env.step(action_list)

        # Episode is over, step all agents with final info state.
        agents[0].step(time_step, env)
        agents[1].step(time_step)
        ret.append(time_step.rewards[0])
        print('end')
    print(np.mean(ret))
    # print(ret)

    print('Time elapsed:', time.time()-begin)
示例#2
0
def init_agents(sess, info_state_size, num_actions, hidden_layers_sizes,
                **kwargs):
    agents = [
        DQN(sess, 0, info_state_size, num_actions, hidden_layers_sizes,
            **kwargs),
        agent.RandomAgent(1)
    ]
    sess.run(tf.global_variables_initializer())

    return agents
示例#3
0
def init_agents(sess, info_state_size, num_actions, cnn_parameters,
                hidden_layers_sizes, **kwargs):

    if use_dqn():
        Algorithm = DQN(sess, 0, info_state_size**0.5, num_actions,
                        cnn_parameters, hidden_layers_sizes, **kwargs)
    else:
        Algorithm = PolicyGradient(sess, 0, info_state_size**0.5, num_actions,
                                   cnn_parameters, hidden_layers_sizes,
                                   **kwargs)

    agents = [Algorithm, agent.RandomAgent(1)]
    sess.run(tf.global_variables_initializer())

    return agents
示例#4
0
def main(unused_argv):
    begin = time.time()
    env = Go()
    info_state_size = env.state_size
    num_actions = env.action_size
    agentR = agent.RandomAgent(0)
    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    kwargs = {
        "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
        "epsilon_decay_duration": int(0.6 * FLAGS.num_train_episodes),
        "epsilon_start": 0.8,
        "epsilon_end": 0.001,
        "learning_rate": 1e-3,
        "learn_every": FLAGS.learn_every,
        "batch_size": 128,
        "max_global_gradient_norm": 10,
    }

    ret = [0]
    max_len = 2000
    with tf.Session() as sess:
        dqn = DQN(sess, 0, info_state_size, num_actions, hidden_layers_sizes,
                  **kwargs)
        dqn.restore("saved_model/10000")
        for ep in range(10):
            print("start mcts train ep" + str(ep))
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == 0:  #用MCTS
                    root = Node(None, env, time_step, None, 0, 0)
                    mcts = MCTS(root,
                                dqn,
                                random_value_net,
                                random_rollout_net,
                                env,
                                time_limit=5)
                    action_list = mcts.start()
                else:
                    agent_output = agentR.step(time_step).action
                    action_list = agent_output  #获得动作
                #print(action_list,player_id)
                time_step = env.step(action_list)
            print(time_step.rewards)
    print('Time elapsed:', time.time() - begin)
示例#5
0
def main(unused_argv):
    begin = time.time()
    env = Go()
    info_state_size = env.state_size
    num_actions = env.action_size

    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    kwargs = {
        "pi_learning_rate": 1e-2,
        "critic_learning_rate": 1e-1,
        "batch_size": 128,
        "entropy_cost": 0.5,
        "max_global_gradient_norm": 20,
    }
    import agent.agent as agent
    ret = [0]
    max_len = 2000

    with tf.Session() as sess:
        # agents = [DQN(sess, _idx, info_state_size,
        #                   num_actions, hidden_layers_sizes, **kwargs) for _idx in range(2)]
        agents = [PolicyGradient(sess, 0, info_state_size,
                          num_actions, hidden_layers_sizes, **kwargs), agent.RandomAgent(1)]
        sess.run(tf.global_variables_initializer())
        for ep in range(FLAGS.num_train_episodes):
            if (ep + 1) % FLAGS.eval_every == 0:
                losses = agents[0].loss
                logging.info("Episodes: {}: Losses: {}, Rewards: {}".format(ep+1, losses, np.mean(ret)))
                with open('log_pg_{}'.format(os.environ.get('BOARD_SIZE')), 'a+') as log_file:
                    log_file.writelines("{}, {}\n".format(ep+1, np.mean(ret)))
            time_step = env.reset()  # a go.Position object
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = agent_output.action
                time_step = env.step(action_list)
            for agent in agents:
                agent.step(time_step)
            if len(ret) < max_len:
                ret.append(time_step.rewards[0])
            else:
                ret[ep % max_len] = time_step.rewards[0]

        ret = []
        for ep in range(FLAGS.num_eval):
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == 0:
                    agent_output = agents[player_id].step(time_step, is_evaluation=True)
                else:
                    agent_output = agents[player_id].step(time_step)
                action_list = agent_output.action
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            # for agent in agents:
            agents[0].step(time_step, is_evaluation=True)
            agents[1].step(time_step)
            ret.append(time_step.rewards[0])
        print(np.mean(ret))

    print('Time elapsed:', time.time()-begin)
示例#6
0
def main(unused_argv):
    begin = time.time()
    env = Go()
    info_state_size = env.state_size
    num_actions = env.action_size

    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    kwargs = {
        "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
        "epsilon_decay_duration": int(0.6 * FLAGS.num_train_episodes),
        "epsilon_start": 0.8,
        "epsilon_end": 0.001,
        "learning_rate": 1e-3,
        "learn_every": FLAGS.learn_every,
        "batch_size": 128,
        "max_global_gradient_norm": 10,
    }
    import agent.agent as agent
    ret = [0]
    max_len = 2000

    with tf.Session() as sess:
        # agents = [DQN(sess, _idx, info_state_size,
        #                   num_actions, hidden_layers_sizes, **kwargs) for _idx in range(2)]  # for self play
        agents = [
            agent.RandomAgent(1),
            DQN(sess, 1, info_state_size, num_actions, hidden_layers_sizes,
                **kwargs)
        ]
        sess.run(tf.global_variables_initializer())
        # train the agent
        for ep in range(FLAGS.num_train_episodes):
            if (ep + 1) % FLAGS.save_every == 0:
                if not os.path.exists("saved_model/random_vs_dqn"):
                    os.mkdir('saved_model/random_vs_dqn')
                agents[1].save(checkpoint_root='saved_model/random_vs_dqn',
                               checkpoint_name='random_vs_dqn_{}'.format(ep +
                                                                         1))
                print('saved %d' % (ep + 1))
            time_step = env.reset()  # a go.Position object
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = agent_output.action
                # print(action_list)
                time_step = env.step(action_list)
            for agent in agents:
                agent.step(time_step)
            if len(ret) < max_len:
                ret.append(time_step.rewards[0])
            else:
                ret[ep % max_len] = time_step.rewards[0]

        # evaluated the trained agent
        agents[1].restore("saved_model/random_vs_dqn/random_vs_dqn_10000")
        ret = []
        for ep in range(FLAGS.num_eval):
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == 0:
                    agent_output = agents[player_id].step(time_step)
                else:
                    agent_output = agents[player_id].step(
                        time_step,
                        is_evaluation=True,
                        add_transition_record=False)
                action_list = agent_output.action
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            # for agent in agents:
            agents[0].step(time_step)
            agents[1].step(time_step,
                           is_evaluation=True,
                           add_transition_record=False)
            ret.append(time_step.rewards[0])
        print(np.mean(ret))
        # print(ret)

    print('Time elapsed:', time.time() - begin)
示例#7
0
def main(unused_argv):
    begin = time.time()
    env = Go()
    ret = [0]

    policy_function = [
        'saved_model/dqn_vs_random/10000',
        'saved_model/random_vs_dqn/random_vs_dqn_10000'
    ]
    value_function = 'saved_model/dqn_vs_random/10000'

    agents = [
        agent.Net_MCTS_Agent(value_function, policy_function, n_playout=50),
        agent.RandomAgent(1)
    ]

    for ep in range(NUM_TRAIN):
        if (ep + 1) % NUM_SAVE_EVERY == 0:
            if not os.path.exists("saved_model/net_mcts_vs_random"):
                os.mkdir('saved_model/net_mcts_vs_random')
            agents[0].mcts._policy_fn[0].save(
                checkpoint_root='saved_model/net_mcts_vs_random',
                checkpoint_name='_policy_fn_0_{}'.format(ep + 1))
            agents[0].mcts._policy_fn[1].save(
                checkpoint_root='saved_model/net_mcts_vs_random',
                checkpoint_name='_policy_fn_1_{}'.format(ep + 1))
            agents[0].mcts._value_fn.save(
                checkpoint_root='saved_model/net_mcts_vs_random',
                checkpoint_name='_value_fn_{}'.format(ep + 1))

        time_step = env.reset()  # a new env
        print('start ep: %d' % ep)
        while not time_step.last():  # play until the game is over
            cur_player = time_step.observations["current_player"]
            state = time_step.observations["info_state"][cur_player]
            player_id = time_step.observations["current_player"]
            if player_id == 0:
                agent_output = agents[player_id].step(time_step, env)
            else:
                agent_output = agents[player_id].step(time_step)
            action_list = agent_output.action
            time_step = env.step(action_list)
        print('end')

        agents[0].step(time_step, env)
        agents[1].step(time_step)
        if len(ret) < max_len:
            ret.append(time_step.rewards[0])
        else:
            ret[ep % max_len] = time_step.rewards[0]

    # evaluated the trained mcts agent
    ret = []
    for ep in range(NUM_EVAL):
        print('eval ep: %d' % ep)
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            if player_id == 0:
                agent_output = agents[player_id].step(time_step, env)
            else:
                agent_output = agents[player_id].step(time_step)
            action_list = agent_output.action
            time_step = env.step(action_list)
        # Episode is over, step all agents with final info state.
        agents[0].step(time_step, env)
        agents[1].step(time_step)
        ret.append(time_step.rewards[0])
    print(np.mean(ret))
    print(ret)

    print('Time elapsed:', time.time() - begin)
示例#8
0
def main(unused_argv):
    begin = time.time()
    env = Go()
    info_state_size = env.state_size
    num_actions = env.action_size

    num_cnn_layer = len(FLAGS.output_channels)
    kernel_shapes = [3 for _ in range(num_cnn_layer)]
    strides = [1 for _ in range(num_cnn_layer)]
    paddings = ["SAME" for _ in range(num_cnn_layer - 1)]
    paddings.append("VALID")

    cnn_parameters = [FLAGS.output_channels, kernel_shapes, strides, paddings]

    #hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    kwargs = {
        "pi_learning_rate": 3e-4,
        "critic_learning_rate": 1e-3,
        "batch_size": 128,
        "entropy_cost": 0.5,
        "max_global_gradient_norm": 20,
    }
    import agent.agent as agent
    ret = [0]
    max_len = 2000

    with tf.Session() as sess:
        # agents = [DQN(sess, _idx, info_state_size,
        #                   num_actions, hidden_layers_sizes, **kwargs) for _idx in range(2)]
        agents = [
            PolicyGradient(sess, 0, info_state_size**0.5, num_actions,
                           cnn_parameters, hidden_layers_sizes, **kwargs),
            agent.RandomAgent(1)
        ]
        sess.run(tf.global_variables_initializer())
        for ep in range(FLAGS.num_train_episodes):
            if (ep + 1) % FLAGS.eval_every == 0:
                losses = agents[0].loss
                logging.info("Episodes: {}: Losses: {}, Rewards: {}".format(
                    ep + 1, losses, np.mean(ret)))
                with open('log_pg_{}'.format(os.environ.get('BOARD_SIZE')),
                          'a+') as log_file:
                    log_file.writelines("{}, {}\n".format(
                        ep + 1, np.mean(ret)))
            if (ep + 1) % FLAGS.save_every == 0:
                agents[0].save(checkpoint_root='saved_model',
                               checkpoint_name='{}'.format(ep + 1))
            time_step = env.reset()  # a go.Position object
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = agent_output.action
                time_step = env.step(action_list)
            for agent in agents:
                agent.step(time_step)
            if len(ret) < max_len:
                ret.append(time_step.rewards[0])
            else:
                ret[ep % max_len] = time_step.rewards[0]

        ret = []
        agents[0].restore("saved_model/10000")
        for ep in range(FLAGS.num_eval):
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == 0:
                    agent_output = agents[player_id].step(time_step,
                                                          is_evaluation=True)
                    print(agents[0].policy_fn(time_step, player_id))
                else:
                    agent_output = agents[player_id].step(time_step)

                action_list = agent_output.action
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            # for agent in agents:
            agents[0].step(time_step, is_evaluation=True)
            agents[1].step(time_step)
            ret.append(time_step.rewards[0])
        print(np.mean(ret))

    print('Time elapsed:', time.time() - begin)