Exemplo n.º 1
0
def _init_bot(bot_type, game, player_id):
  """Initializes a bot by type."""
  rng = np.random.RandomState(FLAGS.seed)
  if bot_type == "mcts":
    evaluator = mcts.RandomRolloutEvaluator(FLAGS.rollout_count, rng)
    return mcts.MCTSBot(
        game,
        FLAGS.uct_c,
        FLAGS.max_simulations,
        evaluator,
        random_state=rng,
        solve=FLAGS.solve,
        verbose=FLAGS.verbose)
  if bot_type == "az":
    model = az_model.Model.from_checkpoint(FLAGS.az_path)
    evaluator = az_evaluator.AlphaZeroEvaluator(game, model)
    return mcts.MCTSBot(
        game,
        FLAGS.uct_c,
        FLAGS.max_simulations,
        evaluator,
        random_state=rng,
        child_selection_fn=mcts.SearchNode.puct_value,
        solve=FLAGS.solve,
        verbose=FLAGS.verbose)
  if bot_type == "random":
    return uniform_random.UniformRandomBot(player_id, rng)
  if bot_type == "human":
    return human.HumanBot()
  if bot_type == "gtp":
    bot = gtp.GTPBot(game, FLAGS.gtp_path)
    for cmd in FLAGS.gtp_cmd:
      bot.gtp_cmd(cmd)
    return bot
  raise ValueError("Invalid bot type: %s" % bot_type)
Exemplo n.º 2
0
 def test_works_with_mcts(self):
   game = pyspiel.load_game("tic_tac_toe")
   model = build_model(game)
   evaluator = evaluator_lib.AlphaZeroEvaluator(game, model)
   bot = mcts.MCTSBot(
       game, 1., 20, evaluator, solve=False, dirichlet_noise=(0.25, 1.))
   root = bot.mcts_search(game.new_initial_state())
   self.assertEqual(root.explore_count, 20)
Exemplo n.º 3
0
def main(_):
  game = pyspiel.load_game("tic_tac_toe")

  # 1. Define a model
  model = model_lib.Model(
      FLAGS.nn_model, game.observation_tensor_shape(),
      game.num_distinct_actions(), nn_width=FLAGS.nn_width,
      nn_depth=FLAGS.nn_depth, weight_decay=1e-4, learning_rate=0.01, path=None)
  print("Model type: {}({}, {}), size: {} variables".format(
      FLAGS.nn_model, FLAGS.nn_width, FLAGS.nn_depth,
      model.num_trainable_variables))

  # 2. Create an MCTS bot using the model
  evaluator = evaluator_lib.AlphaZeroEvaluator(game, model)
  bot = mcts.MCTSBot(game,
                     1.,
                     20,
                     evaluator,
                     solve=False,
                     dirichlet_noise=(0.25, 1.))

  # 3. Build an AlphaZero instance
  a0 = alpha_zero.AlphaZero(game,
                            bot,
                            model,
                            replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                            action_selection_transition=4)

  # 4. Create a bot using min-max search. It can never lose tic-tac-toe, so
  # a success condition for our AlphaZero bot is to draw all games with it.
  minimax_bot = MinimaxBot(game)

  # 5. Run training loop
  for num_round in range(FLAGS.num_rounds):
    logging.info("------------- Starting round %s out of %s -------------",
                 num_round, FLAGS.num_rounds)

    if num_round % FLAGS.evaluation_frequency == 0:
      num_evaluations = 50
      logging.info("Playing %s games against the minimax player.",
                   num_evaluations)
      (_, losses, draws) = bot_evaluation(game, [minimax_bot, a0.bot],
                                          num_evaluations=50)
      logging.info("Result against Minimax player: %s losses and %s draws.",
                   losses, draws)

    logging.info("Running %s games of self play", FLAGS.num_self_play_games)
    a0.self_play(num_self_play_games=FLAGS.num_self_play_games)

    logging.info("Training the net for %s epochs.", FLAGS.num_training_epochs)
    a0.update(FLAGS.num_training_epochs,
              batch_size=FLAGS.batch_size,
              verbose=True)
    logging.info("Cache: %s", evaluator.cache_info())
    evaluator.clear_cache()
Exemplo n.º 4
0
def actor(*, config, game, logger, queue):
  """An actor process runner that generates games and returns trajectories."""
  logger.print("Initializing model")
  model = _init_model_from_config(config)
  logger.print("Initializing bots")
  az_evaluator = evaluator_lib.AlphaZeroEvaluator(game, model)
  bots = [
      _init_bot(config, game, az_evaluator, False),
      _init_bot(config, game, az_evaluator, False),
  ]
  for game_num in itertools.count():
    if not update_checkpoint(logger, queue, model, az_evaluator):
      return
    queue.put(_play_game(logger, game_num, game, bots, config.temperature,
                         config.temperature_drop))
Exemplo n.º 5
0
def evaluator(*, game, config, logger, checkpoint, queue):
    """A process that plays the latest checkpoint vs standard MCTS."""
    results = Buffer(config.evaluation_window)
    logger.print("Initializing model")

    # Load a new model if there's not a checkpoint, otherwise load the checkpoint.
    if checkpoint is None:
        model = _init_model_from_config(config)
    else:
        model = _init_model_from_checkpoint(checkpoint, config.path)

    logger.print("Initializing bots")
    az_evaluator = evaluator_lib.AlphaZeroEvaluator(game, model)
    random_evaluator = mcts.RandomRolloutEvaluator()

    for game_num in itertools.count():
        if not update_checkpoint(logger, queue, model, az_evaluator):
            return

        az_player = game_num % 2
        difficulty = (game_num // 2) % config.eval_levels
        max_simulations = int(config.max_simulations * (10**(difficulty / 2)))
        bots = [
            _init_bot(config, game, az_evaluator, True),
            mcts.MCTSBot(game,
                         config.uct_c,
                         max_simulations,
                         random_evaluator,
                         solve=True,
                         verbose=False)
        ]
        if az_player == 1:
            bots = list(reversed(bots))

        trajectory = _play_game(logger,
                                game_num,
                                game,
                                bots,
                                temperature=1,
                                temperature_drop=0)
        results.append(trajectory.returns[az_player])
        queue.put((difficulty, trajectory.returns[az_player]))

        logger.print("AZ: {}, MCTS: {}, AZ avg/{}: {:.3f}".format(
            trajectory.returns[az_player], trajectory.returns[1 - az_player],
            len(results), np.mean(results.data)))
Exemplo n.º 6
0
def evaluator(*, game, config, logger, num, queue):
    """A process that plays the latest checkpoint vs standard MCTS."""
    max_simulations = config.max_simulations * (3**num)
    logger.print("Running MCTS with", max_simulations, "simulations")
    results = Buffer(config.evaluation_window)
    logger.print("Initializing model")
    model = _init_model_from_config(config)
    logger.print("Initializing bots")
    az_evaluator = evaluator_lib.AlphaZeroEvaluator(game, model)
    random_evaluator = mcts.RandomRolloutEvaluator()
    az_player = 0
    bots = [
        _init_bot(config, game, az_evaluator, True),
        mcts.MCTSBot(game,
                     config.uct_c,
                     max_simulations,
                     random_evaluator,
                     solve=True,
                     verbose=False)
    ]
    for game_num in itertools.count():
        if not update_checkpoint(logger, queue, model, az_evaluator):
            return

        trajectory = _play_game(logger,
                                game_num,
                                game,
                                bots,
                                temperature=1,
                                temperature_drop=0)
        results.append(trajectory.returns[az_player])

        logger.print("AZ: {}, MCTS: {}, AZ avg/{}: {:.3f}".format(
            trajectory.returns[az_player], trajectory.returns[1 - az_player],
            len(results), np.mean(results.data)))

        # Swap players for the next game
        bots = list(reversed(bots))
        az_player = 1 - az_player
Exemplo n.º 7
0
  def test_evaluator_caching(self):
    game = pyspiel.load_game("tic_tac_toe")
    model = build_model(game)
    evaluator = evaluator_lib.AlphaZeroEvaluator(game, model)

    state = game.new_initial_state()
    obs = state.observation_tensor()
    act_mask = state.legal_actions_mask()
    action = state.legal_actions()[0]
    policy = np.zeros(len(act_mask), dtype=float)
    policy[action] = 1
    train_inputs = [model_lib.TrainInput(obs, act_mask, policy, value=1)]

    value = evaluator.evaluate(state)
    self.assertEqual(value[0], -value[1])
    value = value[0]

    value2 = evaluator.evaluate(state)[0]
    self.assertEqual(value, value2)

    prior = evaluator.prior(state)
    prior2 = evaluator.prior(state)
    np.testing.assert_array_equal(prior, prior2)

    info = evaluator.cache_info()
    self.assertEqual(info.misses, 1)
    self.assertEqual(info.hits, 3)

    for _ in range(20):
      model.update(train_inputs)

    # Still equal due to not clearing the cache
    value3 = evaluator.evaluate(state)[0]
    self.assertEqual(value, value3)

    info = evaluator.cache_info()
    self.assertEqual(info.misses, 1)
    self.assertEqual(info.hits, 4)

    evaluator.clear_cache()

    info = evaluator.cache_info()
    self.assertEqual(info.misses, 0)
    self.assertEqual(info.hits, 0)

    # Now they differ from before
    value4 = evaluator.evaluate(state)[0]
    value5 = evaluator.evaluate(state)[0]
    self.assertNotEqual(value, value4)
    self.assertEqual(value4, value5)

    info = evaluator.cache_info()
    self.assertEqual(info.misses, 1)
    self.assertEqual(info.hits, 1)

    value6 = evaluator.evaluate(game.new_initial_state())[0]
    self.assertEqual(value4, value6)

    info = evaluator.cache_info()
    self.assertEqual(info.misses, 1)
    self.assertEqual(info.hits, 2)
Exemplo n.º 8
0
def main(_):
    game = pyspiel.load_game("tic_tac_toe")
    num_actions = game.num_distinct_actions()
    observation_shape = game.observation_tensor_shape()

    # 1. Define a keras net
    if FLAGS.net_type == "resnet":
        net = model_lib.keras_resnet(observation_shape,
                                     num_actions,
                                     num_residual_blocks=1,
                                     num_filters=256,
                                     data_format="channels_first")
    elif FLAGS.net_type == "mlp":
        net = model_lib.keras_mlp(observation_shape,
                                  num_actions,
                                  num_layers=2,
                                  num_hidden=64)
    else:
        raise ValueError(
            ("Invalid value for 'net_type'. Must be either 'mlp' or "
             "'resnet', but was %s") % FLAGS.net_type)

    model = model_lib.Model(net,
                            l2_regularization=1e-4,
                            learning_rate=0.01,
                            device=FLAGS.device)

    # 2. Create an MCTS bot using the previous keras net
    evaluator = evaluator_lib.AlphaZeroEvaluator(game, model)

    bot = mcts.MCTSBot(game,
                       1.,
                       20,
                       evaluator,
                       solve=False,
                       dirichlet_noise=(0.25, 1.))

    # 3. Build an AlphaZero instance
    a0 = alpha_zero.AlphaZero(
        game,
        bot,
        model,
        replay_buffer_capacity=FLAGS.replay_buffer_capacity,
        action_selection_transition=4)

    # 4. Create a bot using min-max search. It can never lose tic-tac-toe, so
    # a success condition for our AlphaZero bot is to draw all games with it.
    minimax_bot = MinimaxBot(game)

    # 5. Run training loop
    for num_round in range(FLAGS.num_rounds):
        logging.info("------------- Starting round %s out of %s -------------",
                     num_round, FLAGS.num_rounds)

        if num_round % FLAGS.evaluation_frequency == 0:
            num_evaluations = 50
            logging.info("Playing %s games against the minimax player.",
                         num_evaluations)
            (_, losses, draws) = bot_evaluation(game, [minimax_bot, a0.bot],
                                                num_evaluations=50)
            logging.info(
                "Result against Minimax player: %s losses and %s draws.",
                losses, draws)

        logging.info("Running %s games of self play",
                     FLAGS.num_self_play_games)
        a0.self_play(num_self_play_games=FLAGS.num_self_play_games)

        logging.info("Training the net for %s epochs.",
                     FLAGS.num_training_epochs)
        a0.update(FLAGS.num_training_epochs,
                  batch_size=FLAGS.batch_size,
                  verbose=True)
        logging.info("Cache: %s", evaluator.cache_info())
        evaluator.clear_cache()