Exemplo n.º 1
0
    def test_save_load(self):
        policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"])

        model_file = 'TESTPOLICY.json'
        weights_file = 'TESTWEIGHTS.h5'
        model_file2 = 'TESTPOLICY2.json'
        weights_file2 = 'TESTWEIGHTS2.h5'

        # test saving model/weights separately
        policy.save_model(model_file)
        policy.model.save_weights(weights_file, overwrite=True)
        # test saving them together
        policy.save_model(model_file2, weights_file2)

        copypolicy = CNNPolicy.load_model(model_file)
        copypolicy.model.load_weights(weights_file)

        copypolicy2 = CNNPolicy.load_model(model_file2)

        for w1, w2 in zip(copypolicy.model.get_weights(), copypolicy2.model.get_weights()):
            self.assertTrue(np.all(w1 == w2))

        os.remove(model_file)
        os.remove(weights_file)
        os.remove(model_file2)
        os.remove(weights_file2)
Exemplo n.º 2
0
        def run_and_get_new_weights(init_weights, win0, win1):
            state = GameState(size=19)
            policy = CNNPolicy.load_model(
                os.path.join('test_data', 'minimodel.json'))
            policy.model.set_weights(init_weights)
            optimizer = BatchedReinforcementLearningSGD(lr=0.01, ng=2)
            policy.model.compile(loss=log_loss, optimizer=optimizer)

            # Make moves on the state and get trainable (state, action) pairs from them.
            moves = [(2, 2), (16, 16), (3, 17), (16, 2), (4, 10), (10, 3)]
            state_tensors = []
            action_tensors = []
            for m in moves:
                (st_tensor,
                 mv_tensor) = _make_training_pair(state, m,
                                                  policy.preprocessor)
                state_tensors.append(st_tensor)
                action_tensors.append(mv_tensor)
                state.do_move(m)

            for i, (s, a) in enumerate(zip(state_tensors, action_tensors)):
                # Put even state/action pairs in game 0, odd ones in game 1.
                game_idx = i % 2
                optimizer.set_current_game(game_idx)
                is_last_move = i + 2 >= len(moves)
                if is_last_move:
                    if game_idx == 0:
                        optimizer.set_result(game_idx, win0)
                    else:
                        optimizer.set_result(game_idx, win1)
                # train_on_batch accumulates gradients, and should only cause a change to parameters
                # on the first call after the final set_result() call
                policy.model.train_on_batch(s, a)
            return policy.model.get_weights()
Exemplo n.º 3
0
 def test_probabilistic_player(self):
     gs = GameState()
     policy = CNNPolicy(["board", "ones", "turns_since"])
     player = ProbabilisticPolicyPlayer(policy)
     for i in range(20):
         move = player.get_move(gs)
         self.assertIsNotNone(move)
         gs.do_move(move)
Exemplo n.º 4
0
    def testApplyAndResetOnGamesFinished(self):
        policy = CNNPolicy.load_model(
            os.path.join('test_data', 'minimodel.json'))
        state = GameState(size=19)
        optimizer = BatchedReinforcementLearningSGD(lr=0.01, ng=2)
        policy.model.compile(loss=log_loss, optimizer=optimizer)

        # Helper to check initial conditions of the optimizer.
        def assertOptimizerInitialConditions():
            for v in optimizer.gradient_sign:
                self.assertEqual(K.eval(v), 0)
            self.assertEqual(K.eval(optimizer.running_games), 2)

        initial_parameters = policy.model.get_weights()

        def assertModelEffect(changed):
            any_change = False
            for cur, init in zip(policy.model.get_weights(),
                                 initial_parameters):
                if not np.allclose(init, cur):
                    any_change = True
                    break
            self.assertEqual(any_change, changed)

        assertOptimizerInitialConditions()

        # Make moves on the state and get trainable (state, action) pairs from them.
        state_tensors = []
        action_tensors = []
        moves = [(2, 2), (16, 16), (3, 17), (16, 2), (4, 10), (10, 3)]
        for m in moves:
            (st_tensor,
             mv_tensor) = _make_training_pair(state, m, policy.preprocessor)
            state_tensors.append(st_tensor)
            action_tensors.append(mv_tensor)
            state.do_move(m)

        for i, (s, a) in enumerate(zip(state_tensors, action_tensors)):
            # Even moves in game 0, odd moves in game 1
            game_idx = i % 2
            optimizer.set_current_game(game_idx)
            is_last_move = i + 2 >= len(moves)
            if is_last_move:
                # Mark game 0 as a win and game 1 as a loss.
                optimizer.set_result(game_idx, game_idx == 0)
            else:
                # Games not finished yet; assert no change to optimizer state.
                assertOptimizerInitialConditions()
            # train_on_batch accumulates gradients, and should only cause a change to parameters
            # on the first call after the final set_result() call
            policy.model.train_on_batch(s, a)
            if i + 1 < len(moves):
                assertModelEffect(changed=False)
            else:
                assertModelEffect(changed=True)
        # Once both games finished, the last call to train_on_batch() should have triggered a reset
        # to the optimizer parameters back to initial conditions.
        assertOptimizerInitialConditions()
Exemplo n.º 5
0
 def test_sensible_greedy(self):
     gs = GameState()
     policy = CNNPolicy(["board", "ones", "turns_since"])
     player = GreedyPolicyPlayer(policy)
     empty = (10, 10)
     for x in range(19):
         for y in range(19):
             if (x, y) != empty:
                 gs.do_move((x, y), go.BLACK)
     gs.current_player = go.BLACK
     self.assertIsNone(player.get_move(gs))
Exemplo n.º 6
0
    def test_output_size(self):
        policy19 = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"], board=19)
        output = policy19.forward(policy19.preprocessor.state_to_tensor(GameState(19)))
        self.assertEqual(output.shape, (1, 19 * 19))

        policy13 = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"], board=13)
        output = policy13.forward(policy13.preprocessor.state_to_tensor(GameState(13)))
        self.assertEqual(output.shape, (1, 13 * 13))
Exemplo n.º 7
0
    def testGradientDirectionChangesWithGameResult(self):
        def run_and_get_new_weights(init_weights, win0, win1):
            state = GameState(size=19)
            policy = CNNPolicy.load_model(
                os.path.join('test_data', 'minimodel.json'))
            policy.model.set_weights(init_weights)
            optimizer = BatchedReinforcementLearningSGD(lr=0.01, ng=2)
            policy.model.compile(loss=log_loss, optimizer=optimizer)

            # Make moves on the state and get trainable (state, action) pairs from them.
            moves = [(2, 2), (16, 16), (3, 17), (16, 2), (4, 10), (10, 3)]
            state_tensors = []
            action_tensors = []
            for m in moves:
                (st_tensor,
                 mv_tensor) = _make_training_pair(state, m,
                                                  policy.preprocessor)
                state_tensors.append(st_tensor)
                action_tensors.append(mv_tensor)
                state.do_move(m)

            for i, (s, a) in enumerate(zip(state_tensors, action_tensors)):
                # Put even state/action pairs in game 0, odd ones in game 1.
                game_idx = i % 2
                optimizer.set_current_game(game_idx)
                is_last_move = i + 2 >= len(moves)
                if is_last_move:
                    if game_idx == 0:
                        optimizer.set_result(game_idx, win0)
                    else:
                        optimizer.set_result(game_idx, win1)
                # train_on_batch accumulates gradients, and should only cause a change to parameters
                # on the first call after the final set_result() call
                policy.model.train_on_batch(s, a)
            return policy.model.get_weights()

        policy = CNNPolicy.load_model(
            os.path.join('test_data', 'minimodel.json'))
        initial_parameters = policy.model.get_weights()
        # Cases 1 and 2 have identical starting models and identical (state, action) pairs,
        # but they differ in who won the games.
        parameters1 = run_and_get_new_weights(initial_parameters, True, False)
        parameters2 = run_and_get_new_weights(initial_parameters, False, True)

        # Changes in case 1 should be equal and opposite to changes in case 2. Allowing 0.1%
        # difference in precision.
        for (i, p1, p2) in zip(initial_parameters, parameters1, parameters2):
            diff1 = p1 - i
            diff2 = p2 - i
            npt.assert_allclose(diff1, -diff2, rtol=1e-3)
Exemplo n.º 8
0
 def test_batch_eval_state(self):
     policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"])
     results = policy.batch_eval_state([GameState(), GameState()])
     self.assertEqual(len(results), 2)  # one result per GameState
     self.assertEqual(len(results[0]), 361)  # each one has 361 (move,prob) pairs
Exemplo n.º 9
0
 def test_default_policy(self):
     policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"])
     policy.eval_state(GameState())
Exemplo n.º 10
0
                        "learning policy weights.", default="d:\ps\club\go\models\tomRL.19.hdf5")
    parser.add_argument("model_path", help="Path to network architecture file.", default="d:\ps\club\go\models\46feats_model_0515.json")
    parser.add_argument("--out_pth", "-o", help="Path to where the training "
                        "pairs will be saved. Default: None", default="d:\ps\club\go\gen_value_training_out\outFile.h5")
    parser.add_argument("--load_from_file", help="Path to HDF5 file to continue from."
                        " Default: None", default=None)
    parser.add_argument(
        "--n_training_pairs", help="Number of training pairs to generate. "
        "(Default: 10)", type=int, default=10)
    parser.add_argument(
        "--batch_size", help="Number of games to run in parallel. "
        "(Default: 2)", type=int, default=2)
    parser.add_argument(
        "--board_size", help="Board size (int). "
        "(Default: 19)", type=int, default=19)
    args = parser.parse_args()

    # Load architecture and weights from file
    policy_SL = CNNPolicy.load_model(args.model_path)
    features = policy_SL.preprocessor.feature_list
    if "color" not in features:
        features.append("color")
    policy_SL.model.load_weights(args.SL_weights_path)
    policy_RL = CNNPolicy.load_model(args.model_path)
    policy_RL.model.load_weights(args.RL_weights_path)
    # Create player object that plays against itself (for both RL and SL phases)
    player_RL = ProbabilisticPolicyPlayer(policy_RL)
    player_SL = ProbabilisticPolicyPlayer(policy_SL)
    run(player_RL, player_SL, args.out_pth, args.n_training_pairs,
        args.batch_size, args.board_size, features)
def run_training(cmd_line_args=None):
    import argparse
    parser = argparse.ArgumentParser(
        description=
        'Perform reinforcement learning to improve given policy network. Second phase of pipeline.'
    )  # noqa: E501
    parser.add_argument("model_json", help="Path to policy model JSON.")
    parser.add_argument(
        "initial_weights",
        help=
        "Path to HDF5 file with inital weights (i.e. result of supervised training)."
    )  # noqa: E501
    parser.add_argument(
        "out_directory",
        help=
        "Path to folder where the model params and metadata will be saved after each epoch."
    )  # noqa: E501
    parser.add_argument("--learning-rate",
                        help="Keras learning rate (Default: 0.001)",
                        type=float,
                        default=0.001)  # noqa: E501
    parser.add_argument(
        "--policy-temp",
        help=
        "Distribution temperature of players using policies (Default: 0.67)",
        type=float,
        default=0.67)  # noqa: E501
    parser.add_argument(
        "--save-every",
        help="Save policy as a new opponent every n batches (Default: 500)",
        type=int,
        default=500)  # noqa: E501
    parser.add_argument("--game-batch",
                        help="Number of games per mini-batch (Default: 20)",
                        type=int,
                        default=20)  # noqa: E501
    parser.add_argument("--move-limit",
                        help="Maximum number of moves per game",
                        type=int,
                        default=500)  # noqa: E501
    parser.add_argument(
        "--iterations",
        help="Number of training batches/iterations (Default: 10000)",
        type=int,
        default=10000)  # noqa: E501
    parser.add_argument("--resume",
                        help="Load latest weights in out_directory and resume",
                        default=False,
                        action="store_true")  # noqa: E501
    parser.add_argument("--verbose",
                        "-v",
                        help="Turn on verbose mode",
                        default=False,
                        action="store_true")  # noqa: E501
    # Baseline function (TODO) default lambda state: 0  (receives either file
    # paths to JSON and weights or None, in which case it uses default baseline 0)
    if cmd_line_args is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(cmd_line_args)

    ZEROTH_FILE = "weights.00000.hdf5"

    if args.resume:
        if not os.path.exists(os.path.join(args.out_directory,
                                           "metadata.json")):
            raise ValueError("Cannot resume without existing output directory")

    if not os.path.exists(args.out_directory):
        if args.verbose:
            print "creating output directory {}".format(args.out_directory)
        os.makedirs(args.out_directory)

    if not args.resume:
        # make a copy of weights file, "weights.00000.hdf5" in the output directory
        copyfile(args.initial_weights,
                 os.path.join(args.out_directory, ZEROTH_FILE))
        if args.verbose:
            print "copied {} to {}".format(
                args.initial_weights,
                os.path.join(args.out_directory, ZEROTH_FILE))
        player_weights = ZEROTH_FILE
    else:
        # if resuming, we expect initial_weights to be just a
        # "weights.#####.hdf5" file, not a full path
        args.initial_weights = os.path.join(
            args.out_directory, os.path.basename(args.initial_weights))
        if not os.path.exists(args.initial_weights):
            raise ValueError("Cannot resume; weights {} do not exist".format(
                args.initial_weights))
        elif args.verbose:
            print "Resuming with weights {}".format(args.initial_weights)
        player_weights = os.path.basename(args.initial_weights)

    # Set initial conditions
    policy = CNNPolicy.load_model(args.model_json)
    policy.model.load_weights(args.initial_weights)
    player = ProbabilisticPolicyPlayer(policy,
                                       temperature=args.policy_temp,
                                       move_limit=args.move_limit)

    # different opponents come from simply changing the weights of 'opponent.policy.model'. That
    # is, only 'opp_policy' needs to be changed, and 'opponent' will change.
    opp_policy = CNNPolicy.load_model(args.model_json)
    opponent = ProbabilisticPolicyPlayer(opp_policy,
                                         temperature=args.policy_temp,
                                         move_limit=args.move_limit)

    if args.verbose:
        print "created player and opponent with temperature {}".format(
            args.policy_temp)

    if not args.resume:
        metadata = {
            "model_file": args.model_json,
            "init_weights": args.initial_weights,
            "learning_rate": args.learning_rate,
            "temperature": args.policy_temp,
            "game_batch": args.game_batch,
            "opponents":
            [ZEROTH_FILE
             ],  # which weights from which to sample an opponent each batch
            "win_ratio":
            {}  # map from player to tuple of (opponent, win ratio) Useful for
            # validating in lieu of 'accuracy/loss'
        }
    else:
        with open(os.path.join(args.out_directory, "metadata.json"), "r") as f:
            metadata = json.load(f)

    # Append args of current run to history of full command args.
    metadata["cmd_line_args"] = metadata.get("cmd_line_args",
                                             []).append(vars(args))

    def save_metadata():
        with open(os.path.join(args.out_directory, "metadata.json"), "w") as f:
            json.dump(metadata, f, sort_keys=True, indent=2)

    optimizer = BatchedReinforcementLearningSGD(lr=args.learning_rate,
                                                ng=args.game_batch)
    player.policy.model.compile(loss=log_loss, optimizer=optimizer)
    for i_iter in xrange(1, args.iterations + 1):
        # Randomly choose opponent from pool (possibly self), and playing
        # game_batch games against them.
        opp_weights = np.random.choice(metadata["opponents"])
        opp_path = os.path.join(args.out_directory, opp_weights)

        # Load new weights into opponent's network, but keep the same opponent object.
        opponent.policy.model.load_weights(opp_path)
        if args.verbose:
            print "Batch {}\tsampled opponent is {}".format(
                i_iter, opp_weights)

        # Run games (and learn from results). Keep track of the win ratio vs
        # each opponent over time.
        win_ratio = run_n_games(optimizer, player, opponent, args.game_batch)
        metadata["win_ratio"][player_weights] = (opp_weights, win_ratio)

        # Save all intermediate models.
        player_weights = "weights.%05d.hdf5" % i_iter
        player.policy.model.save_weights(
            os.path.join(args.out_directory, player_weights))

        # Add player to batch of oppenents once in a while.
        if i_iter % args.save_every == 0:
            metadata["opponents"].append(player_weights)
        save_metadata()