def test_save_load(self): policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"]) model_file = 'TESTPOLICY.json' weights_file = 'TESTWEIGHTS.h5' model_file2 = 'TESTPOLICY2.json' weights_file2 = 'TESTWEIGHTS2.h5' # test saving model/weights separately policy.save_model(model_file) policy.model.save_weights(weights_file, overwrite=True) # test saving them together policy.save_model(model_file2, weights_file2) copypolicy = CNNPolicy.load_model(model_file) copypolicy.model.load_weights(weights_file) copypolicy2 = CNNPolicy.load_model(model_file2) for w1, w2 in zip(copypolicy.model.get_weights(), copypolicy2.model.get_weights()): self.assertTrue(np.all(w1 == w2)) os.remove(model_file) os.remove(weights_file) os.remove(model_file2) os.remove(weights_file2)
def run_and_get_new_weights(init_weights, win0, win1): state = GameState(size=19) policy = CNNPolicy.load_model( os.path.join('test_data', 'minimodel.json')) policy.model.set_weights(init_weights) optimizer = BatchedReinforcementLearningSGD(lr=0.01, ng=2) policy.model.compile(loss=log_loss, optimizer=optimizer) # Make moves on the state and get trainable (state, action) pairs from them. moves = [(2, 2), (16, 16), (3, 17), (16, 2), (4, 10), (10, 3)] state_tensors = [] action_tensors = [] for m in moves: (st_tensor, mv_tensor) = _make_training_pair(state, m, policy.preprocessor) state_tensors.append(st_tensor) action_tensors.append(mv_tensor) state.do_move(m) for i, (s, a) in enumerate(zip(state_tensors, action_tensors)): # Put even state/action pairs in game 0, odd ones in game 1. game_idx = i % 2 optimizer.set_current_game(game_idx) is_last_move = i + 2 >= len(moves) if is_last_move: if game_idx == 0: optimizer.set_result(game_idx, win0) else: optimizer.set_result(game_idx, win1) # train_on_batch accumulates gradients, and should only cause a change to parameters # on the first call after the final set_result() call policy.model.train_on_batch(s, a) return policy.model.get_weights()
def test_probabilistic_player(self): gs = GameState() policy = CNNPolicy(["board", "ones", "turns_since"]) player = ProbabilisticPolicyPlayer(policy) for i in range(20): move = player.get_move(gs) self.assertIsNotNone(move) gs.do_move(move)
def testApplyAndResetOnGamesFinished(self): policy = CNNPolicy.load_model( os.path.join('test_data', 'minimodel.json')) state = GameState(size=19) optimizer = BatchedReinforcementLearningSGD(lr=0.01, ng=2) policy.model.compile(loss=log_loss, optimizer=optimizer) # Helper to check initial conditions of the optimizer. def assertOptimizerInitialConditions(): for v in optimizer.gradient_sign: self.assertEqual(K.eval(v), 0) self.assertEqual(K.eval(optimizer.running_games), 2) initial_parameters = policy.model.get_weights() def assertModelEffect(changed): any_change = False for cur, init in zip(policy.model.get_weights(), initial_parameters): if not np.allclose(init, cur): any_change = True break self.assertEqual(any_change, changed) assertOptimizerInitialConditions() # Make moves on the state and get trainable (state, action) pairs from them. state_tensors = [] action_tensors = [] moves = [(2, 2), (16, 16), (3, 17), (16, 2), (4, 10), (10, 3)] for m in moves: (st_tensor, mv_tensor) = _make_training_pair(state, m, policy.preprocessor) state_tensors.append(st_tensor) action_tensors.append(mv_tensor) state.do_move(m) for i, (s, a) in enumerate(zip(state_tensors, action_tensors)): # Even moves in game 0, odd moves in game 1 game_idx = i % 2 optimizer.set_current_game(game_idx) is_last_move = i + 2 >= len(moves) if is_last_move: # Mark game 0 as a win and game 1 as a loss. optimizer.set_result(game_idx, game_idx == 0) else: # Games not finished yet; assert no change to optimizer state. assertOptimizerInitialConditions() # train_on_batch accumulates gradients, and should only cause a change to parameters # on the first call after the final set_result() call policy.model.train_on_batch(s, a) if i + 1 < len(moves): assertModelEffect(changed=False) else: assertModelEffect(changed=True) # Once both games finished, the last call to train_on_batch() should have triggered a reset # to the optimizer parameters back to initial conditions. assertOptimizerInitialConditions()
def test_sensible_greedy(self): gs = GameState() policy = CNNPolicy(["board", "ones", "turns_since"]) player = GreedyPolicyPlayer(policy) empty = (10, 10) for x in range(19): for y in range(19): if (x, y) != empty: gs.do_move((x, y), go.BLACK) gs.current_player = go.BLACK self.assertIsNone(player.get_move(gs))
def test_output_size(self): policy19 = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"], board=19) output = policy19.forward(policy19.preprocessor.state_to_tensor(GameState(19))) self.assertEqual(output.shape, (1, 19 * 19)) policy13 = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"], board=13) output = policy13.forward(policy13.preprocessor.state_to_tensor(GameState(13))) self.assertEqual(output.shape, (1, 13 * 13))
def testGradientDirectionChangesWithGameResult(self): def run_and_get_new_weights(init_weights, win0, win1): state = GameState(size=19) policy = CNNPolicy.load_model( os.path.join('test_data', 'minimodel.json')) policy.model.set_weights(init_weights) optimizer = BatchedReinforcementLearningSGD(lr=0.01, ng=2) policy.model.compile(loss=log_loss, optimizer=optimizer) # Make moves on the state and get trainable (state, action) pairs from them. moves = [(2, 2), (16, 16), (3, 17), (16, 2), (4, 10), (10, 3)] state_tensors = [] action_tensors = [] for m in moves: (st_tensor, mv_tensor) = _make_training_pair(state, m, policy.preprocessor) state_tensors.append(st_tensor) action_tensors.append(mv_tensor) state.do_move(m) for i, (s, a) in enumerate(zip(state_tensors, action_tensors)): # Put even state/action pairs in game 0, odd ones in game 1. game_idx = i % 2 optimizer.set_current_game(game_idx) is_last_move = i + 2 >= len(moves) if is_last_move: if game_idx == 0: optimizer.set_result(game_idx, win0) else: optimizer.set_result(game_idx, win1) # train_on_batch accumulates gradients, and should only cause a change to parameters # on the first call after the final set_result() call policy.model.train_on_batch(s, a) return policy.model.get_weights() policy = CNNPolicy.load_model( os.path.join('test_data', 'minimodel.json')) initial_parameters = policy.model.get_weights() # Cases 1 and 2 have identical starting models and identical (state, action) pairs, # but they differ in who won the games. parameters1 = run_and_get_new_weights(initial_parameters, True, False) parameters2 = run_and_get_new_weights(initial_parameters, False, True) # Changes in case 1 should be equal and opposite to changes in case 2. Allowing 0.1% # difference in precision. for (i, p1, p2) in zip(initial_parameters, parameters1, parameters2): diff1 = p1 - i diff2 = p2 - i npt.assert_allclose(diff1, -diff2, rtol=1e-3)
def test_batch_eval_state(self): policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"]) results = policy.batch_eval_state([GameState(), GameState()]) self.assertEqual(len(results), 2) # one result per GameState self.assertEqual(len(results[0]), 361) # each one has 361 (move,prob) pairs
def test_default_policy(self): policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"]) policy.eval_state(GameState())
"learning policy weights.", default="d:\ps\club\go\models\tomRL.19.hdf5") parser.add_argument("model_path", help="Path to network architecture file.", default="d:\ps\club\go\models\46feats_model_0515.json") parser.add_argument("--out_pth", "-o", help="Path to where the training " "pairs will be saved. Default: None", default="d:\ps\club\go\gen_value_training_out\outFile.h5") parser.add_argument("--load_from_file", help="Path to HDF5 file to continue from." " Default: None", default=None) parser.add_argument( "--n_training_pairs", help="Number of training pairs to generate. " "(Default: 10)", type=int, default=10) parser.add_argument( "--batch_size", help="Number of games to run in parallel. " "(Default: 2)", type=int, default=2) parser.add_argument( "--board_size", help="Board size (int). " "(Default: 19)", type=int, default=19) args = parser.parse_args() # Load architecture and weights from file policy_SL = CNNPolicy.load_model(args.model_path) features = policy_SL.preprocessor.feature_list if "color" not in features: features.append("color") policy_SL.model.load_weights(args.SL_weights_path) policy_RL = CNNPolicy.load_model(args.model_path) policy_RL.model.load_weights(args.RL_weights_path) # Create player object that plays against itself (for both RL and SL phases) player_RL = ProbabilisticPolicyPlayer(policy_RL) player_SL = ProbabilisticPolicyPlayer(policy_SL) run(player_RL, player_SL, args.out_pth, args.n_training_pairs, args.batch_size, args.board_size, features)
def run_training(cmd_line_args=None): import argparse parser = argparse.ArgumentParser( description= 'Perform reinforcement learning to improve given policy network. Second phase of pipeline.' ) # noqa: E501 parser.add_argument("model_json", help="Path to policy model JSON.") parser.add_argument( "initial_weights", help= "Path to HDF5 file with inital weights (i.e. result of supervised training)." ) # noqa: E501 parser.add_argument( "out_directory", help= "Path to folder where the model params and metadata will be saved after each epoch." ) # noqa: E501 parser.add_argument("--learning-rate", help="Keras learning rate (Default: 0.001)", type=float, default=0.001) # noqa: E501 parser.add_argument( "--policy-temp", help= "Distribution temperature of players using policies (Default: 0.67)", type=float, default=0.67) # noqa: E501 parser.add_argument( "--save-every", help="Save policy as a new opponent every n batches (Default: 500)", type=int, default=500) # noqa: E501 parser.add_argument("--game-batch", help="Number of games per mini-batch (Default: 20)", type=int, default=20) # noqa: E501 parser.add_argument("--move-limit", help="Maximum number of moves per game", type=int, default=500) # noqa: E501 parser.add_argument( "--iterations", help="Number of training batches/iterations (Default: 10000)", type=int, default=10000) # noqa: E501 parser.add_argument("--resume", help="Load latest weights in out_directory and resume", default=False, action="store_true") # noqa: E501 parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true") # noqa: E501 # Baseline function (TODO) default lambda state: 0 (receives either file # paths to JSON and weights or None, in which case it uses default baseline 0) if cmd_line_args is None: args = parser.parse_args() else: args = parser.parse_args(cmd_line_args) ZEROTH_FILE = "weights.00000.hdf5" if args.resume: if not os.path.exists(os.path.join(args.out_directory, "metadata.json")): raise ValueError("Cannot resume without existing output directory") if not os.path.exists(args.out_directory): if args.verbose: print "creating output directory {}".format(args.out_directory) os.makedirs(args.out_directory) if not args.resume: # make a copy of weights file, "weights.00000.hdf5" in the output directory copyfile(args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE)) if args.verbose: print "copied {} to {}".format( args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE)) player_weights = ZEROTH_FILE else: # if resuming, we expect initial_weights to be just a # "weights.#####.hdf5" file, not a full path args.initial_weights = os.path.join( args.out_directory, os.path.basename(args.initial_weights)) if not os.path.exists(args.initial_weights): raise ValueError("Cannot resume; weights {} do not exist".format( args.initial_weights)) elif args.verbose: print "Resuming with weights {}".format(args.initial_weights) player_weights = os.path.basename(args.initial_weights) # Set initial conditions policy = CNNPolicy.load_model(args.model_json) policy.model.load_weights(args.initial_weights) player = ProbabilisticPolicyPlayer(policy, temperature=args.policy_temp, move_limit=args.move_limit) # different opponents come from simply changing the weights of 'opponent.policy.model'. That # is, only 'opp_policy' needs to be changed, and 'opponent' will change. opp_policy = CNNPolicy.load_model(args.model_json) opponent = ProbabilisticPolicyPlayer(opp_policy, temperature=args.policy_temp, move_limit=args.move_limit) if args.verbose: print "created player and opponent with temperature {}".format( args.policy_temp) if not args.resume: metadata = { "model_file": args.model_json, "init_weights": args.initial_weights, "learning_rate": args.learning_rate, "temperature": args.policy_temp, "game_batch": args.game_batch, "opponents": [ZEROTH_FILE ], # which weights from which to sample an opponent each batch "win_ratio": {} # map from player to tuple of (opponent, win ratio) Useful for # validating in lieu of 'accuracy/loss' } else: with open(os.path.join(args.out_directory, "metadata.json"), "r") as f: metadata = json.load(f) # Append args of current run to history of full command args. metadata["cmd_line_args"] = metadata.get("cmd_line_args", []).append(vars(args)) def save_metadata(): with open(os.path.join(args.out_directory, "metadata.json"), "w") as f: json.dump(metadata, f, sort_keys=True, indent=2) optimizer = BatchedReinforcementLearningSGD(lr=args.learning_rate, ng=args.game_batch) player.policy.model.compile(loss=log_loss, optimizer=optimizer) for i_iter in xrange(1, args.iterations + 1): # Randomly choose opponent from pool (possibly self), and playing # game_batch games against them. opp_weights = np.random.choice(metadata["opponents"]) opp_path = os.path.join(args.out_directory, opp_weights) # Load new weights into opponent's network, but keep the same opponent object. opponent.policy.model.load_weights(opp_path) if args.verbose: print "Batch {}\tsampled opponent is {}".format( i_iter, opp_weights) # Run games (and learn from results). Keep track of the win ratio vs # each opponent over time. win_ratio = run_n_games(optimizer, player, opponent, args.game_batch) metadata["win_ratio"][player_weights] = (opp_weights, win_ratio) # Save all intermediate models. player_weights = "weights.%05d.hdf5" % i_iter player.policy.model.save_weights( os.path.join(args.out_directory, player_weights)) # Add player to batch of oppenents once in a while. if i_iter % args.save_every == 0: metadata["opponents"].append(player_weights) save_metadata()