Exemplo n.º 1
0
    def test_save_load(self):
        policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"])

        model_file = "TESTPOLICY.json"
        weights_file = "TESTWEIGHTS.h5"
        model_file2 = "TESTPOLICY2.json"
        weights_file2 = "TESTWEIGHTS2.h5"

        # test saving model/weights separately
        policy.save_model(model_file)
        policy.model.save_weights(weights_file, overwrite=True)
        # test saving them together
        policy.save_model(model_file2, weights_file2)

        copypolicy = CNNPolicy.load_model(model_file)
        copypolicy.model.load_weights(weights_file)

        copypolicy2 = CNNPolicy.load_model(model_file2)

        for w1, w2 in zip(copypolicy.model.get_weights(), copypolicy2.model.get_weights()):
            self.assertTrue(np.all(w1 == w2))

        os.remove(model_file)
        os.remove(weights_file)
        os.remove(model_file2)
        os.remove(weights_file2)
        def run_and_get_new_weights(init_weights, winners, game):

            # Create "mock" states that end after 2 moves with a predetermined winner.
            states = [MockState(winner, 2, size=19) for winner in winners]

            policy1 = CNNPolicy.load_model(
                os.path.join('tests', 'test_data', 'minimodel_policy.json'))
            policy2 = CNNPolicy.load_model(
                os.path.join('tests', 'test_data', 'minimodel_policy.json'))
            policy1.model.set_weights(init_weights)
            optimizer = SGD(lr=0.001)
            policy1.model.compile(loss=log_loss, optimizer=optimizer)

            learner = MockPlayer(policy1, game)
            opponent = MockPlayer(policy2, game)

            # Run RL training
            run_n_games(optimizer,
                        0.001,
                        learner,
                        opponent,
                        2,
                        mock_states=states)

            return policy1.model.get_weights()
Exemplo n.º 3
0
	def test_save_load(self):
		policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"])

		model_file = 'TESTPOLICY.json'
		weights_file = 'TESTWEIGHTS.h5'
		model_file2 = 'TESTPOLICY2.json'
		weights_file2 = 'TESTWEIGHTS2.h5'

		# test saving model/weights separately
		policy.save_model(model_file)
		policy.model.save_weights(weights_file, overwrite=True)
		# test saving them together
		policy.save_model(model_file2, weights_file2)

		copypolicy = CNNPolicy.load_model(model_file)
		copypolicy.model.load_weights(weights_file)

		copypolicy2 = CNNPolicy.load_model(model_file2)

		for w1, w2 in zip(copypolicy.model.get_weights(), copypolicy2.model.get_weights()):
			self.assertTrue(np.all(w1 == w2))

		os.remove(model_file)
		os.remove(weights_file)
		os.remove(model_file2)
		os.remove(weights_file2)
        def test_game_decrease(game):

            # Create "mock" state that ends after 20 moves with the learner losing
            lose_state = [MockState(go.WHITE, 20, size=19)]
            policy1 = CNNPolicy.load_model(
                os.path.join('tests', 'test_data', 'minimodel_policy.json'))
            policy2 = CNNPolicy.load_model(
                os.path.join('tests', 'test_data', 'minimodel_policy.json'))
            learner = MockPlayer(policy1, game)
            opponent = MockPlayer(policy2, game)
            optimizer = SGD(lr=0.001)
            policy1.model.compile(loss=log_loss, optimizer=optimizer)

            # Get initial (before learning) move probabilities for all moves made by black
            init_move_probs = get_sgf_move_probs(game, policy1, go.BLACK)
            init_probs = [prob for (mv, prob) in init_move_probs]

            # Run RL training
            run_n_games(optimizer,
                        0.001,
                        learner,
                        opponent,
                        1,
                        mock_states=lose_state)

            # Get new move probabilities for black's moves having finished 1 round of training
            new_move_probs = get_sgf_move_probs(game, policy1, go.BLACK)
            new_probs = [prob for (mv, prob) in new_move_probs]

            # Assert that, on average, move probabilities for black decreased having lost.
            self.assertTrue(
                sum((new_probs[i] - init_probs[i]) for i in range(10)) < 0)
        def test_game_gradient(game):

            policy = CNNPolicy.load_model(
                os.path.join('tests', 'test_data', 'minimodel_policy.json'))
            initial_parameters = policy.model.get_weights()
            # Cases 1 and 2 have identical starting models and identical (state, action) pairs,
            # but they differ in who won the games.
            parameters1 = run_and_get_new_weights(initial_parameters,
                                                  [go.BLACK, go.WHITE], game)
            parameters2 = run_and_get_new_weights(initial_parameters,
                                                  [go.WHITE, go.BLACK], game)

            # Assert that some parameters changed.
            any_change_1 = any(
                not np.array_equal(i, p1)
                for (i, p1) in zip(initial_parameters, parameters1))
            any_change_2 = any(
                not np.array_equal(i, p2)
                for (i, p2) in zip(initial_parameters, parameters2))
            self.assertTrue(any_change_1)
            self.assertTrue(any_change_2)

            # Changes in case 1 should be equal and opposite to changes in case 2. Allowing 0.1%
            # difference in precision.
            for (i, p1, p2) in zip(initial_parameters, parameters1,
                                   parameters2):
                diff1 = p1 - i
                diff2 = p2 - i
                npt.assert_allclose(diff1, -diff2, rtol=1e-3, atol=1e-11)
        def test_game_run_N(game):
            policy1 = CNNPolicy.load_model(os.path.join('tests', 'test_data', 'minimodel.json'))
            policy2 = CNNPolicy.load_model(os.path.join('tests', 'test_data', 'minimodel.json'))
            learner = MockPlayer(policy1, game)
            opponent = MockPlayer(policy2, game)
            optimizer = SGD()
            init_weights = policy1.model.get_weights()
            policy1.model.compile(loss=log_loss, optimizer=optimizer)

            # Run RL training
            run_n_games(optimizer, learner, opponent, 2)

            # Get new weights for comparison
            trained_weights = policy1.model.get_weights()

            # Assert that some parameters changed.
            any_change = any(not np.array_equal(i, t)
                             for (i, t) in zip(init_weights, trained_weights))
            self.assertTrue(any_change)
Exemplo n.º 7
0
	def test_save_load(self):
		policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"])

		model_file = 'TESTPOLICY.json'
		weights_file = 'TESTWEIGHTS.h5'

		policy.save_model(model_file)
		policy.model.save_weights(weights_file)

		copypolicy = CNNPolicy.load_model(model_file)
		copypolicy.model.load_weights(weights_file)

		os.remove(model_file)
		os.remove(weights_file)
Exemplo n.º 8
0
    def test_save_load(self):
        policy = CNNPolicy(
            ["board", "liberties", "sensibleness", "capture_size"])

        model_file = 'TESTPOLICY.json'
        weights_file = 'TESTWEIGHTS.h5'

        policy.save_model(model_file)
        policy.model.save_weights(weights_file)

        copypolicy = CNNPolicy.load_model(model_file)
        copypolicy.model.load_weights(weights_file)

        os.remove(model_file)
        os.remove(weights_file)
Exemplo n.º 9
0
def run_training(cmd_line_args=None):
	"""Run training. command-line args may be passed in as a list
	"""
	import argparse
	parser = argparse.ArgumentParser(description='Perform supervised training on a policy network.')
	# required args
	parser.add_argument("model", help="Path to a JSON model file (i.e. from CNNPolicy.save_model())")
	parser.add_argument("train_data", help="A .h5 file of training data")
	parser.add_argument("out_directory", help="directory where metadata and weights will be saved")
	# frequently used args
	parser.add_argument("--minibatch", "-B", help="Size of training data minibatches. Default: 16", type=int, default=16)
	parser.add_argument("--epochs", "-E", help="Total number of iterations on the data. Default: 10", type=int, default=10)
	parser.add_argument("--epoch-length", "-l", help="Number of training examples considered 'one epoch'. Default: # training data", type=int, default=None)
	parser.add_argument("--learning-rate", "-r", help="Learning rate - how quickly the model learns at first. Default: .03", type=float, default=.03)
	parser.add_argument("--decay", "-d", help="The rate at which learning decreases. Default: .0001", type=float, default=.0001)
	parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true")
	# slightly fancier args
	parser.add_argument("--weights", help="Name of a .h5 weights file (in the output directory) to load to resume training", default=None)
	parser.add_argument("--train-val-test", help="Fraction of data to use for training/val/test. Must sum to 1. Invalid if restarting training", nargs=3, type=float, default=[0.93, .05, .02])
	# TODO - an argument to specify which transformations to use, put it in metadata

	if cmd_line_args is None:
		args = parser.parse_args()
	else:
		args = parser.parse_args(cmd_line_args)

	# TODO - what follows here should be refactored into a series of small functions

	resume = args.weights is not None

	if args.verbose:
		if resume:
			print "trying to resume from %s with weights %s" % (args.out_directory, os.path.join(args.out_directory, args.weights))
		else:
			if os.path.exists(args.out_directory):
				print "directory %s exists. any previous data will be overwritten" % args.out_directory
			else:
				print "starting fresh output directory %s" % args.out_directory

	# load model from json spec
	model = CNNPolicy.load_model(args.model).model
	if resume:
		model.load_weights(os.path.join(args.out_directory, args.weights))

	# TODO - (waiting on game_converter) verify that features of model match features of training data
	dataset = h5.File(args.train_data)
	n_total_data = len(dataset["states"])
	n_train_data = int(args.train_val_test[0] * n_total_data)
	n_val_data = int(args.train_val_test[1] * n_total_data)
	# n_test_data = n_total_data - (n_train_data + n_val_data)

	if args.verbose:
		print "datset loaded"
		print "\t%d total samples" % n_total_data
		print "\t%d training samples" % n_train_data
		print "\t%d validaion samples" % n_val_data

	# ensure output directory is available
	if not os.path.exists(args.out_directory):
		os.makedirs(args.out_directory)

	# create metadata file and the callback object that will write to it
	meta_file = os.path.join(args.out_directory, "metadata.json")
	meta_writer = MetadataWriterCallback(meta_file)
	# load prior data if it already exists
	if os.path.exists(meta_file) and resume:
		with open(meta_file, "r") as f:
			meta_writer.metadata = json.load(f)
		if args.verbose:
			print "previous metadata loadeda: %d epochs. new epochs will be appended." % len(meta_writer.metadata["epochs"])
	elif args.verbose:
		print "starting with empty metadata"
	# the MetadataWriterCallback only sets 'epoch' and 'best_epoch'. We can add in anything else we like here
	# TODO - model and train_data are saved in meta_file; check that they match (and make args optional when restarting?)
	meta_writer.metadata["training_data"] = args.train_data
	meta_writer.metadata["model_file"] = args.model

	# create ModelCheckpoint to save weights every epoch
	checkpoint_template = os.path.join(args.out_directory, "weights.{epoch:05d}.hdf5")
	checkpointer = ModelCheckpoint(checkpoint_template)

	# load precomputed random-shuffle indices or create them
	# TODO - save each train/val/test indices separately so there's no danger of
	# changing args.train_val_test when resuming
	shuffle_file = os.path.join(args.out_directory, "shuffle.npz")
	if os.path.exists(shuffle_file) and resume:
		with open(shuffle_file, "r") as f:
			shuffle_indices = np.load(f)
		if args.verbose:
			print "loading previous data shuffling indices"
	else:
		# create shuffled indices
		shuffle_indices = np.random.permutation(n_total_data)
		with open(shuffle_file, "w") as f:
			np.save(f, shuffle_indices)
		if args.verbose:
			print "created new data shuffling indices"
	# training indices are the first consecutive set of shuffled indices, val next, then test gets the remainder
	train_indices = shuffle_indices[0:n_train_data]
	val_indices = shuffle_indices[n_train_data:n_train_data + n_val_data]
	# test_indices = shuffle_indices[n_train_data + n_val_data:]

	# create dataset generators
	train_data_generator = shuffled_hdf5_batch_generator(
		dataset["states"],
		dataset["actions"],
		train_indices,
		args.minibatch,
		BOARD_TRANSFORMATIONS)
	val_data_generator = shuffled_hdf5_batch_generator(
		dataset["states"],
		dataset["actions"],
		val_indices,
		args.minibatch,
		BOARD_TRANSFORMATIONS)

	sgd = SGD(lr=args.learning_rate, decay=args.decay)
	model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=["accuracy"])

	samples_per_epoch = args.epoch_length or n_train_data

	if args.verbose:
		print "STARTING TRAINING"

	model.fit_generator(
		generator=train_data_generator,
		samples_per_epoch=samples_per_epoch,
		nb_epoch=args.epochs,
		callbacks=[checkpointer, meta_writer],
		validation_data=val_data_generator,
		nb_val_samples=n_val_data)
Exemplo n.º 10
0
def train(metadata, out_directory, verbose, weight_file, meta_file):
    # set resume
    resume = weight_file is not None

    # load model from json spec
    policy = CNNPolicy.load_model(metadata["model_file"])
    model_features = policy.preprocessor.get_feature_list()
    model = policy.model
    # load weights
    if resume:
        model.load_weights(
            os.path.join(out_directory, FOLDER_WEIGHT, weight_file))

    # features of training data
    dataset = h5.File(metadata["training_data"])

    # Verify that dataset's features match the model's expected features.
    validate_feature_planes(verbose, dataset, model_features)

    # create metadata file and the callback object that will write to it
    # and saves model  at the same time
    # the MetadataWriterCallback only sets 'epoch', 'best_epoch' and 'current_batch'.
    # We can add in anything else we like here
    meta_writer = EpochDataSaverCallback(meta_file, out_directory, metadata)

    # get train/validation/test indices
    train_indices, val_indices, test_indices \
        = load_train_val_test_indices(verbose, metadata['symmetries'], len(dataset["states"]),
                                      metadata["batch_size"], out_directory)

    # create dataset generators
    train_data_generator = threading_shuffled_hdf5_batch_generator(
        dataset["states"], dataset["actions"], train_indices,
        metadata["batch_size"], metadata)
    val_data_generator = threading_shuffled_hdf5_batch_generator(
        dataset["states"],
        dataset["actions"],
        val_indices,
        metadata["batch_size"],
        validation=True)

    # check if step decay has to be applied
    if metadata["decay_every"] is None:
        # use normal decay without momentum
        lr_scheduler_callback = LrDecayCallback(metadata)
    else:
        # use step decay
        lr_scheduler_callback = LrStepDecayCallback(metadata, verbose)

    sgd = SGD(lr=metadata["learning_rate"])
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=["accuracy"])

    if verbose:
        print("STARTING TRAINING")

    # check that remaining epoch > 0
    if metadata["epochs"] <= len(metadata["epoch_logs"]):
        raise ValueError("No more epochs to train!")

    model.fit_generator(
        generator=train_data_generator,
        steps_per_epoch=(metadata["epoch_length"] / metadata["batch_size"]),
        epochs=(metadata["epochs"] - len(metadata["epoch_logs"])),
        callbacks=[meta_writer, lr_scheduler_callback],
        validation_data=val_data_generator,
        validation_steps=(len(val_indices) / metadata["batch_size"]))
def run_training(cmd_line_args=None):
	parser = argparse.ArgumentParser(description='Perform reinforcement learning to improve given policy network. Second phase of pipeline.')
	parser.add_argument("model_json", help="Path to policy model JSON.")
	parser.add_argument("initial_weights", help="Path to HDF5 file with inital weights (i.e. result of supervised training).")
	parser.add_argument("out_directory", help="Path to folder where the model params and metadata will be saved after each epoch.")
	parser.add_argument("--learning-rate", help="Keras learning rate (Default: .03)", type=float, default=.03)
	parser.add_argument("--policy-temp", help="Distribution temperature of players using policies (Default: 0.67)", type=float, default=0.67)
	parser.add_argument("--save-every", help="Save policy as a new opponent every n batches (Default: 500)", type=int, default=500)
	parser.add_argument("--game-batch", help="Number of games per mini-batch (Default: 20)", type=int, default=20)
	parser.add_argument("--iterations", help="Number of training batches/iterations (Default: 10000)", type=int, default=10000)
	parser.add_argument("--resume", help="Load latest weights in out_directory and resume", default=False, action="store_true")
	parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true")
	# Baseline function (TODO) default lambda state: 0  (receives either file
	# paths to JSON and weights or None, in which case it uses default baseline 0)
	if cmd_line_args is None:
		args = parser.parse_args()
	else:
		args = parser.parse_args(cmd_line_args)

	ZEROTH_FILE = "weights.00000.hdf5"

	if args.resume:
		if not os.path.exists(os.path.join(args.out_directory, "metadata.json")):
			raise ValueError("Cannot resume without existing output directory")

	if not os.path.exists(args.out_directory):
		if args.verbose:
			print ("creating output directory {}".format(args.out_directory))
		os.makedirs(args.out_directory)

	if not args.resume:
		# make a copy of weights file, "weights.00000.hdf5" in the output directory
		copyfile(args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE))
		if args.verbose:
			print ("copied {} to {}".format(args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE)))
		player_weights = ZEROTH_FILE
	else:
		# if resuming, we expect initial_weights to be just a "weights.#####.hdf5" file, not a full path
		args.initial_weights = os.path.join(args.out_directory, os.path.basename(args.initial_weights))
		if not os.path.exists(args.initial_weights):
			raise ValueError("Cannot resume; weights {} do not exist".format(args.initial_weights))
		elif args.verbose:
			print ("Resuming with weights {}".format(args.initial_weights))
		player_weights = os.path.basename(args.initial_weights)

	# Set initial conditions
	policy = CNNPolicy.load_model(args.model_json)
	policy.model.load_weights(args.initial_weights)
	player = ProbabilisticPolicyPlayer(policy, temperature=args.policy_temp)
	features = policy.preprocessor.feature_list

	# different opponents come from simply changing the weights of
	# opponent.policy.model "behind the scenes"
	opp_policy = CNNPolicy.load_model(args.model_json)
	opponent = ProbabilisticPolicyPlayer(opp_policy, temperature=args.policy_temp)

	if args.verbose:
		print ("created player and opponent with temperature {}".format(args.policy_temp))

	if not args.resume:
		metadata = {
			"model_file": args.model_json,
			"init_weights": args.initial_weights,
			"learning_rate": args.learning_rate,
			"temperature": args.policy_temp,
			"game_batch": args.game_batch,
			"opponents": [ZEROTH_FILE],  # which weights from which to sample an opponent each batch
			"win_ratio": {}  # map from player to tuple of (opponent, win ratio) Useful for validating in lieu of 'accuracy/loss'
		}
	else:
		with open(os.path.join(args.out_directory, "metadata.json"), "r") as f:
			metadata = json.load(f)

	def save_metadata():
		with open(os.path.join(args.out_directory, "metadata.json"), "w") as f:
			json.dump(metadata, f)

	# Set SGD and compile
	sgd = SGD(lr=args.learning_rate)
	player.policy.model.compile(loss='binary_crossentropy', optimizer=sgd)
	board_size = player.policy.model.input_shape[-1]
	for i_iter in xrange(1, args.iterations + 1):
		# Train mini-batches by randomly choosing opponent from pool (possibly self)
		# and playing game_batch games against them
		opp_weights = np.random.choice(metadata["opponents"])
		opp_path = os.path.join(args.out_directory, opp_weights)
		# load new weights into opponent, but otherwise its the same
		opponent.policy.model.load_weights(opp_path)
		if args.verbose:
			print ("Batch {}\tsampled opponent is {}".format(i_iter, opp_weights))
		# Make training pairs and do RL
		X_list, y_list, winners = make_training_pairs(player, opponent, features, args.game_batch, board_size)
		win_ratio = np.sum(np.array(winners) == 1) / float(args.game_batch)
		metadata["win_ratio"][player_weights] = (opp_weights, win_ratio)
		train_batch(player, X_list, y_list, winners, args.learning_rate)
		# Save intermediate models
		player_weights = "weights.%05d.hdf5" % i_iter
		player.policy.model.save_weights(os.path.join(args.out_directory, player_weights))
		# add player to batch of oppenents once in a while
		if i_iter % args.save_every == 0:
			metadata["opponents"].append(player_weights)
		save_metadata()
Exemplo n.º 12
0
def run_training(cmd_line_args=None):
    parser = argparse.ArgumentParser(
        description=
        'Perform reinforcement learning to improve given policy network. Second phase of pipeline.'
    )
    parser.add_argument("model_json", help="Path to policy model JSON.")
    parser.add_argument(
        "initial_weights",
        help=
        "Path to HDF5 file with inital weights (i.e. result of supervised training)."
    )
    parser.add_argument(
        "out_directory",
        help=
        "Path to folder where the model params and metadata will be saved after each epoch."
    )
    parser.add_argument("--learning-rate",
                        help="Keras learning rate (Default: .03)",
                        type=float,
                        default=.03)
    parser.add_argument(
        "--policy-temp",
        help=
        "Distribution temperature of players using policies (Default: 0.67)",
        type=float,
        default=0.67)
    parser.add_argument(
        "--save-every",
        help="Save policy as a new opponent every n batches (Default: 500)",
        type=int,
        default=500)
    parser.add_argument("--game-batch",
                        help="Number of games per mini-batch (Default: 20)",
                        type=int,
                        default=20)
    parser.add_argument(
        "--iterations",
        help="Number of training batches/iterations (Default: 10000)",
        type=int,
        default=10000)
    parser.add_argument("--resume",
                        help="Load latest weights in out_directory and resume",
                        default=False,
                        action="store_true")
    parser.add_argument("--verbose",
                        "-v",
                        help="Turn on verbose mode",
                        default=False,
                        action="store_true")
    # Baseline function (TODO) default lambda state: 0  (receives either file
    # paths to JSON and weights or None, in which case it uses default baseline 0)
    if cmd_line_args is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(cmd_line_args)

    ZEROTH_FILE = "weights.00000.hdf5"

    if args.resume:
        if not os.path.exists(os.path.join(args.out_directory,
                                           "metadata.json")):
            raise ValueError("Cannot resume without existing output directory")

    if not os.path.exists(args.out_directory):
        if args.verbose:
            print("creating output directory {}".format(args.out_directory))
        os.makedirs(args.out_directory)

    if not args.resume:
        # make a copy of weights file, "weights.00000.hdf5" in the output directory
        copyfile(args.initial_weights,
                 os.path.join(args.out_directory, ZEROTH_FILE))
        if args.verbose:
            print("copied {} to {}".format(
                args.initial_weights,
                os.path.join(args.out_directory, ZEROTH_FILE)))
        player_weights = ZEROTH_FILE
    else:
        # if resuming, we expect initial_weights to be just a "weights.#####.hdf5" file, not a full path
        args.initial_weights = os.path.join(
            args.out_directory, os.path.basename(args.initial_weights))
        if not os.path.exists(args.initial_weights):
            raise ValueError("Cannot resume; weights {} do not exist".format(
                args.initial_weights))
        elif args.verbose:
            print("Resuming with weights {}".format(args.initial_weights))
        player_weights = os.path.basename(args.initial_weights)

    # Set initial conditions
    policy = CNNPolicy.load_model(args.model_json)
    policy.model.load_weights(args.initial_weights)
    player = ProbabilisticPolicyPlayer(policy, temperature=args.policy_temp)
    features = policy.preprocessor.feature_list

    # different opponents come from simply changing the weights of
    # opponent.policy.model "behind the scenes"
    opp_policy = CNNPolicy.load_model(args.model_json)
    opponent = ProbabilisticPolicyPlayer(opp_policy,
                                         temperature=args.policy_temp)

    if args.verbose:
        print("created player and opponent with temperature {}".format(
            args.policy_temp))

    if not args.resume:
        metadata = {
            "model_file": args.model_json,
            "init_weights": args.initial_weights,
            "learning_rate": args.learning_rate,
            "temperature": args.policy_temp,
            "game_batch": args.game_batch,
            "opponents":
            [ZEROTH_FILE
             ],  # which weights from which to sample an opponent each batch
            "win_ratio": {
            }  # map from player to tuple of (opponent, win ratio) Useful for validating in lieu of 'accuracy/loss'
        }
    else:
        with open(os.path.join(args.out_directory, "metadata.json"), "r") as f:
            metadata = json.load(f)

    def save_metadata():
        with open(os.path.join(args.out_directory, "metadata.json"), "w") as f:
            json.dump(metadata, f)

    # Set SGD and compile
    sgd = SGD(lr=args.learning_rate)
    player.policy.model.compile(loss='binary_crossentropy', optimizer=sgd)
    board_size = player.policy.model.input_shape[-1]
    for i_iter in xrange(1, args.iterations + 1):
        # Train mini-batches by randomly choosing opponent from pool (possibly self)
        # and playing game_batch games against them
        opp_weights = np.random.choice(metadata["opponents"])
        opp_path = os.path.join(args.out_directory, opp_weights)
        # load new weights into opponent, but otherwise its the same
        opponent.policy.model.load_weights(opp_path)
        if args.verbose:
            print("Batch {}\tsampled opponent is {}".format(
                i_iter, opp_weights))
        # Make training pairs and do RL
        X_list, y_list, winners = make_training_pairs(player, opponent,
                                                      features,
                                                      args.game_batch,
                                                      board_size)
        win_ratio = np.sum(np.array(winners) == 1) / float(args.game_batch)
        metadata["win_ratio"][player_weights] = (opp_weights, win_ratio)
        train_batch(player, X_list, y_list, winners, args.learning_rate)
        # Save intermediate models
        player_weights = "weights.%05d.hdf5" % i_iter
        player.policy.model.save_weights(
            os.path.join(args.out_directory, player_weights))
        # add player to batch of oppenents once in a while
        if i_iter % args.save_every == 0:
            metadata["opponents"].append(player_weights)
        save_metadata()
def handle_arguments(cmd_line_args=None):
    """Run generate data. command-line args may be passed in as a list
    """

    import argparse
    parser = argparse.ArgumentParser(
        description='Play games used for training'
        'value network (third phase of pipeline). '
        'The final policy from the RL phase plays '
        'against itself and training pairs for value '
        'network are generated from the outcome in each '
        'games, following an off-policy, uniform random move')
    # required arguments
    parser.add_argument(
        "SL_weights_path",
        help="Path to file with supervised learning policy weights."
    )  # noqa: E501
    parser.add_argument(
        "RL_weights_path",
        help="Path to file with reinforcement learning policy weights."
    )  # noqa: E501
    parser.add_argument("model_path",
                        help="Path to network architecture file.")
    # optional arguments
    parser.add_argument("--verbose",
                        "-v",
                        help="Turn on verbose mode",
                        default=False,
                        action="store_true")  # noqa: E501
    parser.add_argument(
        "--outfile",
        "-o",
        help="Destination to write data (hdf5 file) Default: " +
        DEFAULT_FILE_NAME,
        default=DEFAULT_FILE_NAME)  # noqa: E501
    parser.add_argument(
        "--sgf-path",
        help="If set all sgf will be saved here. Default: None",
        default=None)  # noqa: E501
    parser.add_argument(
        "--n-training-pairs",
        help="Number of training pairs to generate. Default: " +
        str(DEFAULT_N_TRAINING_PAIRS),
        type=int,
        default=DEFAULT_N_TRAINING_PAIRS)  # noqa: E501
    parser.add_argument("--batch-size",
                        help="Number of games to run in parallel. Default: " +
                        str(DEFAULT_BATCH_SIZE),
                        type=int,
                        default=DEFAULT_BATCH_SIZE)  # noqa: E501
    parser.add_argument(
        "--features",
        "-f",
        help=
        "Comma-separated list of features to compute and store or 'all'. Default: all",
        default='all')  # noqa: E501
    parser.add_argument(
        "--sl-temperature",
        help="Distribution temperature of players using SL policies. Default: "
        + str(DEFAULT_TEMPERATURE_SL),
        type=float,
        default=DEFAULT_TEMPERATURE_SL)  # noqa: E501
    parser.add_argument(
        "--rl-temperature",
        help="Distribution temperature of players using RL policies. Default: "
        + str(DEFAULT_TEMPERATURE_RL),
        type=float,
        default=DEFAULT_TEMPERATURE_RL)  # noqa: E501

    # show help or parse arguments
    if cmd_line_args is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(cmd_line_args)

    # list with features used for value network
    # features = policy_SL.preprocessor.feature_list
    if args.features.lower() == 'all':
        features = [
            "board", "ones", "turns_since", "liberties", "capture_size",
            "self_atari_size", "liberties_after", "ladder_capture",
            "ladder_escape", "sensibleness", "color"
        ]
    else:
        features = args.features.split(",")

    # always add color feature
    if "color" not in features:
        features.append("color")

    # Load SL architecture and weights from file
    policy_SL = CNNPolicy.load_model(args.model_path)
    policy_SL.model.load_weights(args.SL_weights_path)
    # create SL player
    player_SL = ProbabilisticPolicyPlayer(policy_SL,
                                          temperature=args.sl_temperature,
                                          move_limit=DEFAULT_MAX_GAME_DEPTH)

    # Load RL architecture and weights from file
    policy_RL = CNNPolicy.load_model(args.model_path)
    policy_RL.model.load_weights(args.RL_weights_path)
    # Create RL player
    # TODO is it better to use greedy player?
    player_RL = ProbabilisticPolicyPlayer(policy_RL,
                                          temperature=args.rl_temperature,
                                          move_limit=DEFAULT_MAX_GAME_DEPTH)

    # check if folder exists
    if args.sgf_path is not None and not os.path.exists(args.sgf_path):
        os.makedirs(args.sgf_path)

    # generate data
    generate_data(player_RL, player_SL, args.outfile, args.n_training_pairs,
                  args.batch_size, policy_SL.model.input_shape[-1], features,
                  args.verbose, args.sgf_path)
Exemplo n.º 14
0
def run_training(cmd_line_args=None):
    import argparse
    parser = argparse.ArgumentParser(
        description=
        'Perform reinforcement learning to improve given policy network. Second phase of pipeline.'
    )  # noqa: E501
    parser.add_argument("model_json", help="Path to policy model JSON.")
    parser.add_argument(
        "initial_weights",
        help=
        "Path to HDF5 file with inital weights (i.e. result of supervised training)."
    )  # noqa: E501
    parser.add_argument(
        "out_directory",
        help=
        "Path to folder where the model params and metadata will be saved after each epoch."
    )  # noqa: E501
    parser.add_argument("--learning-rate",
                        help="Keras learning rate (Default: 0.001)",
                        type=float,
                        default=0.001)  # noqa: E501
    parser.add_argument(
        "--policy-temp",
        help=
        "Distribution temperature of players using policies (Default: 0.67)",
        type=float,
        default=0.67)  # noqa: E501
    parser.add_argument(
        "--save-every",
        help="Save policy as a new opponent every n batches (Default: 500)",
        type=int,
        default=500)  # noqa: E501
    parser.add_argument(
        "--record-every",
        help="Save learner's weights every n batches (Default: 1)",
        type=int,
        default=1)  # noqa: E501
    parser.add_argument("--game-batch",
                        help="Number of games per mini-batch (Default: 20)",
                        type=int,
                        default=20)  # noqa: E501
    parser.add_argument("--move-limit",
                        help="Maximum number of moves per game",
                        type=int,
                        default=500)  # noqa: E501
    parser.add_argument(
        "--iterations",
        help="Number of training batches/iterations (Default: 10000)",
        type=int,
        default=10000)  # noqa: E501
    parser.add_argument("--resume",
                        help="Load latest weights in out_directory and resume",
                        default=False,
                        action="store_true")  # noqa: E501
    parser.add_argument("--verbose",
                        "-v",
                        help="Turn on verbose mode",
                        default=False,
                        action="store_true")  # noqa: E501
    # Baseline function (TODO) default lambda state: 0  (receives either file
    # paths to JSON and weights or None, in which case it uses default baseline 0)
    if cmd_line_args is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(cmd_line_args)

    ZEROTH_FILE = "weights.00000.hdf5"

    if args.resume:
        if not os.path.exists(os.path.join(args.out_directory,
                                           "metadata.json")):
            raise ValueError("Cannot resume without existing output directory")

    if not os.path.exists(args.out_directory):
        if args.verbose:
            print("creating output directory {}".format(args.out_directory))
        os.makedirs(args.out_directory)

    if not args.resume:
        # make a copy of weights file, "weights.00000.hdf5" in the output directory
        copyfile(args.initial_weights,
                 os.path.join(args.out_directory, ZEROTH_FILE))
        if args.verbose:
            print("copied {} to {}".format(
                args.initial_weights,
                os.path.join(args.out_directory, ZEROTH_FILE)))
        player_weights = ZEROTH_FILE
        iter_start = 1
    else:
        # if resuming, we expect initial_weights to be just a
        # "weights.#####.hdf5" file, not a full path
        if not re.match(r"weights\.\d{5}\.hdf5", args.initial_weights):
            raise ValueError(
                "Expected to resume from weights file with name 'weights.#####.hdf5'"
            )
        args.initial_weights = os.path.join(
            args.out_directory, os.path.basename(args.initial_weights))
        if not os.path.exists(args.initial_weights):
            raise ValueError("Cannot resume; weights {} do not exist".format(
                args.initial_weights))
        elif args.verbose:
            print("Resuming with weights {}".format(args.initial_weights))
        player_weights = os.path.basename(args.initial_weights)
        iter_start = 1 + int(player_weights[8:13])

    # Set initial conditions
    policy = CNNPolicy.load_model(args.model_json)
    policy.model.load_weights(args.initial_weights)
    player = ProbabilisticPolicyPlayer(policy,
                                       temperature=args.policy_temp,
                                       move_limit=args.move_limit)

    # different opponents come from simply changing the weights of 'opponent.policy.model'. That
    # is, only 'opp_policy' needs to be changed, and 'opponent' will change.
    opp_policy = CNNPolicy.load_model(args.model_json)
    opponent = ProbabilisticPolicyPlayer(opp_policy,
                                         temperature=args.policy_temp,
                                         move_limit=args.move_limit)

    if args.verbose:
        print("created player and opponent with temperature {}".format(
            args.policy_temp))

    if not args.resume:
        metadata = {
            "model_file": args.model_json,
            "init_weights": args.initial_weights,
            "learning_rate": args.learning_rate,
            "temperature": args.policy_temp,
            "game_batch": args.game_batch,
            "opponents":
            [ZEROTH_FILE
             ],  # which weights from which to sample an opponent each batch
            "win_ratio":
            {}  # map from player to tuple of (opponent, win ratio) Useful for
            # validating in lieu of 'accuracy/loss'
        }
    else:
        with open(os.path.join(args.out_directory, "metadata.json"), "r") as f:
            metadata = json.load(f)

    # Append args of current run to history of full command args.
    metadata["cmd_line_args"] = metadata.get("cmd_line_args", [])
    metadata["cmd_line_args"].append(vars(args))

    def save_metadata():
        with open(os.path.join(args.out_directory, "metadata.json"), "w") as f:
            json.dump(metadata, f, sort_keys=True, indent=2)

    optimizer = SGD(lr=args.learning_rate)
    player.policy.model.compile(loss=log_loss, optimizer=optimizer)
    for i_iter in range(iter_start, args.iterations + 1):
        # Note that player_weights will only be saved as a file every args.record_every iterations.
        # Regardless, player_weights enters into the metadata to keep track of the win ratio over
        # time.
        player_weights = "weights.%05d.hdf5" % i_iter

        # Randomly choose opponent from pool (possibly self), and playing
        # game_batch games against them.
        opp_weights = np.random.choice(metadata["opponents"])
        opp_path = os.path.join(args.out_directory, opp_weights)

        # Load new weights into opponent's network, but keep the same opponent object.
        opponent.policy.model.load_weights(opp_path)
        if args.verbose:
            print("Batch {}\tsampled opponent is {}".format(
                i_iter, opp_weights))

        # Run games (and learn from results). Keep track of the win ratio vs each opponent over
        # time.
        win_ratio = run_n_games(optimizer, args.learning_rate, player,
                                opponent, args.game_batch)
        metadata["win_ratio"][player_weights] = (opp_weights, win_ratio)

        # Save intermediate models.
        if i_iter % args.record_every == 0:
            player.policy.model.save_weights(
                os.path.join(args.out_directory, player_weights))

        # Add player to batch of oppenents once in a while.
        if i_iter % args.save_every == 0:
            metadata["opponents"].append(player_weights)
        save_metadata()
def run_training(cmd_line_args=None):
    """Run training. command-line args may be passed in as a list
	"""
    import argparse
    parser = argparse.ArgumentParser(
        description='Perform supervised training on a policy network.')
    # required args
    parser.add_argument(
        "model",
        help="Path to a JSON model file (i.e. from CNNPolicy.save_model())")
    parser.add_argument("train_data", help="A .h5 file of training data")
    parser.add_argument(
        "out_directory",
        help="directory where metadata and weights will be saved")
    # frequently used args
    parser.add_argument("--minibatch",
                        "-B",
                        help="Size of training data minibatches. Default: 16",
                        type=int,
                        default=16)
    parser.add_argument(
        "--epochs",
        "-E",
        help="Total number of iterations on the data. Default: 10",
        type=int,
        default=10)
    parser.add_argument(
        "--epoch-length",
        "-l",
        help=
        "Number of training examples considered 'one epoch'. Default: # training data",
        type=int,
        default=None)
    parser.add_argument(
        "--learning-rate",
        "-r",
        help=
        "Learning rate - how quickly the model learns at first. Default: .03",
        type=float,
        default=.03)
    parser.add_argument(
        "--decay",
        "-d",
        help="The rate at which learning decreases. Default: .0001",
        type=float,
        default=.0001)
    parser.add_argument("--verbose",
                        "-v",
                        help="Turn on verbose mode",
                        default=False,
                        action="store_true")
    # slightly fancier args
    parser.add_argument(
        "--weights",
        help=
        "Name of a .h5 weights file (in the output directory) to load to resume training",
        default=None)
    parser.add_argument(
        "--train-val-test",
        help=
        "Fraction of data to use for training/val/test. Must sum to 1. Invalid if restarting training",
        nargs=3,
        type=float,
        default=[0.93, .05, .02])
    parser.add_argument(
        "--symmetries",
        help=
        "Comma-separated list of transforms, subset of noop,rot90,rot180,rot270,fliplr,flipud,diag1,diag2",
        default='noop,rot90,rot180,rot270,fliplr,flipud,diag1,diag2')
    # TODO - an argument to specify which transformations to use, put it in metadata

    if cmd_line_args is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(cmd_line_args)

    # TODO - what follows here should be refactored into a series of small functions

    resume = args.weights is not None

    if args.verbose:
        if resume:
            print("trying to resume from %s with weights %s" %
                  (args.out_directory,
                   os.path.join(args.out_directory, args.weights)))
        else:
            if os.path.exists(args.out_directory):
                print(
                    "directory %s exists. any previous data will be overwritten"
                    % args.out_directory)
            else:
                print("starting fresh output directory %s" %
                      args.out_directory)

    # load model from json spec
    model = CNNPolicy.load_model(args.model).model
    if resume:
        model.load_weights(os.path.join(args.out_directory, args.weights))

    # TODO - (waiting on game_converter) verify that features of model match features of training data
    dataset = h5.File(args.train_data)
    n_total_data = len(dataset["states"])
    n_train_data = int(args.train_val_test[0] * n_total_data)
    n_val_data = int(args.train_val_test[1] * n_total_data)
    # n_test_data = n_total_data - (n_train_data + n_val_data)

    if args.verbose:
        print("datset loaded")
        print("\t%d total samples" % n_total_data)
        print("\t%d training samples" % n_train_data)
        print("\t%d validaion samples" % n_val_data)

    # ensure output directory is available
    if not os.path.exists(args.out_directory):
        os.makedirs(args.out_directory)

    # create metadata file and the callback object that will write to it
    meta_file = os.path.join(args.out_directory, "metadata.json")
    meta_writer = MetadataWriterCallback(meta_file)
    # load prior data if it already exists
    if os.path.exists(meta_file) and resume:
        with open(meta_file, "r") as f:
            meta_writer.metadata = json.load(f)
        if args.verbose:
            print(
                "previous metadata loaded: %d epochs. new epochs will be appended."
                % len(meta_writer.metadata["epochs"]))
    elif args.verbose:
        print("starting with empty metadata")
    # the MetadataWriterCallback only sets 'epoch' and 'best_epoch'. We can add in anything else we like here
    # TODO - model and train_data are saved in meta_file; check that they match (and make args optional when restarting?)
    meta_writer.metadata["training_data"] = args.train_data
    meta_writer.metadata["model_file"] = args.model

    # create ModelCheckpoint to save weights every epoch
    checkpoint_template = os.path.join(args.out_directory,
                                       "weights.{epoch:05d}.hdf5")
    checkpointer = ModelCheckpoint(checkpoint_template)

    # load precomputed random-shuffle indices or create them
    # TODO - save each train/val/test indices separately so there's no danger of
    # changing args.train_val_test when resuming
    shuffle_file = os.path.join(args.out_directory, "shuffle.npz")
    if os.path.exists(shuffle_file) and resume:
        with open(shuffle_file, "r") as f:
            shuffle_indices = np.load(f)
        if args.verbose:
            print("loading previous data shuffling indices")
    else:
        # create shuffled indices
        shuffle_indices = np.random.permutation(n_total_data)
        with open(shuffle_file, "w") as f:
            np.save(f, shuffle_indices)
        if args.verbose:
            print("created new data shuffling indices")
    # training indices are the first consecutive set of shuffled indices, val next, then test gets the remainder
    train_indices = shuffle_indices[0:n_train_data]
    val_indices = shuffle_indices[n_train_data:n_train_data + n_val_data]
    # test_indices = shuffle_indices[n_train_data + n_val_data:]

    symmetries = [
        BOARD_TRANSFORMATIONS[name]
        for name in args.symmetries.strip().split(",")
    ]

    # create dataset generators
    train_data_generator = shuffled_hdf5_batch_generator(
        dataset["states"], dataset["actions"], train_indices, args.minibatch,
        symmetries)
    val_data_generator = shuffled_hdf5_batch_generator(dataset["states"],
                                                       dataset["actions"],
                                                       val_indices,
                                                       args.minibatch,
                                                       symmetries)

    sgd = SGD(lr=args.learning_rate, decay=args.decay)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=["accuracy"])

    samples_per_epoch = args.epoch_length or n_train_data

    if args.verbose:
        print("STARTING TRAINING")

    model.fit_generator(generator=train_data_generator,
                        samples_per_epoch=samples_per_epoch,
                        nb_epoch=args.epochs,
                        callbacks=[checkpointer, meta_writer],
                        validation_data=val_data_generator,
                        nb_val_samples=n_val_data)