def main(argv):
    del argv

    # Create the game to use, and a loss calculator for it
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name)
    loss_calculator = exploitability_descent.LossCalculator(game)

    # Build the network
    num_hidden = FLAGS.num_hidden
    num_layers = FLAGS.num_layers
    layer = tf.constant(loss_calculator.tabular_policy.state_in, tf.float64)
    for _ in range(num_layers):
        regularizer = (tf.keras.regularizers.l2(l=FLAGS.regularizer_scale))
        layer = tf.layers.dense(layer,
                                num_hidden,
                                activation=tf.nn.relu,
                                kernel_regularizer=regularizer)
    regularizer = (tf.keras.regularizers.l2(l=FLAGS.regularizer_scale))
    layer = tf.layers.dense(layer,
                            game.num_distinct_actions(),
                            kernel_regularizer=regularizer)
    tabular_policy = loss_calculator.masked_softmax(layer)

    # Build the loss - exploitability descent loss plus regularizer loss
    nash_conv, loss = loss_calculator.loss(tabular_policy)
    loss += tf.losses.get_regularization_loss()

    # Use a simple gradient descent optimizer
    learning_rate = tf.placeholder(tf.float64, (), name="learning_rate")
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    optimizer_step = optimizer.minimize(loss)

    # Training loop
    with tf.train.MonitoredTrainingSession() as sess:
        for step in range(FLAGS.num_steps):
            t0 = time.time()
            nash_conv_value, _ = sess.run([nash_conv, optimizer_step],
                                          feed_dict={
                                              learning_rate:
                                              FLAGS.init_lr /
                                              np.sqrt(1 + step),
                                          })
            t1 = time.time()
            # Optionally log our progress
            if step % FLAGS.print_freq == 0:
                logging.info("step=%d nash_conv=%g time per step=%.4f", step,
                             nash_conv_value, t1 - t0)
Exemplo n.º 2
0
def main(argv):
    del argv

    # Create the game to use, and a loss calculator for it
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name)
    loss_calculator = exploitability_descent.LossCalculator(game)

    # Build the network
    num_hidden = FLAGS.num_hidden
    num_layers = FLAGS.num_layers
    regularizers = {
        "w": tf.contrib.layers.l2_regularizer(scale=FLAGS.regularizer_scale)
    }
    x = tf.constant(loss_calculator.tabular_policy.state_in, tf.float64)
    for _ in range(num_layers):
        layer = snt.Linear(num_hidden, regularizers=regularizers)
        x = tf.nn.relu(layer(x))
    layer = snt.Linear(game.num_distinct_actions(), regularizers=regularizers)
    tabular_policy = loss_calculator.masked_softmax(layer(x))

    # Build the loss - exploitability descent loss plus regularizer loss
    nash_conv, loss = loss_calculator.loss(tabular_policy)
    graph_regularizers = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    loss += tf.reduce_sum(graph_regularizers)

    # Use a simple gradient descent optimizer
    learning_rate = tf.placeholder(tf.float64, (), name="learning_rate")
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    optimizer_step = optimizer.minimize(loss)

    # Training loop
    with tf.train.MonitoredTrainingSession() as sess:
        for step in range(FLAGS.num_steps):
            t0 = time.time()
            nash_conv_value, _ = sess.run([nash_conv, optimizer_step],
                                          feed_dict={
                                              learning_rate:
                                              FLAGS.init_lr /
                                              np.sqrt(1 + step),
                                          })
            t1 = time.time()
            # Optionally log our progress
            if step % FLAGS.print_freq == 0:
                logging.info("step=%d nash_conv=%g time per step=%.4f", step,
                             nash_conv_value, t1 - t0)
Exemplo n.º 3
0
def main(argv):
    del argv

    N_epochs = FLAGS.N_epochs
    ppo_eps = FLAGS.ppo_eps
    min_policy_eps = FLAGS.min_policy_eps

    tf.set_random_seed(FLAGS.seed)

    # Create the game to use, and a loss calculator for it
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name)
    loss_calculator = exploitability_descent.LossCalculator(
        game, FLAGS.actorcritic_method)

    # Build the network
    num_hidden = FLAGS.num_hidden
    num_layers = FLAGS.num_layers
    layer = tf.constant(loss_calculator.tabular_policy.state_in, tf.float64)
    for _ in range(num_layers):
        regularizer = (tf.keras.regularizers.l2(l=FLAGS.regularizer_scale))
        layer = tf.layers.dense(layer,
                                num_hidden,
                                activation=tf.nn.relu,
                                kernel_regularizer=regularizer)
    regularizer = (tf.keras.regularizers.l2(l=FLAGS.regularizer_scale))
    layer = tf.layers.dense(layer,
                            game.num_distinct_actions(),
                            kernel_regularizer=regularizer)
    tabular_policy = loss_calculator.masked_softmax(layer)

    old_tp = tf.Variable(tabular_policy)
    old_tp = tf.nn.softmax(tf.clip_by_value(old_tp, min_policy_eps, 1.0),
                           axis=1)
    old_tabular_policy = tf.placeholder(tf.float64,
                                        tabular_policy.shape,
                                        name="old_tabular_policy")

    # Build the loss - exploitability descent loss plus regularizer loss
    nash_conv, loss = loss_calculator.loss(tabular_policy, old_tabular_policy,
                                           ppo_eps, min_policy_eps)

    loss += tf.losses.get_regularization_loss()

    # Use a simple gradient descent optimizer
    learning_rate = tf.placeholder(tf.float64, (), name="learning_rate")
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    optimizer_step = optimizer.minimize(loss)

    # Training loop
    nash_conv_values = []
    steps = []
    with tf.train.MonitoredTrainingSession() as sess:
        for step in range(FLAGS.num_steps):
            t0 = time.time()
            if FLAGS.actorcritic_method == 'PPO':
                old_tp_for_epoch = sess.run(old_tp)
                # print(old_tp_for_epoch)
                # print(old_tp_for_epoch.shape)
                # exit()
                for _ in range(N_epochs):
                    nash_conv_value, _ = sess.run(
                        [nash_conv, optimizer_step],
                        feed_dict={
                            learning_rate: FLAGS.init_lr / np.sqrt(1 + step),
                            old_tabular_policy: old_tp_for_epoch,
                        })
            else:
                nash_conv_value, _ = sess.run([nash_conv, optimizer_step],
                                              feed_dict={
                                                  learning_rate:
                                                  FLAGS.init_lr /
                                                  np.sqrt(1 + step),
                                              })
            t1 = time.time()
            # Optionally log our progress
            if step % FLAGS.print_freq == 0:
                logging.info("step=%d nash_conv=%g time per step=%.4f", step,
                             nash_conv_value, t1 - t0)
                steps.append(step)
                nash_conv_values.append(nash_conv_value)

        #save results
        if FLAGS.actorcritic_method == 'PPO':
            args_list = [
                str(FLAGS.game_name),
                str(FLAGS.num_steps),
                str(FLAGS.print_freq),
                str(FLAGS.init_lr),
                str(FLAGS.regularizer_scale),
                str(FLAGS.num_hidden),
                str(FLAGS.num_layers),
                str(FLAGS.actorcritic_method),
                str(FLAGS.N_epochs),
                str(FLAGS.ppo_eps),
                str(FLAGS.min_policy_eps)
            ]
        else:
            args_list = [
                str(FLAGS.game_name),
                str(FLAGS.num_steps),
                str(FLAGS.print_freq),
                str(FLAGS.init_lr),
                str(FLAGS.regularizer_scale),
                str(FLAGS.num_hidden),
                str(FLAGS.num_layers),
                str(FLAGS.actorcritic_method)
            ]
        output_folder = os.path.join(FLAGS.results_folder, '_'.join(args_list))
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        output_filename = 'seed' + str(FLAGS.seed) + '.npz'
        np.savez(os.path.join(output_folder, output_filename),
                 nash_conv_values=np.array(nash_conv_values),
                 steps=np.array(steps))