Пример #1
0
def test_SimpleTeamSoccerGame():
    game = SimpleTeamSoccerGame.SimpleTeamSoccerGame()
    agents = [SimpleTeamSoccerGame.DumbAgent()] * 4
    result = GameLoop.play_game(game, agents)
    print "result =\n{}\n".format(result)
    Replayer.show(result)
Пример #2
0
def train(game_type, agent_type, annealer=None):
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))

    agent = agent_type(sess)
    agents = [agent] * game_type.get_num_agents()

    # hyperparameters
    gamma = 0.98
    gae_lambda = 0.95
    learning_rate = 0.0003
    #learning_rate = 0.03
    #learning_rate = 0.001
    minibatch_size = 32  # experiences in each training batch
    #minibatch_size = 1000  # experiences in each training batch
    #value_loss_coef = 0.3
    value_loss_coef = 1.0
    hyper_string = "a2c_lr{}_vc{}_mb{}".format(learning_rate, value_loss_coef, minibatch_size)

    # set up the training operations in tensorflow
    # train policy by gradient descent on -[(log prob chosen action) * (advantage)]
    log_p_chosen_action_op = agent.get_log_p_chosen_action_op()
    advantage_ph = tf.placeholder(tf.float32, shape=[None], name='advantage')  # shape: (batch size,)
    policy_loss = tf.constant(-1.0) * tf.reduce_sum(advantage_ph * log_p_chosen_action_op)
    # train value function by gradient descent on [(value est) - (cum future reward)] ** 2
    reward_ph = tf.placeholder(tf.float32, shape=[None], name='reward')  # shape: (batch size,)
    value_op = agent.get_value_op()
    value_sq_err = tf.square(reward_ph - value_op)
    value_mse_sum = tf.summary.scalar("value_mse", tf.reduce_mean(value_sq_err))
    value_loss = tf.reduce_sum(value_sq_err)
    # train on a combined loss
    total_loss = policy_loss + value_loss_coef * value_loss
    learning_rate_ph = tf.placeholder(tf.float32)
    train_op = tf.train.AdamOptimizer(learning_rate_ph).minimize(total_loss)
    #train_op = tf.train.GradientDescentOptimizer(learning_rate_ph).minimize(total_loss)

    sess.run(tf.global_variables_initializer())

    exp_buf = []
    rew_buf = []
    adv_buf = []

    prr = PeriodicReplayWriter(game_type=game_type, agents=agents, period=50, outdir="/home/greg/coding/ML/rlgames/replays")

    merged_sum_op = tf.summary.merge_all()
    log_dir = os.path.join("/home/greg/coding/ML/rl/logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "_" + hyper_string)
    sum_wri = tf.summary.FileWriter(log_dir, graph=sess.graph, flush_secs=5)

    saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=5, keep_checkpoint_every_n_hours=0.5)
    ckpt_dir = os.path.join("/home/greg/coding/ML/rl", "checkpoints")
    frames_between_ckpts = 5000
    #train_frames = Checkpoint.optionally_restore_from_checkpoint(sess, saver, ckpt_dir)
    train_frames = 0
    last_ckpt_frame = train_frames

    print "WARNING: only training on 0th agent's experiences"

    ep_num = 0
    while True:
        if annealer:
            annealer.frame(train_frames)

        prr.maybe_write(ep_num)

        # play a game and remember the experiences and rewards
        # If there's more than one player, the same agent is used for all of them
        # so the agent had better not be something with state like an RNN. Then
        # the experiences of all players are used for training.
        game = game_type()
        result = GameLoop.play_game(game, agents)
        #print "len(result.episodes) = {}".format(len(result.episodes))
        for ep in result.episodes[:1]:
            exp_buf.extend(ep.experiences)
            ep_rewards = ep.compute_cum_discounted_future_rewards(gamma=gamma)
            ep_undisc_rewards = ep.compute_cum_discounted_future_rewards(gamma=1.0)
            ep_advs = ep.compute_generalized_advantage_ests(gamma, gae_lambda)
            rew_buf.extend(ep_rewards)
            adv_buf.extend(ep_advs)
            #print "ep_rewards =\n{}".format(ep_rewards)
            #print "ep_advs =\n{}".format(ep_advs)
            #print "exp_buf =\n{}".format(exp_buf)
            #print "rew_buf =\n{}".format(rew_buf)
            #print "adv_buf =\n{}".format(adv_buf)

            sum_wri.add_summary(make_summary("disc_rew", ep_rewards[0]), global_step=train_frames)
            sum_wri.add_summary(make_summary("undisc_rew", ep_undisc_rewards[0]), global_step=train_frames)
            sum_wri.add_summary(make_summary("init_value_est", ep.experiences[0].value_est), global_step=train_frames)
            sum_wri.add_summary(make_summary("init_value_mse", (ep.experiences[0].value_est - ep_rewards[0])**2), global_step=train_frames)
        sum_wri.add_summary(make_summary("game_length", len(result.episodes[0].experiences)), global_step=train_frames)
        sum_wri.add_summary(make_summary("total_undisc_rew", sum(sum(exp.reward for exp in ep.experiences) for ep in result.episodes)), global_step=train_frames)

        # train:
        while len(exp_buf) >= minibatch_size:
            # all this slicing is slow, but whatever
            batch_exps = exp_buf[:minibatch_size]
            batch_rews = rew_buf[:minibatch_size]
            batch_advs = adv_buf[:minibatch_size]
            #print "batch_exps =\n{}".format(batch_exps)
            #print "batch_rews =\n{}".format(batch_rews)
            #print "batch_advs =\n{}".format(batch_advs)
            # create a feed dict that will plug the state and chosen action
            # into the agent's network
            feed_dict = {
                reward_ph: batch_rews,
                advantage_ph: batch_advs,
                learning_rate_ph: learning_rate,
            }
            feed_dict.update(agent.make_train_feed_dict(batch_exps))
            exp_buf = exp_buf[minibatch_size:]  # discard the experiences we used
            rew_buf = rew_buf[minibatch_size:]
            adv_buf = adv_buf[minibatch_size:]
            # do a step of gradient descent
            #print "debug before train:"
            #agent.print_debug_info()
            #print "train feed dict:\n{}".format(feed_dict)
            [_, sums] = sess.run([train_op, merged_sum_op], feed_dict=feed_dict)
            train_frames += minibatch_size
            sum_wri.add_summary(sums, global_step=train_frames)
            #print "debug after train:"
            #agent.print_debug_info()

            if train_frames - last_ckpt_frame >= frames_between_ckpts:
                saver.save(sess, os.path.join(ckpt_dir, "model.ckpt"), global_step=train_frames)
                last_ckpt_frame = train_frames

        ep_num += 1
Пример #3
0
def test_ShootGame():
    game = ShootGame.ShootGame()
    agent = ShootGame.DumbAgent()
    result = GameLoop.play_game(game, [agent])
    print "result =\n{}\n".format(result)
    Replayer.show(result)
Пример #4
0
def test_NPlaceGame():
    game = NPlaceGame.NPlaceGame()
    agent = NPlaceGame.DumbAgent()
    result = GameLoop.play_game(game, [agent])
    print "result =\n{}\n".format(result)
    Replayer.show(result)
Пример #5
0
def test_DumbGame():
    game = DumbGame.DumbGame()
    agent = DumbGame.DumbAgent()
    result = GameLoop.play_game(game, [agent])
    print result
Пример #6
0
def train(game_type, agent_type, annealer=None):
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))

    agent = agent_type(sess)
    agents = [agent] * game_type.get_num_agents()

    # hyperparameters
    gamma = 0.99
    gae_lambda = 0.95
    learning_rate = 0.0003
    #    learning_rate = 0.0001
    epsilon = 0.1  # ppo parameter TODO: fiddle with me
    value_loss_coef = 0.3
    examples_per_iteration = 1000
    minibatch_size = 100  # experiences in each training batch
    epochs = 3
    hyper_string = "ppo_lr{}_ep{}_vc{}_eit{}_mb{}_ep{}".format(
        learning_rate, epsilon, value_loss_coef, examples_per_iteration,
        minibatch_size, epochs)

    # set up the training operations in tensorflow
    advantage_ph = tf.placeholder(tf.float32, shape=[None],
                                  name='advantage')  # shape: (batch size,)
    old_log_p_chosen_action_ph = tf.placeholder(
        tf.float32, shape=[None],
        name='old_log_p_chosen_action')  # shape: (batch size,)
    log_p_chosen_action_op = agent.get_log_p_chosen_action_op()
    p_ratio = tf.exp(log_p_chosen_action_op - old_log_p_chosen_action_ph)
    clipped_p_ratio = tf.clip_by_value(p_ratio, 1.0 - epsilon, 1.0 + epsilon)
    policy_loss = -tf.reduce_sum(
        tf.minimum(advantage_ph * p_ratio, advantage_ph * clipped_p_ratio))
    # train value function by gradient descent on [(value est) - (cum future reward)] ** 2
    reward_ph = tf.placeholder(tf.float32, shape=[None],
                               name='reward')  # shape: (batch size,)
    value_op = agent.get_value_op()
    value_mse = tf.reduce_sum(tf.square(reward_ph - value_op))
    value_mse_sum = tf.summary.scalar(
        "value_mse", tf.reduce_mean(tf.square(reward_ph - value_op)))
    # put policy and value loss together to get total loss
    total_loss = policy_loss + value_loss_coef * value_mse  # could optionally add an entropy loss to encourage exploration
    learning_rate_ph = tf.placeholder(tf.float32, name="learning_rate")
    train_op = tf.train.AdamOptimizer(learning_rate_ph).minimize(total_loss)

    sess.run(tf.global_variables_initializer())

    exp_buf = []
    rew_buf = []
    adv_buf = []

    prr = PeriodicReplayWriter(game_type=game_type,
                               agents=agents,
                               period=10,
                               outdir="/home/greg/coding/ML/rlgames/replays")

    merged_sum_op = tf.summary.merge_all()
    log_dir = os.path.join(
        "/home/greg/coding/ML/rlgames/logs",
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "_" + hyper_string)
    sum_wri = tf.summary.FileWriter(log_dir, graph=sess.graph, flush_secs=5)

    saver = tf.train.Saver(tf.trainable_variables(),
                           max_to_keep=5,
                           keep_checkpoint_every_n_hours=0.5)
    ckpt_dir = os.path.join("/home/greg/coding/ML/rlgames", "checkpoints")
    steps_between_ckpts = 5000
    step = Checkpoint.optionally_restore_from_checkpoint(sess, saver, ckpt_dir)
    last_ckpt_step = step

    game_frames = 0
    train_frames = 0
    time_tracker = TimeTracker()

    iteration = 0
    while True:
        if annealer:
            annealer.frame(step)

        time_tracker.start("prr")
        prr.maybe_write(iteration)
        time_tracker.end("prr")

        sampler = Sampler()  # stores the examples we'll use in this iteration

        # play games until we have enough examples to do a round of optimization
        print "iteration {}: playing games...".format(iteration)
        while sampler.num_examples < examples_per_iteration:
            # play a game and remember the experiences and rewards
            # If there's more than one player, the same agent is used for all of them
            # so the agent had better not be something with state like an RNN. Then
            # the experiences of all players are used for training.
            time_tracker.start("game")
            game = game_type()
            result = GameLoop.play_game(game, agents)
            time_tracker.end("game")
            game_frames += len(result.episodes[0].experiences)

            #print result.episodes[0]
            for ep in result.episodes:
                # remember each frame as an example to train on later
                ep_rewards = ep.compute_cum_discounted_future_rewards(
                    gamma=gamma)
                ep_advs = ep.compute_generalized_advantage_ests(
                    gamma, gae_lambda)
                ep_log_p_actions = np.array(
                    [exp.log_p_action for exp in ep.experiences])
                ep_feed_dict = {
                    reward_ph: ep_rewards,
                    advantage_ph: ep_advs,
                    old_log_p_chosen_action_ph: ep_log_p_actions
                }
                ep_feed_dict.update(agent.make_train_feed_dict(ep.experiences))
                sampler.add_examples(ep_feed_dict)

                # record some stats
                ep_undisc_rewards = ep.compute_cum_discounted_future_rewards(
                    gamma=1.0)
                sum_wri.add_summary(make_summary("disc_rew", ep_rewards[0]),
                                    global_step=step)
                sum_wri.add_summary(make_summary("undisc_rew",
                                                 ep_undisc_rewards[0]),
                                    global_step=step)
                sum_wri.add_summary(make_summary("init_value_est",
                                                 ep.experiences[0].value_est),
                                    global_step=step)
                sum_wri.add_summary(make_summary("init_value_mse",
                                                 (ep.experiences[0].value_est -
                                                  ep_rewards[0])**2),
                                    global_step=step)
            sum_wri.add_summary(make_summary(
                "game_length", len(result.episodes[0].experiences)),
                                global_step=step)
            sum_wri.add_summary(make_summary(
                "total_undisc_rew",
                sum(
                    sum(exp.reward for exp in ep.experiences)
                    for ep in result.episodes)),
                                global_step=step)

        # do a few epochs of optimization on the examples
        print "iteration {}: starting training...".format(iteration)
        time_tracker.start("train")
        for epoch in range(epochs):
            for mb_i, minibatch_fd in enumerate(
                    sampler.get_minibatches(minibatch_size)):
                #print "begin iteration {} epoch {} minibatch {}".format(iteration, epoch, mb_i)
                minibatch_fd[learning_rate_ph] = learning_rate
                #print "minibatch_fd =\n{}".format(minibatch_fd)
                #print "debug before train step:"
                #agent.print_debug_info()
                [_, sums] = sess.run([train_op, merged_sum_op],
                                     feed_dict=minibatch_fd)
                #print "debug after train step:"
                #agent.print_debug_info()
                sum_wri.add_summary(sums, global_step=step)
                step += minibatch_size
                train_frames += minibatch_size
        time_tracker.end("train")

        iteration += 1
        cur_time = time.time()
        print "iteration {}: finished training.".format(iteration)
        game_seconds = time_tracker.part_seconds["game"]
        train_seconds = time_tracker.part_seconds["train"]
        prr_seconds = time_tracker.part_seconds["prr"]
        total_seconds = time_tracker.get_total_seconds()
        other_seconds = total_seconds - train_seconds - game_seconds - prr_seconds
        print "game  frames = {}  game  seconds = {:.1f}s  game  frames per second = {:.1f}".format(
            game_frames, game_seconds, game_frames / game_seconds)
        print "train frames = {}  train seconds = {:.1f}s  train frames per second = {:.1f}".format(
            train_frames, train_seconds, train_frames / train_seconds)
        print "total time = {:.1f}s  game {:.1f}% train {:.1f}% prr {:.1f}% other {:.1f}%".format(
            total_seconds, 100 * game_seconds / total_seconds,
            100 * train_seconds / total_seconds,
            100 * prr_seconds / total_seconds,
            100 * other_seconds / total_seconds)

        if step - last_ckpt_step >= steps_between_ckpts:
            saver.save(sess,
                       os.path.join(ckpt_dir, "model.ckpt"),
                       global_step=step)
            last_ckpt_step = step
Пример #7
0
 def play_and_write(self, out_fn):
     GameLoop.play_game(self.game_type(), self.agents).save(out_fn)
     print "saved {}".format(out_fn)