示例#1
0
def train(num_episodes=1000,
          save_every=100,
          checkpoint_dir="checkpoints",
          tensorboard_dir="tensorboard",
          tboard_every=10,
          find_target_prop=0):
    pol = Policy()
    writer = tf.contrib.summary.create_file_writer(tensorboard_dir)
    for j in range(1, num_episodes + 1):
        random_secret = random.randint(0, config.max_guesses - 1)
        e = Episode(pol, random_secret, find_target_prop, True)
        history = e.generate()

        print("Episode:{}, length: {}".format(j, len(history)))

        G = -1

        optimizer = \
            tf.train.GradientDescentOptimizer(
                learning_rate=config.reinforce_alpha*G)

        for i in reversed(range(1, len(history))):
            history_so_far = history[:i]
            next_action, _ = history[i]
            with tfe.GradientTape() as tape:
                action_logits = pol(history_so_far, with_softmax=False)
                loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                    labels=tf.one_hot(tf.convert_to_tensor([next_action]),
                                      config.max_guesses),
                    logits=action_logits)

            grads = tape.gradient(loss, pol.variables)
            optimizer.apply_gradients(zip(grads, pol.variables))

            G -= 1
            optimizer._learning_rate = G * config.reinforce_alpha
            optimizer._learning_rate_tensor = None
            # hack. Should be able to pass a callable as learning_rate, see
            # https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer#args
            # can I perhaps submit a PR to fix this bug?

            sys.stdout.write("{}/{}\r".format(len(history) - i, len(history)))

        if j % save_every == 0 or j == num_episodes:
            saver = tfe.Saver(pol.named_variables)
            save_path = os.path.join(
                checkpoint_dir,
                "episode{}".format(str(j).zfill(len(str(num_episodes)))))
            saver.save(save_path)

        if j % tboard_every == 0:
            with writer.as_default():
                with tf.contrib.summary.always_record_summaries():
                    tf.contrib.summary.scalar('total_return',
                                              tf.convert_to_tensor([G]),
                                              step=j)
    return pol
示例#2
0
        state = self.lstm.zero_state(1, tf.float32)

        for guess, feedback in game_state:
            guess_tensor = tf.reshape(tf.convert_to_tensor(guess), (1,))
            feedback_tensor = tf.reshape(tf.convert_to_tensor(feedback), (1,))
            guess_embedded = self.guess_embedding(guess_tensor)
            feedback_embedded = self.feedback_embedding(feedback_tensor)

            combined_embedded = tf.concat([guess_embedded,
                                            feedback_embedded],
                                            axis=-1)
            # can I do multiple inputs to the LSTM instead of concatenating?

            output, state = self.lstm(combined_embedded, state)

        logits = self.dense(output)
        if with_softmax:
            return tf.nn.softmax(logits)
        return logits


if __name__ == "__main__":
    from episode import Episode
    import numpy as np
    np.random.seed(123)
    p = Policy()
    e = Episode(p, "0000")
    x = p(e.generate())
    print(x.numpy())

示例#3
0
        for guess, feedback in game_state:
            guess_tensor = tf.reshape(tf.convert_to_tensor(guess), (1, ))
            feedback_tensor = tf.reshape(tf.convert_to_tensor(feedback), (1, ))
            guess_embedded = self.guess_embedding(guess_tensor)
            feedback_embedded = self.feedback_embedding(feedback_tensor)

            combined_embedded = tf.concat([guess_embedded, feedback_embedded],
                                          axis=-1)
            # can I do multiple inputs to the LSTM instead of concatenating?

            output, state = self.lstm(combined_embedded, state)

        logits = self.dense(output)
        if with_softmax:
            return tf.nn.softmax(logits)
        return logits


if __name__ == "__main__":
    from episode import Episode
    import numpy as np
    np.random.seed(123)
    p = Policy()
    print("lstm")
    print(p.lstm)
    e = Episode(p, "0000", find_target_proba=0.3, train=True)
    g = e.generate()
    print(g)
    x = p(g)
    print(x)