示例#1
0
	def __init__(self, rootDir, batchSize, seqLen):
		self.batchSize = batchSize
		self.seqLen = seqLen
		self.rootDir = rootDir
		# load data, either shakespeare, or the Python source of Tensorflow itself
		self.textdir = self.rootDir + "/*.txt"
		#shakedir = "../tensorflow/**/*.py"
		self.codetext, self.valitext, self.bookranges = txt.read_data_files(self.textdir, validation=True)
		self.epoch_size = len(self.codetext) // (self.batchSize * self.seqLen)

		# display some stats on the data
		txt.print_data_stats(len(self.codetext), len(self.valitext), self.epoch_size)
def validate_on_network(auth):
    with tf.Session() as sess:
        new_saver = tf.train.import_meta_graph(auth + '.meta')
        new_saver.restore(sess, auth)
        valitext, _, __ = my_txtutils.read_data_files(directory, validation=False)

        VALI_SEQLEN = 1 * 64  # Sequence length for validation. State will be wrong at the start of each sequence.
        bsize = len(valitext) // VALI_SEQLEN
        vali_x, vali_y, _ = next(
            my_txtutils.rnn_minibatch_sequencer(valitext, bsize, VALI_SEQLEN, 1))  # all data in 1 batch
        vali_nullstate = np.zeros([bsize, INTERNALSIZE * NLAYERS])
        feed_dict = {'inputs/X:0': vali_x, 'target/Y_:0': vali_y, 'model/pkeep:0': 1.0,
                     'hidden_state/Hin:0': vali_nullstate, 'model/batchsize:0': bsize}

        ls, acc = sess.run(["display_data/batchloss:0", "display_data/accuracy:0"], feed_dict=feed_dict)
        my_txtutils.print_validation_stats(ls, acc)
#         A good choice of parameters ensures that the testing and validation curves stay close
#         To see the curves drift apart ("overfitting") try to use an insufficient amount of
#         training data (shakedir = "shakespeare/t*.txt" for example)
#
SEQLEN = 30
BATCHSIZE = 100
ALPHASIZE = txt.ALPHASIZE
INTERNALSIZE = 512
NLAYERS = 3
learning_rate = 0.001  # fixed learning rate
dropout_pkeep = 1.0    # no dropout

# load data, either shakespeare, or the Python source of Tensorflow itself
shakedir = "dickens/*.txt"
# shakedir = "../tensorflow/**/*.py"
codetext, valitext, bookranges = txt.read_data_files(shakedir, validation=False)

# display some stats on the data
epoch_size = len(codetext) // (BATCHSIZE * SEQLEN)
txt.print_data_stats(len(codetext), len(valitext), epoch_size)

#
# the model (see FAQ in README.md)
#
lr = tf.placeholder(tf.float32, name='lr')  # learning rate
pkeep = tf.placeholder(tf.float32, name='pkeep')  # dropout parameter
batchsize = tf.placeholder(tf.int32, name='batchsize')

# inputs
X = tf.placeholder(tf.uint8, [None, None], name='X')    # [ BATCHSIZE, SEQLEN ]
Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0)                 # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
示例#4
0
#   Training and experimentation (default):
#         Keep validation enabled
#         You can now play with the parameters anf follow the effects in Tensorboard
#         A good choice of parameters ensures that the testing and validation curves stay close
#         To see the curves drift apart ("overfitting") try to use an insufficient amount of
#
SEQLEN = 200
BATCHSIZE = 80
ALPHASIZE = txt.ALPHASIZE
INTERNALSIZE = 512
NLAYERS = 3
learning_rate = 0.001  # fixed learning rate
dropout_pkeep = 0.8    # some dropout

bibledir = "bible/*.txt"
codetext, valitext, bookranges = txt.read_data_files(bibledir, validation=True)

# display some stats on the data
epoch_size = len(codetext) // (BATCHSIZE * SEQLEN)
txt.print_data_stats(len(codetext), len(valitext), epoch_size)

#
# the model (see FAQ in README.md)
#
lr = tf.placeholder(tf.float32, name='lr')  # learning rate
pkeep = tf.placeholder(tf.float32, name='pkeep')  # dropout parameter
batchsize = tf.placeholder(tf.int32, name='batchsize')

# inputs
X = tf.placeholder(tf.uint8, [None, None], name='X')    # [ BATCHSIZE, SEQLEN ]
Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0)                 # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
示例#5
0
BATCH_SIZE = 100
ALPHA_SIZE = txt.ALPHASIZE
INTERNAL_SIZE = 512
NLAYERS = 3

learning_rate = 0.001
dropout_pkeep = 0.5  # No dropout

# Load training data
# Shakespeare
# train_data = "shakespeare/*.txt"

# Java code
train_data = "../Desktop/android/frameworks/**/*.java"

codetxt, valitext, bookranges = txt.read_data_files(train_data,
                                                    validation=True)

# Model
lr = tf.placeholder(tf.float32, name='lr')
pkeep = tf.placeholder(tf.float32, name='pkeep')
batchsize = tf.placeholder(tf.int32, name='batchsize')

# Input
X = tf.placeholder(tf.uint8, [None, None], name='X')  # [BATCH_SIZE, SEQ_LEN]
Xo = tf.one_hot(X, ALPHA_SIZE, 1.0, 0.0)  # [BATCH_SIZE, SEQ_LEN, ALPHA_SIZE]

# Output
Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_')  # [BATCH_SIZE, SEQ_LEN]
Yo_ = tf.one_hot(Y_, ALPHA_SIZE, 1.0, 0.0)  # [BATCH_SIZE, SEQ_LEN, ALPHA_SIZE]

# Input state
示例#6
0
#         training data (input_dir = "shakespeare/t*.txt" for example)
#
SEQLEN = 30
BATCHSIZE = 200
ALPHASIZE = txt.ALPHASIZE
INTERNALSIZE = 512
NLAYERS = 3
learning_rate = 0.001  # fixed learning rate
dropout_pkeep = 0.8  # some dropout
epoch_count = 13
project_name = "shakespeare"

# load data, from the project subfolder or the Python source of Tensorflow itself
input_dir = "{}/*.txt".format(project_name)
#input_dir = "../tensorflow/**/*.py"
codetext, valitext, bookranges = txt.read_data_files(input_dir,
                                                     validation=True)

# display some stats on the data
epoch_size = len(codetext) // (BATCHSIZE * SEQLEN)
txt.print_data_stats(len(codetext), len(valitext), epoch_size)

#
# the model (see FAQ in README.md)
#
lr = tf.placeholder(tf.float32, name='lr')  # learning rate
pkeep = tf.placeholder(tf.float32, name='pkeep')  # dropout parameter
batchsize = tf.placeholder(tf.int32, name='batchsize')

# inputs
X = tf.placeholder(tf.uint8, [None, None], name='X')  # [ BATCHSIZE, SEQLEN ]
Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0)  # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
SEQ_LEN = 64  # Number of characters per sequence

NUM_EPOCHS = 50  # Number of epochs
BATCH_SIZE = 250  # Sequences per batch
NUM_OF_GRUS = 1024  # Number of GRU cells per layer
NUM_LAYERS = 3  # How many layers deep we are going

SET_LR = 0.001  # Small fixed learning rate
SET_PKEEP = 0.75  # Dropping 20% of neurons

# Seed our random number generator
tf.set_random_seed(0)

# Load our Star Wars Scripts.
filedir = "StarWarsScripts/*.txt"
traintext, validtext, scriptranges = txt.read_data_files(filedir,
                                                         validation=True)

# Print out information about our data
size_of_epoch = len(traintext) // (BATCH_SIZE * SEQ_LEN)
txt.print_data_stats(len(traintext), len(validtext), size_of_epoch)

# Create our TensorFlow Graph.
batchsize = tf.placeholder(tf.int32, name='batchsize')
lr = tf.placeholder(tf.float32, name='lr')
pkeep = tf.placeholder(tf.float32, name='pkeep')
X = tf.placeholder(tf.uint8, [None, None], name='X')  # Input vector
Xo = tf.one_hot(
    X, ALPHA_SIZE, 1.0,
    0.0)  # One Hots create vector size ALPHA_SIZE, all set 0 except character
Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_')  # Output tensor
Yo_ = tf.one_hot(Y_, ALPHA_SIZE, 1.0, 0.0)  # OneHot our output  also
示例#8
0
#         You can now play with the parameters and follow the effects in Tensorboard
#         A good choice of parameters ensures that the testing and validation curves stay close
#         To see the curves drift apart ("overfitting") try to use an insufficient amount of
#         training data
#
SEQLEN = 30
BATCHSIZE = 200
ALPHASIZE = txt.ALPHASIZE
INTERNALSIZE = 512
NLAYERS = 3
learning_rate = 0.001  # fixed learning rate
dropout_pkeep = 0.8    # some dropout

# validation = True
validation = False
codetext, valitext, fileranges = txt.read_data_files(args.globby, validation)

# display some stats on the data
epoch_size = len(codetext) // (BATCHSIZE * SEQLEN)
txt.print_data_stats(len(codetext), len(valitext), epoch_size)

#
# the model (see FAQ in README.md)
#
lr = tf.placeholder(tf.float32, name='lr')  # learning rate
pkeep = tf.placeholder(tf.float32, name='pkeep')  # dropout parameter
batchsize = tf.placeholder(tf.int32, name='batchsize')

# inputs
X = tf.placeholder(tf.uint8, [None, None], name='X')    # [ BATCHSIZE, SEQLEN ]
Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0)                 # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
示例#9
0
timee = datetime.datetime.now()

# ADAPTED FROM 'DEEP LEARNING WITHOUT A PHD' FROM GOOGLE
# model parameters

BATCHSIZE = 200
ALPHASIZE = txt.ALPHASIZE
INTERNALSIZE = 512
NLAYERS = 3
SEQLEN = 30
learning_rate = 0.001  # fixed learning rate
dropout_pkeep = 0.8    # some dropout

# load data
docdir = "all/*.txt"
code_text, validation_text, bookranges = txt.read_data_files(docdir, validation=True)

# display some stats on the data
epoch_size = len(code_text) // (BATCHSIZE * SEQLEN)
txt.print_data_stats(len(code_text), len(validation_text), epoch_size)

lr = tf.placeholder(tf.float32, name='lr')  # learning rate
pkeep = tf.placeholder(tf.float32, name='pkeep')  # dropout parameter
batchsize = tf.placeholder(tf.int32, name='batchsize') # batch size parameter

# inputs
X = tf.placeholder(tf.uint8, [None, None], name='X')    # [ BATCHSIZE, SEQLEN ]
Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0)                 # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
# expected outputs = same sequence shifted by 1 since we are trying to predict the next character
Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_')  # [ BATCHSIZE, SEQLEN ]
Yo_ = tf.one_hot(Y_, ALPHASIZE, 1.0, 0.0)               # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
def main(_):

    # load data, either shakespeare, or the Python source of Tensorflow itself
    shakedir = FLAGS.text_dir
    # shakedir = "../tensorflow/**/*.py"
    codetext, valitext, bookranges = txt.read_data_files(shakedir,
                                                         validation=True)

    # display some stats on the data
    epoch_size = len(codetext) // (FLAGS.train_batch_size * FLAGS.seqlen)
    txt.print_data_stats(len(codetext), len(valitext), epoch_size)

    #
    # the model (see FAQ in README.md)
    #
    lr = tf.placeholder(tf.float32, name='lr')  # learning rate
    pkeep = tf.placeholder(tf.float32, name='pkeep')  # dropout parameter
    batchsize = tf.placeholder(tf.int32, name='batchsize')

    # inputs
    X = tf.placeholder(tf.uint8, [None, None],
                       name='X')  # [ BATCHSIZE, FLAGS.seqlen ]
    Xo = tf.one_hot(X, ALPHASIZE, 1.0,
                    0.0)  # [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ]
    # expected outputs = same sequence shifted by 1 since we are trying to predict the next character
    Y_ = tf.placeholder(tf.uint8, [None, None],
                        name='Y_')  # [ BATCHSIZE, FLAGS.seqlen ]
    Yo_ = tf.one_hot(Y_, ALPHASIZE, 1.0,
                     0.0)  # [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ]
    # input state
    Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE * NLAYERS],
                         name='Hin')  # [ BATCHSIZE, INTERNALSIZE * NLAYERS]

    # using a NLAYERS=3 layers of GRU cells, unrolled FLAGS.seqlen=30 times
    # dynamic_rnn infers FLAGS.seqlen from the size of the inputs Xo

    onecell = rnn.GRUCell(INTERNALSIZE)
    dropcell = rnn.DropoutWrapper(onecell, input_keep_prob=pkeep)
    multicell = rnn.MultiRNNCell([dropcell] * NLAYERS, state_is_tuple=False)
    multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep)
    Yr, H = tf.nn.dynamic_rnn(multicell,
                              Xo,
                              dtype=tf.float32,
                              initial_state=Hin)
    # Yr: [ BATCHSIZE, FLAGS.seqlen, INTERNALSIZE ]
    # H:  [ BATCHSIZE, INTERNALSIZE*NLAYERS ] # this is the last state in the sequence

    H = tf.identity(H, name='H')  # just to give it a name

    # Softmax layer implementation:
    # Flatten the first two dimension of the output [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ] => [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ]
    # then apply softmax readout layer. This way, the weights and biases are shared across unrolled time steps.
    # From the readout point of view, a value coming from a cell or a minibatch is the same thing

    Yflat = tf.reshape(
        Yr, [-1, INTERNALSIZE])  # [ BATCHSIZE x FLAGS.seqlen, INTERNALSIZE ]
    Ylogits = layers.linear(
        Yflat, ALPHASIZE)  # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ]
    Yflat_ = tf.reshape(
        Yo_, [-1, ALPHASIZE])  # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ]
    loss = tf.nn.softmax_cross_entropy_with_logits(
        logits=Ylogits, labels=Yflat_)  # [ BATCHSIZE x FLAGS.seqlen ]
    loss = tf.reshape(loss, [batchsize, -1])  # [ BATCHSIZE, FLAGS.seqlen ]
    Yo = tf.nn.softmax(Ylogits,
                       name='Yo')  # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ]
    Y = tf.argmax(Yo, 1)  # [ BATCHSIZE x FLAGS.seqlen ]
    Y = tf.reshape(Y, [batchsize, -1], name="Y")  # [ BATCHSIZE, FLAGS.seqlen ]
    train_step = tf.train.AdamOptimizer(lr).minimize(loss)

    # stats for display
    seqloss = tf.reduce_mean(loss, 1)
    batchloss = tf.reduce_mean(seqloss)
    accuracy = tf.reduce_mean(
        tf.cast(tf.equal(Y_, tf.cast(Y, tf.uint8)), tf.float32))
    loss_summary = tf.summary.scalar("batch_loss", batchloss)
    acc_summary = tf.summary.scalar("batch_accuracy", accuracy)
    summaries = tf.summary.merge([loss_summary, acc_summary])

    # Init Tensorboard stuff. This will save Tensorboard information into a different
    # folder at each run named 'log/<timestamp>/'. Two sets of data are saved so that
    # you can compare training and validation curves visually in Tensorboard.
    timestamp = str(math.trunc(time.time()))
    summary_writer = tf.summary.FileWriter(
        os.path.join(FLAGS.summaries_dir, timestamp + "-training"))
    validation_writer = tf.summary.FileWriter(
        os.path.join(FLAGS.summaries_dir, timestamp + "-validation"))

    # Init for saving models. They will be saved into a directory named 'checkpoints'.
    # Only the last checkpoint is kept.
    if not os.path.exists(FLAGS.checkpoint_dir):
        os.mkdir(FLAGS.checkpoint_dir)
    saver = tf.train.Saver(max_to_keep=1)

    # for display: init the progress bar
    DISPLAY_FREQ = 50
    _50_BATCHES = DISPLAY_FREQ * FLAGS.train_batch_size * FLAGS.seqlen
    progress = txt.Progress(DISPLAY_FREQ,
                            size=111 + 2,
                            msg="Training on next " + str(DISPLAY_FREQ) +
                            " batches")

    # init
    istate = np.zeros([FLAGS.train_batch_size,
                       INTERNALSIZE * NLAYERS])  # initial zero input state
    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)
    step = 0

    # training loop
    for x, y_, epoch in txt.rnn_minibatch_sequencer(codetext,
                                                    FLAGS.train_batch_size,
                                                    FLAGS.seqlen,
                                                    nb_epochs=1000):

        # train on one minibatch
        feed_dict = {
            X: x,
            Y_: y_,
            Hin: istate,
            lr: FLAGS.learning_rate,
            pkeep: FLAGS.dropout_pkeep,
            batchsize: FLAGS.train_batch_size
        }
        _, y, ostate, smm = sess.run([train_step, Y, H, summaries],
                                     feed_dict=feed_dict)

        # save training data for Tensorboard
        summary_writer.add_summary(smm, step)

        # display a visual validation of progress (every 50 batches)
        if step % _50_BATCHES == 0:
            feed_dict = {
                X: x,
                Y_: y_,
                Hin: istate,
                pkeep: 1.0,
                batchsize: FLAGS.train_batch_size
            }  # no dropout for validation
            y, l, bl, acc = sess.run([Y, seqloss, batchloss, accuracy],
                                     feed_dict=feed_dict)
            txt.print_learning_learned_comparison(x, y, l, bookranges, bl, acc,
                                                  epoch_size, step, epoch)

        # run a validation step every 50 batches
        # The validation text should be a single sequence but that's too slow (1s per 1024 chars!),
        # so we cut it up and batch the pieces (slightly inaccurate)
        # tested: validating with 5K sequences instead of 1K is only slightly more accurate, but a lot slower.
        if step % _50_BATCHES == 0 and len(valitext) > 0:
            VALI_SEQLEN = 1 * 1024  # Sequence length for validation. State will be wrong at the start of each sequence.
            bsize = len(valitext) // VALI_SEQLEN
            txt.print_validation_header(len(codetext), bookranges)
            vali_x, vali_y, _ = next(
                txt.rnn_minibatch_sequencer(valitext, bsize, VALI_SEQLEN,
                                            1))  # all data in 1 batch
            vali_nullstate = np.zeros([bsize, INTERNALSIZE * NLAYERS])
            feed_dict = {
                X: vali_x,
                Y_: vali_y,
                Hin: vali_nullstate,
                pkeep: 1.0,  # no dropout for validation
                batchsize: bsize
            }
            ls, acc, smm = sess.run([batchloss, accuracy, summaries],
                                    feed_dict=feed_dict)
            txt.print_validation_stats(ls, acc)
            # save validation data for Tensorboard
            validation_writer.add_summary(smm, step)

        # display a short text generated with the current weights and biases (every 150 batches)
        if step // 3 % _50_BATCHES == 0:
            txt.print_text_generation_header()
            ry = np.array([[txt.convert_from_alphabet(ord("K"))]])
            rh = np.zeros([1, INTERNALSIZE * NLAYERS])
            for k in range(1000):
                ryo, rh = sess.run([Yo, H],
                                   feed_dict={
                                       X: ry,
                                       pkeep: 1.0,
                                       Hin: rh,
                                       batchsize: 1
                                   })
                rc = txt.sample_from_probabilities(
                    ryo, topn=10 if epoch <= 1 else 2)
                print(chr(txt.convert_to_alphabet(rc)), end="")
                ry = np.array([[rc]])
            txt.print_text_generation_footer()

        # save a checkpoint (every 500 batches)
        if step // 10 % _50_BATCHES == 0:
            saver.save(sess,
                       FLAGS.checkpoint_dir + '/rnn_train_' + timestamp,
                       global_step=step)

        # display progress bar
        progress.step(reset=step % _50_BATCHES == 0)

        # loop state around
        istate = ostate
        step += FLAGS.train_batch_size * FLAGS.seqlen
示例#11
0
# option in tf.nn.rnn_cell.MultiRNNCell. This option is enabled by default.
# It produces faster code (by ~10%) but handling the state as a tuple is bit
# more cumbersome. Search for comments containing "state_is_tuple=True" for
# details.

SEQLEN = 30
BATCHSIZE = 100
ALPHASIZE = txt.ALPHASIZE
INTERNALSIZE = 512
NLAYERS = 3
learning_rate = 0.001  # fixed learning rate

# load data, either shakespeare, or the Python source of Tensorflow itself
shakedir = "shakespeare/*.txt"
charlesdir = "charles/*.txt"
codetext, valitext, bookranges = txt.read_data_files(shakedir, validation=False)
codetext2, valitext2, bookranges2 = txt.read_data_files(charlesdir, validation=False)


# display some stats on the data
epoch_size = len(codetext) // (BATCHSIZE * SEQLEN)
txt.print_data_stats(len(codetext), len(valitext), epoch_size)

#
# the model
#
lr = tf.placeholder(tf.float32, name='lr')  # learning rate
batchsize = tf.placeholder(tf.int32, name='batchsize')

# inputs
X = tf.placeholder(tf.uint8, [None, None], name='X')    # [ BATCHSIZE, SEQLEN ]
示例#12
0
print("-"*80)
print("Model parameters:")
print("Seqlen        {:>5d}".format(SEQLEN))
print("BatchSize     {:>5d}".format(BATCHSIZE))
print("AlphaSize     {:>5d}".format(ALPHASIZE))
print("InternalSize  {:>5d}".format(INTERNALSIZE))
print("NLayers       {:>5d}".format(NLAYERS))
print("Learning Rate {:>5.3f}".format(learning_rate))
print("Dropout PKeep {:>5.1f}".format(dropout_pkeep))
print("Num. epochs   {:>5d}".format(args.epochs))
print("-"*80)
print("")

read_data_time = datetime.now()
codetext, valitext, bookranges = txt.read_data_files(args.input_dir, validation=args.novalidate,
     validation_percentage = args.validation_percentage)
print_elapsed_time("reading data files", read_data_time)

# display some stats on the data
epoch_size = len(codetext) // (BATCHSIZE * SEQLEN)
txt.print_data_stats(len(codetext), len(valitext), epoch_size)

#
# the model (see FAQ in README.md)
#
lr = tf.placeholder(tf.float32, name='lr')  # learning rate
pkeep = tf.placeholder(tf.float32, name='pkeep')  # dropout parameter
batchsize = tf.placeholder(tf.int32, name='batchsize')

# inputs
X = tf.placeholder(tf.uint8, [None, None], name='X')    # [ BATCHSIZE, SEQLEN ]