def __init__(self, rootDir, batchSize, seqLen): self.batchSize = batchSize self.seqLen = seqLen self.rootDir = rootDir # load data, either shakespeare, or the Python source of Tensorflow itself self.textdir = self.rootDir + "/*.txt" #shakedir = "../tensorflow/**/*.py" self.codetext, self.valitext, self.bookranges = txt.read_data_files(self.textdir, validation=True) self.epoch_size = len(self.codetext) // (self.batchSize * self.seqLen) # display some stats on the data txt.print_data_stats(len(self.codetext), len(self.valitext), self.epoch_size)
def validate_on_network(auth): with tf.Session() as sess: new_saver = tf.train.import_meta_graph(auth + '.meta') new_saver.restore(sess, auth) valitext, _, __ = my_txtutils.read_data_files(directory, validation=False) VALI_SEQLEN = 1 * 64 # Sequence length for validation. State will be wrong at the start of each sequence. bsize = len(valitext) // VALI_SEQLEN vali_x, vali_y, _ = next( my_txtutils.rnn_minibatch_sequencer(valitext, bsize, VALI_SEQLEN, 1)) # all data in 1 batch vali_nullstate = np.zeros([bsize, INTERNALSIZE * NLAYERS]) feed_dict = {'inputs/X:0': vali_x, 'target/Y_:0': vali_y, 'model/pkeep:0': 1.0, 'hidden_state/Hin:0': vali_nullstate, 'model/batchsize:0': bsize} ls, acc = sess.run(["display_data/batchloss:0", "display_data/accuracy:0"], feed_dict=feed_dict) my_txtutils.print_validation_stats(ls, acc)
# A good choice of parameters ensures that the testing and validation curves stay close # To see the curves drift apart ("overfitting") try to use an insufficient amount of # training data (shakedir = "shakespeare/t*.txt" for example) # SEQLEN = 30 BATCHSIZE = 100 ALPHASIZE = txt.ALPHASIZE INTERNALSIZE = 512 NLAYERS = 3 learning_rate = 0.001 # fixed learning rate dropout_pkeep = 1.0 # no dropout # load data, either shakespeare, or the Python source of Tensorflow itself shakedir = "dickens/*.txt" # shakedir = "../tensorflow/**/*.py" codetext, valitext, bookranges = txt.read_data_files(shakedir, validation=False) # display some stats on the data epoch_size = len(codetext) // (BATCHSIZE * SEQLEN) txt.print_data_stats(len(codetext), len(valitext), epoch_size) # # the model (see FAQ in README.md) # lr = tf.placeholder(tf.float32, name='lr') # learning rate pkeep = tf.placeholder(tf.float32, name='pkeep') # dropout parameter batchsize = tf.placeholder(tf.int32, name='batchsize') # inputs X = tf.placeholder(tf.uint8, [None, None], name='X') # [ BATCHSIZE, SEQLEN ] Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
# Training and experimentation (default): # Keep validation enabled # You can now play with the parameters anf follow the effects in Tensorboard # A good choice of parameters ensures that the testing and validation curves stay close # To see the curves drift apart ("overfitting") try to use an insufficient amount of # SEQLEN = 200 BATCHSIZE = 80 ALPHASIZE = txt.ALPHASIZE INTERNALSIZE = 512 NLAYERS = 3 learning_rate = 0.001 # fixed learning rate dropout_pkeep = 0.8 # some dropout bibledir = "bible/*.txt" codetext, valitext, bookranges = txt.read_data_files(bibledir, validation=True) # display some stats on the data epoch_size = len(codetext) // (BATCHSIZE * SEQLEN) txt.print_data_stats(len(codetext), len(valitext), epoch_size) # # the model (see FAQ in README.md) # lr = tf.placeholder(tf.float32, name='lr') # learning rate pkeep = tf.placeholder(tf.float32, name='pkeep') # dropout parameter batchsize = tf.placeholder(tf.int32, name='batchsize') # inputs X = tf.placeholder(tf.uint8, [None, None], name='X') # [ BATCHSIZE, SEQLEN ] Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
BATCH_SIZE = 100 ALPHA_SIZE = txt.ALPHASIZE INTERNAL_SIZE = 512 NLAYERS = 3 learning_rate = 0.001 dropout_pkeep = 0.5 # No dropout # Load training data # Shakespeare # train_data = "shakespeare/*.txt" # Java code train_data = "../Desktop/android/frameworks/**/*.java" codetxt, valitext, bookranges = txt.read_data_files(train_data, validation=True) # Model lr = tf.placeholder(tf.float32, name='lr') pkeep = tf.placeholder(tf.float32, name='pkeep') batchsize = tf.placeholder(tf.int32, name='batchsize') # Input X = tf.placeholder(tf.uint8, [None, None], name='X') # [BATCH_SIZE, SEQ_LEN] Xo = tf.one_hot(X, ALPHA_SIZE, 1.0, 0.0) # [BATCH_SIZE, SEQ_LEN, ALPHA_SIZE] # Output Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_') # [BATCH_SIZE, SEQ_LEN] Yo_ = tf.one_hot(Y_, ALPHA_SIZE, 1.0, 0.0) # [BATCH_SIZE, SEQ_LEN, ALPHA_SIZE] # Input state
# training data (input_dir = "shakespeare/t*.txt" for example) # SEQLEN = 30 BATCHSIZE = 200 ALPHASIZE = txt.ALPHASIZE INTERNALSIZE = 512 NLAYERS = 3 learning_rate = 0.001 # fixed learning rate dropout_pkeep = 0.8 # some dropout epoch_count = 13 project_name = "shakespeare" # load data, from the project subfolder or the Python source of Tensorflow itself input_dir = "{}/*.txt".format(project_name) #input_dir = "../tensorflow/**/*.py" codetext, valitext, bookranges = txt.read_data_files(input_dir, validation=True) # display some stats on the data epoch_size = len(codetext) // (BATCHSIZE * SEQLEN) txt.print_data_stats(len(codetext), len(valitext), epoch_size) # # the model (see FAQ in README.md) # lr = tf.placeholder(tf.float32, name='lr') # learning rate pkeep = tf.placeholder(tf.float32, name='pkeep') # dropout parameter batchsize = tf.placeholder(tf.int32, name='batchsize') # inputs X = tf.placeholder(tf.uint8, [None, None], name='X') # [ BATCHSIZE, SEQLEN ] Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
SEQ_LEN = 64 # Number of characters per sequence NUM_EPOCHS = 50 # Number of epochs BATCH_SIZE = 250 # Sequences per batch NUM_OF_GRUS = 1024 # Number of GRU cells per layer NUM_LAYERS = 3 # How many layers deep we are going SET_LR = 0.001 # Small fixed learning rate SET_PKEEP = 0.75 # Dropping 20% of neurons # Seed our random number generator tf.set_random_seed(0) # Load our Star Wars Scripts. filedir = "StarWarsScripts/*.txt" traintext, validtext, scriptranges = txt.read_data_files(filedir, validation=True) # Print out information about our data size_of_epoch = len(traintext) // (BATCH_SIZE * SEQ_LEN) txt.print_data_stats(len(traintext), len(validtext), size_of_epoch) # Create our TensorFlow Graph. batchsize = tf.placeholder(tf.int32, name='batchsize') lr = tf.placeholder(tf.float32, name='lr') pkeep = tf.placeholder(tf.float32, name='pkeep') X = tf.placeholder(tf.uint8, [None, None], name='X') # Input vector Xo = tf.one_hot( X, ALPHA_SIZE, 1.0, 0.0) # One Hots create vector size ALPHA_SIZE, all set 0 except character Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_') # Output tensor Yo_ = tf.one_hot(Y_, ALPHA_SIZE, 1.0, 0.0) # OneHot our output also
# You can now play with the parameters and follow the effects in Tensorboard # A good choice of parameters ensures that the testing and validation curves stay close # To see the curves drift apart ("overfitting") try to use an insufficient amount of # training data # SEQLEN = 30 BATCHSIZE = 200 ALPHASIZE = txt.ALPHASIZE INTERNALSIZE = 512 NLAYERS = 3 learning_rate = 0.001 # fixed learning rate dropout_pkeep = 0.8 # some dropout # validation = True validation = False codetext, valitext, fileranges = txt.read_data_files(args.globby, validation) # display some stats on the data epoch_size = len(codetext) // (BATCHSIZE * SEQLEN) txt.print_data_stats(len(codetext), len(valitext), epoch_size) # # the model (see FAQ in README.md) # lr = tf.placeholder(tf.float32, name='lr') # learning rate pkeep = tf.placeholder(tf.float32, name='pkeep') # dropout parameter batchsize = tf.placeholder(tf.int32, name='batchsize') # inputs X = tf.placeholder(tf.uint8, [None, None], name='X') # [ BATCHSIZE, SEQLEN ] Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
timee = datetime.datetime.now() # ADAPTED FROM 'DEEP LEARNING WITHOUT A PHD' FROM GOOGLE # model parameters BATCHSIZE = 200 ALPHASIZE = txt.ALPHASIZE INTERNALSIZE = 512 NLAYERS = 3 SEQLEN = 30 learning_rate = 0.001 # fixed learning rate dropout_pkeep = 0.8 # some dropout # load data docdir = "all/*.txt" code_text, validation_text, bookranges = txt.read_data_files(docdir, validation=True) # display some stats on the data epoch_size = len(code_text) // (BATCHSIZE * SEQLEN) txt.print_data_stats(len(code_text), len(validation_text), epoch_size) lr = tf.placeholder(tf.float32, name='lr') # learning rate pkeep = tf.placeholder(tf.float32, name='pkeep') # dropout parameter batchsize = tf.placeholder(tf.int32, name='batchsize') # batch size parameter # inputs X = tf.placeholder(tf.uint8, [None, None], name='X') # [ BATCHSIZE, SEQLEN ] Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, SEQLEN, ALPHASIZE ] # expected outputs = same sequence shifted by 1 since we are trying to predict the next character Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_') # [ BATCHSIZE, SEQLEN ] Yo_ = tf.one_hot(Y_, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
def main(_): # load data, either shakespeare, or the Python source of Tensorflow itself shakedir = FLAGS.text_dir # shakedir = "../tensorflow/**/*.py" codetext, valitext, bookranges = txt.read_data_files(shakedir, validation=True) # display some stats on the data epoch_size = len(codetext) // (FLAGS.train_batch_size * FLAGS.seqlen) txt.print_data_stats(len(codetext), len(valitext), epoch_size) # # the model (see FAQ in README.md) # lr = tf.placeholder(tf.float32, name='lr') # learning rate pkeep = tf.placeholder(tf.float32, name='pkeep') # dropout parameter batchsize = tf.placeholder(tf.int32, name='batchsize') # inputs X = tf.placeholder(tf.uint8, [None, None], name='X') # [ BATCHSIZE, FLAGS.seqlen ] Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ] # expected outputs = same sequence shifted by 1 since we are trying to predict the next character Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_') # [ BATCHSIZE, FLAGS.seqlen ] Yo_ = tf.one_hot(Y_, ALPHASIZE, 1.0, 0.0) # [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ] # input state Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE * NLAYERS], name='Hin') # [ BATCHSIZE, INTERNALSIZE * NLAYERS] # using a NLAYERS=3 layers of GRU cells, unrolled FLAGS.seqlen=30 times # dynamic_rnn infers FLAGS.seqlen from the size of the inputs Xo onecell = rnn.GRUCell(INTERNALSIZE) dropcell = rnn.DropoutWrapper(onecell, input_keep_prob=pkeep) multicell = rnn.MultiRNNCell([dropcell] * NLAYERS, state_is_tuple=False) multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep) Yr, H = tf.nn.dynamic_rnn(multicell, Xo, dtype=tf.float32, initial_state=Hin) # Yr: [ BATCHSIZE, FLAGS.seqlen, INTERNALSIZE ] # H: [ BATCHSIZE, INTERNALSIZE*NLAYERS ] # this is the last state in the sequence H = tf.identity(H, name='H') # just to give it a name # Softmax layer implementation: # Flatten the first two dimension of the output [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ] => [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] # then apply softmax readout layer. This way, the weights and biases are shared across unrolled time steps. # From the readout point of view, a value coming from a cell or a minibatch is the same thing Yflat = tf.reshape( Yr, [-1, INTERNALSIZE]) # [ BATCHSIZE x FLAGS.seqlen, INTERNALSIZE ] Ylogits = layers.linear( Yflat, ALPHASIZE) # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] Yflat_ = tf.reshape( Yo_, [-1, ALPHASIZE]) # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] loss = tf.nn.softmax_cross_entropy_with_logits( logits=Ylogits, labels=Yflat_) # [ BATCHSIZE x FLAGS.seqlen ] loss = tf.reshape(loss, [batchsize, -1]) # [ BATCHSIZE, FLAGS.seqlen ] Yo = tf.nn.softmax(Ylogits, name='Yo') # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ] Y = tf.argmax(Yo, 1) # [ BATCHSIZE x FLAGS.seqlen ] Y = tf.reshape(Y, [batchsize, -1], name="Y") # [ BATCHSIZE, FLAGS.seqlen ] train_step = tf.train.AdamOptimizer(lr).minimize(loss) # stats for display seqloss = tf.reduce_mean(loss, 1) batchloss = tf.reduce_mean(seqloss) accuracy = tf.reduce_mean( tf.cast(tf.equal(Y_, tf.cast(Y, tf.uint8)), tf.float32)) loss_summary = tf.summary.scalar("batch_loss", batchloss) acc_summary = tf.summary.scalar("batch_accuracy", accuracy) summaries = tf.summary.merge([loss_summary, acc_summary]) # Init Tensorboard stuff. This will save Tensorboard information into a different # folder at each run named 'log/<timestamp>/'. Two sets of data are saved so that # you can compare training and validation curves visually in Tensorboard. timestamp = str(math.trunc(time.time())) summary_writer = tf.summary.FileWriter( os.path.join(FLAGS.summaries_dir, timestamp + "-training")) validation_writer = tf.summary.FileWriter( os.path.join(FLAGS.summaries_dir, timestamp + "-validation")) # Init for saving models. They will be saved into a directory named 'checkpoints'. # Only the last checkpoint is kept. if not os.path.exists(FLAGS.checkpoint_dir): os.mkdir(FLAGS.checkpoint_dir) saver = tf.train.Saver(max_to_keep=1) # for display: init the progress bar DISPLAY_FREQ = 50 _50_BATCHES = DISPLAY_FREQ * FLAGS.train_batch_size * FLAGS.seqlen progress = txt.Progress(DISPLAY_FREQ, size=111 + 2, msg="Training on next " + str(DISPLAY_FREQ) + " batches") # init istate = np.zeros([FLAGS.train_batch_size, INTERNALSIZE * NLAYERS]) # initial zero input state init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) step = 0 # training loop for x, y_, epoch in txt.rnn_minibatch_sequencer(codetext, FLAGS.train_batch_size, FLAGS.seqlen, nb_epochs=1000): # train on one minibatch feed_dict = { X: x, Y_: y_, Hin: istate, lr: FLAGS.learning_rate, pkeep: FLAGS.dropout_pkeep, batchsize: FLAGS.train_batch_size } _, y, ostate, smm = sess.run([train_step, Y, H, summaries], feed_dict=feed_dict) # save training data for Tensorboard summary_writer.add_summary(smm, step) # display a visual validation of progress (every 50 batches) if step % _50_BATCHES == 0: feed_dict = { X: x, Y_: y_, Hin: istate, pkeep: 1.0, batchsize: FLAGS.train_batch_size } # no dropout for validation y, l, bl, acc = sess.run([Y, seqloss, batchloss, accuracy], feed_dict=feed_dict) txt.print_learning_learned_comparison(x, y, l, bookranges, bl, acc, epoch_size, step, epoch) # run a validation step every 50 batches # The validation text should be a single sequence but that's too slow (1s per 1024 chars!), # so we cut it up and batch the pieces (slightly inaccurate) # tested: validating with 5K sequences instead of 1K is only slightly more accurate, but a lot slower. if step % _50_BATCHES == 0 and len(valitext) > 0: VALI_SEQLEN = 1 * 1024 # Sequence length for validation. State will be wrong at the start of each sequence. bsize = len(valitext) // VALI_SEQLEN txt.print_validation_header(len(codetext), bookranges) vali_x, vali_y, _ = next( txt.rnn_minibatch_sequencer(valitext, bsize, VALI_SEQLEN, 1)) # all data in 1 batch vali_nullstate = np.zeros([bsize, INTERNALSIZE * NLAYERS]) feed_dict = { X: vali_x, Y_: vali_y, Hin: vali_nullstate, pkeep: 1.0, # no dropout for validation batchsize: bsize } ls, acc, smm = sess.run([batchloss, accuracy, summaries], feed_dict=feed_dict) txt.print_validation_stats(ls, acc) # save validation data for Tensorboard validation_writer.add_summary(smm, step) # display a short text generated with the current weights and biases (every 150 batches) if step // 3 % _50_BATCHES == 0: txt.print_text_generation_header() ry = np.array([[txt.convert_from_alphabet(ord("K"))]]) rh = np.zeros([1, INTERNALSIZE * NLAYERS]) for k in range(1000): ryo, rh = sess.run([Yo, H], feed_dict={ X: ry, pkeep: 1.0, Hin: rh, batchsize: 1 }) rc = txt.sample_from_probabilities( ryo, topn=10 if epoch <= 1 else 2) print(chr(txt.convert_to_alphabet(rc)), end="") ry = np.array([[rc]]) txt.print_text_generation_footer() # save a checkpoint (every 500 batches) if step // 10 % _50_BATCHES == 0: saver.save(sess, FLAGS.checkpoint_dir + '/rnn_train_' + timestamp, global_step=step) # display progress bar progress.step(reset=step % _50_BATCHES == 0) # loop state around istate = ostate step += FLAGS.train_batch_size * FLAGS.seqlen
# option in tf.nn.rnn_cell.MultiRNNCell. This option is enabled by default. # It produces faster code (by ~10%) but handling the state as a tuple is bit # more cumbersome. Search for comments containing "state_is_tuple=True" for # details. SEQLEN = 30 BATCHSIZE = 100 ALPHASIZE = txt.ALPHASIZE INTERNALSIZE = 512 NLAYERS = 3 learning_rate = 0.001 # fixed learning rate # load data, either shakespeare, or the Python source of Tensorflow itself shakedir = "shakespeare/*.txt" charlesdir = "charles/*.txt" codetext, valitext, bookranges = txt.read_data_files(shakedir, validation=False) codetext2, valitext2, bookranges2 = txt.read_data_files(charlesdir, validation=False) # display some stats on the data epoch_size = len(codetext) // (BATCHSIZE * SEQLEN) txt.print_data_stats(len(codetext), len(valitext), epoch_size) # # the model # lr = tf.placeholder(tf.float32, name='lr') # learning rate batchsize = tf.placeholder(tf.int32, name='batchsize') # inputs X = tf.placeholder(tf.uint8, [None, None], name='X') # [ BATCHSIZE, SEQLEN ]
print("-"*80) print("Model parameters:") print("Seqlen {:>5d}".format(SEQLEN)) print("BatchSize {:>5d}".format(BATCHSIZE)) print("AlphaSize {:>5d}".format(ALPHASIZE)) print("InternalSize {:>5d}".format(INTERNALSIZE)) print("NLayers {:>5d}".format(NLAYERS)) print("Learning Rate {:>5.3f}".format(learning_rate)) print("Dropout PKeep {:>5.1f}".format(dropout_pkeep)) print("Num. epochs {:>5d}".format(args.epochs)) print("-"*80) print("") read_data_time = datetime.now() codetext, valitext, bookranges = txt.read_data_files(args.input_dir, validation=args.novalidate, validation_percentage = args.validation_percentage) print_elapsed_time("reading data files", read_data_time) # display some stats on the data epoch_size = len(codetext) // (BATCHSIZE * SEQLEN) txt.print_data_stats(len(codetext), len(valitext), epoch_size) # # the model (see FAQ in README.md) # lr = tf.placeholder(tf.float32, name='lr') # learning rate pkeep = tf.placeholder(tf.float32, name='pkeep') # dropout parameter batchsize = tf.placeholder(tf.int32, name='batchsize') # inputs X = tf.placeholder(tf.uint8, [None, None], name='X') # [ BATCHSIZE, SEQLEN ]