def plr_slr(bs_seq_len_list): """Given a list of pairs (batch size, seq_len), calculate the throughput of an LS-LSTM, an SRU, a QRNN(2), and QRNN(10) using the parallel kernel as opposed to the serial one""" import tensorflow as tf import numpy as np import scipy.io.wavfile from tensorflow.contrib import rnn import math from layers_new import linear_surrogate_lstm from layers_new import s_linear_surrogate_lstm from layers_new import SRU from layers_new import s_SRU from layers_new import QRNN from layers_new import s_QRNN import time import os import random throughput_list = [] #TODO: #Make LS_LSTM with PLR #Make SRU with PLR #Make QRNN with PLR #Make LS_LSTM with SLR #Make SRU with SLR #Make QRNN with SLR for seq_len in seq_len_list: #First generate the LS-LSTM and work out the throughput tf.reset_default_graph() n_hidden = 256 n_classes = 2 n_steps = seq_len batch_size = 65536 / seq_len bs = batch_size print "Batch size is {} and sequence length is {}".format(bs, seq_len) n_input = 24 n_layers = 2 forget_gate_init = 1.0 # = 1/(n_in). We use uniform p(x) #Training Parameters sn = 1.0 / math.sqrt(n_hidden) learning_rate = 0.001 training_iters = 5000000 x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal( [n_hidden, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') layer1 = linear_surrogate_lstm(x, n_hidden, name='ls-lstm') outputs = linear_surrogate_lstm(layer1, n_hidden, name='ls-lstm2') pred = tf.matmul(outputs[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients = raw_gradients optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) init = tf.global_variables_initializer() #Initialise the model and evaluate step = 0 times = [] x_in = np.random.random((n_steps, batch_size, n_input)) y_in = np.random.random((batch_size, n_classes)) with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) while step < 10: out = sess.run(pred, feed_dict={x: x_in, y: y_in}) step += 1 if step != 0: start = time.time() out = sess.run(pred, feed_dict={x: x_in, y: y_in}) finish = time.time() times.append(finish - start) ls_lstm_tp = (bs * n_steps) / np.mean(times) tf.reset_default_graph() x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal( [n_hidden, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') layer1 = s_linear_surrogate_lstm(x, n_hidden, name='ls-lstm') output = s_linear_surrogate_lstm(layer1, n_hidden, name='ls-lstm') pred = tf.matmul(output[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients = raw_gradients optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) init = tf.global_variables_initializer() #Initialise the model and evaluate step = 0 times = [] x_in = np.random.random((n_steps, batch_size, n_input)) y_in = np.random.random((batch_size, n_classes)) with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) while step < 10: out = sess.run(pred, feed_dict={x: x_in, y: y_in}) step += 1 if step != 0: start = time.time() out = sess.run(pred, feed_dict={x: x_in, y: y_in}) finish = time.time() times.append(finish - start) s_ls_lstm_tp = (bs * n_steps) / np.mean(times) tf.reset_default_graph() x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal([n_input, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') layer1 = SRU(x, name='SRU_1') output = SRU(layer1, name='SRU_2') pred = tf.matmul(output[-1], W1) + b1 tf.reset_default_graph() x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal( [n_hidden, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') layer1 = s_linear_surrogate_lstm(x, n_hidden, name='ls-lstm') output = s_linear_surrogate_lstm(layer1, n_hidden, name='ls-lstm') pred = tf.matmul(output[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients = raw_gradients optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) init = tf.global_variables_initializer() #Initialise the model and evaluate step = 0 times = [] x_in = np.random.random((n_steps, batch_size, n_input)) y_in = np.random.random((batch_size, n_classes)) with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) while step < 10: out = sess.run(pred, feed_dict={x: x_in, y: y_in}) step += 1 if step != 0: start = time.time() out = sess.run(pred, feed_dict={x: x_in, y: y_in}) finish = time.time() times.append(finish - start) s_ls_lstm_tp = (bs * n_steps) / np.mean(times) tf.reset_default_graph() x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal([n_input, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') layer1 = SRU(x, name='SRU_1') output = SRU(layer1, name='SRU_2') pred = tf.matmul(output[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients = raw_gradients optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) init = tf.global_variables_initializer() #Initialise the model and evaluate step = 0 times = [] x_in = np.random.random((n_steps, batch_size, n_input)) y_in = np.random.random((batch_size, n_classes)) with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) while step < 10: out = sess.run(pred, feed_dict={x: x_in, y: y_in}) step += 1 if step != 0: start = time.time() out = sess.run(pred, feed_dict={x: x_in, y: y_in}) finish = time.time() times.append(finish - start) sru_tp = (bs * n_steps) / np.mean(times) tf.reset_default_graph() x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal([n_input, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') layer1 = s_SRU(x, name='s_SRU_1') output = s_SRU(layer1, name='s_SRU_2') pred = tf.matmul(output[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients = raw_gradients optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) init = tf.global_variables_initializer() #Initialise the model and evaluate step = 0 times = [] x_in = np.random.random((n_steps, batch_size, n_input)) y_in = np.random.random((batch_size, n_classes)) with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) while step < 10: out = sess.run(pred, feed_dict={x: x_in, y: y_in}) step += 1 if step != 0: start = time.time() out = sess.run(pred, feed_dict={x: x_in, y: y_in}) finish = time.time() times.append(finish - start) s_sru_tp = (bs * n_steps) / np.mean(times) tf.reset_default_graph() x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal([n_input, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') layer1 = QRNN(x, 2, name='QRNN_1') output = QRNN(layer1, 2, name='QRNN_2') pred = tf.matmul(output[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients = raw_gradients optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) init = tf.global_variables_initializer() #Initialise the model and evaluate step = 0 times = [] x_in = np.random.random((n_steps, batch_size, n_input)) y_in = np.random.random((batch_size, n_classes)) with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) while step < 10: out = sess.run(pred, feed_dict={x: x_in, y: y_in}) step += 1 if step != 0: start = time.time() out = sess.run(pred, feed_dict={x: x_in, y: y_in}) finish = time.time() times.append(finish - start) qrnn_2_tp = (bs * n_steps) / np.mean(times) tf.reset_default_graph() x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal([n_input, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') layer1 = s_QRNN(x, 2, name='s_QRNN_3') output = s_QRNN(layer1, 2, name='s_QRNN_4') pred = tf.matmul(output[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients = raw_gradients optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) init = tf.global_variables_initializer() #Initialise the model and evaluate step = 0 times = [] x_in = np.random.random((n_steps, batch_size, n_input)) y_in = np.random.random((batch_size, n_classes)) with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) while step < 10: out = sess.run(pred, feed_dict={x: x_in, y: y_in}) step += 1 if step != 0: start = time.time() out = sess.run(pred, feed_dict={x: x_in, y: y_in}) finish = time.time() times.append(finish - start) s_qrnn_2_tp = (bs * n_steps) / np.mean(times) print np.mean(times) print np.std(times) tf.reset_default_graph() x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal([n_input, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') layer1 = QRNN(x, 10, name='QRNN_2') output = QRNN(layer1, 10, name='QRNN_6') pred = tf.matmul(output[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients = raw_gradients optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) init = tf.global_variables_initializer() #Initialise the model and evaluate step = 0 times = [] x_in = np.random.random((n_steps, batch_size, n_input)) y_in = np.random.random((batch_size, n_classes)) with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) while step < 10: out = sess.run(pred, feed_dict={x: x_in, y: y_in}) step += 1 if step != 0: start = time.time() out = sess.run(pred, feed_dict={x: x_in, y: y_in}) finish = time.time() times.append(finish - start) qrnn_10_tp = (bs * n_steps) / np.mean(times) tf.reset_default_graph() x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal([n_input, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') layer1 = s_QRNN(x, 10, name='s_QRNN_7') output = s_QRNN(layer1, 10, name='s_QRNN_8') pred = tf.matmul(output[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients = raw_gradients optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) init = tf.global_variables_initializer() #Initialise the model and evaluate step = 0 times = [] x_in = np.random.random((n_steps, batch_size, n_input)) y_in = np.random.random((batch_size, n_classes)) with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) while step < 10: out = sess.run(pred, feed_dict={x: x_in, y: y_in}) step += 1 if step != 0: start = time.time() out = sess.run(pred, feed_dict={x: x_in, y: y_in}) finish = time.time() times.append(finish - start) s_qrnn_10_tp = (bs * n_steps) / np.mean(times) throughput_list.append([ ls_lstm_tp, s_ls_lstm_tp, sru_tp, s_sru_tp, qrnn_2_tp, s_qrnn_2_tp, qrnn_10_tp, s_qrnn_10_tp ]) return throughput_list
def random_test(bs_seq_len_list): """Given a list of pairs (batch size, seq_len), calculate the throughput of an LS-LSTM vs a cudnn on random data""" import tensorflow as tf import numpy as np import scipy.io.wavfile from tensorflow.contrib import rnn import math from layers_new import linear_surrogate_lstm import time import os import random ls_lstm_throughput_dict = {} cudnn_throughput_dict = {} for bs, seq_len in bs_seq_len_list: #First generate the LS-LSTM and work out the throughput tf.reset_default_graph() n_hidden = 234 n_classes = 2 n_steps = seq_len batch_size = bs n_input = 4 n_layers = 2 forget_gate_init = 1.0 # = 1/(n_in). We use uniform p(x) sn = 1.0 / math.sqrt(n_hidden) #Training Parameters learning_rate = 0.001 training_iters = 5000000 x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal( [n_hidden, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') layer1 = linear_surrogate_lstm(x, n_hidden, name='ls-lstm') outputs = linear_surrogate_lstm(layer1, n_hidden, name='ls-lstm2') pred = tf.matmul(outputs[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients = raw_gradients optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) init = tf.global_variables_initializer() #Initialise the model and evaluate step = 0 times = [] x_in = np.random.random((n_steps, batch_size, n_input)) y_in = np.random.random((batch_size, n_classes)) with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) while step < 10: #Do a few iters to warm up out = sess.run(pred, feed_dict={x: x_in, y: y_in}) if step > 3: start = time.time() out = sess.run(pred, feed_dict={x: x_in, y: y_in}) finish = time.time() times.append(finish - start) step += 1 ls_lstm_throughput_dict[(bs, n_steps)] = (bs * n_steps) / np.mean(times) #-------------------------------------------------------------------------------- # Now we do the CUDNN tf.reset_default_graph() #Initialise variables ################################################################################ #Generate the lstm hook to CUDA model = tf.contrib.cudnn_rnn.CudnnLSTM(n_layers, n_hidden, n_input) # tf Graph input x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) #Define weights & rnn initial states weights = { 'out': tf.Variable(tf.random_normal([n_hidden, n_classes]), dtype='float') } biases = { 'out': tf.Variable(tf.random_normal([n_classes]), dtype='float') } #Initial state of the LSTM at each batch, we don't let this be trained. input_h = { 'out': tf.Variable(tf.zeros([n_layers, batch_size, n_hidden]), dtype='float', trainable=False) } input_c = { 'out': tf.Variable(tf.zeros([n_layers, batch_size, n_hidden]), dtype='float', trainable=False) } #Initialise all weights & biases for the cudnnlstm: set weights according to Glorot #There are eight weights and biases per layer in the LSTM. Described in #http://docs.nvidia.com/deeplearning/sdk/cudnn-user-guide/index.html#cudnnRNNMode_t #There are two biases which sum to give the biases in the canonical form of the LSTM #This seems redundant - I'm not sure why CUDA is implemented in this way. weight_list = [] bias_list = [] for n in range(4): weight_list.append( np.float32( np.random.uniform(low=-sn, high=sn, size=[n_hidden, n_input]))) for n in range(4, 8): weight_list.append( np.float32( np.random.uniform(low=-sn, high=sn, size=[n_hidden, n_hidden]))) if n_layers == 2: for n in range(4): weight_list.append( np.float32( np.random.uniform(low=-sn, high=sn, size=[n_hidden, n_hidden]))) for n in range(4, 8): weight_list.append( np.float32( np.random.uniform(low=-sn, high=sn, size=[n_hidden, n_hidden]))) for n in range(8): bias_list.append(np.float32(np.zeros([n_hidden]))) if n_layers == 2: for n in range(8): bias_list.append(np.float32(np.zeros([n_hidden]))) bias_list[13] = np.float32(forget_gate_init * np.ones([n_hidden])) bias_list[5] = np.float32(forget_gate_init * np.ones([n_hidden])) #Initialize the opaque parameter buffer used to handle the cudnnlstm params #If we try to pass the canonical_to_params tensor through the call graph, #we fail because the size must be known statically. The easiest way to get #around this (though hacky) is to get the values out by casting to an np array #and then initialising a tensor with those values. params_size_t = ((n_input * n_hidden * 4) + (n_hidden * n_hidden * 4) + (n_hidden * 2 * 4)) flat_params = model.canonical_to_params(weight_list, bias_list) flat_params_as_ndarray = tf.Session().run(flat_params) params = { 'out': tf.get_variable('param_buffer', initializer=tf.constant(flat_params_as_ndarray)) } #Generate network ################################################################################ outputs, states1, states2 = model(is_training=True, input_data=x, input_h=input_h['out'], input_c=input_c['out'], params=params['out']) # Linear activation, using rnn inner loop on last output pred = tf.matmul(outputs[-1], weights['out']) + biases['out'] #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients = raw_gradients optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) init = tf.global_variables_initializer() step = 0 times = [] with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) while step < 10: out = sess.run(pred, feed_dict={x: x_in, y: y_in}) #Warm up with a few iters first if step > 3: start = time.time() out = sess.run(pred, feed_dict={x: x_in, y: y_in}) finish = time.time() times.append(finish - start) step += 1 cudnn_throughput_dict[(bs, n_steps)] = (bs * n_steps) / np.mean(times) return cudnn_throughput_dict, ls_lstm_throughput_dict
def ls_lstm(n_steps=1024, n_hidden=1024, n_input=128, batch_size=8, n_layers=1, n_converge=5): #Network Parameters tf.reset_default_graph() n_classes = 2 sn = 1 / math.sqrt(n_hidden) #Glorot initialisation, var(p(x)) forget_gate_init = 5.0 # = 1/(n_in). We use uniform p(x) clip = 20 #We use gradient clipping to stop the gradient exploding initially #for the much larger networks #Training Parameters learning_rate = 0.0001 training_iters = 5000000 display_step = 10 id_num = np.random.uniform( 0, 50) #To distinguish from other runs of identical models #Initialise variables ################################################################################ #Generate the lstm hook to PLR # tf Graph input x = tf.placeholder("float", [n_steps, batch_size, n_input]) y = tf.placeholder("float", [batch_size, n_classes]) #Define weights & rnn initial states tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal([n_hidden, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') #Initialise all weights & biases for the plrlstm: set weights according to Glorot #There are eight weights and 4 biases per layer in the LSTM. Described in #http://docs.nvidia.com/deeplearning/sdk/cudnn-user-guide/index.html#cudnnRNNMode_t #There are two biases which sum to give the biases in the canonical form of the LSTM #Generate network ################################################################################ layer1 = linear_surrogate_lstm(x, n_hidden, name='ls-lstm') outputs = linear_surrogate_lstm(layer1, n_hidden, name='ls-lstm2') # Linear activation, using rnn inner loop last output pred = tf.matmul(outputs[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients, _ = tf.clip_by_global_norm(raw_gradients, clip) optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) tf.summary.scalar('cost', cost) tf.summary.scalar('acc', accuracy) init = tf.global_variables_initializer() merged = tf.summary.merge_all() init = tf.global_variables_initializer() saver = tf.train.Saver() start = time.time() acc_list = [0] * n_converge if not os.path.exists('./LS_LSTM_' + str(n_steps) + '_steps_model_'): os.makedirs('./LS_LSTM_' + str(n_steps) + '_steps_model_') if not os.path.exists('./LS_LSTM_' + str(n_steps) + '_steps_log_'): os.makedirs('./LS_LSTM_' + str(n_steps) + '_steps_log_') with tf.device("gpu:0"): with tf.Session() as sess: sess.run(init) step = 1 test_writer = tf.summary.FileWriter( './LS_LSTM_' + str(n_steps) + '_stepslog_', sess.graph) # Keep training until reach max iterations while step * batch_size < training_iters: if batch_size == 1: batch_x, batch_y = gen_2b_data_1(n_steps - 1, n_input - 1) else: batch_x, batch_y = gen_2b_data(n_steps - 1, n_input - 1, batch_size) # Run optimization op (backprop) sess.run(optimizer, feed_dict={x: batch_x, y: batch_y}) if step % display_step == 0: # Calculate batch accuracy acc = sess.run(accuracy, feed_dict={ x: batch_x, y: batch_y }) # Calculate batch loss loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y}) summary, _ = sess.run([merged, cost], feed_dict={ x: batch_x, y: batch_y }) summary, _ = sess.run([merged, accuracy], feed_dict={ x: batch_x, y: batch_y }) test_writer.add_summary(summary, step) print("Iter " + str(step) + ", Minibatch Loss= " + \ "{:.6f}".format(loss) + ", Training Accuracy= " + \ "{:.5f}".format(acc)) if step % (display_step * 10) == 0: #Save the model every so often saver.save(sess, './LS_LSTM_' + str(n_steps) + '_steps_model_', global_step=step) if acc_list == [1.0] * n_converge: print "Converged after {} iterations and {} seconds".format( step, time.time() - start) break else: acc_list.append(acc) acc_list.pop(0) step += 1 print("Optimization Finished!") return step, time.time() - start
# with tf.variable_scope('pre_fc'): #preact params. specify separately so we can # tf.get_variable_scope().reuse == True #set forget bias # W = tf.get_variable('W', # initializer=tf.random_uniform([n_input+n_hidden, 4*n_hidden], # minval=-sn, maxval=sn), dtype='float') # init = tf.constant(forget_gate_init*np.ones((n_hidden)), dtype='float32') # f_bias = tf.get_variable('f_bias', initializer= init, dtype='float') # other_bias = tf.get_variable('other_bias', # initializer=tf.zeros([3*n_hidden]), dtype='float') # b = tf.get_variable('b', initializer=tf.concat([f_bias, other_bias],axis=0), # dtype='float') #Generate network ################################################################################ outputs = linear_surrogate_lstm(x, n_hidden, name='ls-lstm') # Linear activation, using rnn inner loop last output with tf.variable_scope(None, default_name='linear_layer'): pred = tf.matmul(outputs[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients, _ = tf.clip_by_global_norm(raw_gradients, clip) optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
tf.get_variable_scope().reuse == True W1 = tf.get_variable('W1', initializer=tf.random_normal([n_hidden, n_classes]), dtype='float') b1 = tf.get_variable('b1', initializer=tf.zeros([n_classes]), dtype='float') #Initialise all weights & biases for the plrlstm: set weights according to Glorot #There are eight weights and 4 biases per layer in the LSTM. Described in #http://docs.nvidia.com/deeplearning/sdk/cudnn-user-guide/index.html#cudnnRNNMode_t #There are two biases which sum to give the biases in the canonical form of the LSTM #Generate network ################################################################################ layer1 = linear_surrogate_lstm(x, n_hidden, name='ls-lstm') outputs = linear_surrogate_lstm(layer1, n_hidden, name='ls-lstm2') pred = tf.matmul(outputs[-1], W1) + b1 #Evaluate network, run adam and clip gradients ################################################################################ cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer_0 = tf.train.AdamOptimizer(learning_rate=learning_rate) raw_gradients, variables = zip(*optimizer_0.compute_gradients(cost)) gradients, _ = tf.clip_by_global_norm(raw_gradients, clip) optimizer = optimizer_0.apply_gradients(zip(gradients, variables)) correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) tf.summary.scalar('cost', cost) tf.summary.scalar('acc', accuracy)