def compute_ler(self, decode_op_main, decode_op_sub, labels_main, labels_sub): """Operation for computing LER (Label Error Rate). Args: decode_op_main: operation for decoding of the main task decode_op_sub: operation for decoding of the sub task labels_main: A SparseTensor of target labels in the main task labels_sub: A SparseTensor of target labels in the sub task Return: ler_op_main: operation for computing LER of the main task ler_op_sub: operation for computing LER of the sub task """ # Compute LER (normalize by label length) ler_op_main = tf.reduce_mean(tf.edit_distance( decode_op_main, labels_main, normalize=True)) ler_op_sub = tf.reduce_mean(tf.edit_distance( decode_op_sub, labels_sub, normalize=True)) # Add a scalar summary for the snapshot of LER self.summaries_train.append(tf.summary.scalar( 'ler_main_train', ler_op_main)) self.summaries_train.append(tf.summary.scalar( 'ler_sub_train', ler_op_sub)) self.summaries_dev.append(tf.summary.scalar( 'ler_main_dev', ler_op_main)) self.summaries_dev.append(tf.summary.scalar( 'ler_sub_dev', ler_op_sub)) return ler_op_main, ler_op_sub
def _get_testing(rnn_logits,sequence_length,label,label_length): """Create ops for testing (all scalars): loss: CTC loss function value, label_error: Batch-normalized edit distance on beam search max sequence_error: Batch-normalized sequence error rate """ with tf.name_scope("train"): loss = model.ctc_loss_layer(rnn_logits,label,sequence_length) with tf.name_scope("test"): predictions,_ = tf.nn.ctc_beam_search_decoder(rnn_logits, sequence_length, beam_width=128, top_paths=1, merge_repeated=True) hypothesis = tf.cast(predictions[0], tf.int32) # for edit_distance label_errors = tf.edit_distance(hypothesis, label, normalize=False) sequence_errors = tf.count_nonzero(label_errors,axis=0) total_label_error = tf.reduce_sum( label_errors ) total_labels = tf.reduce_sum( label_length ) label_error = tf.truediv( total_label_error, tf.cast(total_labels, tf.float32 ), name='label_error') sequence_error = tf.truediv( tf.cast( sequence_errors, tf.int32 ), tf.shape(label_length)[0], name='sequence_error') tf.summary.scalar( 'loss', loss ) tf.summary.scalar( 'label_error', label_error ) tf.summary.scalar( 'sequence_error', sequence_error ) return loss, label_error, sequence_error
def _testEditDistanceST( self, hypothesis_st, truth_st, normalize, expected_output, expected_shape, expected_err_re=None): edit_distance = tf.edit_distance( hypothesis=hypothesis_st, truth=truth_st, normalize=normalize) if expected_err_re is None: self.assertEqual(edit_distance.get_shape(), expected_shape) output = edit_distance.eval() self.assertAllClose(output, expected_output) else: with self.assertRaisesOpError(expected_err_re): edit_distance.eval()
def test_edit_distance(): graph = tf.Graph() with graph.as_default(): truth = tf.sparse_placeholder(tf.int32) hyp = tf.sparse_placeholder(tf.int32) editDist = tf.edit_distance(hyp, truth, normalize=False) with tf.Session(graph=graph) as session: truthTest = sparse_tensor_feed([[0,1,2], [0,1,2,3,4]]) hypTest = sparse_tensor_feed([[3,4,5], [0,1,2,2]]) feedDict = {truth: truthTest, hyp: hypTest} dist = session.run([editDist], feed_dict=feedDict) print(dist)
def _testEditDistance(self, hypothesis, truth, normalize, expected_output, expected_err_re=None): # hypothesis and truth are (index, value, shape) tuples hypothesis_st = tf.SparseTensor(*[ConstantOf(x) for x in hypothesis]) truth_st = tf.SparseTensor(*[ConstantOf(x) for x in truth]) edit_distance = tf.edit_distance(hypothesis=hypothesis_st, truth=truth_st, normalize=normalize) with self.test_session(): if expected_err_re is None: # Shape inference figures out the shape from the shape variables expected_shape = [max(h, t) for h, t in zip(hypothesis[2], truth[2])[:-1]] self.assertEqual(edit_distance.get_shape(), expected_shape) output = edit_distance.eval() self.assertAllClose(output, expected_output) else: with self.assertRaisesOpError(expected_err_re): edit_distance.eval()
def compute_ler(self, decode_op, labels): """Operation for computing LER (Label Error Rate). Args: decode_op: operation for decoding labels: A SparseTensor of target labels Return: ler_op: operation for computing LER """ # Compute LER (normalize by label length) ler_op = tf.reduce_mean(tf.edit_distance( decode_op, labels, normalize=True)) # Add a scalar summary for the snapshot of LER self.summaries_train.append(tf.summary.scalar('ler_train', ler_op)) self.summaries_dev.append(tf.summary.scalar('ler_dev', ler_op)) return ler_op
def sequence_edit_distance(predictions, labels, weights_fn=common_layers.weights_nonzero): """Average edit distance, ignoring padding 0s. The score returned is the edit distance divided by the total length of reference truth and the weight returned is the total length of the truth. Args: predictions: Tensor of shape [`batch_size`, `length`, 1, `num_classes`] and type tf.float32 representing the logits, 0-padded. labels: Tensor of shape [`batch_size`, `length`, 1, 1] and type tf.int32 representing the labels of same length as logits and 0-padded. weights_fn: ignored. The weights returned are the total length of the ground truth labels, excluding 0-paddings. Returns: (edit distance / reference length, reference length) Raises: ValueError: if weights_fn is not common_layers.weights_nonzero. """ if weights_fn is not common_layers.weights_nonzero: raise ValueError("Only weights_nonzero can be used for this metric.") with tf.variable_scope("edit_distance", values=[predictions, labels]): # Transform logits into sequence classes by taking max at every step. predictions = tf.to_int32( tf.squeeze(tf.argmax(predictions, axis=-1), axis=(2, 3))) nonzero_idx = tf.where(tf.not_equal(predictions, 0)) sparse_outputs = tf.SparseTensor(nonzero_idx, tf.gather_nd(predictions, nonzero_idx), tf.shape(predictions, out_type=tf.int64)) labels = tf.squeeze(labels, axis=(2, 3)) nonzero_idx = tf.where(tf.not_equal(labels, 0)) label_sparse_outputs = tf.SparseTensor(nonzero_idx, tf.gather_nd(labels, nonzero_idx), tf.shape(labels, out_type=tf.int64)) distance = tf.reduce_sum( tf.edit_distance(sparse_outputs, label_sparse_outputs, normalize=False)) reference_length = tf.to_float(common_layers.shape_list(nonzero_idx)[0]) return distance / reference_length, reference_length
def loss(self): """ 定义loss :return: """ # 调用ctc loss with tf.name_scope('loss'): #损失 self.avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(self.text, self.logits, self.seq_length)) tf.summary.scalar('loss',self.avg_loss) # [optimizer] with tf.name_scope('train'): #训练过程 self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.avg_loss) with tf.name_scope("decode"): self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder(self.logits, self.seq_length, merge_repeated=False) with tf.name_scope("accuracy"): self.distance = tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.text) # 计算label error rate (accuracy) self.label_err = tf.reduce_mean(self.distance, name='label_error_rate') tf.summary.scalar('accuracy', self.label_err)
def compute_ler(self, labels_true, labels_pred): """Operation for computing LER (Label Error Rate). Args: labels_true: A SparseTensor of target labels labels_pred: A SparseTensor of predicted labels Returns: ler_op: operation for computing LER """ # Compute LER (normalize by label length) ler_op = tf.reduce_mean(tf.edit_distance( labels_pred, labels_true, normalize=True)) # TODO: consider <EOS> # Add a scalar summary for the snapshot of LER # with tf.name_scope("ler"): # self.summaries_train.append(tf.summary.scalar( # 'ler_train', ler_op)) # self.summaries_dev.append(tf.summary.scalar( # 'ler_dev', ler_op)) # TODO: feed_dictのタイミング違うからエラーになる # global_stepをupdateする前にする? return ler_op
def _build_graph(self, inputs): feat, labelidx, labelvalue, labelshape, seqlen = inputs label = tf.SparseTensor(labelidx, labelvalue, labelshape) cell = tf.contrib.rnn.BasicLSTMCell(num_units=HIDDEN) cell = tf.contrib.rnn.MultiRNNCell([cell] * NLAYER) initial = cell.zero_state(tf.shape(feat)[0], tf.float32) outputs, last_state = tf.nn.dynamic_rnn(cell, feat, seqlen, initial, dtype=tf.float32, scope='rnn') # o: b x t x HIDDEN output = tf.reshape(outputs, [-1, HIDDEN]) # (Bxt) x rnnsize logits = FullyConnected('fc', output, NR_CLASS, nl=tf.identity, W_init=tf.truncated_normal_initializer(stddev=0.01)) logits = tf.reshape(logits, (BATCH, -1, NR_CLASS)) loss = tf.nn.ctc_loss(label, logits, seqlen, time_major=False) self.cost = tf.reduce_mean(loss, name='cost') logits = tf.transpose(logits, [1, 0, 2]) isTrain = get_current_tower_context().is_training if isTrain: # beam search is too slow to run in training predictions = tf.to_int32( tf.nn.ctc_greedy_decoder(logits, seqlen)[0][0]) else: predictions = tf.to_int32( tf.nn.ctc_beam_search_decoder(logits, seqlen)[0][0]) err = tf.edit_distance(predictions, label, normalize=True) err.set_shape([None]) err = tf.reduce_mean(err, name='error') summary.add_moving_summary(err, self.cost)
def runCTC(batch): INPUT_PATH = '../TRAIN/All/mfcc/' #directory of MFCC nFeatures x nFrames 2-D array .npy files TARGET_PATH = '../TRAIN/All/phone_y/' #directory of nPhonemes 1-D array .npy files ####Learning Parameters learningRate = 0.001 momentum = 0.9 nEpochs = 300 batchSize = batch.shape[1] ####Network Parameters nFeatures = 39 #12 MFCC coefficients + energy, and derivatives nHidden = 256 nClasses = 30 #39 phonemes, plus the "blank" for CTC ####Load data print('Loading data') with open('TIMIT_data_prepared_for_CTC.pkl','rb') as f: data= pickle.load(f) input_list = batch charmap = data['chars'] print(charmap) charmap.append('_') #batchedData, maxTimeSteps = data_lists_to_batches(input_list, target_list, batchSize) maxTimeSteps = 776 totalN = len(input_list) ####Define graph print('Defining graph') graph = tf.Graph() with graph.as_default(): ####NOTE: try variable-steps inputs and dynamic bidirectional rnn, when it's implemented in tensorflow ####Graph input inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, batchSize, nFeatures)) #Prep input data to fit requirements of rnn.bidirectional_rnn # Reshape to 2-D tensor (nTimeSteps*batchSize, nfeatures) inputXrs = tf.reshape(inputX, [-1, nFeatures]) # Split to get a list of 'n_steps' tensors of shape (batch_size, n_hidden) inputList = tf.split(0, maxTimeSteps, inputXrs) targetIxs = tf.placeholder(tf.int64) targetVals = tf.placeholder(tf.int32) targetShape = tf.placeholder(tf.int64) targetY = tf.SparseTensor(targetIxs, targetVals, targetShape) seqLengths = tf.placeholder(tf.int32, shape=(batchSize)) ####Weights & biases weightsOutH1 = tf.Variable(tf.truncated_normal([2, nHidden], stddev=np.sqrt(2.0 / (2*nHidden)))) biasesOutH1 = tf.Variable(tf.zeros([nHidden])) weightsOutH2 = tf.Variable(tf.truncated_normal([2, nHidden], stddev=np.sqrt(2.0 / (2*nHidden)))) biasesOutH2 = tf.Variable(tf.zeros([nHidden])) weightsClasses = tf.Variable(tf.truncated_normal([nHidden, nClasses], stddev=np.sqrt(2.0 / nHidden))) biasesClasses = tf.Variable(tf.zeros([nClasses])) ####Network forwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) backwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) fbH1, _, _ = bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32, scope='BDLSTM_H1') fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1] outH1 = [tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) + biasesOutH1 for t in fbH1rs] logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1] ####Optimizing logits3d = tf.pack(logits) loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths)) optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating logitsMaxTest = tf.slice(tf.argmax(logits3d,2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0]) errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \ tf.to_float(tf.size(targetY.values)) ####Run session with tf.Session(graph=graph) as session: print('Initializing') saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('/users/TeamASR/models') if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") session.run(tf.initialize_all_variables()) feedDict = {inputX: batch, seqLengths: (np.ones([batchSize])*776)} logit = session.run([logits3d], feed_dict=feedDict) return logit
def create_cer(sparse_decoded, sparse_targets): return tf.edit_distance(tf.cast(sparse_decoded, tf.int32), sparse_targets, normalize=True)
import tensorflow as tf sess = tf.Session() #---------------------------------- # First compute the edit distance between 'bear' and 'beers' hypothesis = list('bear') truth = list('beers') h1 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3]], hypothesis, [1, 1, 1]) t1 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 0, 4]], truth, [1, 1, 1]) print(sess.run(tf.edit_distance(h1, t1, normalize=False))) #---------------------------------- # Compute the edit distance between ('bear','beer') and 'beers': hypothesis2 = list('bearbeer') truth2 = list('beersbeers') h2 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3]], hypothesis2, [1, 2, 4]) t2 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 0, 4], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3], [0, 1, 4]], truth2, [1, 2, 5]) print(sess.run(tf.edit_distance(h2, t2, normalize=True))) #----------------------------------
outputs = tf.reshape(outputs1, [-1, num_hidden]) logits0 = tf.matmul(tf.nn.dropout(outputs, keep_prob), W) + b logits1 = tf.reshape(logits0, [batch_s, -1, num_classes]) logits = tf.transpose(logits1, (1, 0, 2)) logits = tf.cast(logits, tf.float32) loss = tf.nn.ctc_loss(labels, logits, seq_len) cost = tf.reduce_mean(loss) width1_decoded, width1_log_prob = tf.nn.ctc_beam_search_decoder( logits, seq_len, merge_repeated=False, beam_width=1) decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, merge_repeated=False) width1_acc = tf.reduce_mean( tf.edit_distance(tf.cast(width1_decoded[0], tf.int32), labels)) acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), labels)) saver = tf.train.Saver(max_to_keep=1) result = 0 imgFiles = glob.glob(os.path.join("test_img", "*")) imgFiles.sort() txtFiles = glob.glob(os.path.join("test_txt", "*")) txtFiles.sort() for i in range(len(imgFiles)): goldLines = [] fin = open(txtFiles[i]) lines = fin.readlines() fin.close() for j in range(len(lines)): goldLines.append(lines[j])
def train(): global_step = tf.Variable(0, trainable=False) # learning_rate = tf.train.exponential_decay(LEARNING_RATE_INITIAL, # global_step, # LEARNING_RATE_DECAY_STEPS, # LEARNING_RATE_DECAY_FACTOR, # staircase=True, name="learning_rate") # 决定还是自定义学习速率比较靠谱 curr_learning_rate = 1e-5 learning_rate = tf.placeholder(tf.float32, shape=[]) logits, inputs, labels, seq_len, keep_prob = neural_networks() # If time_major == True (default), this will be a Tensor shaped: [max_time x batch_size x num_classes] # 返回 A 1-D float Tensor, size [batch], containing the negative log probabilities. loss = tf.nn.ctc_loss(labels=labels, inputs=logits, sequence_length=seq_len) cost = tf.reduce_mean(loss, name="cost") # 收敛效果不好 # optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=MOMENTUM).minimize(cost, global_step=global_step) # 做一个梯度裁剪,貌似也没啥用, 将梯度控制到 -1 和 1 之间 # grads_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) # grads_and_vars = grads_optimizer.compute_gradients(loss) # capped_grads_and_vars = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in grads_and_vars] # gradients, variables = zip(*grads_optimizer.compute_gradients(loss)) # gradients, _ = tf.clip_by_global_norm(gradients, 5.0) # capped_grads_and_vars = zip(gradients, variables) #capped_grads_and_vars = [(tf.clip_by_norm(g, 5), v) for g,v in grads_and_vars] # optimizer = grads_optimizer.apply_gradients(capped_grads_and_vars, global_step=global_step) # 最小化 loss optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize( cost, global_step=global_step) # The ctc_greedy_decoder is a special case of the ctc_beam_search_decoder with top_paths=1 (but that decoder is faster for this special case). # decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len, merge_repeated=False) decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, beam_width=10, merge_repeated=False) # decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, merge_repeated=False) acc = tf.reduce_sum( tf.edit_distance(tf.cast(decoded[0], tf.int32), labels, normalize=False)) acc = 1 - acc / tf.to_float(tf.size(labels.values)) init = tf.global_variables_initializer() def report_accuracy(decoded_list, test_labels): original_list = decode_sparse_tensor(test_labels) detected_list = decode_sparse_tensor(decoded_list) if len(original_list) != len(detected_list): print("len(original_list)", len(original_list), "len(detected_list)", len(detected_list), " test and detect length desn't match") print("T/F: original(length) <-------> detectcted(length)") _acc = 0. for idx in range(min(len(original_list), len(detected_list))): number = original_list[idx] detect_number = detected_list[idx] hit = (number == detect_number) print("%6s" % hit, list_to_chars(number), "(", len(number), ")") print("%6s" % "", list_to_chars(detect_number), "(", len(detect_number), ")") # 计算莱文斯坦比 import Levenshtein _acc += Levenshtein.ratio(list_to_chars(number), list_to_chars(detect_number)) print("Test Accuracy:", _acc / len(original_list)) def do_report(): test_inputs, test_labels, test_seq_len = get_next_batch( TEST_BATCH_SIZE) test_feed = { inputs: test_inputs, labels: test_labels, seq_len: test_seq_len, keep_prob: 1.0 } dd = session.run(decoded[0], test_feed) report_accuracy(dd, test_labels) def restore(sess): curr_dir = os.path.dirname(__file__) model_dir = os.path.join(curr_dir, "model_ascii_res_lstm") if not os.path.exists(model_dir): os.mkdir(model_dir) saver_prefix = os.path.join(model_dir, "model.ckpt") ckpt = tf.train.get_checkpoint_state(model_dir) saver = tf.train.Saver(max_to_keep=5) if ckpt and ckpt.model_checkpoint_path: print("Restore Model ...") saver.restore(sess, ckpt.model_checkpoint_path) return saver, model_dir, saver_prefix with tf.Session() as session: session.run(init) saver, model_dir, checkpoint_path = restore( session) # tf.train.Saver(tf.global_variables(), max_to_keep=100) while True: train_cost = 0 for batch in range(BATCHES): start = time.time() train_inputs, train_labels, train_seq_len = get_next_batch( BATCH_SIZE) feed = { inputs: train_inputs, labels: train_labels, seq_len: train_seq_len, keep_prob: 0.95, learning_rate: curr_learning_rate } # l=session.run(layer,feed) # print(train_inputs.shape) # print(l.shape) # print(train_seq_len[0]) b_acc, b_loss, b_labels, b_logits, b_seq_len, b_cost, steps, b_learning_rate, _ = \ session.run([acc, loss, labels, logits, seq_len, cost, global_step, learning_rate, optimizer], feed) train_cost += b_cost * BATCH_SIZE seconds = round(time.time() - start, 2) print("step:", steps, "cost:", b_cost, "batch seconds:", seconds, "acc:", b_acc, "width:", train_seq_len[0]) if np.isnan(b_cost) or np.isinf(b_cost): print("Error: cost is nan or inf") train_labels_list = decode_sparse_tensor(train_labels) for i, train_label in enumerate(train_labels_list): print(i, list_to_chars(train_label)) return if seconds > 60: print('Exit for long time') return if steps > 0 and steps % REPORT_STEPS == 0: do_report() saver.save(session, checkpoint_path, global_step=steps)
with tf.name_scope('decoder'): decode, log_prob = tf.nn.ctc_beam_search_decoder( inputs=logits, sequence_length=seq_len, merge_repeated=True) targets = tf.sparse_placeholder(tf.int32, [None, None], name="target") with tf.name_scope('loss'): ctc_loss = tf.nn.ctc_loss(labels=targets, inputs=logits, sequence_length=seq_len) avg_loss = tf.reduce_mean(ctc_loss) tf.summary.histogram("avg_loss", avg_loss) with tf.name_scope('accuracy'): distance = tf.edit_distance(tf.cast(decode[0], tf.int32), targets) ler = tf.reduce_mean(distance, name='label_error_rate') with tf.name_scope('optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate=alpha, beta1=beta1, beta2=beta2, epsilon=epsilon) optimizer = optimizer.minimize(avg_loss) elapsed_time = timer() - start print("Elapsed time : " + str(elapsed_time)) def run_model(sess, client):
def train_shadownet(dataset_dir, weights_path=None): """ :param dataset_dir: :param weights_path: :return: """ # decode the tf records to get the training data decoder = data_utils.TextFeatureIO().reader images, labels, imagenames = decoder.read_features(ops.join(dataset_dir, 'train_feature.tfrecords'), num_epochs=None) inputdata, input_labels, input_imagenames = tf.train.shuffle_batch( tensors=[images, labels, imagenames], batch_size=32, capacity=1000+2*32, min_after_dequeue=100, num_threads=1) inputdata = tf.cast(x=inputdata, dtype=tf.float32) # initializa the net model shadownet = crnn_model.ShadowNet(phase='Train', hidden_nums=256, layers_nums=2, seq_length=25, num_classes=37) with tf.variable_scope('shadow', reuse=False): net_out = shadownet.build_shadownet(inputdata=inputdata) cost = tf.reduce_mean(tf.nn.ctc_loss(labels=input_labels, inputs=net_out, sequence_length=25*np.ones(32))) decoded, log_prob = tf.nn.ctc_beam_search_decoder(net_out, 25*np.ones(32), merge_repeated=False) sequence_dist = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), input_labels)) global_step = tf.Variable(0, name='global_step', trainable=False) starter_learning_rate = config.cfg.TRAIN.LEARNING_RATE learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, config.cfg.TRAIN.LR_DECAY_STEPS, config.cfg.TRAIN.LR_DECAY_RATE, staircase=True) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate).minimize(loss=cost, global_step=global_step) # Set tf summary tboard_save_path = 'tboard/shadownet' if not ops.exists(tboard_save_path): os.makedirs(tboard_save_path) tf.summary.scalar(name='Cost', tensor=cost) tf.summary.scalar(name='Learning_Rate', tensor=learning_rate) tf.summary.scalar(name='Seq_Dist', tensor=sequence_dist) merge_summary_op = tf.summary.merge_all() # Set saver configuration saver = tf.train.Saver() model_save_dir = 'model/shadownet' if not ops.exists(model_save_dir): os.makedirs(model_save_dir) train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) model_name = 'shadownet_{:s}.ckpt'.format(str(train_start_time)) model_save_path = ops.join(model_save_dir, model_name) # Set sess configuration sess_config = tf.ConfigProto() sess_config.gpu_options.per_process_gpu_memory_fraction = config.cfg.TRAIN.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = config.cfg.TRAIN.TF_ALLOW_GROWTH sess = tf.Session(config=sess_config) summary_writer = tf.summary.FileWriter(tboard_save_path) summary_writer.add_graph(sess.graph) # Set the training parameters train_epochs = config.cfg.TRAIN.EPOCHS with sess.as_default(): if weights_path is None: logger.info('Training from scratch') init = tf.global_variables_initializer() sess.run(init) else: logger.info('Restore model from {:s}'.format(weights_path)) saver.restore(sess=sess, save_path=weights_path) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) for epoch in range(train_epochs): _, c, seq_distance, preds, gt_labels, summary = sess.run( [optimizer, cost, sequence_dist, decoded, input_labels, merge_summary_op]) # calculate the precision preds = decoder.sparse_tensor_to_str(preds[0]) gt_labels = decoder.sparse_tensor_to_str(gt_labels) accuracy = [] for index, gt_label in enumerate(gt_labels): pred = preds[index] totol_count = len(gt_label) correct_count = 0 try: for i, tmp in enumerate(gt_label): if tmp == pred[i]: correct_count += 1 except IndexError: continue finally: try: accuracy.append(correct_count / totol_count) except ZeroDivisionError: if len(pred) == 0: accuracy.append(1) else: accuracy.append(0) accuracy = np.mean(np.array(accuracy).astype(np.float32), axis=0) # if epoch % config.cfg.TRAIN.DISPLAY_STEP == 0: logger.info('Epoch: {:d} cost= {:9f} seq distance= {:9f} train accuracy= {:9f}'.format( epoch + 1, c, seq_distance, accuracy)) summary_writer.add_summary(summary=summary, global_step=epoch) saver.save(sess=sess, save_path=model_save_path, global_step=epoch) coord.request_stop() coord.join(threads=threads) sess.close() return
def build(self, ctc_beam_search=False, decay_steps=8000, decay_rate=0.7): '''Build all necessary ops into the object's tensorflow graph''' if self.built: raise RuntimeError("Graph has already been built! Please reset.") self.rate = tf.placeholder(tf.float32, shape=[]) global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(self.rate, global_step, decay_steps, decay_rate, staircase=False) self.features = tf.placeholder(tf.float32, [None, None]) self.speaker = tf.placeholder(tf.int32, [None]) self.targets = tf.sparse_placeholder(tf.int32) n_windows = tf.shape(self.features)[0] - self.n_frames c_logit_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True) i_logit_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True) d_vec_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True) window_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True) loop_vars = [ 0, n_windows, self.features, c_logit_array, i_logit_array, window_array, d_vec_array ] # define loop that applies the feedforward model over a frame sequence def cond(t, t_stop, *args): # stop iterating when the full frame sequence has been encoded return t < t_stop def body(t, t_stop, features, char_logits, id_logits, windows, d_vecs): n_per_branch = self.n_layers - self.n_shared shared = ['shared_layer_' + str(n) for n in range(self.n_shared)] char_scopes = ['char_layer_' + str(n) for n in range(n_per_branch)] id_scopes = ['id_layer_' + str(n) for n in range(n_per_branch)] char_out_scope = 'char_output' id_out_scope = 'id_output' # slice window out of feature array and flatten it inp = self.features[t:t + self.n_frames, :] windows = windows.write(t, tf.reshape(inp, [1, self.size_in])) x = tf.reshape(inp, [1, self.size_in]) # build and stack shared feedforward layers for i, scope in enumerate(shared): size_in = self.size_in if i < 1 else self.n_per_layer x = self.ff_layer(x, size_in, self.n_per_layer, scope) if n_per_branch > 0: x_char = self.build_ff_branch(x, char_scopes) x_id = self.build_ff_branch(x, id_scopes) else: x_char = x x_id = x # build output layers for each task char_out = self.ff_layer(x_char, self.n_per_layer, self.n_chars, char_out_scope, logits=True) id_out = self.ff_layer(x_id, self.n_per_layer, self.n_speakers, id_out_scope, logits=True) # accumulate logit values for each window char_logits = char_logits.write(t, char_out) id_logits = id_logits.write(t, id_out) # accumulate ID d-vectors d_vecs = d_vecs.write(t, x_id) return [ t + 1, t_stop, features, char_logits, id_logits, windows, d_vecs ] # note that because there are no dependencies between time steps, we # can run the loop iterations in parallel (doesn't make much of a diff) loop_output = tf.while_loop(cond, body, loop_vars, parallel_iterations=20) # use squeeze to create 2D instead of 3D arrays self.c_logits = loop_output[3].stack() # can't squeeze b/c ctc loss self.i_logits = tf.squeeze(loop_output[4].stack()) self.all_windows = tf.squeeze(loop_output[5].stack()) self.d_vecs = tf.squeeze(loop_output[6].stack()) n_windows = tf.expand_dims(n_windows, 0) char_loss = tf.nn.ctc_loss(self.targets, self.c_logits, n_windows, preprocess_collapse_repeated=False, ctc_merge_repeated=True, ignore_longer_outputs_than_inputs=True, time_major=True) id_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.i_logits, labels=self.speaker) # TODO: figure out a good weighting scheme for combining losses self.cost = tf.reduce_sum(id_loss) + tf.reduce_sum(char_loss) # build the loss and an op for doing parameter updates tvars = tf.trainable_variables() grads = tf.gradients(self.cost, tvars) grads, _ = tf.clip_by_global_norm(grads, 5.0) # avoid explosions optimizer = tf.train.RMSPropOptimizer(learning_rate) self.train_step = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) self.speaker_decode = tf.argmax(self.i_logits, axis=-1, output_type=tf.int32) if ctc_beam_search: self.char_decode, _ = tf.nn.ctc_beam_search_decoder( self.c_logits, n_windows) else: self.char_decode, _ = tf.nn.ctc_greedy_decoder( self.c_logits, n_windows) self.ler = tf.reduce_mean( tf.edit_distance(tf.cast(self.char_decode[0], tf.int32), self.targets)) self.built = True
def build_graph(self, args, maxTimeSteps): self.graph = tf.Graph() with self.graph.as_default(): self.inputX = tf.placeholder( tf.float32, shape=(maxTimeSteps, args.batch_size, args.num_feature)) # [maxL,32,39] inputXrs = tf.reshape(self.inputX, [-1, args.num_feature]) # self.inputList = tf.split(0, maxTimeSteps, inputXrs) #convert inputXrs from [32*maxL,39] to [32,maxL,39] self.inputList = tf.split( inputXrs, maxTimeSteps, 0) # convert inputXrs from [32*maxL,39] to [32,maxL,39] self.targetIxs = tf.placeholder(tf.int64) self.targetVals = tf.placeholder(tf.int32) self.targetShape = tf.placeholder(tf.int64) self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape) self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size)) self.config = { 'name': args.model, 'rnncell': self.cell_fn, 'num_layer': args.num_layer, 'num_hidden': args.num_hidden, 'num_class': args.num_class, 'activation': args.activation, 'optimizer': args.optimizer, 'learning rate': args.learning_rate, 'keep prob': args.keep_prob, 'batch size': args.batch_size } fbHrs = build_multi_dynamic_brnn(self.args, maxTimeSteps, self.inputX, self.cell_fn, self.seqLengths) with tf.name_scope('fc-layer'): with tf.variable_scope('fc'): weightsClasses = tf.Variable( tf.truncated_normal([args.num_hidden, args.num_class], name='weightsClasses')) biasesClasses = tf.Variable(tf.zeros([args.num_class]), name='biasesClasses') logits = [ tf.matmul(t, weightsClasses) + biasesClasses for t in fbHrs ] logits3d = tf.stack(logits) self.loss = tf.reduce_mean( tf.nn.ctc_loss(self.targetY, logits3d, self.seqLengths)) self.var_op = tf.global_variables() self.var_trainable_op = tf.trainable_variables() if args.grad_clip == -1: # not apply gradient clipping self.optimizer = tf.train.AdamOptimizer( args.learning_rate).minimize(self.loss) else: # apply gradient clipping grads, _ = tf.clip_by_global_norm( tf.gradients(self.loss, self.var_trainable_op), args.grad_clip) opti = tf.train.AdamOptimizer(args.learning_rate) self.optimizer = opti.apply_gradients( zip(grads, self.var_trainable_op)) self.predictions = tf.to_int32( tf.nn.ctc_beam_search_decoder(logits3d, self.seqLengths, merge_repeated=False)[0][0]) if args.level == 'cha': self.errorRate = tf.reduce_sum( tf.edit_distance(self.predictions, self.targetY, normalize=True)) self.initial_op = tf.global_variables_initializer() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5, keep_checkpoint_every_n_hours=1) self.logfile = args.log_dir + str( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') + '.txt').replace(' ', '').replace('/', '')
def __init__(self, stochastic=False, use_slope=True, variational_dropout=False, vocabularySize=283, label_size=50, rnnSize=256, n_layers=3, dropout=0.5, zoneout=0.1, embedding_size=None, dtype=tf.float32, clip=0.35, k_width=3, name='hlstm', conv_filter=3, mid_filter=25, batch_size=128): self.rnnSize = rnnSize self.vocabularySize = vocabularySize self.outputSize = label_size self.stochastic = stochastic self.dtype = dtype self.dropout = dropout self.n_layers = n_layers self.clip = clip self.name = name self.use_slope = use_slope self.zoneout = zoneout self.batch_size = batch_size self.k_width = k_width self.conv_filter = conv_filter self.mid_filter = mid_filter f_bias = 0.0 # placeholders self.x = tf.placeholder(tf.float32, [None, None, 40, 3], name='x') #[batch, seq_len] self.label = tf.sparse_placeholder(tf.int32, name='label') #[batch, seq_len] self.seq_len = tf.placeholder(tf.int32, [None], name='seq_len') # [batch_size] self.is_train = tf.placeholder(tf.bool, [], name='train') self.lr = tf.placeholder(tf.float32, [], name='lr') dropout_p = tf.where(self.is_train, self.dropout, 1.0) dropout_p = tf.cast(dropout_p, dtype=self.dtype) # LSTM layers self.lstm_cells = [] conv_filter_size = (self.conv_filter, self.conv_filter) h = tf.layers.conv2d(self.x, 32, conv_filter_size, (2, 2), 'same', use_bias=False, name='conv0') h = tf.contrib.layers.batch_norm(h, center=True, scale=True, is_training=self.is_train, decay=0.9, epsilon=1e-3, scope='bn0') h = tf.nn.tanh(h, name='tanh0') h = tf.layers.conv2d(h, 32, conv_filter_size, (1, 2), 'same', use_bias=False, name='conv1') h = tf.contrib.layers.batch_norm(h, center=True, scale=True, is_training=self.is_train, decay=0.9, epsilon=1e-3, scope='bn1') h = tf.nn.tanh(h, name='tanh1') time_convolution = 2 _seq_len_char = self.seq_len _seq_len_word = tf.div(self.seq_len, 2) #reshape # ([0] : batch_size, [1] : seq_len, [2]*[3] : feature dimension) h_shape = tf.shape(h) h = tf.reshape(h, [h_shape[0], h_shape[1], 320]) cell = LSTMCell(self.rnnSize, initializer=tf.variance_scaling_initializer( 1.0, 'fan_out', 'uniform')) #cell = LSTMCell(self.rnnSize) cell = DropoutWrapper(cell, output_keep_prob=dropout_p, variational_recurrent=variational_dropout, dtype=self.dtype) self.lstm_cells.append(cell) for i in range(self.n_layers - 1): cell = LSTMCell(self.rnnSize, initializer=tf.variance_scaling_initializer( 1.0, 'fan_out', 'uniform'), forget_bias=f_bias) #cell = SRUCell(self.rnnSize, initializer = tf.variance_scaling_initializer(1.0, 'fan_out', 'uniform')) cell = DropoutWrapper(cell, output_keep_prob=dropout_p, variational_recurrent=variational_dropout, dtype=self.dtype) #cell = HLSTMCell(self.rnnSize) self.lstm_cells.append(cell) # self.h = [] # self.gate = [] with tf.variable_scope('lstm0'): _h, last_state = tf.nn.dynamic_rnn(cell=self.lstm_cells[0], inputs=h, dtype=self.dtype) lstm_input = _h for i in range(1, self.n_layers): with tf.variable_scope('lstm' + str(i)): output, last_state = tf.nn.dynamic_rnn(cell=self.lstm_cells[i], inputs=lstm_input, dtype=self.dtype) lstm_input = output if i == self.n_layers - 3: character_h = output lstm_input = tf.expand_dims(lstm_input, -2) conv_filter = tf.get_variable( 'lstm_time_conv_filter', shape=[time_convolution, 1, self.rnnSize, 1], trainable=True) lstm_input = tf.nn.depthwise_conv2d( lstm_input, conv_filter, [1, time_convolution, time_convolution, 1], padding='SAME', name='lstm_time_conv') lstm_input = tf.squeeze(lstm_input, axis=[-2]) lstm_output = lstm_input # character-ctc layer h_shape = tf.shape(character_h) output_h = tf.reshape(character_h, [-1, self.rnnSize]) print(output_h) with tf.variable_scope('dense_character'): dense = tf.layers.dense( output_h, self.outputSize, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1)) self.char_logit = tf.reshape(dense, [h_shape[0], h_shape[1], self.outputSize]) self.char_loss = tf.nn.ctc_loss(inputs=self.char_logit, labels=self.label, sequence_length=_seq_len_char, time_major=False) self.char_loss = tf.reduce_mean(self.char_loss) train_loss = self.char_loss char_opt = tf.train.AdamOptimizer(self.lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): grad, var = zip(*char_opt.compute_gradients(train_loss)) clipped_gradients, _ = tf.clip_by_global_norm(grad, clip) self.char_optimizer = char_opt.apply_gradients( zip(clipped_gradients, var)) self.sentence, _ = tf.nn.ctc_greedy_decoder( tf.transpose(self.char_logit, (1, 0, 2)), _seq_len_char) self.cer = tf.reduce_mean( tf.edit_distance(tf.cast(self.sentence[0], tf.int32), self.label)) # wordpiece-ctc layer h_shape = tf.shape(lstm_output) output_h = tf.reshape(lstm_output, [-1, self.rnnSize]) print(output_h) with tf.variable_scope('dense_wordpiece'): dense = tf.layers.dense( output_h, self.vocabularySize + 1, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1)) self.word_logit = tf.reshape( dense, [h_shape[0], h_shape[1], self.vocabularySize + 1]) self.word_loss = tf.nn.ctc_loss(inputs=self.word_logit, labels=self.label, sequence_length=_seq_len_word, time_major=False) self.word_loss = tf.reduce_mean(self.word_loss) train_loss = self.word_loss word_opt = tf.train.AdamOptimizer(self.lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): grad, var = zip(*word_opt.compute_gradients(train_loss)) clipped_gradients, _ = tf.clip_by_global_norm(grad, clip) self.word_optimizer = word_opt.apply_gradients( zip(clipped_gradients, var)) self.word_sentence, _ = tf.nn.ctc_greedy_decoder( tf.transpose(self.word_logit, (1, 0, 2)), _seq_len_word) print(self.word_sentence) self.word_sentence = tf.sparse_tensor_to_dense(self.word_sentence[0], default_value=2) #self.word_sentence = tf.sparse_tensor_to_dense(self.word_sentence.indices, self.word_sentence.shape, self.word_sentence.values,default_value = 2 ) #self.word_distance = tf.reduce_mean(tf.edit_distance(tf.cast(word_sentence[0], tf.int32),self.wp_label)) # last states to placeholder self.logsoftmax = tf.nn.log_softmax(self.word_logit) self.saver = tf.train.Saver()
def train(): test_names, test_inputs, test_targets, test_seq_len = utils.get_data_set( 'valid.txt') global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay( common.INITIAL_LEARNING_RATE, global_step, common.DECAY_STEPS, common.LEARNING_RATE_DECAY_FACTOR, staircase=True) logits, inputs, targets, seq_len, Wforward, Wbackward, b = model.get_train_model( ) loss = tf.nn.ctc_loss(logits, targets, seq_len) cost = tf.reduce_mean(loss) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=common.MOMENTUM).minimize( cost, global_step=global_step) decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, merge_repeated=False) acc = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) def do_report(): test_feed = { inputs: test_inputs, targets: test_targets, seq_len: test_seq_len } dd, log_probs, accuracy = session.run([decoded[0], log_prob, acc], test_feed) accuracy = report_accuracy(dd, test_targets, test_names) save_path = saver.save(session, "models/ocr.model-" + str(accuracy), global_step=steps) # decoded_list = decode_sparse_tensor(dd) def do_batch(): feed = { inputs: train_inputs, targets: train_targets, seq_len: train_seq_len } b_cost, steps, _ = session.run([cost, global_step, optimizer], feed) if steps > 0 and steps % common.REPORT_STEPS == 0: do_report() return b_cost, steps with tf.Session(config=tf.ConfigProto( log_device_placement=True)) as session: ckpt = tf.train.get_checkpoint_state("models") if ckpt and ckpt.model_checkpoint_path: saver = tf.train.Saver() saver.restore(session, ckpt.model_checkpoint_path) else: print("no checkpoint found") # Initializate the weights and biases init = tf.initialize_all_variables() session.run(init) saver = tf.train.Saver(tf.all_variables(), max_to_keep=100) for curr_epoch in xrange(num_epochs): print("Epoch.......", curr_epoch) train_cost = train_ler = 0 for batch in xrange(common.BATCHES): start = time.time() train_names, train_inputs, train_targets, train_seq_len = utils.get_data_set( 'trainimg.txt', batch * common.BATCH_SIZE, (batch + 1) * common.BATCH_SIZE) print("get data time", time.time() - start) start = time.time() c, steps = do_batch() train_cost += c * common.BATCH_SIZE seconds = time.time() - start print("Step:", steps, ", batch seconds:", seconds) train_cost /= common.TRAIN_SIZE val_feed = { inputs: train_inputs, targets: train_targets, seq_len: train_seq_len } val_cost, val_ler, lr, steps = session.run( [cost, acc, learning_rate, global_step], feed_dict=val_feed) log = "Epoch {}/{}, steps = {}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}s, learning_rate = {}" print( log.format(curr_epoch + 1, num_epochs, steps, train_cost, train_ler, val_cost, val_ler, time.time() - start, lr))
def build_graph(self, args, maxTimeSteps): self.maxTimeSteps = maxTimeSteps self.inputX = tf.placeholder( tf.float32, shape=[maxTimeSteps, args.batch_size, args.num_feature]) # define tf.SparseTensor for ctc loss self.targetIxs = tf.placeholder(tf.int64) self.targetVals = tf.placeholder(tf.int32) self.targetShape = tf.placeholder(tf.int64) self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape) self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size)) self.config = { 'name': args.model, 'num_layer': args.num_layer, 'num_hidden': args.num_hidden, 'num_class': args.num_class, 'activation': args.activation, 'optimizer': args.optimizer, 'learning rate': args.learning_rate, 'keep prob': args.keep_prob, 'batch size': args.batch_size } inputX = tf.reshape( self.inputX, [args.batch_size, maxTimeSteps, args.num_feature, 1]) print(inputX.get_shape()) with tf.variable_scope("layer_conv1"): # shape of kernel: [batch, in_height, in_width, in_channels] kernel = tf.get_variable("kernel", shape=[3, 3, 1, 16], dtype=tf.float32) # shape of conv1: [batch, height, width, channels] conv1 = tf.nn.conv2d(inputX, kernel, (1, 1, 1, 1), padding='VALID') print(conv1.get_shape()) output = conv1 for layer_id in range(args.num_layer): vars_scope = "capsule_cnn_layer_" + str(layer_id + 1) # (self, num_capsules, num_channels, output_vector_len, layer_type='conv', vars_scope=None): capLayer = CapsuleLayer(4, 8, 2, layer_type='conv', vars_scope=vars_scope) # (self, inputX, kernel_size, strides, routing=True, padding='VALID'): output = capLayer(output, [2, 2], (1, 1, 1, 1), args.num_iter) print(output.get_shape()) # last dnn layer for classification vars_scope = "capsule_dnn_layer" capLayer = CapsuleLayer(8, 16, args.num_classes, layer_type='dnn', vars_scope=vars_scope) logits3d = capLayer(output, [3, 3], (1, 1, 1, 1), args.num_iter) logits3d = tf.transpose(logits3d, perm=[1, 0, 2]) self.loss = tf.reduce_mean( tf.nn.ctc_loss(self.targetY, logits3d, self.seqLengths)) self.var_op = tf.global_variables() self.var_trainable_op = tf.trainable_variables() if args.grad_clip == -1: # not apply gradient clipping self.optimizer = tf.train.AdamOptimizer( args.learning_rate).minimize(self.loss) else: # apply gradient clipping grads, _ = tf.clip_by_global_norm( tf.gradients(self.loss, self.var_trainable_op), args.grad_clip) opti = tf.train.AdamOptimizer(args.learning_rate) self.optimizer = opti.apply_gradients( zip(grads, self.var_trainable_op)) self.predictions = tf.to_int32( tf.nn.ctc_beam_search_decoder(logits3d, self.seqLengths, merge_repeated=False)[0][0]) if args.level == 'cha': self.errorRate = tf.reduce_sum( tf.edit_distance(self.predictions, self.targetY, normalize=True)) self.initial_op = tf.global_variables_initializer() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5, keep_checkpoint_every_n_hours=1)
def _get_cer(self, pred_chars: Tensor) -> None: # Compute the Character Error Rate (CER) per batch in the computing graph self._cer = tf.reduce_mean(tf.edit_distance(pred_chars, self._targets), name='cer')
def train(): #test_inputs, test_targets, test_seq_len = utils.get_data_set('LSTM2', 0, 128)#118100, 118200)#120100, 120200) #IMGN1 # GO2 120100, 120200 S = 'train' m = CNNLSTM(S) m.build_graph() global_step = tf.train.get_or_create_global_step( ) #tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, DECAY_STEPS, LEARNING_RATE_DECAY_FACTOR, staircase=True) loss = tf.nn.ctc_loss(labels=m.labels, inputs=m.logits, sequence_length=m.seq_len) cost = tf.reduce_mean(loss) #cost = model.ctc_loss_layer(logits,targets,seq_len) #optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=common.MOMENTUM).minimize(cost, global_step=global_step) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999).minimize( loss, global_step=global_step) # Option 2: tf.contrib.ctc.ctc_beam_search_decoder # (it's slower but you'll get better results) decoded, log_prob = tf.nn.ctc_beam_search_decoder(m.logits, m.seq_len, merge_repeated=False) # Accuracy: label error rate acc = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), m.labels)) print('loading train data, please wait---------------------') train_feeder = DataIterator(data_dir=train_dir) print('get image: ', train_feeder.size) print('loading validation data, please wait---------------------') val_feeder = DataIterator(data_dir=val_dir) print('get image: ', val_feeder.size) num_train_samples = train_feeder.size # 100000 num_batches_per_epoch = int(num_train_samples / B_SIZE) # номер партии shuffle_idx = np.random.permutation(num_train_samples) #-----------------------2й - словарь--------------------------# num_val_samples = val_feeder.size #val_feeder.size num_batches_per_epoch_val = int(num_val_samples / B_SIZE) # example: 10000/100 shuffle_idx_val = np.random.permutation(num_val_samples) def do_report(): #Информация/сохранение модели indexs_val = [ shuffle_idx_val[i % num_val_samples] for i in range(batch * B_SIZE, (batch + B_SIZE) * B_SIZE) ] val_inputs, val_seq_len, val_labels = val_feeder.input_index_generate_batch( indexs_val) #test_feed = {m.inputs: val_inputs, m.labels: val_labels, m.seq_len: val_seq_len} test_feed = {m.inputs: val_inputs, m.labels: val_labels} dd, log_probs, accuracy = session.run([decoded[0], log_prob, acc], test_feed) accuracy = report_accuracy(dd, val_labels) save_path = saver.save(session, "models/ocr.model-" + str(accuracy), global_step=steps) # decoded_list = decode_sparse_tensor(dd) def do_batch(): #Партия #feed = {m.inputs: train_inputs, m.labels: train_labels, m.seq_len: train_seq_len} feed = {m.inputs: train_inputs, m.labels: train_labels} b_cost, steps, _ = session.run([cost, global_step, optimizer], feed) #b_cost, steps, _ = session.run([m.cost, m.global_step, m.train_op], feed) #print "ПАРТИЯ" if steps > 0 and steps % 10000 == 0: do_report() return b_cost, steps gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.30) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, log_device_placement=True)) as session: ckpt = tf.train.get_checkpoint_state("models") #writer = tf.summary.FileWriter('log/', graph=session.graph) if ckpt and ckpt.model_checkpoint_path: saver = tf.train.Saver() saver.restore(session, ckpt.model_checkpoint_path) else: print("no checkpoint found") # Initializate the weights and biases #init = tf.initialize_all_variables() #session.run(init) #saver = tf.train.Saver(tf.all_variables(), max_to_keep=100) session.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) for curr_epoch in xrange(num_epochs): #variables = tf.all_variables() #for i in variables: #print(i.name) print("Epoch.......", curr_epoch) train_cost = train_ler = 0 for batch in range(num_batches_per_epoch): #print (batch) start = time.time() #train_inputs, train_targets, train_seq_len = utils.get_data_set('GO1', batch * common.BATCH_SIZE, # (batch + 1) * common.BATCH_SIZE) indexs = [ shuffle_idx[i % num_train_samples] for i in range(batch * B_SIZE, (batch + B_SIZE) * B_SIZE) ] #print indexs train_inputs, train_seq_len, train_labels = train_feeder.input_index_generate_batch( indexs ) #utils.get_data_set('LSTM2', batch * common.BATCH_SIZE, (batch + 1) * common.BATCH_SIZE) #print("get data time", time.time() - start) start = time.time() c, steps = do_batch() train_cost += c * B_SIZE seconds = time.time() - start #print("Step:", steps, ", batch seconds:", seconds) #feed = {m.inputs: train_inputs, m.labels: train_targets, m.seq_len: train_seq_len} #готовлю данные #summary_str, batch_cost, step, _ = session.run([m.merged_summay, m.cost, m.global_step, m.train_op], feed) train_cost /= B_SIZE indexs_val = [ shuffle_idx_val[i % num_val_samples] for i in range(batch * B_SIZE, (batch + B_SIZE) * B_SIZE) ] val_inputs, val_seq_len, val_labels = val_feeder.input_index_generate_batch( indexs_val) #val_feed = {m.inputs: val_inputs, m.labels: val_labels, m.seq_len: val_seq_len} val_feed = {m.inputs: val_inputs, m.labels: val_labels} val_cost, val_ler, lr, steps = session.run( [cost, acc, learning_rate, global_step], feed_dict=val_feed) log = "Epoch {}/{}, steps = {}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}s, learning_rate = {}" print( log.format(curr_epoch + 1, num_epochs, steps, train_cost, train_ler, val_cost, val_ler, time.time() - start, lr))
def create(self, imageHeight, imageWidth, num_classes, evalFLAG): graph = tf.Graph() with graph.as_default(): num_hidden = 256 training = not evalFLAG with tf.name_scope('Inputs'): inputs = tf.placeholder(tf.float32, [None, imageHeight, imageWidth, 1], name='inputs') if evalFLAG: tf.summary.image('inputs', inputs, max_outputs=1) #seq_len should be feed with a list containing the real width of the images before padding to obtain imageWidth seq_len = tf.placeholder(tf.int32, [None], name='seq_len') targets = tf.sparse_placeholder(tf.int32, name='targets') targets_len = tf.placeholder(tf.int32, name='targets_len') conv_keep_prob = 0.8 lstm_keep_prob = 0.5 # Layer 1 with tf.name_scope('Layer_Conv_1'): h_conv1 = CNN(x=inputs, filters=16, kernel_size=[3, 3], strides=[1, 1], name='conv1', activation=tf.nn.leaky_relu, evalFLAG=evalFLAG, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) h_pool1, seq_len_1, imageHeight, imageWidth = max_pool( h_conv1, [2, 2], seq_len, imageHeight, imageWidth, evalFLAG) h_pool1 = tf.layers.dropout(h_pool1, rate=0.0, training=training) # Layer 2 with tf.name_scope('Layer_Conv_2'): h_conv2 = CNN(x=h_pool1, filters=32, kernel_size=[3, 3], strides=[1, 1], name='conv2', activation=tf.nn.leaky_relu, evalFLAG=evalFLAG, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) h_pool2, seq_len_2, imageHeight, imageWidth = max_pool( h_conv2, [2, 2], seq_len_1, imageHeight, imageWidth, evalFLAG) h_pool2 = tf.layers.dropout(h_pool2, rate=(1 - conv_keep_prob), training=training) # Layer 3 with tf.name_scope('Layer_Conv_3'): h_conv3 = CNN(x=h_pool2, filters=48, kernel_size=[3, 3], strides=[1, 1], name='conv3', activation=tf.nn.leaky_relu, evalFLAG=evalFLAG, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) h_pool3, seq_len_3, imageHeight, imageWidth = max_pool( h_conv3, [2, 2], seq_len_2, imageHeight, imageWidth, evalFLAG) h_pool3 = tf.layers.dropout(h_pool3, rate=(1 - conv_keep_prob), training=training) # Layer 4 with tf.name_scope('Layer_Conv_4'): h_conv4 = CNN(x=h_pool3, filters=64, kernel_size=[3, 3], strides=[1, 1], name='conv4', activation=tf.nn.leaky_relu, evalFLAG=evalFLAG, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) h_pool4, seq_len_4, imageHeight, imageWidth = max_pool( h_conv4, [1, 1], seq_len_3, imageHeight, imageWidth, evalFLAG) h_pool4 = tf.layers.dropout(h_pool4, rate=(1 - conv_keep_prob), training=training) # Layer 5 with tf.name_scope('Layer_Conv_5'): h_conv5 = CNN(x=h_pool4, filters=80, kernel_size=[3, 3], strides=[1, 1], name='conv5', activation=tf.nn.leaky_relu, evalFLAG=evalFLAG, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) h_pool5, seq_len_5, imageHeight, imageWidth = max_pool( h_conv5, [1, 1], seq_len_4, imageHeight, imageWidth, evalFLAG) h_pool5 = tf.layers.dropout(h_pool5, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Reshaping_step'): h_cw_concat = tf.transpose(h_pool5, (2, 0, 1, 3)) h_cw_concat = tf.reshape( h_cw_concat, (int(imageWidth), -1, int(imageHeight * 80))) h_cw_concat = tf.transpose(h_cw_concat, (1, 0, 2)) with tf.name_scope('Layer_BLSTM_1'): h_bilstm1 = bidirectionalLSTM(h_cw_concat, num_hidden, seq_len_5, '1', evalFLAG) h_bilstm1 = tf.concat(h_bilstm1, 2) h_bilstm1 = tf.layers.dropout(h_bilstm1, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Layer_BLSTM_2'): h_bilstm2 = bidirectionalLSTM(h_bilstm1, num_hidden, seq_len_5, '2', evalFLAG) h_bilstm2 = tf.concat(h_bilstm2, 2) h_bilstm2 = tf.layers.dropout(h_bilstm2, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Layer_BLSTM_3'): h_bilstm3 = bidirectionalLSTM(h_bilstm2, num_hidden, seq_len_5, '3', evalFLAG) h_bilstm3 = tf.concat(h_bilstm3, 2) h_bilstm3 = tf.layers.dropout(h_bilstm3, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Layer_BLSTM_4'): h_bilstm4 = bidirectionalLSTM(h_bilstm3, num_hidden, seq_len_5, '4', evalFLAG) h_bilstm4 = tf.concat(h_bilstm4, 2) h_bilstm4 = tf.layers.dropout(h_bilstm4, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Layer_BLSTM_5'): h_bilstm5 = bidirectionalLSTM(h_bilstm4, num_hidden, seq_len_5, '5', evalFLAG) h_bilstm5 = tf.concat(h_bilstm5, 2) h_bilstm5 = tf.layers.dropout(h_bilstm5, rate=(1 - lstm_keep_prob), training=training) with tf.name_scope('Layer_Linear') as ns: outputs = tf.transpose(h_bilstm5, (1, 0, 2)) outputs = tf.reshape(outputs, (-1, 2 * num_hidden)) logits = FNN(outputs, num_classes, ns, None, evalFLAG) with tf.name_scope('Logits'): logits = tf.reshape(logits, (int(imageWidth), -1, num_classes)) seq_len_5 = tf.maximum(seq_len_5, targets_len) n_batches = tf.placeholder(tf.float32, name='n_batches') previousCost = tf.placeholder(tf.float32, name='previous_cost') with tf.name_scope('CTC_Loss'): loss = tf.nn.ctc_loss(targets, logits, seq_len_5, preprocess_collapse_repeated=False, ctc_merge_repeated=True) with tf.name_scope('total'): batch_cost = tf.reduce_mean(loss) cost = batch_cost / n_batches + previousCost tf.summary.scalar('CTC_loss', cost) with tf.name_scope('train'): train_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='Layer_Linear') + tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='BLSTM[12345]') + tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='conv[12345]') print(train_vars) learning_rate = tf.placeholder(tf.float32, name='learning_rate') optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(batch_cost) with tf.name_scope('predictions'): predictions, log_prob = tf.nn.ctc_beam_search_decoder( logits, seq_len_5, merge_repeated=False) with tf.name_scope('CER'): with tf.name_scope('Mean_CER_per_word'): previousEDnorm = tf.placeholder(tf.float32, name='previousEDnorm') EDnorm = tf.reduce_mean( tf.edit_distance( tf.cast(predictions[0], tf.int32), targets, normalize=True)) / n_batches + previousEDnorm if evalFLAG: tf.summary.scalar('EDnorm', EDnorm) with tf.name_scope('Absolute_CER_total_set'): setTotalChars = tf.placeholder(tf.float32, name='setTotalChars') previousEDabs = tf.placeholder(tf.float32, name='previousEDabs') errors = tf.edit_distance(tf.cast(predictions[0], tf.int32), targets, normalize=False) EDabs = tf.reduce_sum( errors) / setTotalChars + previousEDabs if evalFLAG: tf.summary.scalar('EDabs', EDabs) ED = [EDnorm, EDabs] saver = tf.train.Saver(tf.global_variables(), max_to_keep=5, keep_checkpoint_every_n_hours=24) transferred_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="BLSTM[12345]") + tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="conv") transferred_vars_dict = dict([(var.op.name, var) for var in transferred_vars]) transfer_saver = tf.train.Saver(transferred_vars_dict) merged = tf.summary.merge_all() return graph, [ saver, transfer_saver ], inputs, seq_len, targets, targets_len, learning_rate, n_batches, setTotalChars, previousEDabs, previousEDnorm, previousCost, optimizer, batch_cost, cost, errors, ED, predictions, merged
def ler(y_true, y_pred, **kwargs): """ Label Error Rate. For more information see 'tf.edit_distance' """ return tf.reduce_mean(tf.edit_distance(y_pred, y_true, **kwargs))
def main(_): batch_size = FLAGS.batch_size # num_readers = 4 num_epochs = FLAGS.epoch checkpoint_dir = FLAGS.checkpoint_dir with tf.Graph().as_default(): # deploy_config = model_deploy.DeploymentConfig() # Create global_step. global_step = tf.placeholder(tf.int64, name='global_step') tr_file_names = [os.path.join("/mnt/sdb/mark/SynthText/", "synthtext_train.tfrecords")] te_file_names = [os.path.join("/mnt/sdb/mark/SynthText/", "synthtext_test.tfrecords")] sh_images, sh_labels, sh_length, sh_width = read_utils.inputs( filename=tr_file_names, batch_size=batch_size, num_epochs=num_epochs, preprocess=True) val_images, val_labels, val_length, val_width = read_utils.inputs( filename=te_file_names, batch_size=batch_size, num_epochs=10000*num_epochs, preprocess=True) # Build Model crnn = model.CRNNNet() with tf.variable_scope('crnn'): logits, seq_len = crnn.net(sh_images, sh_width, is_training=True, kp=1.0) tf.get_variable_scope().reuse_variables() val_logits, val_seq_len = crnn.net(val_images, val_width, is_training=False, kp=1.0) loss = crnn.losses(sh_labels, logits, seq_len) tf.summary.scalar("train/loss", loss) tf.summary.image("train/inputs", sh_images) val_loss = crnn.losses(val_labels, val_logits, val_seq_len) # TODO: BK-tree NN search decoded, log_prob = tf.nn.ctc_beam_search_decoder(tf.transpose(val_logits, perm=[1, 0, 2]), val_seq_len, merge_repeated=False) acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels, normalize=False)) acc_norm = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels)) val_loss_sum = tf.placeholder(tf.float32, name='val_loss_sum') val_acc_sum = tf.placeholder(tf.float32, name='val_acc_sum') val_acc_norm_sum = tf.placeholder(tf.float32, name='val_acc_norm_sum') tf.summary.scalar("test/val_loss", val_loss_sum) tf.summary.scalar("test/edit_distance", val_acc_sum) tf.summary.scalar("test/edit_distance_norm", val_acc_norm_sum) starter_learning_rate = FLAGS.learning_rate learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, FLAGS.lr_decay_step, FLAGS.lr_decay_rate, staircase=True) tf.summary.scalar("train/learning_rate",learning_rate) train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss=loss, var_list=train_vars) # Start Training with tf.Session(config=config) as sess: if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) save = tf.train.Saver(max_to_keep=50) raw_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='crnn/CRNN_net/') init_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='crnn/ResNet/') pretrain = tf.train.Saver({v.op.name.replace('crnn/ResNet/', ''): v for v in init_vars if v.op.name.find('Adam') == -1}) if not FLAGS.load: base_step = 0 init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) # Start input enqueue threads. else: try: base_step = int(FLAGS.ckpt_file.split('-')[-1]) except: base_step = 0 if FLAGS.mode == 'raw': # ckpt_file = 'model.ckpt-' + FLAGS.ckpt_step ckpt_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.ckpt_file) save.restore(sess, ckpt_path) sess.run(tf.local_variables_initializer()) elif FLAGS.mode == 'pretrain': ckpt_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.ckpt_file) init_op = tf.group(tf.local_variables_initializer(), tf.variables_initializer([v for v in init_vars if v.op.name.find('Adam') != -1] + raw_vars + [v for v in tf.global_variables() if v.op.name.find('crnn/') == -1])) sess.run(init_op) pretrain.restore(sess, ckpt_path) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) #sess = tf_debug.LocalCLIDebugWrapperSession(sess) merged = tf.summary.merge(tf.get_collection(tf.GraphKeys.SUMMARIES, scope='train/*')) val_merged = tf.summary.merge(tf.get_collection(tf.GraphKeys.SUMMARIES, scope='test/*')) file_writer = tf.summary.FileWriter(FLAGS.logdir, sess.graph) try: step = 0 while not coord.should_stop(): start_time = time.time() _, merged_t, tr_loss, lr, step, db_labels, db_images, db_logits = sess.run([optimizer, merged, loss, learning_rate, global_step, sh_labels, sh_images, logits], feed_dict={global_step: step}) duration = time.time() - start_time print("loss", tr_loss, "time", duration) file_writer.add_summary(merged_t, step) # Print an overview fairly often. if step % FLAGS.save_steps == 0 and step > 0: ####################################################### val_loss_s, val_acc_s, val_acc_norm_s = 0, 0, 0 for ite in range(FLAGS.sample_size): te_loss, te_acc, te_acc_norm = sess.run([val_loss, acc, acc_norm]) val_loss_s += te_loss val_acc_s += te_acc val_acc_norm_s += te_acc_norm val_loss_s /= FLAGS.sample_size val_acc_s /= FLAGS.sample_size val_acc_norm_s /= FLAGS.sample_size print('Step %d: loss %.3f acc %.3f %.3f (%.3f sec)' % (step, val_loss_s, val_acc_s, val_acc_norm_s, duration)) # Add summary val_sum = sess.run(val_merged, feed_dict={val_loss_sum: val_loss_s, val_acc_sum: val_acc_s, val_acc_norm_sum: val_acc_norm_s}) file_writer.add_summary(val_sum, step) save.save(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt'), global_step=step+base_step) step += 1 except tf.errors.OutOfRangeError: print('Done training for %d epochs, %d steps.' % (num_epochs, step)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads)
def create_graph_for_validation_ctc(pipeline, nnet_config): graph = dict() nnet_input = pipeline['nnet_input'] graph['nnet_input'] = nnet_input sequence_length = pipeline['sequence_length'] graph['sequence_length'] = sequence_length nnet_type = nnet_config.get('nnet_type') create_logits = get_create_logits(nnet_type) logits, encoder, reg_loss = create_logits( nnet_input=nnet_input, sequence_length=sequence_length, nnet_config=nnet_config, ) graph['logits'] = logits # Convert from [batch, time, target] to [time, batch, target] logits = tf.transpose(logits, (1, 0, 2)) nnet_target = pipeline['nnet_target'] graph['raw_target'] = nnet_target sparse_indices = \ tf.where( tf.not_equal( nnet_target, tf.constant(-1, dtype=tf.int64) ) ) sparse_values = \ tf.gather_nd( params=nnet_target, indices=sparse_indices, ) dense_shape = \ tf.cast( x=tf.shape(nnet_target), dtype=tf.int64, ) sparse = \ tf.SparseTensor( indices=sparse_indices, values=sparse_values, dense_shape=dense_shape, ) sparse = \ tf.cast( x=sparse, dtype=tf.int32, ) nnet_target = sparse graph['nnet_target'] = nnet_target batch_size = tf.shape(sparse_values)[0] graph['size'] = batch_size loss = tf.nn.ctc_loss(labels=nnet_target, inputs=logits, sequence_length=sequence_length, ignore_longer_outputs_than_inputs=True) loss = tf.reduce_sum(loss) tf.summary.scalar('loss', loss) graph['eval_loss'] = loss other_weights = 0 other_loss = None for item in reg_loss: if item[0] is not None and item[1] is not None \ and item[1]>0: other_weights += item[1] if other_loss == None: other_loss = item[0] else: other_loss += item[0] if other_loss is not None and other_weights != 0: #loss = (1-other_weights)* loss + other_loss loss = loss + other_loss graph['loss'] = loss # keep track of the total loss decoded, neg_sum_logits = tf.nn.ctc_greedy_decoder( inputs=logits, sequence_length=sequence_length, merge_repeated=True) dist = tf.reduce_sum( tf.edit_distance(tf.cast(decoded[0], tf.int64), tf.cast(nnet_target, tf.int64), normalize=False), ) graph['eval'] = dist global_step = tf.train.get_or_create_global_step() global_step = tf.assign(global_step, global_step + 1, name='global_step') graph['global_step'] = global_step summary = tf.summary.merge_all() graph['summary'] = summary for key, val in graph.iteritems(): tf.add_to_collection(key, val) return graph
def train_shadownet(dataset_dir, weights_path=None, decode: bool=False, num_threads=4): """ :param dataset_dir: :param weights_path: :param num_threads: Number of threads to use in tf.train.shuffle_batch :return: """ # Load config cfg = load_config().cfg # decode the tf records to get the training data decoder = data_utils.TextFeatureIO().reader input_images, input_labels, input_image_names = decoder.read_features(ops.join(dataset_dir, 'train_feature.tfrecords'), cfg.TRAIN.BATCH_SIZE, num_threads) # initialise the net model shadownet = crnn_model.ShadowNet(phase='Train', hidden_nums=cfg.ARCH.HIDDEN_UNITS, layers_nums=cfg.ARCH.HIDDEN_LAYERS, num_classes=len(decoder.char_dict) + 1) with tf.variable_scope('shadow', reuse=False): net_out = shadownet.build_shadownet(inputdata=input_images) cost = tf.reduce_mean(tf.nn.ctc_loss(labels=input_labels, inputs=net_out, sequence_length=cfg.ARCH.SEQ_LENGTH*np.ones(cfg.TRAIN.BATCH_SIZE))) decoded, log_prob = tf.nn.ctc_beam_search_decoder(net_out, cfg.ARCH.SEQ_LENGTH*np.ones(cfg.TRAIN.BATCH_SIZE), merge_repeated=False) sequence_dist = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), input_labels)) global_step = tf.Variable(0, name='global_step', trainable=False) starter_learning_rate = cfg.TRAIN.LEARNING_RATE learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, cfg.TRAIN.LR_DECAY_STEPS, cfg.TRAIN.LR_DECAY_RATE, staircase=True) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate).minimize(loss=cost, global_step=global_step) # Setup TF summary tboard_save_path = 'tboard/shadownet' if not ops.exists(tboard_save_path): os.makedirs(tboard_save_path) tf.summary.scalar(name='Cost', tensor=cost) tf.summary.scalar(name='Learning_Rate', tensor=learning_rate) if decode: tf.summary.scalar(name='Seq_Dist', tensor=sequence_dist) merge_summary_op = tf.summary.merge_all() # Set saver configuration saver = tf.train.Saver() model_save_dir = cfg.PATH.CRNN_MODEL_SAVE_DIR if not ops.exists(model_save_dir): os.makedirs(model_save_dir) train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) model_name = 'shadownet_{:s}.ckpt'.format(str(train_start_time)) model_save_path = ops.join(model_save_dir, model_name) # Set sess configuration sess_config = tf.ConfigProto() sess_config.gpu_options.per_process_gpu_memory_fraction = cfg.TRAIN.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = cfg.TRAIN.TF_ALLOW_GROWTH sess = tf.Session(config=sess_config) summary_writer = tf.summary.FileWriter(tboard_save_path) summary_writer.add_graph(sess.graph) # Set the training parameters train_epochs = cfg.TRAIN.EPOCHS with sess.as_default(): if weights_path is None: logger.info('Training from scratch') init = tf.global_variables_initializer() sess.run(init) else: logger.info('Restore model from {:s}'.format(weights_path)) saver.restore(sess=sess, save_path=weights_path) cost_history = [np.inf] for epoch in range(train_epochs): if decode: _, c, seq_distance, predictions, labels, summary = sess.run([optimizer, cost, sequence_dist, decoded, input_labels, merge_summary_op]) labels = decoder.sparse_tensor_to_str(labels) predictions = decoder.sparse_tensor_to_str(predictions[0]) accuracy = compute_accuracy(labels, predictions) if epoch % cfg.TRAIN.DISPLAY_STEP == 0: logger.info('Epoch: {:d} cost= {:9f} seq distance= {:9f} train accuracy= {:9f}'.format( epoch + 1, c, seq_distance, accuracy)) else: _, c, summary = sess.run([optimizer, cost, merge_summary_op]) if epoch % cfg.TRAIN.DISPLAY_STEP == 0: logger.info('Epoch: {:d} cost= {:9f}'.format(epoch + 1, c)) cost_history.append(c) summary_writer.add_summary(summary=summary, global_step=epoch) saver.save(sess=sess, save_path=model_save_path, global_step=epoch) return np.array(cost_history[1:]) # Don't return the first np.inf
def model_fn(self, features, labels, mode, params): """ Model function for transformer. Args: features: float Tensor with shape [batch_size, T, H, W, C]. Input sequence. labels: string Tensor with shape [batch_size,]. Target labels. mode: Indicate train or eval or predict. params: dict. model parameters. Returns: tf.estimator.EstimatorSpec. """ #learning_rate = params.get('learning_rate', 0.001) in_training = mode == tf.estimator.ModeKeys.TRAIN video = features['video'] inputs_unpadded_length = features['unpadded_length'] if params.get('feature_extractor') == 'early_fusion': from .cnn_extractor import EarlyFusion2D as CnnExtractor elif params.get('feature_extractor') == 'res': from .cnn_extractor import ResNet as CnnExtractor else: from .cnn_extractor import LipNet as CnnExtractor feature_extractor = CnnExtractor(feature_len=params.get('hidden_size'), training=in_training, scope='cnn_feature_extractor') inputs = feature_extractor.build( video) # [batch_size, input_length, hidden_size] params.update({'pinyin_vocab_size': len(self.pinyin_dic)}) params.update({'viseme_vocab_size': len(self.viseme_dic)}) v_p_transformer = Transformer1(params, in_training, scope="v_p_transformer") label_params = params.copy() label_params.update({'vocab_size': len(self.label_dic)}) label_params.update({'target_vocab_size': len(self.label_dic)}) label_params.update({'scope': "v_c_transformer"}) label_params.update({'dic': self.label_dic}) if params.get('co_attention') == 1: v_c_transformer = Transformer_co1(label_params, in_training, scope="v_c_transformer") elif params.get('co_attention') == 2: v_c_transformer = Transformer_co2(label_params, in_training, scope="v_c_transformer") elif params.get('co_attention') == 3: v_c_transformer = Transformer_co3(label_params, in_training, scope="v_c_transformer") else: v_c_transformer = Transformer_co4(label_params, in_training, scope="v_c_transformer") viseme_labels = labels['viseme'] sparse_viseme, viseme_string = self.procesess_label( viseme_labels, self.viseme_dic) viseme_char_list_labels = label_util.string2char_list(viseme_string) pinyin_labels = tf.squeeze(labels['pinyin']) # [batch_size, ] pinyin_char_list_labels, pinyin_labels = self.preprocess_labels( pinyin_labels, self.pinyin_dic) # [batch_size, target_length] pinyin_string = self.id_to_string(pinyin_labels, self.pinyin_dic) label_targets = labels['label'] label_index, label_string = self.procesess_label( label_targets, self.label_dic) pinyin_logits, viseme_logits, encode, attention_bias = v_p_transformer( inputs, inputs_unpadded_length, pinyin_labels, viseme_labels) pinyin_sequence = tf.argmax(pinyin_logits, 2) viseme_sequence = tf.argmax(viseme_logits, 2) pinyin_embedded = tf.contrib.layers.embed_sequence( ids=pinyin_sequence, vocab_size=params["target_pinyin_vocab_size"], embed_dim=512) viseme_embedded = tf.contrib.layers.embed_sequence( ids=viseme_sequence, vocab_size=params["target_viseme_vocab_size"], embed_dim=512) # Calculate model loss. # xentropy contains the cross entropy loss of every nonpadding token in the # train pinyin_xentropy, pinyin_weights = metrics.padded_cross_entropy_loss( pinyin_logits, pinyin_labels, params["label_smoothing"], params["pinyin_vocab_size"]) pinyin_loss = tf.reduce_sum(pinyin_xentropy) / tf.reduce_sum( pinyin_weights) viseme_xentropy, viseme_weights = metrics.padded_cross_entropy_loss( viseme_logits, viseme_labels, params["label_smoothing"], params["viseme_vocab_size"]) viseme_loss = tf.reduce_sum(viseme_xentropy) / tf.reduce_sum( viseme_weights) #embedded = tf.concat([viseme_embedded, pinyin_embedded], 1) pinyin_unpadded_length = tf.cast( tf.count_nonzero(pinyin_sequence, 1, keepdims=True) - 1, tf.int32) viseme_unpadded_length = tf.cast( tf.count_nonzero(viseme_sequence, 1, keepdims=True) - 1, tf.int32) # label_logits = v_c_transformer(tf.cast(viseme_embedded, tf.float32), inputs_unpadded_length, label_targets) #label_logits = v_c_transformer(tf.cast(embedded, tf.float32), inputs_unpadded_length, encode, attention_bias,label_targets) label_logits = v_c_transformer(tf.cast(pinyin_embedded, tf.float32), pinyin_unpadded_length, tf.cast(viseme_embedded, tf.float32), viseme_unpadded_length, encode, attention_bias, label_targets) label_xentropy, label_weights = metrics.padded_cross_entropy_loss( label_logits, label_targets, params["label_smoothing"], label_params["vocab_size"]) label_loss = tf.reduce_sum(label_xentropy) / tf.reduce_sum( label_weights) loss = label_loss + pinyin_loss + viseme_loss if mode == tf.estimator.ModeKeys.TRAIN: train_op, metric_dict = get_train_op_and_metrics(loss, params) # if params["ckpt_path"] != "": # print('restore from: {}'.format(params["ckpt_path"])) # tf.train.init_from_checkpoint( # params["ckpt_path"], assignment_map={"/": "/"}) # Epochs can be quite long. This gives some intermediate information # in TensorBoard. metric_dict["minibatch_loss"] = loss record_scalars(metric_dict) pinyin_sequence = tf.argmax(pinyin_logits, 2) viseme_sequence = tf.argmax(viseme_logits, 2) label_sequence = tf.argmax(label_logits, 2) sparse_pinyin_prediction, pinyin_predicted_string = self.procesess_label( pinyin_sequence, self.pinyin_dic) pinyin_predicted_char_list = label_util.string2char_list( pinyin_predicted_string) sparse_viseme_prediction, viseme_predicted_string = self.procesess_label( viseme_sequence, self.viseme_dic) viseme_predicted_char_list = label_util.string2char_list( viseme_predicted_string) label_predicted_index, label_predicted_string = self.procesess_label( label_sequence, self.label_dic) ver = self.cal_pinyin_metrics(viseme_char_list_labels, viseme_predicted_char_list) per = self.cal_pinyin_metrics(pinyin_char_list_labels, pinyin_predicted_char_list) cer = tf.edit_distance(label_index, tf.cast(label_predicted_index, tf.int64)) logging_hook = tf.train.LoggingTensorHook( { 'loss': loss, 'ver': tf.reduce_mean(ver), 'per': tf.reduce_mean(per), 'cer': tf.reduce_mean(cer), }, every_n_iter=1, ) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=[logging_hook]) # Save loss as named tensor that will be logged with the logging hook. tf.identity(loss, "cross_entropy") # eval if mode == tf.estimator.ModeKeys.EVAL: pinyin_logits, viseme_logits, encode, attention_bias = v_p_transformer( inputs, inputs_unpadded_length, None, None) label_logits = v_c_transformer( tf.cast(pinyin_embedded, tf.float32), pinyin_unpadded_length, tf.cast(viseme_embedded, tf.float32), viseme_unpadded_length, encode, attention_bias, None) predicted_pinyin = pinyin_logits['outputs'] sparse_pinyin_prediction, pinyin_predicted_string = self.procesess_label( predicted_pinyin, self.pinyin_dic) pinyin_predicted_char_list = label_util.string2char_list( pinyin_predicted_string) predicted_viseme = viseme_logits['outputs'] sparse_viseme_prediction, viseme_predicted_string = self.procesess_label( predicted_viseme, self.viseme_dic) viseme_predicted_char_list = label_util.string2char_list( viseme_predicted_string) label_predictions = label_logits['outputs'] label_predicted_index, label_predicted_string = self.procesess_label( label_predictions, self.label_dic) ver = self.cal_pinyin_metrics(viseme_char_list_labels, viseme_predicted_char_list) per = self.cal_pinyin_metrics(pinyin_char_list_labels, pinyin_predicted_char_list) cer = tf.edit_distance(label_index, tf.cast(label_predicted_index, tf.int64)) tf.summary.scalar('ver', tf.reduce_mean(ver)) tf.summary.scalar('per', tf.reduce_mean(per)) tf.summary.scalar('cer', tf.reduce_mean(cer)) eval_metric_ops = { 'ver': tf.metrics.mean(ver), 'per': tf.metrics.mean(per), 'cer': tf.metrics.mean(cer), } def custom_formatter(tensors): hook_list = ['predicted_sentence', 'sentence_label'] ostrs = [] for k, v in tensors.items(): if k in hook_list: v = [str(vv, encoding='UTF8') for vv in v] ostrs.append('{}: {}'.format(k, v)) return '\n'.join(ostrs) logging_hook = tf.train.LoggingTensorHook( { 'loss': loss, 'ver': tf.reduce_mean(ver), 'per': tf.reduce_mean(per), 'cer': tf.reduce_mean(cer), 'viseme_labels': viseme_string[:5], 'pinyin_labels': pinyin_string[:5], 'sentence_label': label_string[:5], 'predicted_viseme': viseme_predicted_string[:5], 'predicted_pinyin': pinyin_predicted_string[:5], 'predicted_sentence': label_predicted_string[:5], }, every_n_iter=10, formatter=custom_formatter) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": val[0]}, eval_metric_ops=eval_metric_ops, evaluation_hooks=[logging_hook])
def train(): global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, DECAY_STEPS, LEARNING_RATE_DECAY_FACTOR, staircase=True) logits, inputs, targets, seq_len, W, b = get_train_model() loss = tf.nn.ctc_loss(labels=targets, inputs=logits, sequence_length=seq_len) cost = tf.reduce_mean(loss) #optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=MOMENTUM).minimize(cost, global_step=global_step) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize( loss, global_step=global_step) decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, merge_repeated=False) acc = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) init = tf.global_variables_initializer() def do_report(): test_inputs, test_targets, test_seq_len = get_next_batch(BATCH_SIZE) test_feed = { inputs: test_inputs, targets: test_targets, seq_len: test_seq_len } dd, log_probs, accuracy = session.run([decoded[0], log_prob, acc], test_feed) report_accuracy(dd, test_targets) # decoded_list = decode_sparse_tensor(dd) def do_batch(): train_inputs, train_targets, train_seq_len = get_next_batch(BATCH_SIZE) feed = { inputs: train_inputs, targets: train_targets, seq_len: train_seq_len } b_loss, b_targets, b_logits, b_seq_len, b_cost, steps, _ = session.run( [loss, targets, logits, seq_len, cost, global_step, optimizer], feed) #print b_loss #print b_targets, b_logits, b_seq_len print b_cost, steps if steps > 0 and steps % REPORT_STEPS == 0: do_report() #save_path = saver.save(session, "ocr.model", global_step=steps) # print(save_path) return b_cost, steps with tf.Session() as session: session.run(init) saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) for curr_epoch in xrange(num_epochs): print("Epoch.......", curr_epoch) train_cost = train_ler = 0 for batch in xrange(BATCHES): start = time.time() c, steps = do_batch() train_cost += c * BATCH_SIZE seconds = time.time() - start print("Step:", steps, ", batch seconds:", seconds) train_cost /= TRAIN_SIZE train_inputs, train_targets, train_seq_len = get_next_batch( BATCH_SIZE) val_feed = { inputs: train_inputs, targets: train_targets, seq_len: train_seq_len } val_cost, val_ler, lr, steps = session.run( [cost, acc, learning_rate, global_step], feed_dict=val_feed) log = "Epoch {}/{}, steps = {}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}s, learning_rate = {}" print( log.format(curr_epoch + 1, num_epochs, steps, train_cost, train_ler, val_cost, val_ler, time.time() - start, lr))
def train_model(ENV, train_data=None, test_data=None, decode=False, file_decode=False): graph = tf.Graph() with graph.as_default(): # e.g: log filter bank or MFCC features # Has size [batch_size, max_stepsize, num_features], but the # batch_size and max_stepsize can vary along each step inputs = tf.placeholder(tf.float32, [None, None, num_features]) targets_idx = tf.placeholder(tf.int64) targets_val = tf.placeholder(tf.int32) targets_shape = tf.placeholder(tf.int64) targets = tf.SparseTensor(targets_idx, targets_val, targets_shape) # 1d array of size [batch_size] seq_len = tf.placeholder(tf.int32, [None]) # Weights & biases weight_classes = tf.Variable( tf.truncated_normal([num_hidden, num_classes], mean=0, stddev=0.1, dtype=tf.float32)) bias_classes = tf.Variable(tf.zeros([num_classes]), dtype=tf.float32) # Network forward_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, use_peepholes=True, state_is_tuple=True) backward_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, use_peepholes=True, state_is_tuple=True) stack_forward_cell = tf.nn.rnn_cell.MultiRNNCell([forward_cell] * num_layers, state_is_tuple=True) stack_backward_cell = tf.nn.rnn_cell.MultiRNNCell([backward_cell] * num_layers, state_is_tuple=True) outputs, _ = tf.nn.bidirectional_dynamic_rnn( stack_forward_cell, stack_backward_cell, inputs, sequence_length=seq_len, time_major=False, # [batch_size, max_time, num_hidden] dtype=tf.float32) inputs_shape = tf.shape(inputs) batch_size = inputs_shape[0] """ outputs_concate = tf.concat_v2(outputs, 2) outputs_concate = tf.reshape(outputs_concate, [-1, 2*num_hidden]) # logits = tf.matmul(outputs_concate, weight_classes) + bias_classes """ fw_output = tf.reshape(outputs[0], [-1, num_hidden]) bw_output = tf.reshape(outputs[1], [-1, num_hidden]) logits = tf.add( tf.add(tf.matmul(fw_output, weight_classes), tf.matmul(bw_output, weight_classes)), bias_classes) logits = tf.reshape(logits, [batch_size, -1, num_classes]) loss = tf.reduce_mean( ctc_ops.ctc_loss(logits, targets, seq_len, time_major=False)) optimizer = tf.train.MomentumOptimizer(learning_rate, momentum).minimize(loss) # Evaluating # decoded, log_prob = ctc_ops.ctc_greedy_decoder(tf.transpose(logits, perm=[1, 0, 2]), seq_len) decoded, log_prob = ctc_ops.ctc_beam_search_decoder( tf.transpose(logits, perm=[1, 0, 2]), seq_len) label_error_rate = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as session: session.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.all_variables(), max_to_keep=0) if not decode: ckpt = tf.train.get_checkpoint_state(ENV.output) if ckpt: print('load', ckpt.model_checkpoint_path) saver.restore(session, ckpt.model_checkpoint_path) total_train_data = len(train_data) total_test_data = len(test_data) num_batch = total_train_data for curr_epoch in range(num_epochs): start = time.time() train_cost = 0 train_ler = 0 for i in range(num_batch - 1): feed = { inputs: train_data[i][0], targets_idx: train_data[i][1][0], targets_val: train_data[i][1][1], targets_shape: train_data[i][1][2], seq_len: train_data[i][2] } batch_cost, _ = session.run([loss, optimizer], feed) train_cost += batch_cost * batch_size train_ler += session.run(label_error_rate, feed_dict=feed) * batch_size log = "Epoch {}/{}, iter {}, batch_cost {}" logging.info( log.format(curr_epoch + 1, num_epochs, i, batch_cost)) train_cost /= num_batch train_ler /= num_batch saver.save(session, os.path.join(ENV.output, 'best.ckpt'), global_step=curr_epoch) feed_test = { inputs: test_data[0][0], targets_idx: test_data[0][1][0], targets_val: test_data[0][1][1], targets_shape: train_data[0][1][2], seq_len: test_data[0][2] } test_cost, test_ler = session.run([loss, label_error_rate], feed_dict=feed_test) log = "Epoch {}/{}, test_cost {}, test_ler {}" logging.info( log.format(curr_epoch + 1, num_epochs, test_cost, test_ler)) else: ckpt = tf.train.get_checkpoint_state(ENV.model_path) print('load', ckpt.model_checkpoint_path) saver = tf.train.Saver() saver.restore(session, ckpt.model_checkpoint_path) while True: if file_decode: wav_file = raw_input('Enter the wav file path:') else: wav_file = 'temp.wav' raw_input('Press Enter to start...') try: sox = subprocess.Popen([ 'sox', '-d', '-b', '16', '-c', '1', '-r', '16000', 'temp.wav' ]) sox.communicate() except KeyboardInterrupt: os.kill(sox.pid, signal.SIGTERM) if sox.poll() is None: time.sleep(2) print('Done recording') features = process_wav(wav_file) batch_features = np.array([features for i in range(16)]) batch_seq_len = np.array( [features.shape[0] for i in range(16)]) print(batch_features.shape) feed = {inputs: batch_features, seq_len: batch_seq_len} d, oc = session.run([decoded[0], outputs], feed_dict=feed) dsp = d.shape res = [] for label in d.values[:dsp[1]]: for k, v in phoneme_set_39.items(): if v == label + 1: res.append(k) print(res)
def _get_testing(rnn_logits, sequence_length, label, label_length): """Create ops for testing (all scalars): label_error: Normalized edit distance on beam search max sequence_error: Normalized sequence error rate """ with tf.name_scope("evaluate"): predictions, _ = tf.nn.ctc_beam_search_decoder(rnn_logits, sequence_length, beam_width=128, top_paths=1, merge_repeated=True) hypothesis = tf.cast(predictions[0], tf.int32) # for edit_distance # Per-sequence statistic num_label_errors = tf.edit_distance(hypothesis, label, normalize=False) # Per-batch summary counts batch_num_label_errors = tf.reduce_sum(num_label_errors) batch_num_sequence_errors = tf.count_nonzero(num_label_errors, axis=0) batch_num_labels = tf.reduce_sum(label_length) batch_size = tf.shape(label_length)[0] # Wide integer type casts (prefer unsigned, but truediv dislikes those) batch_num_label_errors = tf.cast(batch_num_label_errors, tf.int64) batch_num_sequence_errors = tf.cast(batch_num_sequence_errors, tf.int64) batch_num_labels = tf.cast(batch_num_labels, tf.int64) batch_size = tf.cast(batch_size, tf.int64) # Variables to tally across batches (all initially zero) # Make sure the variables are local so the Saver doesn't try to read them # from the saved model checkpoint var_collections = [tf.GraphKeys.LOCAL_VARIABLES] total_num_label_errors = tf.Variable(0, trainable=False, name='total_num_label_errors', dtype=tf.int64, collections=var_collections) total_num_sequence_errors = tf.Variable( 0, trainable=False, name='total_num_sequence_errors', dtype=tf.int64, collections=var_collections) total_num_labels = tf.Variable(0, trainable=False, name='total_num_labels', dtype=tf.int64, collections=var_collections) total_num_sequences = tf.Variable(0, trainable=False, name='total_num_sequences', dtype=tf.int64, collections=var_collections) # Create the "+=" update ops and group together as one update_label_errors = tf.assign_add(total_num_label_errors, batch_num_label_errors) update_num_labels = tf.assign_add(total_num_labels, batch_num_labels) update_sequence_errors = tf.assign_add(total_num_sequence_errors, batch_num_sequence_errors) update_num_sequences = tf.assign_add(total_num_sequences, batch_size) update_metrics = tf.group(update_label_errors, update_num_labels, update_sequence_errors, update_num_sequences) metrics = [ total_num_label_errors, total_num_labels, total_num_sequence_errors, total_num_sequences ] # Tensors to make final calculations label_error = tf.truediv(total_num_label_errors, total_num_labels, name='label_error') sequence_error = tf.truediv(total_num_sequence_errors, total_num_sequences, name='sequence_error') return label_error, sequence_error, update_metrics, metrics, predictions
def cer(decoded, targets, targets_length): greedy_decoded = tf.sparse.from_dense(decoded) sparse_targets = tf.cast(K.ctc_label_dense_to_sparse(targets, math_ops.cast( K.flatten(targets_length), dtype='int32')), 'int32') return tf.edit_distance(tf.cast(greedy_decoded, tf.int32), sparse_targets, normalize=True)
def _build_graph(self, inputs): l, labelidx, labelvalue, labelshape, seqlen = inputs tf.summary.image('input_img', l) label = tf.SparseTensor(labelidx, labelvalue, labelshape) l = tf.cast(l, tf.float32) l = l / 255.0 * 2 - 1 self.batch_size = tf.shape(l)[0] # cnn part with tf.variable_scope('cnn') as scope: feature_height = cfg.input_height for i, kernel_height in enumerate(cfg.cnn.kernel_heights): out_channel = cfg.cnn.channels[i] kernel_width = cfg.cnn.kernel_widths[i] stride = cfg.cnn.stride[i] l = Conv2D('conv.{}'.format(i), l, out_channel, (kernel_height, kernel_width), cfg.cnn.padding, stride=(1, stride)) if cfg.cnn.with_bn: l = BatchNorm('bn.{}'.format(i), l) l = tf.clip_by_value(l, 0, 20, "clipped_relu.{}".format(i)) if cfg.cnn.padding == "VALID": feature_height = feature_height - kernel_height + 1 seqlen = tf.cast( tf.ceil( (tf.cast(seqlen, tf.float32) - kernel_width + 1) / stride), tf.int32) else: seqlen = tf.cast( tf.ceil((tf.cast(seqlen, tf.float32)) / stride), tf.int32) feature_size = feature_height * out_channel # rnn part l = tf.transpose(l, perm=[0, 2, 1, 3]) l = tf.reshape(l, [self.batch_size, -1, feature_size]) if cfg.rnn.hidden_layers_no > 0: cell_fw = [ tf.nn.rnn_cell.BasicLSTMCell(cfg.rnn.hidden_size) for _ in range(cfg.rnn.hidden_layers_no) ] cell_bw = [ tf.nn.rnn_cell.BasicLSTMCell(cfg.rnn.hidden_size) for _ in range(cfg.rnn.hidden_layers_no) ] l = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( cell_fw, cell_bw, l, dtype=tf.float32) feature_size = cfg.rnn.hidden_size # fc part l = tf.reshape(l[0], [-1, 2 * feature_size]) # l = tf.reshape(l, [-1, feature_size]) output = BatchNorm('bn', l) logits = FullyConnected( 'fc', output, cfg.label_size, nl=tf.identity, W_init=tf.truncated_normal_initializer(stddev=0.01)) logits = tf.reshape(logits, (self.batch_size, -1, cfg.label_size)) softmaxed_logits = tf.nn.softmax(logits, name='logits') # ctc output loss = tf.nn.ctc_loss(inputs=logits, labels=label, sequence_length=seqlen, ignore_longer_outputs_than_inputs=True, time_major=False) if cfg.hard_sample_mining: self.cost = hard_loss(loss, self.hard_sample_num, name='cost') else: self.cost = tf.reduce_mean(loss, name='cost') # prediction error logits = tf.transpose(logits, [1, 0, 2]) isTrain = get_current_tower_context().is_training predictions = tf.to_int32( tf.nn.ctc_greedy_decoder(inputs=logits, sequence_length=seqlen)[0][0]) # predictions = tf.to_int32(tf.nn.ctc_beam_search_decoder(inputs=logits, # sequence_length=seqlen)[0][0]) dense_pred = tf.sparse_tensor_to_dense(predictions, name="prediction") err = tf.edit_distance(predictions, label, normalize=True) err.set_shape([None]) err = tf.reduce_mean(err, name='error') summary.add_moving_summary(err, self.cost)
def run_ctc(): graph = tf.Graph() with graph.as_default(): # e.g: log filter bank or MFCC features # Has size [batch_size, max_step_size, num_features], but the # batch_size and max_step_size can vary along each step inputs = tf.placeholder(tf.float32, [None, None, num_features]) # Here we use sparse_placeholder that will generate a # SparseTensor required by ctc_loss op. targets = tf.sparse_placeholder(tf.int32) # 1d array of size [batch_size] seq_len = tf.placeholder(tf.int32, [None]) # Defining the cell # Can be: cell = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True) # Stacking rnn cells stack = tf.contrib.rnn.MultiRNNCell([cell] * num_layers, state_is_tuple=True) # The second output is the last state and we will no use that outputs, _ = tf.nn.dynamic_rnn(stack, inputs, seq_len, dtype=tf.float32) shape = tf.shape(inputs) batch_s, max_time_steps = shape[0], shape[1] # Reshaping to apply the same weights over the timesteps outputs = tf.reshape(outputs, [-1, num_hidden]) # Truncated normal with mean 0 and stdev=0.1 # Tip: Try another initialization W = tf.Variable(tf.truncated_normal([num_hidden, num_classes], stddev=0.1)) # Zero initialization # Tip: Is tf.zeros_initializer the same? b = tf.Variable(tf.constant(0., shape=[num_classes])) # Doing the affine projection logits = tf.matmul(outputs, W) + b # Reshaping back to the original shape logits = tf.reshape(logits, [batch_s, -1, num_classes]) # Time major logits = tf.transpose(logits, (1, 0, 2)) loss = tf.nn.ctc_loss(targets, logits, seq_len) cost = tf.reduce_mean(loss) optimizer = tf.train.MomentumOptimizer(learning_rate=0.005, momentum=0.9).minimize(cost) # Option 2: tf.contrib.ctc.ctc_beam_search_decoder # (it's slower but you'll get better results) decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len) # Inaccuracy: label error rate ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) files = find_files("/home/burak/Downloads/vctk-p225-small/wav48/p225") with tf.Session(graph=graph) as session: tf.global_variables_initializer().run() saver = tf.train.Saver() for curr_epoch in range(num_epochs): train_cost = train_ler = 0 for batch in range(num_batches_per_epoch): filename = random.choice(files) txtfile = filename.replace("wav48","txt") txtfile = txtfile.replace(".wav",".txt") txt = open(txtfile).read() audio = read_audio_from_filename(filename, sample_rate) out = convert_inputs_to_ctc_format(audio,sample_rate,txt) train_inputs, train_targets, train_seq_len, original = out feed = {inputs: train_inputs, targets: train_targets, seq_len: train_seq_len} batch_cost, _ = session.run([cost, optimizer], feed) train_ler += session.run(ler, feed_dict=feed) print 'batch_cost', batch_cost, 'train_ler', train_ler # Decoding d = session.run(decoded[0], feed_dict=feed) str_decoded = ''.join([chr(x) for x in np.asarray(d[1]) + FIRST_INDEX]) # Replacing blank label to none str_decoded = str_decoded.replace(chr(ord('z') + 1), '') # Replacing space label to space str_decoded = str_decoded.replace(chr(ord('a') - 1), ' ') print('Original: %s' % original) print('Decoded: %s' % str_decoded) if curr_epoch % 10 == 0: saver.save(session, mfile)
def prediction(logits,seq_length,label): logits = tf.transpose(logits,perm = [1,0,2]) predict = tf.to_int32(tf.nn.ctc_beam_search_decoder(logits,seq_length,merge_repeated = False)[0][0]) error = tf.reduce_sum(tf.edit_distance(predict, label, normalize=False)) / tf.to_float(tf.size(label.values)) return error
# Swap dimensions to time major for CTC loss. logits = tf.transpose(logits, (1, 0, 2)) loss = ctc.ctc_loss(targets, logits, seq_len) cost = tf.reduce_mean(loss) # Record the loss tf.contrib.deprecated.scalar_summary('loss', cost) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, use_nesterov=True).minimize(cost) decoded, log_prob = ctc.ctc_beam_search_decoder(inputs=logits, sequence_length=seq_len) # Label error rate using the edit distance between output and target ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) # Record the label error rate tf.contrib.deprecated.scalar_summary('label error rate', ler) saver = tf.train.Saver() merged = tf.contrib.deprecated.merge_all_summaries() train_writer = tf.summary.FileWriter('./summaries/train', graph) test_writer = tf.summary.FileWriter('./summaries/test', graph) def test_decoding(input_feed_dict, input_original): """ Runs the classifier on a feed dictionary and prints the decoded predictions. """ d = session.run(decoded, feed_dict=input_feed_dict)
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): self.inputs = tf.placeholder( tf.float32, [None, utils.image_width, utils.image_height, 1]) '''with tf.variable_scope('STN'): #Localisation net conv1_loc = slim.conv2d(self.inputs, 32, [3, 3], scope='conv1_loc') pool1_loc = slim.max_pool2d(conv1_loc, [2, 2], scope='pool1_loc') conv2_loc = slim.conv2d(pool1_loc, 64, [3, 3], scope='conv2_loc') pool2_loc = slim.max_pool2d(conv2_loc, [2, 2], scope='pool2_loc') pool2_loc_flat = slim.flatten(pool2_loc) fc1_loc = slim.fully_connected(pool2_loc_flat, 1024, scope='fc1_loc') fc2_loc = slim.fully_connected(fc1_loc, 128, scope='fc2_loc') W = tf.Variable(tf.zeros([128, 20])) b = tf.Variable(initial_value=[-1, -0.2, -0.5, -0.35, 0, -0.5, 0.5, -0.67, 1, -0.8, -1, 0.8, -0.5, 0.65, 0, 0.5, 0.5, 0.33, 1, 0.2], dtype=tf.float32) # fc3_loc=tf.layers.dense(fc2_loc,20,activation=tf.nn.tanh,kernel_initializer=tf.zeros_initializer) # fc3_loc = slim.fully_connected(fc2_loc, 8, activation_fn=tf.nn.tanh, scope='fc3_loc') # spatial transformer fc3_loc = tf.nn.tanh(tf.matmul(fc2_loc, W) + b) loc = tf.reshape(fc3_loc, [-1, 10, 2]) # spatial transformer s = np.array([[-0.95, -0.95], [-0.5, -0.95], [0, -0.95], [0.5, -0.95], [0.95, -0.95], [-0.95, 0.95], [-0.5, 0.95], [0, 0.95], [0.5, 0.95], [0.95,0.95]] * 256) s = tf.constant(s.reshape([256, 10, 2]), dtype=tf.float32) self.h_trans = stn(self.inputs, s, loc, (utils.image_width, utils.image_height))''' if FLAGS.Use_CRNN: with tf.variable_scope('CNN'): self.keep_prob_cv1 = tf.placeholder("float") self.keep_prob_cv2 = tf.placeholder("float") self.keep_prob_cv3 = tf.placeholder("float") self.keep_prob_cv4 = tf.placeholder("float") net = slim.conv2d(self.inputs, 64, [3, 3], scope='conv1') net = tf.nn.dropout(net, self.keep_prob_cv1) net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.conv2d(net, 128, [3, 3], scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.conv2d(net, 256, [3, 3], activation_fn=None, scope='conv3') net = tf.layers.batch_normalization(net, training=is_training) net = tf.nn.relu(net) net = tf.nn.dropout(net, self.keep_prob_cv2) net = slim.conv2d(net, 256, [3, 3], scope='conv4') net = slim.max_pool2d(net, [2, 2], [1, 2], scope='pool3') net = slim.conv2d(net, 512, [3, 3], activation_fn=None, scope='conv5') net = tf.nn.dropout(net, self.keep_prob_cv3) net = tf.layers.batch_normalization(net, training=is_training) net = tf.nn.relu(net) net = slim.conv2d(net, 512, [3, 3], scope='conv6') net = slim.max_pool2d(net, [2, 2], [1, 2], scope='pool4') net = slim.conv2d(net, 512, [2, 2], padding='VALID', activation_fn=None, scope='conv7') net = tf.nn.dropout(net, self.keep_prob_cv4) net = tf.layers.batch_normalization(net, training=is_training) net = tf.nn.relu(net) self.cnn_time = net.get_shape().as_list()[1] self.num_feauture = 512 else: with tf.variable_scope('Dense_CNN'): nb_filter = 64 net = tf.layers.conv2d(self.inputs, nb_filter, 5, (2, 2), "SAME", use_bias=False) net, nb_filter = dense_block(net, 8, 8, nb_filter, is_training) net, nb_filter = transition_block(net, 128, is_training, pooltype=2) net, nb_filter = dense_block(net, 8, 8, nb_filter, is_training) net, nb_filter = transition_block(net, 128, is_training, pooltype=3) net, nb_filter = dense_block(net, 8, 8, nb_filter, is_training) #net, nb_filter = transition_block(net, 128, is_training, pooltype=3) print(net) #net = tf.layers.conv2d(net, nb_filter, 3, (1, 2), "SAME", use_bias=True) self.cnn_time = net.get_shape().as_list()[1] self.num_feauture = 4 * 192 temp_inputs = net with tf.variable_scope('BLSTM'): self.labels = tf.sparse_placeholder(tf.int32) self.seq_len = tf.placeholder(tf.int32, [None]) self.lstm_inputs = tf.reshape( temp_inputs, [-1, self.cnn_time, self.num_feauture]) outputs = stacked_bidirectional_rnn(tf.contrib.rnn.LSTMCell, FLAGS.num_hidden, 2, self.lstm_inputs, self.seq_len) shape = tf.shape(self.lstm_inputs) batch_s, max_timesteps = shape[0], 40 outputs = tf.reshape(outputs, [-1, FLAGS.num_hidden * 2]) self.keep_prob_fc = tf.placeholder("float") h_fc1_drop = tf.nn.dropout(outputs, self.keep_prob_fc) W = tf.Variable(tf.truncated_normal( [FLAGS.num_hidden * 2, num_classes], stddev=0.1, dtype=tf.float32), name='W') b = tf.Variable( tf.constant(0., dtype=tf.float32, shape=[num_classes], name='b')) logits = tf.matmul(outputs, W) + b logits = tf.reshape(logits, [batch_s, -1, num_classes]) self.logits_before_ctc = tf.argmax(logits, 2) logits = tf.transpose(logits, (1, 0, 2)) self.global_step = tf.Variable(0, trainable=False) print( "###########################################################") print(self.labels) print(logits) print(self.seq_len) self.loss = tf.nn.ctc_loss(labels=self.labels, inputs=logits, sequence_length=self.seq_len) self.cost = tf.reduce_mean(self.loss) self.learning_rate = tf.train.exponential_decay( FLAGS.initial_learning_rate, self.global_step, FLAGS.decay_steps, FLAGS.decay_rate, staircase=True) self.optimizer = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=FLAGS.momentum, use_nesterov=True).minimize(self.cost, global_step=self.global_step) self.decoded, self.log_prob = tf.nn.ctc_beam_search_decoder( logits, self.seq_len, merge_repeated=False) self.dense_decoded = tf.sparse_tensor_to_dense(self.decoded[0], default_value=-1) self.lerr = tf.reduce_mean( tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.labels)) tf.summary.scalar('cost', self.cost) self.merged_summay = tf.summary.merge_all()
def CheckpointTest(): # input_tensor为输入音频数据,由前面分析可知,它的结构是[batch_size, amax_stepsize, n_input + (2 * n_input * n_context)] # 其中,batch_size是batch的长度,amax_stepsize是时序长度,n_input + (2 * n_input * n_context)是MFCC特征数, # batch_size是可变的,所以设为None,由于每一批次的时序长度不固定,所有,amax_stepsize也设为None input_tensor = tf.placeholder( tf.float32, [None, None, n_input + (2 * n_input * n_context)], name='input') # Use sparse_placeholder; will generate a SparseTensor, required by ctc_loss op. # targets保存的是音频数据对应的文本的系数张量,所以用sparse_placeholder创建一个稀疏张量 targets = tf.sparse_placeholder(tf.int32, name='targets') # seq_length保存的是当前batch数据的时序长度 seq_length = tf.placeholder(tf.int32, [None], name='seq_length') # keep_dropout则是dropout的参数 keep_dropout = tf.placeholder(tf.float32) # logits is the non-normalized output/activations from the last layer. # logits will be input for the loss function. # nn_model is from the import statement in the load_model function logits = BiRNN_model(input_tensor, tf.to_int64(seq_length), n_input, n_context, words_size + 1, keep_dropout) aa = ctc_ops.ctc_loss(targets, logits, seq_length) # 使用ctc loss计算损失 avg_loss = tf.reduce_mean(aa) # 优化器 learning_rate = 0.001 optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(avg_loss) # 使用CTC decoder with tf.name_scope("decode"): decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_length, merge_repeated=True) # 计算编辑距离 with tf.name_scope("accuracy"): distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets) # 计算label error rate (accuracy) ler = tf.reduce_mean(distance, name='label_error_rate') # 迭代次数 epochs = 150 # 模型保存地址 savedir = "saver/" # 如果该目录不存在,新建 if os.path.exists(savedir) == False: os.mkdir(savedir) # 生成saver saver = tf.train.Saver(max_to_keep=1) # 创建session with tf.Session() as sess: # 初始化 sess.run(tf.global_variables_initializer()) # 没有模型的话,就重新初始化 kpt = tf.train.latest_checkpoint(savedir) print("kpt:", kpt) startepo = 0 if kpt != None: saver.restore(sess, kpt) ind = kpt.find("-") startepo = int(kpt[ind + 1:]) # 要识别的语音文件 wav_file = 'input.wav' source, source_lengths, sparse_labels = get_speech_file( wav_file, labels) feed2 = { input_tensor: source, targets: sparse_labels, seq_length: source_lengths, keep_dropout: 1.0 } d, train_ler = sess.run([decoded[0], ler], feed_dict=feed2) dense_decoded = tf.sparse_tensor_to_dense( d, default_value=-1).eval(session=sess) if (len(dense_decoded) > 0): decoded_str = ndarray_to_text_ch(dense_decoded[0], words) print('Decoded: {}'.format(decoded_str))
import tensorflow as tf import numpy as np x = tf.SparseTensor([ [0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], ], ["s", "i", "a", "l", "u", "s", "i", "a", "l", "u"], (1, 5)) target = tf.SparseTensor( [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5]], ["h", "a", "n", "d", "a", "l", "s", "y", "a", "l", "o", "m"], (2, 6)) ler = tf.edit_distance(x, target) with tf.Session() as sess: _ler = sess.run(ler) print(_ler)
def build_graph(self, args, maxTimeSteps): self.graph = tf.Graph() with self.graph.as_default(): # according to DeepSpeech2 paper, input is the spectrogram power of audio, but if you like, # you can also use mfcc feature as the input. self.inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, args.batch_size, args.num_feature)) inputXrs = tf.reshape( self.inputX, [args.batch_size, args.num_feature, maxTimeSteps, 1]) #self.inputList = tf.split(inputXrs, maxTimeSteps, 0) # convert inputXrs from [32*maxL,39] to [32,maxL,39] self.targetIxs = tf.placeholder(tf.int64) self.targetVals = tf.placeholder(tf.int32) self.targetShape = tf.placeholder(tf.int64) self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape) self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size)) self.config = { 'name': args.model, 'rnncell': self.cell_fn, 'num_layer': args.num_layer, 'num_hidden': args.num_hidden, 'num_class': args.num_class, 'activation': args.activation, 'optimizer': args.optimizer, 'learning rate': args.learning_rate, 'keep prob': args.keep_prob, 'batch size': args.batch_size } output_fc = build_deepSpeech2(self.args, maxTimeSteps, self.inputX, self.cell_fn, self.seqLengths) self.loss = tf.reduce_mean( tf.nn.ctc_loss(self.targetY, output_fc, self.seqLengths)) self.var_op = tf.global_variables() self.var_trainable_op = tf.trainable_variables() if args.grad_clip == -1: # not apply gradient clipping self.optimizer = tf.train.AdamOptimizer( args.learning_rate).minimize(self.loss) else: # apply gradient clipping grads, _ = tf.clip_by_global_norm( tf.gradients(self.loss, self.var_trainable_op), args.grad_clip) opti = tf.train.AdamOptimizer(args.learning_rate) self.optimizer = opti.apply_gradients( zip(grads, self.var_trainable_op)) self.predictions = tf.to_int32( tf.nn.ctc_beam_search_decoder(output_fc, self.seqLengths, merge_repeated=False)[0][0]) if args.level == 'cha': self.errorRate = tf.reduce_sum( tf.edit_distance(self.predictions, self.targetY, normalize=True)) self.initial_op = tf.global_variables_initializer() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5, keep_checkpoint_every_n_hours=1)
def train(): global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(common.INITIAL_LEARNING_RATE, global_step, common.DECAY_STEPS, common.LEARNING_RATE_DECAY_FACTOR, staircase=True) logits, inputs, targets, seq_len, W, b = model.get_train_model() loss = tf.nn.ctc_loss(targets, logits, seq_len) cost = tf.reduce_mean(loss) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=common.MOMENTUM).minimize(cost, global_step=global_step) # Option 2: tf.contrib.ctc.ctc_beam_search_decoder # (it's slower but you'll get better results) decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, merge_repeated=False) # Accuracy: label error rate acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) # Initializate the weights and biases init = tf.global_variables_initializer() def do_report(): test_feed = {inputs: test_inputs, targets: test_targets, seq_len: test_seq_len} dd, log_probs, accuracy = session.run([decoded[0], log_prob, acc], test_feed) report_accuracy(dd, test_targets) # decoded_list = decode_sparse_tensor(dd) def do_batch(): feed = {inputs: train_inputs, targets: train_targets, seq_len: train_seq_len} b_cost, steps, _ = session.run([cost, global_step, optimizer], feed) if steps > 0 and steps % common.REPORT_STEPS == 0: do_report() save_path = saver.save(session, "models/ocr.model", global_step=steps) #print(save_path) return b_cost, steps config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as session: session.run(init) saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) for curr_epoch in range(num_epochs): # variables = tf.all_variables() # for i in variables: # print(i.name) print("Epoch.......", curr_epoch) train_cost = train_ler = 0 for batch in range(common.BATCHES): start = time.time() train_inputs, train_targets, train_seq_len = utils.get_data_set('train', batch * common.BATCH_SIZE, (batch + 1) * common.BATCH_SIZE) #print("get data time", time.time() - start) start = time.time() c, steps = do_batch() train_cost += c * common.BATCH_SIZE seconds = time.time() - start print("Step:", steps, ", batch seconds:", seconds) train_cost /= common.TRAIN_SIZE # train_ler /= common.TRAIN_SIZE val_feed = {inputs: train_inputs, targets: train_targets, seq_len: train_seq_len} val_cost, val_ler, lr, steps = session.run([cost, acc, learning_rate, global_step], feed_dict=val_feed) log = "Epoch {}/{}, steps = {}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}s, learning_rate = {}" print(log.format(curr_epoch + 1, num_epochs, steps, train_cost, train_ler, val_cost, val_ler, time.time() - start, lr))
def crnn(self, max_width, batch_size): def BidirectionnalRNN(inputs, seq_len): """ Bidirectionnal LSTM Recurrent Neural Network part """ with tf.variable_scope(None, default_name="bidirectional-rnn-1"): # Forward lstm_fw_cell_1 = rnn.BasicLSTMCell(256) # Backward lstm_bw_cell_1 = rnn.BasicLSTMCell(256) inter_output, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell_1, lstm_bw_cell_1, inputs, seq_len, dtype=tf.float32) inter_output = tf.concat(inter_output, 2) with tf.variable_scope(None, default_name="bidirectional-rnn-2"): # Forward lstm_fw_cell_2 = rnn.BasicLSTMCell(256) # Backward lstm_bw_cell_2 = rnn.BasicLSTMCell(256) outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell_2, lstm_bw_cell_2, inter_output, seq_len, dtype=tf.float32) outputs = tf.concat(outputs, 2) return outputs def CNN(inputs): """ Convolutionnal Neural Network part """ # 64 / 3 x 3 / 1 / 1 conv1 = tf.layers.conv2d(inputs=inputs, filters = 64, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu) # 2 x 2 / 1 pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) # 128 / 3 x 3 / 1 / 1 conv2 = tf.layers.conv2d(inputs=pool1, filters = 128, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu) # 2 x 2 / 1 pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) # 256 / 3 x 3 / 1 / 1 conv3 = tf.layers.conv2d(inputs=pool2, filters = 256, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu) # Batch normalization layer bnorm1 = tf.layers.batch_normalization(conv3) # 256 / 3 x 3 / 1 / 1 conv4 = tf.layers.conv2d(inputs=bnorm1, filters = 256, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu) # 1 x 2 / 1 pool3 = tf.layers.max_pooling2d(inputs=conv4, pool_size=[2, 2], strides=[1, 2], padding="same") # 512 / 3 x 3 / 1 / 1 conv5 = tf.layers.conv2d(inputs=pool3, filters = 512, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu) # Batch normalization layer bnorm2 = tf.layers.batch_normalization(conv5) # 512 / 3 x 3 / 1 / 1 conv6 = tf.layers.conv2d(inputs=bnorm2, filters = 512, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu) # 1 x 2 / 2 pool4 = tf.layers.max_pooling2d(inputs=conv6, pool_size=[2, 2], strides=[1, 2], padding="same") # 512 / 2 x 2 / 1 / 0 conv7 = tf.layers.conv2d(inputs=pool4, filters = 512, kernel_size = (2, 2), padding = "valid", activation=tf.nn.relu) return conv7 inputs = tf.placeholder(tf.float32, [batch_size, max_width, 32, 1]) # Our target output targets = tf.sparse_placeholder(tf.int32, name='targets') # The length of the sequence seq_len = tf.placeholder(tf.int32, [None], name='seq_len') cnn_output = CNN(inputs) reshaped_cnn_output = tf.reshape(cnn_output, [batch_size, -1, 512]) max_char_count = reshaped_cnn_output.get_shape().as_list()[1] crnn_model = BidirectionnalRNN(reshaped_cnn_output, seq_len) logits = tf.reshape(crnn_model, [-1, 512]) W = tf.Variable(tf.truncated_normal([512, config.NUM_CLASSES], stddev=0.1), name="W") b = tf.Variable(tf.constant(0., shape=[config.NUM_CLASSES]), name="b") logits = tf.matmul(logits, W) + b logits = tf.reshape(logits, [batch_size, -1, config.NUM_CLASSES]) # Final layer, the output of the BLSTM logits = tf.transpose(logits, (1, 0, 2)) # Loss and cost calculation loss = tf.nn.ctc_loss(targets, logits, seq_len) cost = tf.reduce_mean(loss) # Training step optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost) # The decoded answer decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len) dense_decoded = tf.sparse_tensor_to_dense(decoded[0], default_value=-1) # The error rate acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) init = tf.global_variables_initializer() return inputs, targets, seq_len, logits, dense_decoded, optimizer, acc, cost, max_char_count, init
def main(_): batch_size = FLAGS.batch_size # num_readers = 4 num_epochs = FLAGS.epoch checkpoint_dir = FLAGS.checkpoint_dir with tf.Graph().as_default(): # deploy_config = model_deploy.DeploymentConfig() # Create global_step. global_step = tf.Variable(0, name='global_step', trainable=False) tr_file_name = os.path.join("/mnt/sdb/mark/mjsyth", "mjsynth_train.tfrecords") te_file_name = os.path.join("/mnt/sdb/mark/mjsyth", "mjsynth_val.tfrecords") sh_images, sh_labels, sh_length= read_utils.inputs( filename=[tr_file_name], batch_size=batch_size, num_epochs=num_epochs) val_images, val_labels, val_length= read_utils.inputs( filename=[te_file_name], batch_size=batch_size, num_epochs=1000) # Build Model crnn = model.CRNNNet() with tf.variable_scope('crnn'): logits, seq_len = crnn.net(sh_images, is_training=True) tf.get_variable_scope().reuse_variables() val_logits, val_seq_len = crnn.net(val_images, is_training=False) loss = crnn.losses(sh_labels, logits, seq_len) tf.summary.scalar("train/loss", loss) val_loss = crnn.losses(val_labels, val_logits, val_seq_len) # TODO: BK-tree NN search decoded, log_prob = tf.nn.ctc_beam_search_decoder(tf.transpose(val_logits, perm=[1, 0, 2]), val_seq_len, merge_repeated=False) acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels, normalize=False)) acc_norm = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels)) val_loss_sum = tf.placeholder(tf.float32, name='val_loss_sum') val_acc_sum = tf.placeholder(tf.float32, name='val_acc_sum') val_acc_norm_sum = tf.placeholder(tf.float32, name='val_acc_norm_sum') tf.summary.scalar("test/val_loss", val_loss_sum) tf.summary.scalar("test/edit_distance", val_acc_sum) tf.summary.scalar("test/edit_distance_norm", val_acc_norm_sum) starter_learning_rate = FLAGS.learning_rate learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 500000, 0.5, staircase=True) tf.summary.scalar("train/learning_rate",learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss=loss, global_step=global_step) # Start Training with tf.Session(config=config) as sess: save = tf.train.Saver(max_to_keep=50) if not FLAGS.load: init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) # Start input enqueue threads. else: # ckpt_file = 'model.ckpt-' + FLAGS.ckpt_step ckpt_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.ckpt_file) save.restore(sess, ckpt_path) sess.run(tf.local_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) #sess = tf_debug.LocalCLIDebugWrapperSession(sess) merged = tf.summary.merge(tf.get_collection(tf.GraphKeys.SUMMARIES, scope='train/*')) val_merged = tf.summary.merge(tf.get_collection(tf.GraphKeys.SUMMARIES, scope='test/*')) file_writer = tf.summary.FileWriter(FLAGS.logdir, sess.graph) try: while not coord.should_stop(): start_time = time.time() _, merged_t, tr_loss, lr, step, db_lables, db_images, db_logits = sess.run([optimizer, merged, loss, learning_rate, global_step, sh_labels, sh_images, logits]) duration = time.time() - start_time print("loss", tr_loss, "time", duration) file_writer.add_summary(merged_t, step) # Print an overview fairly often. if step % 10000 == 0: ####################################################### val_loss_s, val_acc_s, val_acc_norm_s = 0, 0, 0 for ite in range(FLAGS.sample_size): te_loss, te_acc, te_acc_norm = sess.run([val_loss, acc, acc_norm]) val_loss_s += te_loss val_acc_s += te_acc val_acc_norm_s += te_acc_norm val_loss_s /= FLAGS.sample_size val_acc_s /= FLAGS.sample_size val_acc_norm_s /= FLAGS.sample_size print('Step %d: loss %.3f acc %.3f %.3f (%.3f sec)' % (step, val_loss_s, val_acc_s, val_acc_norm_s, duration)) # Add summary val_sum = sess.run(val_merged, feed_dict={val_loss_sum: val_loss_s, val_acc_sum: val_acc_s, val_acc_norm_sum: val_acc_norm_s}) file_writer.add_summary(val_sum, step) save.save(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt'), global_step=step) except tf.errors.OutOfRangeError: print('Done training for %d epochs, %d steps.' % (num_epochs, step)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads)
def _add_training_on_rnn(self, logits, grad_clip, learning_rate, lr_decay_factor, sparse_labels, input_seq_lengths, prediction): """ Build the training add-on of the Acoustic RNN This add-on offer ops that can be used to train the network : * self.learning_rate_decay_op : will decay the learning rate * self.acc_mean_loss_op : will compute the loss and accumulate it over multiple mini-batchs * self.acc_mean_loss_zero_op : will reset the loss accumulator to 0 * self.acc_error_rate_op : will compute the error rate and accumulate it over multiple mini-batchs * self.acc_error_rate_zero_op : will reset the error_rate accumulator to 0 * self.increase_mini_batch_op : will increase the mini-batchs counter * self.mini_batch_zero_op : will reset the mini-batchs counter * self.acc_gradients_zero_op : will reset the gradients * self.accumulate_gradients_op : will compute the gradients and accumulate them over multiple mini-batchs * self.train_step_op : will clip the accumulated gradients and apply them on the RNN Parameters ---------- :param logits: the output of the RNN before the beam search :param grad_clip: max gradient size (prevent exploding gradients) :param learning_rate: learning rate parameter fed to optimizer :param lr_decay_factor: decay factor of the learning rate :param sparse_labels: the labels in a sparse tensor :param input_seq_lengths: vector containing the length of each input from 'inputs' :param prediction: the predicted label given by the RNN Returns ------- :returns: tensorflow variable keeping the current learning rate """ # Define the variable for the learning rate learning_rate_var = tf.Variable(float(learning_rate), trainable=False, name='learning_rate') # Define an op to decrease the learning rate self.learning_rate_decay_op = learning_rate_var.assign(tf.multiply(learning_rate_var, lr_decay_factor)) # Compute the CTC loss between the logits and the truth for each item of the batch with tf.name_scope('CTC'): ctc_loss = tf.nn.ctc_loss(sparse_labels, logits, input_seq_lengths, ignore_longer_outputs_than_inputs=True) # Compute the mean loss of the batch (only used to check on progression in learning) # The loss is averaged accross the batch but before we take into account the real size of the label mean_loss = tf.reduce_mean(tf.truediv(ctc_loss, tf.to_float(input_seq_lengths))) # Set an accumulator to sum the loss between mini-batchs self.accumulated_mean_loss = tf.Variable(0.0, trainable=False) self.acc_mean_loss_op = self.accumulated_mean_loss.assign_add(mean_loss) self.acc_mean_loss_zero_op = self.accumulated_mean_loss.assign(tf.zeros_like(self.accumulated_mean_loss)) # Compute the error between the logits and the truth with tf.name_scope('Error_Rate'): error_rate = tf.reduce_mean(tf.edit_distance(prediction, sparse_labels, normalize=True)) # Set an accumulator to sum the error rate between mini-batchs self.accumulated_error_rate = tf.Variable(0.0, trainable=False) self.acc_error_rate_op = self.accumulated_error_rate.assign_add(error_rate) self.acc_error_rate_zero_op = self.accumulated_error_rate.assign(tf.zeros_like(self.accumulated_error_rate)) # Count mini-batchs with tf.name_scope('Mini_batch'): # Set an accumulator to count the number of mini-batchs in a batch # Note : variable is defined as float to avoid type conversion error using tf.divide self.mini_batch = tf.Variable(0.0, trainable=False) self.increase_mini_batch_op = self.mini_batch.assign_add(1) self.mini_batch_zero_op = self.mini_batch.assign(tf.zeros_like(self.mini_batch)) # Compute the gradients trainable_variables = tf.trainable_variables() with tf.name_scope('Gradients'): opt = tf.train.AdamOptimizer(learning_rate_var) gradients = opt.compute_gradients(ctc_loss, trainable_variables) # Define a list of variables to store the accumulated gradients between batchs accumulated_gradients = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in trainable_variables] # Define an op to reset the accumulated gradient self.acc_gradients_zero_op = [tv.assign(tf.zeros_like(tv)) for tv in accumulated_gradients] # Define an op to accumulate the gradients calculated by the current batch with # the accumulated gradients variable self.accumulate_gradients_op = [accumulated_gradients[i].assign_add(gv[0]) for i, gv in enumerate(gradients)] # Define an op to apply the result of the accumulated gradients clipped_gradients, _norm = tf.clip_by_global_norm(accumulated_gradients, grad_clip) self.train_step_op = opt.apply_gradients([(clipped_gradients[i], gv[1]) for i, gv in enumerate(gradients)], global_step=self.global_step) return learning_rate_var
def main(_): checkpoint_dir = FLAGS.checkpoint_dir with tf.Graph().as_default(): # deploy_config = model_deploy.DeploymentConfig() # Create global_step. val_images = tf.placeholder(tf.float32, shape=[1, HEIGHT, WIDTH, 3], name='input_img') val_labels = tf.sparse_placeholder(tf.int32, name='input_labels') val_width = tf.placeholder(tf.int32, shape=[1], name='input_width') #indices = tf.placeholder(tf.int32, [None, 2]) #values = tf.placeholder(tf.int32, [None]) #shape = tf.placeholder(tf.int32, [2]) #val_labels = tf.SparseTensor(indices, values, shape) # Build Model crnn = model.CRNNNet() with tf.variable_scope('crnn'): val_logits, val_seq_len = crnn.net(val_images, val_width, is_training=False, kp=1.0) val_loss = crnn.losses(val_labels, val_logits, val_seq_len) # TODO: BK-tree NN search decoded, log_prob = tf.nn.ctc_beam_search_decoder(tf.transpose(val_logits, perm=[1, 0, 2]), val_seq_len, merge_repeated=False) acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels, normalize=False)) acc_norm = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels)) # Start Training with tf.Session(config=config) as sess: save = tf.train.Saver(max_to_keep=50) assert FLAGS.load if not FLAGS.load: init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) # Start input enqueue threads. else: # ckpt_file = 'model.ckpt-' + FLAGS.ckpt_step ckpt_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.ckpt_file) save.restore(sess, ckpt_path) print("Done loading checkpoint") # sess.run(tf.local_variables_initializer()) with open(FLAGS.gt_file, 'r') as f: val_loss_s, val_acc_s, val_acc_norm_s = 0, 0, 0 counter = 0 hit = 0 for line in f: # print(line) if FLAGS.dataset == 'ch4': line = line.replace('\xef\xbb\xbf','') line = line.replace('\r\n','') # parse each line img_file = line.split(', ')[0] img_label = line.split(', ')[1][1:-1] print(img_file, img_label) if FLAGS.dataset == 'coco': line = line.replace('\r\n','') line = line.replace('\n','') img_file = line.split(',')[0]+'.jpg' if len(line) < 10: continue start = line.find(',') img_label = line[start+1:] print(img_file, img_label) # labels.append(label) # print(img_file, label) # imgLists.append(os.path.join(data_prefix, img_file)) if FLAGS.dataset == 'IC13': line = line.replace('\xef\xbb\xbf','') line = line.replace('\r\n','') # parse each line img_file = line.split(', ')[0] img_label = line.split(', ')[1][1:-1] # print(img_file, img_label) img = Image.open(os.path.join(FLAGS.data_dir, img_file)) # w, h = img.size # # print(w, h) # ratio = 32 / float(h) # data = data.resize([int(ratio*w), 32]) # # print(data.size) # container = Image.new('RGB', (32, 100)) # container.paste(img) # img = container w, h = img.size if w < h: img = img.rotate(-90, expand=True) w, h = img.size # print(w, h) ratio = HEIGHT / float(h) if int(ratio*w) > WIDTH: img = img.resize([WIDTH, HEIGHT]) actual_width = [WIDTH] else: img = img.resize([int(ratio*w), HEIGHT]) actual_width = [int(ratio*w)] # print(data.size) container = Image.new('RGB', (WIDTH, HEIGHT)) container.paste(img) img = container img = np.asarray(img, np.float32) # img = img * (1. / 255) - 0.5 img /= 255. img = mean_image_subtraction( img, [_R_MEAN, _G_MEAN, _B_MEAN]) img = np.expand_dims(img, axis=0) str_label = img_label if FLAGS.case_insensitive: str_label = str_label.lower() img_label = str2code(img_label) if -1 in img_label: continue print(img_file, str_label) indices = [(0, i) for i in range(len(img_label))] values = [c for c in img_label] shape = [1, len(img_label)] t1 = time.time() output_label, te_acc, te_acc_norm = sess.run([decoded, acc, acc_norm], feed_dict={ val_images: img, val_labels: (indices, values, shape), val_width: actual_width }) t2 = time.time() print(t2 - t1) val_loss_s += 0 val_acc_s += te_acc val_acc_norm_s += te_acc_norm counter += 1 output_str = code2str(output_label[0].values) print(img_file, output_str) print(te_acc) if FLAGS.case_insensitive: output_str = output_str.lower() if output_str == str_label: hit += 1 print(hit) val_loss_s /= counter val_acc_s /= counter val_acc_norm_s /= counter pred_acc = hit / float(counter) print(hit, counter) print('loss %.3f edit dist %.3f %.3f acc %.3f' % (val_loss_s, val_acc_s, val_acc_norm_s, pred_acc))
sess = tf.Session() #---------------------------------- # First compute the edit distance between 'bear' and 'beers' hypothesis = list('bear') truth = list('beers') h1 = tf.SparseTensor([[0,0,0], [0,0,1], [0,0,2], [0,0,3]], hypothesis, [1,1,1]) t1 = tf.SparseTensor([[0,0,0], [0,0,1], [0,0,2], [0,0,3],[0,0,4]], truth, [1,1,1]) print(sess.run(tf.edit_distance(h1, t1, normalize=False))) #---------------------------------- # Compute the edit distance between ('bear','beer') and 'beers': hypothesis2 = list('bearbeer') truth2 = list('beersbeers') h2 = tf.SparseTensor([[0,0,0], [0,0,1], [0,0,2], [0,0,3], [0,1,0], [0,1,1], [0,1,2], [0,1,3]], hypothesis2, [1,2,4]) t2 = tf.SparseTensor([[0,0,0], [0,0,1], [0,0,2], [0,0,3], [0,0,4], [0,1,0], [0,1,1], [0,1,2], [0,1,3], [0,1,4]], truth2, [1,2,5]) print(sess.run(tf.edit_distance(h2, t2, normalize=True)))
W2 = weight_variable([2*hidden_size,output_size]) b2 = bias_variable([output_size]) #n_batch, n_time_steps, n_features = l_in.input_var.shape #Unnecessary in this version. Just collecting the info so that we can reshape the output back to the original shape l_reshape3 = tf.reshape(lstm_output_tr,[-1,2*hidden_size] ) h_2 = tf.matmul(l_reshape3,W2) + b2 l_reshape4 = tf.reshape(h_2,[-1,output_size]) l_soft = tf.nn.softmax(l_reshape4) l_soft_reshaped = tf.reshape(l_soft,[-1,n_time_steps,output_size]) l_soft_tr = tf.transpose(l_soft_reshaped, [1,0,2]) loss = tf.reduce_mean(tf.nn.ctc_loss(l_soft_tr, targets,seqLengths)) optimizer = tf.train.AdamOptimizer(learningRate).minimize(loss) logitsMaxTest = tf.slice(tf.argmax(l_soft_reshaped, 2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32(ctc.ctc_beam_search_decoder(l_soft_reshaped , seqLengths)[0][0]) errorRate = tf.reduce_sum(tf.edit_distance(predictions, targets, normalize=False)) / \ tf.to_float(tf.size(targets.values)) def getminibatch(x,y,bs): perm = np.random.permutation(len(x)) toselect = perm[:bs] batch = {} batch['x'] = np.array([x[i] for i in toselect]) batch['ind'], batch['val'], batch['shape'] = target_list_to_sparse_tensor([y[i] for i in toselect]) batch['seqlen'] = np.zeros([bs]) batch['seqlen'].fill(776) return batch number_of_batches = 100 batch_size_var = 38 nEpochs = 100
def build_net(): net = {} net['x'] = tf.placeholder(tf.float32, shape=[None, 40, 120, 4], name="X") net['y'] = tf.sparse_placeholder(tf.int32, name="Y") net['len'] = tf.placeholder(tf.int32, shape=[None]) layer = net['x'] layer = conv(layer, 32, 5, 2) layer = conv(layer, 64, 3, 1) layer = conv(layer, 128, 3, 1) layer = conv(layer, 128, 3, 2) layer = conv(layer, 256, 3, 1) layer = conv(layer, 256, 3, 1) layer = conv(layer, 512, 3, 2) layer = conv(layer, 512, 3, 1) layer = conv(layer, 1024, 3, 1) # layer = conv(layer, 512, (5, 1), (5, 1)) logits = layer # print logits.get_shape() # (?, 5, 15, 1024) -> (15, ?, 5, 1024) logits = tf.transpose(logits, (2, 0, 1, 3)) # (15, ?, 5, 1024) -> (15 * ?, 5 * 1024) logits = tf.reshape(logits, (-1, 5120)) # (15 * ?, ??) -> (15 * ?, 512) logits = tf.layers.dense(logits, units = 512, activation = tf.nn.leaky_relu, use_bias = True, kernel_initializer = tf.truncated_normal_initializer(stddev=0.1), bias_initializer = tf.constant_initializer(0.01), ) # # (15 * ?, 512) -> (15, ?, 512) # logits = tf.reshape(logits, (15, -1, 512)) # # (15 * ?, 512) -> (15, ?, 512) # logits = tf.reshape(logits, (15, -1, 512)) # # (15, ?, ??) -> (15, ?, 256) # rnn_layers = [tf.nn.rnn_cell.GRUCell(size) for size in [256]] # multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers) # logits, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell, # inputs=logits, dtype=tf.float32, time_major=True) # # (15, ?, 256) -> (15 * ?, 256) # logits = tf.reshape(logits, (-1, 256)) # (15 * ?, ??) -> (15 * ?, n_class) logits = tf.layers.dense(logits, units = labels_units, activation = None, use_bias = True, kernel_initializer = tf.truncated_normal_initializer(stddev=0.1), bias_initializer = tf.constant_initializer(0.01), ) # (15 * ?, n_class) -> (15, ?, n_class) logits = tf.reshape(logits, (15, -1, labels_units)) loss = tf.nn.ctc_loss(labels=net['y'], inputs=logits, sequence_length=net['len'], ignore_longer_outputs_than_inputs=True) net['loss'] = tf.reduce_mean(loss) net['train_op'] = tf.train.AdamOptimizer(learning_rate=0.000005).minimize(net['loss']) decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, net['len'], merge_repeated=False) net['decoded'] = decoded[0] net['acc'] = tf.reduce_mean(tf.edit_distance(tf.cast(net['decoded'], tf.int32), net['y'])) return net
outH1 = [tf.reduce_sum(tf.multiply(t, weightsOutH1), axis=1) + biasesOutH1 for t in fbH1rs] print("building logits ") logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1] print("len(outH1) %d"% len(outH1)) ####Optimizing print("building loss") logits3d = tf.stack(logits) loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths)) out = tf.identity(loss, 'ctc_loss_mean') optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating print("building Evaluation") logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0]) reduced_sum = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) errorRate = reduced_sum / tf.to_float(tf.size(targetY.values)) check_op = tf.add_check_numerics_ops() print("done building graph") ####Run session with tf.Session(graph=graph) as session: try: merged = tf.summary.merge_all() except: merged = tf.summary.merge_all() try:writer = tf.summary.FileWriter("/tmp/basic_new", session.graph) except: writer = tf.summary.FileWriter("/tmp/basic_new", session.graph) try:saver = tf.train.Saver() # defaults to saving all variables except: print("tf.train.Saver() broken in tensorflow 0.12") saver = tf.train.Saver(tf.global_variables())# WTF stupid API breaking
def __init__(self): self.graph = tf.Graph() with self.graph.as_default(): with tf.variable_scope('weights'): self.weights = { 'W_conv1': tf.get_variable( 'W_conv1', [10, 1, 1, 4], initializer=tf.truncated_normal_initializer( stddev=0.1)), 'W_conv2': tf.get_variable( 'W_conv2', [5, 1, 4, 8], initializer=tf.truncated_normal_initializer( stddev=0.1)), 'W_conv3': tf.get_variable( 'W_conv3', [3, 1, 8, 16], initializer=tf.truncated_normal_initializer( stddev=0.1)) } with tf.variable_scope('biases'): self.biases = { 'b_conv1': tf.get_variable('b_conv1', [4], initializer=tf.constant_initializer( 0, dtype=tf.float32)), 'b_conv2': tf.get_variable('b_conv2', [8], initializer=tf.constant_initializer( 0, dtype=tf.float32)), 'b_conv3': tf.get_variable('b_conv3', [16], initializer=tf.constant_initializer( 0, dtype=tf.float32)) } # input_x.shape: [batch_size, max_step, fea_dim] self.input_x = tf.placeholder(tf.float32, shape=[None, None, config.fea_dim], name="inputs_x") # input_y.shape:[batch_size, emo_num] self.input_y = tf.placeholder(tf.int32, shape=[None, None], name="labels_y") # seq_len: [batch_size] self.seq_len = tf.placeholder(tf.int32, shape=[None], name="feature_len") # nums of frames # label_len : [batch_szie] self.lab_len = tf.placeholder(tf.int32, shape=[None], name="label_len") # [A A A ] # batch_szie self.batch_size = tf.placeholder(tf.int32, [], name="batch_size") # training or testing label self.is_train = tf.placeholder(tf.bool, None) self.keep_prob = tf.placeholder(tf.float32, name="keep_prob") self.mu = tf.placeholder(tf.float32, shape=[config.fea_dim], name="mu") self.var = tf.placeholder(tf.float32, shape=[config.fea_dim], name="var") fea_norm = tf.nn.batch_normalization(self.input_x, self.mu, self.var, 0, 2, 0.001, name="normalize") self.input_x_bn = fea_norm with tf.name_scope('cnn_net'): # x_data.shape:[batch_size, max_step, fea_dim, 1] self.x_data = tf.reshape( self.input_x_bn, [self.batch_size, -1, config.fea_dim, 1]) # first convolution and pooling with tf.name_scope('conv1'): print('self.x_data:', self.x_data) conv1 = tf.nn.conv2d(self.x_data, self.weights['W_conv1'], strides=[1, 1, 1, 1], padding='SAME') h_conv1 = tf.nn.relu( tf.nn.bias_add(conv1, self.biases['b_conv1'])) h_pool1 = tf.nn.max_pool(h_conv1, ksize=[1, 3, 1, 1], strides=[1, 2, 1, 1], padding='SAME') print("h_pool1:", h_pool1) # second convolution and pooling with tf.name_scope('conv2'): conv2 = tf.nn.conv2d(h_pool1, self.weights['W_conv2'], strides=[1, 1, 1, 1], padding='SAME') h_conv2 = tf.nn.relu( tf.nn.bias_add(conv2, self.biases['b_conv2'])) h_pool2 = tf.nn.max_pool(h_conv2, ksize=[1, 3, 1, 1], strides=[1, 2, 1, 1], padding='SAME') print("h_pool2:", h_pool2) # third convolution and pooling with tf.name_scope('conv3'): conv3 = tf.nn.conv2d(h_pool2, self.weights['W_conv3'], strides=[1, 1, 1, 1], padding='SAME') h_conv3 = tf.nn.relu( tf.nn.bias_add(conv3, self.biases['b_conv3'])) h_pool3 = tf.nn.max_pool(h_conv3, ksize=[1, 3, 1, 1], strides=[1, 2, 1, 1], padding='SAME') print("h_pool3:", h_pool3) self.cnn_result = h_pool3 print( "self.cnn_result:", self.cnn_result) # [batch_size, frames_nums, fea_dim, 16] shape = self.cnn_result.get_shape().as_list() print('shape:', shape) self.cnn_results = tf.reshape(self.cnn_result, [self.batch_size, -1, shape[2] * 16]) print("self.cnn_results:", self.cnn_results) self.new_seq_len = tf.ceil((tf.to_float(self.seq_len)) / 8) self.new_seq_len = tf.cast(self.new_seq_len, tf.int32) with tf.name_scope('lstm_net'): count = -1 hidden_layer = [] with tf.name_scope('lstm_layer'): for unit_num in config.lstm_hidden_size: count = count + 1 with tf.name_scope('lstm_cell_' + str(count)): lstm_cell = tf.contrib.rnn.LSTMCell(unit_num) hidden_layer.append(lstm_cell) stack = tf.contrib.rnn.MultiRNNCell(hidden_layer, state_is_tuple=True) init_state = stack.zero_state(self.batch_size, dtype=tf.float32) outputs, last_states = tf.nn.dynamic_rnn( stack, self.cnn_results, self.new_seq_len, initial_state=init_state, dtype=tf.float32, time_major=False) # tf.ceil(tf.to_float(self.seq_len)) print('outputs:', outputs) # [batch_size, frame_nums, 256] print('last_states:', last_states) #h_output = last_states[-1][-1] h_output = tf.reshape(outputs, [-1, config.lstm_hidden_size[-1] ]) # [batch_size*frame_nums, 256] self.h_output = h_output print("self.h_output:", self.h_output) with tf.name_scope('dense_net'): # full_conn f_dense.shape:[batch_size, config.full_connect_layer_unit] f_dense = tf.contrib.layers.fully_connected( self.h_output, config.full_connect_layer_unit, activation_fn=None, scope='full_conn') if config.do_batchnorm: self.f_dense = tf.contrib.layers.batch_norm( f_dense, decay=0.99, center=True, scale=True, updates_collections=None, is_training=self.is_train, scope='bn') else: self.f_dense = f_dense #logits.shape: [batch_size, config.emo_num] logits = tf.contrib.layers.fully_connected(self.f_dense, config.class_num, activation_fn=None, scope='logits') #[batch_size, timestep, config.emo_num] self.logit = tf.reshape( logits, [self.batch_size, -1, config.class_num] ) # [batch_size*frame_nums, 4]--> [batch_size, frame_nums, 256] logits = tf.transpose(self.logit, (1, 0, 2)) self.logits = logits #[timesteps ,batch_size, 5] print("logits:", self.logits) with tf.name_scope('accuracy'): self.global_step = tf.Variable(0, trainable=False) targets = tf.contrib.keras.backend.ctc_label_dense_to_sparse( self.input_y, self.lab_len) #framenums*0.03 loss = tf.nn.ctc_loss( labels=targets, inputs=self.logits, sequence_length=self.new_seq_len) # framenums/8 self.cost = tf.reduce_mean(loss) self.optimizer = tf.train.AdamOptimizer( config.initial_learning_rate).minimize( self.cost, self.global_step) self.decoded, log_prob = tf.nn.ctc_beam_search_decoder( self.logits, self.new_seq_len, merge_repeated=False) self.decoded_dense = tf.sparse_tensor_to_dense( self.decoded[0], default_value=(config.class_num - 1)) dis = tf.edit_distance(tf.cast(self.decoded[0], tf.int32), targets) self.acc = tf.reduce_mean(dis) if config.out_model: saver = tf.train.Saver(max_to_keep=30) self.saver = saver
# Now we can perform address matching # Create graph sess = tf.Session() # Placeholders test_address = tf.sparse_placeholder( dtype=tf.string) test_zip = tf.placeholder(shape=[None, 1], dtype=tf.float32) ref_address = tf.sparse_placeholder(dtype=tf.string) ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32) # Declare Zip code distance for a test zip and reference set zip_dist = tf.square(tf.subtract(ref_zip, test_zip)) # Declare Edit distance for address address_dist = tf.edit_distance(test_address, ref_address, normalize=True) # Create similarity scores zip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1)) zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1)) zip_sim = tf.div(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min)) address_sim = tf.subtract(1., address_dist) # Combine distance functions address_weight = 0.5 zip_weight = 1. - address_weight weighted_sim = tf.add(tf.transpose(tf.multiply(address_weight, address_sim)), tf.multiply(zip_weight, zip_sim)) # Predict: Get max similarity entry top_match_index = tf.argmax(weighted_sim, 1)
h1 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3]], hypothesis, [1, 1, 1]) t1 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 0, 4]], truth, [1, 1, 1]) #print sess.run(h1) #print sess.run(t1) #tf.edit_distance的两个输入都是稀疏张量 """This operation takes variable-length sequences (`hypothesis` and `truth`), each provided as a `SparseTensor`, and computes the Levenshtein distance. You can normalize the edit distance by length of `truth` by setting `normalize` to true.""" print(sess.run(tf.edit_distance(h1, t1, normalize=False))) #---------------------------------- # Compute the edit distance between ('bear','beer') and 'beers': hypothesis2 = list('bearbeer') truth2 = list('beersbeers') h2 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3]], hypothesis2, [1, 2, 4]) t2 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 0, 4], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3], [0, 1, 4]], truth2, [1, 2, 5]) #normalize: 布尔值,如果值True的话,求出来的Levenshtein距离除以真实序列的长度. 默认为True #bear and beers编辑距离2 max len is 5 ;normalize 2/5=0.4 #beer and beers编辑距离1 max len is 5 ;normalize 1/5=0.2