def test_time_major(self): """Testing time_major param. testing if transposing and setting time_major=False will result in the same loss """ # [max_time x batch_size x depth tensor] inputs = np.random.randn(2, 2, 3).astype(np.float32) labels = SimpleSparseTensorFrom([[0, 1], [1, 0]]) seq_lens = np.array([2, 2], dtype=np.int32) inputs_t = constant_op.constant(inputs) # Transposing tensor to [batch_size x max_time x depth tensor] inputs_t_transposed = constant_op.constant(inputs.transpose(1, 0, 2)) with self.test_session(use_gpu=False) as sess: loss = ctc_ops.ctc_loss( inputs=inputs_t, labels=labels, sequence_length=seq_lens) loss_transposed = ctc_ops.ctc_loss( inputs=inputs_t_transposed, labels=labels, sequence_length=seq_lens, time_major=False) (tf_loss, tf_loss_transposed) = sess.run([loss, loss_transposed]) self.assertAllEqual(tf_loss, tf_loss_transposed)
def test_time_major(self): """Testing time_major param. testing if transposing and setting time_major=False will result in the same loss """ # [max_time x batch_size x depth tensor] inputs = np.random.randn(2, 2, 3).astype(np.float32) labels = SimpleSparseTensorFrom([[0, 1], [1, 0]]) seq_lens = np.array([2, 2], dtype=np.int32) inputs_t = constant_op.constant(inputs) # Transposing tensor to [batch_size x max_time x depth tensor] inputs_t_transposed = constant_op.constant(inputs.transpose(1, 0, 2)) with self.test_session(use_gpu=False) as sess: loss = ctc_ops.ctc_loss(inputs=inputs_t, labels=labels, sequence_length=seq_lens) loss_transposed = ctc_ops.ctc_loss(inputs=inputs_t_transposed, labels=labels, sequence_length=seq_lens, time_major=False) (tf_loss, tf_loss_transposed) = sess.run([loss, loss_transposed]) self.assertAllEqual(tf_loss, tf_loss_transposed)
def define_logit_and_ctc(output_combined, targetY, seqLengths, nHiddenOutput, nClass): W = tf.Variable( tf.truncated_normal([nHiddenOutput, nClass], stddev=np.sqrt(2.0 / nHiddenOutput))) # Zero initialization # Tip: tf.zeros_initializer b = tf.Variable(tf.zeros([nClass])) batch_size = tf.shape(output_combined)[0] max_time = tf.shape(output_combined)[1] output_combined_reshape = tf.reshape(output_combined, [-1, nHiddenOutput]) # Doing the affine projection logits = tf.matmul(output_combined_reshape, W) + b # Reshaping back to the original shape logits = tf.reshape(logits, [batch_size, max_time, nClass]) # Time major, this is convenient for edit distance. logits = tf.transpose(logits, (1, 0, 2)) loss_individual = ctc.ctc_loss(logits, targetY, seqLengths) loss_overall = tf.reduce_mean(loss_individual) # just use this beam search. predictions = tf.to_int32( ctc.ctc_beam_search_decoder(logits, seqLengths)[0][0]) errorRate_raw = tf.reduce_sum( tf.edit_distance(predictions, targetY, normalize=False)) z_count_this = tf.size(targetY.values) errorRate_this_batch = errorRate_raw / tf.to_float(z_count_this) return (loss_overall, loss_individual), (errorRate_raw, z_count_this, errorRate_this_batch), (logits, predictions)
def testCtcLossDenseWithBlankIndexIsSameAsCtcLoss(self): random_seed.set_random_seed(5) batch_size = 8 num_labels = 6 label_length = 5 num_frames = 12 logits = random_ops.random_uniform( [num_frames, batch_size, num_labels]) labels = random_ops.random_uniform([batch_size, label_length], minval=0, maxval=num_labels - 1, dtype=dtypes.int64) label_lengths = random_ops.random_uniform([batch_size], minval=2, maxval=label_length, dtype=dtypes.int64) label_mask = array_ops.sequence_mask(label_lengths, maxlen=label_length, dtype=label_lengths.dtype) labels *= label_mask logit_lengths = [num_frames] * batch_size tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse( tf_ctc_loss_labels, label_lengths) tf_nn_ctc_loss = ctc_ops.ctc_loss(labels=tf_ctc_loss_labels, inputs=logits, sequence_length=logit_lengths, time_major=True) tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0] # Shift the blank logits/labels to be somewhere in the middle. blank_index = 2 shifted_logits = array_ops.concat([ logits[:, :, :blank_index], logits[:, :, -1:], logits[:, :, blank_index:-1], ], axis=2) shifted_labels = array_ops.where(labels < blank_index, labels, labels + 1) ctc_loss = ctc_ops.ctc_loss_dense(labels=shifted_labels, logits=shifted_logits, label_length=label_lengths, logit_length=logit_lengths, blank_index=blank_index) ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0] with self.cached_session() as sess: for _ in range(32): self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss])) self.assertAllClose(*self.evaluate( [ctc_loss_grads, tf_nn_ctc_grads]), rtol=2e-06, atol=2e-06)
def loss(self): """ Define loss return """ # ctc loss with tf.name_scope('loss'): self.avg_loss = tf.reduce_mean( ctc_ops.ctc_loss(self.text, self.logits, self.seq_length)) tf.summary.scalar('loss', self.avg_loss) # [optimizer] with tf.name_scope('train'): self.optimizer = tf.train.AdamOptimizer( learning_rate=self.hyparam.learning_rate).minimize( self.avg_loss) with tf.name_scope("decode"): self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder( self.logits, self.seq_length, merge_repeated=False) with tf.name_scope("ctc_beam_search_decode"): self.prob = tf.nn.softmax(self.logits, dim=0) self.prob = tf.transpose( self.prob, [1, 0, 2] ) # keep the same dim with decoder {batch_size, time_step, n_character} self.decoder = LM_decoder(self.hyparam.alpha, self.hyparam.beta, self.hyparam.lang_model_path, self.words) with tf.name_scope("accuracy"): self.distance = tf.edit_distance( tf.cast(self.decoded[0], tf.int32), self.text) # compute label error rate (accuracy) self.label_err = tf.reduce_mean(self.distance, name='label_error_rate') tf.summary.scalar('accuracy', self.label_err)
def _testCTCLoss(self, inputs, seq_lens, labels, loss_truth, grad_truth, expected_err_re=None): self.assertEquals(len(inputs), len(grad_truth)) inputs_t = constant_op.constant(inputs) with self.test_session(use_gpu=False) as sess: loss = ctc_ops.ctc_loss(inputs=inputs_t, labels=labels, sequence_length=seq_lens) grad = gradients_impl.gradients(loss, [inputs_t])[0] self.assertShapeEqual(loss_truth, loss) self.assertShapeEqual(grad_truth, grad) if expected_err_re is None: (tf_loss, tf_grad) = sess.run([loss, grad]) self.assertAllClose(tf_loss, loss_truth, atol=1e-6) self.assertAllClose(tf_grad, grad_truth, atol=1e-6) else: with self.assertRaisesOpError(expected_err_re): sess.run([loss, grad])
def ctc_batch_cost(self, y_true, y_pred, input_length, label_length): """Runs CTC loss algorithm on each batch element. # Arguments y_true: tensor `(samples, max_string_length)` containing the truth labels. y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax. input_length: tensor `(samples, 1)` containing the sequence length for each batch item in `y_pred`. label_length: tensor `(samples, 1)` containing the sequence length for each batch item in `y_true`. # Returns Tensor with shape (samples,1) containing the CTC loss of each element. """ label_length = tf.to_int32(tf.squeeze(label_length, axis=-1)) input_length = tf.to_int32(tf.squeeze(input_length, axis=-1)) sparse_labels = tf.to_int32( K.ctc_label_dense_to_sparse(y_true, label_length)) y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-7) # 注意这里的True是为了忽略解码失败的情况,此时loss会变成nan直到下一个个batch return tf.expand_dims( ctc.ctc_loss(inputs=y_pred, labels=sparse_labels, sequence_length=input_length, ignore_longer_outputs_than_inputs=True), 1)
def ctc_batch_cost(y_true, y_pred, input_length, label_length): """ FROM KERAS - MODIFIED FOR BATCH SIZE OF ONE. Runs CTC loss algorithm on each batch element. # Arguments y_true: tensor `(samples, max_string_length)` containing the truth labels. y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax. input_length: tensor `(samples, 1)` containing the sequence length for each batch item in `y_pred`. label_length: tensor `(samples, 1)` containing the sequence length for each batch item in `y_true`. # Returns Tensor with shape (samples,1) containing the CTC loss of each element. """ label_length = tf.to_int32(tf.squeeze(label_length, axis=1)) input_length = tf.to_int32(tf.squeeze(input_length, axis=1)) sparse_labels = tf.to_int32( K.ctc_label_dense_to_sparse(y_true, label_length)) y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + K.epsilon()) return tf.expand_dims( ctc.ctc_loss(inputs=y_pred, labels=sparse_labels, sequence_length=input_length), 1)
def build_graph(self, args, maxTimeSteps): self.graph = tf.Graph() with self.graph.as_default(): self.inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, args.batch_size, args.num_feature)) #[maxL,32,39] inputXrs = tf.reshape(self.inputX, [-1, args.num_feature]) self.inputList = tf.split(inputXrs, maxTimeSteps, 0) #convert inputXrs from [32*maxL,39] to [32,maxL,39] self.targetIxs = tf.placeholder(tf.int64) self.targetVals = tf.placeholder(tf.int32) self.targetShape = tf.placeholder(tf.int64) self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape) self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size)) depth = 10 width = 8 self.config = { 'name':'residual network', 'num_layer':depth, 'num_featuremap':width, 'num_class':args.num_class, 'optimizer':args.optimizer, 'learning rate':args.learning_rate } inpt = tf.reshape(self.inputX,[args.batch_size,maxTimeSteps,args.num_feature,1]) conv_output = build_resnet(inpt,maxTimeSteps,depth,width,args.num_class) self.loss = tf.reduce_mean(ctc.ctc_loss(self.targetY, conv_output, self.seqLengths)) self.optimizer = args.optimizer(args.learning_rate).minimize(self.loss) self.logitsMaxTest = tf.slice(tf.argmax(conv_output, 2), [0, 0], [self.seqLengths[0], 1]) self.predictions = tf.to_int32(ctc.ctc_beam_search_decoder(conv_output, self.seqLengths)[0][0]) self.errorRate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targetY, normalize=False))/tf.to_float(tf.size(self.targetY.values)) self.initial_op = tf.global_variables_initializer() self.saver = tf.train.Saver(tf.global_variables(),max_to_keep=2,keep_checkpoint_every_n_hours=1) self.logfile = args.log_dir+str(datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')+'.txt').replace(' ','').replace('/','') self.var_op = tf.global_variables() self.var_trainable_op = tf.trainable_variables()
def loss(self): """ 定义loss :return: """ # 调用ctc loss with tf.name_scope('loss'): #损失 self.avg_loss = tf.reduce_mean( ctc_ops.ctc_loss(self.text, self.logits, self.seq_length)) tf.summary.scalar('loss', self.avg_loss) # [optimizer] with tf.name_scope('train'): #训练过程 self.optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(self.avg_loss) with tf.name_scope("decode"): self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder( self.logits, self.seq_length, merge_repeated=False) with tf.name_scope("accuracy"): self.distance = tf.edit_distance( tf.cast(self.decoded[0], tf.int32), self.text) # 计算label error rate (accuracy) self.label_err = tf.reduce_mean(self.distance, name='label_error_rate') tf.summary.scalar('accuracy', self.label_err)
def my_ctc_batch_cost(y_true, y_pred, input_length, label_length): """Runs CTC loss algorithm on each batch element. Arguments: y_true: tensor `(samples, max_string_length)` containing the truth labels. y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax. input_length: tensor `(samples, 1)` containing the sequence length for each batch item in `y_pred`. label_length: tensor `(samples, 1)` containing the sequence length for each batch item in `y_true`. Returns: Tensor with shape (samples,1) containing the CTC loss of each element. """ label_length = math_ops.to_int32(array_ops.squeeze(label_length, axis=-1)) input_length = math_ops.to_int32(array_ops.squeeze(input_length, axis=-1)) sparse_labels = math_ops.to_int32( my_ctc_label_dense_to_sparse(y_true, label_length)) y_pred = math_ops.log( array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon()) return array_ops.expand_dims( ctc.ctc_loss(inputs=y_pred, labels=sparse_labels, sequence_length=input_length, ctc_merge_repeated=True), 1)
def _testCTCLoss(self, inputs, seq_lens, labels, loss_truth, grad_truth, expected_err_re=None): self.assertEquals(len(inputs), len(grad_truth)) inputs_t = constant_op.constant(inputs) with self.test_session(use_gpu=False) as sess: loss = ctc_ops.ctc_loss( inputs=inputs_t, labels=labels, sequence_length=seq_lens) grad = gradients_impl.gradients(loss, [inputs_t])[0] self.assertShapeEqual(loss_truth, loss) self.assertShapeEqual(grad_truth, grad) if expected_err_re is None: (tf_loss, tf_grad) = sess.run([loss, grad]) self.assertAllClose(tf_loss, loss_truth, atol=1e-6) self.assertAllClose(tf_grad, grad_truth, atol=1e-6) else: with self.assertRaisesOpError(expected_err_re): sess.run([loss, grad])
def testCtcLossDenseIsSameAsCtcLoss(self): with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"): random_seed.set_random_seed(5) batch_size = 8 num_labels = 6 label_length = 5 minimum_logits_length = 10 num_frames = minimum_logits_length + batch_size logits = random_ops.random_uniform( [num_frames, batch_size, num_labels]) labels = random_ops.random_uniform([batch_size, label_length], minval=1, maxval=num_labels, dtype=dtypes.int64) label_lengths = random_ops.random_uniform([batch_size], minval=2, maxval=label_length, dtype=dtypes.int64) label_mask = array_ops.sequence_mask(label_lengths, maxlen=label_length, dtype=label_lengths.dtype) labels *= label_mask logit_lengths = math_ops.range(batch_size) + minimum_logits_length ctc_loss = ctc_ops.ctc_loss_dense(labels=labels, logits=logits, label_length=label_lengths, logit_length=logit_lengths) ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0] # Shift labels down by one (move blank from 0 to num_labels -1) tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1 tf_nn_ctc_logits = array_ops.concat([ logits[:, :, 1:], logits[:, :, 0:1], ], axis=2) tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse( tf_ctc_loss_labels, label_lengths) tf_nn_ctc_loss = ctc_ops.ctc_loss(labels=tf_ctc_loss_labels, inputs=tf_nn_ctc_logits, sequence_length=logit_lengths, time_major=True) tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0] with self.cached_session() as sess: for _ in range(32): self.assertAllClose( *self.evaluate([ctc_loss, tf_nn_ctc_loss])) self.assertAllClose(*self.evaluate( [ctc_loss_grads, tf_nn_ctc_grads]), rtol=4e-06, atol=4e-06)
def ctc_loss(self,outputs, targets, seq_len, num_classes,initial_learning_rate, keep_prob=0.8, scopeN="l1-ctc_loss"): """Implements ctc loss @param outputs: [batch,h,w,chanels] @param targets: sparce tensor @param seq_len: the length of the inputs sequences [batch] @param num_classes: the number of classes @param initial_learning_rate: learning rate @param keep_prob: if true dropout layer @param scopeN: the scope name @returns: list with [optimizer, cost, Inaccuracy- label error rate, decoded output of the batch] """ with tf.name_scope('Train'): with tf.variable_scope("ctc_loss-"+scopeN) as scope: W = tf.Variable(tf.truncated_normal([self.hidden*2, num_classes], stddev=0.1)) # Zero initialization b = tf.Variable(tf.constant(0., shape=[num_classes])) tf.summary.histogram('histogram-b-ctc', b) tf.summary.histogram('histogram-w-ctc', W) # Doing the affine projection logits = tf.matmul(outputs, W) + b if keep_prob is not None: logits = tf.nn.dropout(logits, keep_prob) # Reshaping back to the original shape logits = tf.reshape(logits, [self.width, self.batch_size, num_classes]) #logits = tf.transpose(logits, [1,0,2]) with tf.name_scope('CTC-loss'): loss = ctc_ops.ctc_loss(logits, targets, seq_len) cost = tf.reduce_mean(loss) with tf.name_scope('Optimizer'): if self.optimizer == "ADAM": optimizer = tf.train.AdamOptimizer(learning_rate=initial_learning_rate,name="AdamOptimizer").minimize(cost) elif self.optimizer == "RMSP": optimizer = tf.train.RMSPropOptimizer(learning_rate=initial_learning_rate, decay=self.decay, momentum=self.momentum).minimize(cost) else: raise Exception("model type not supported: {}".format(self.optimizer)) with tf.name_scope('Prediction'): if self.ctc_decoder == 'greedy': decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len) elif self.ctc_decoder == 'beam_search': decoded, log_prob = ctc_ops.ctc_beam_search_decoder(logits, seq_len) else: raise Exception("model type not supported: {}".format(self.ctc_decoder)) # Inaccuracy: label error rate ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) return optimizer, cost, ler, decoded
def testCtcLossDenseUniqueFastPathIsSameAsCtcLoss(self): random_seed.set_random_seed(5) batch_size = 8 num_labels = 6 label_length = 5 num_frames = 12 logits = random_ops.random_uniform( [num_frames, batch_size, num_labels]) labels = random_ops.random_uniform([batch_size, label_length], minval=1, maxval=num_labels, dtype=dtypes.int64) label_lengths = random_ops.random_uniform([batch_size], minval=2, maxval=label_length, dtype=dtypes.int64) label_mask = array_ops.sequence_mask(label_lengths, maxlen=label_length, dtype=label_lengths.dtype) labels *= label_mask logit_lengths = [num_frames] * batch_size ctc_loss = ctc_ops.ctc_loss_dense( labels=labels, logits=logits, label_length=label_lengths, logit_length=logit_lengths, unique=ctc_ops.ctc_unique_labels(labels)) ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0] # Shift labels down by one (move blank from 0 to num_labels -1) tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1 tf_nn_ctc_logits = array_ops.concat([ logits[:, :, 1:], logits[:, :, 0:1], ], axis=2) tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse( tf_ctc_loss_labels, label_lengths) tf_nn_ctc_loss = ctc_ops.ctc_loss(labels=tf_ctc_loss_labels, inputs=tf_nn_ctc_logits, sequence_length=logit_lengths, time_major=True) tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0] with self.cached_session() as sess: for _ in range(32): self.assertAllClose(*sess.run([ctc_loss, tf_nn_ctc_loss])) self.assertAllClose(*sess.run( [ctc_loss_grads, tf_nn_ctc_grads]), rtol=2e-06, atol=2e-06)
def test_one_wav(wav_path, label_text): tf.reset_default_graph() input_tensor = tf.placeholder( tf.float32, [None, None, n_input + (2 * n_input * n_context)], name='input') # ctc_loss计算需要使用sparse_placeholder生成SparseTensor targets = tf.sparse_placeholder(tf.int32, name='targets') # 文本 keep_dropout = tf.placeholder(tf.float32) seq_length = tf.placeholder(tf.int32, [None], name='seq_length') # 序列长 regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE) logits = inference(input_tensor, words_size + 1, False, keep_dropout, regularizer, tf.to_int64(seq_length)) avg_loss = tf.reduce_mean(ctc_ops.ctc_loss( targets, logits, seq_length)) + tf.add_n(tf.get_collection('losses')) learning_rate = 0.001 optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(avg_loss) with tf.name_scope("decode"): decoded, log_prob = ctc_ops.ctc_beam_search_decoder( logits, seq_length, merge_repeated=False) with tf.name_scope("accuracy"): distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets) # 计算label error rate (accuracy) ler = tf.reduce_mean(distance, name='label_error_rate') choose_cpkt = "BiRNN.cpkt-117" saver = tf.train.Saver(max_to_keep=5) re1 = "" re2 = "" with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver.restore(sess, savedir + choose_cpkt) _, source, source_lengths, sparse_labels = next_batch( labels=label_text, wav_files=wav_path) feed = { input_tensor: source, targets: sparse_labels, seq_length: source_lengths, keep_dropout: 1.0 } d, test_ler, batch_cost, _ = sess.run( [decoded[0], ler, avg_loss, optimizer], feed_dict=feed) dense_decoded = tf.sparse_tensor_to_dense( d, default_value=-1).eval(session=sess) dense_labels = base.sparse_tuple_to_texts_ch(sparse_labels, words) print('Label err rate: ', test_ler) for orig, decoded_arr in zip(dense_labels, dense_decoded): # convert to strings decoded_str = base.ndarray_to_text_ch(decoded_arr, words) decoded_str = decoded_str.strip().strip('龚') re1 = orig re2 = decoded_str print('Original: {}'.format(orig)) print('Decoded: {}'.format(decoded_str)) return re1, re2, test_ler
def testCtcLossDenseWithBlankIndexIsSameAsCtcLoss(self): random_seed.set_random_seed(5) batch_size = 8 num_labels = 6 label_length = 5 num_frames = 12 logits = random_ops.random_uniform([num_frames, batch_size, num_labels]) labels = random_ops.random_uniform( [batch_size, label_length], minval=0, maxval=num_labels-1, dtype=dtypes.int64) label_lengths = random_ops.random_uniform( [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64) label_mask = array_ops.sequence_mask( label_lengths, maxlen=label_length, dtype=label_lengths.dtype) labels *= label_mask logit_lengths = [num_frames] * batch_size tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse( tf_ctc_loss_labels, label_lengths) tf_nn_ctc_loss = ctc_ops.ctc_loss( labels=tf_ctc_loss_labels, inputs=logits, sequence_length=logit_lengths, time_major=True) tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0] # Shift the blank logits/labels to be somewhere in the middle. blank_index = 2 shifted_logits = array_ops.concat([ logits[:, :, :blank_index], logits[:, :, -1:], logits[:, :, blank_index:-1], ], axis=2) shifted_labels = array_ops.where(labels < blank_index, labels, labels + 1) ctc_loss = ctc_ops.ctc_loss_dense( labels=shifted_labels, logits=shifted_logits, label_length=label_lengths, logit_length=logit_lengths, blank_index=blank_index) ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0] with self.cached_session() as sess: for _ in range(32): self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss])) self.assertAllClose( *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]), rtol=2e-06, atol=2e-06)
def setup_loss_function(self): with tf.name_scope("loss"): self.total_loss = ctc_ops.ctc_loss( self.targets, self.logits, self.seq_length,ignore_longer_outputs_than_inputs=True) self.avg_loss = tf.reduce_mean(self.total_loss) self.loss_summary = tf.summary.scalar("avg_loss", self.avg_loss) self.cost_placeholder = tf.placeholder(dtype=tf.float32, shape=[]) self.train_cost_op = tf.summary.scalar( "train_avg_loss", self.cost_placeholder)
def testEmptyBatch(self): inputs = constant_op.constant([], dtype=dtypes.float32, shape=(1, 0, 2)) sequence_lengths = constant_op.constant([], dtype=dtypes.int32) labels = sparse_tensor.SparseTensor( indices=constant_op.constant([], shape=(0, 2), dtype=dtypes.int64), values=constant_op.constant([], shape=(0,), dtype=dtypes.int32), dense_shape=[5, 5]) with self.test_session(use_gpu=False) as sess: with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "batch_size must not be 0"): sess.run(ctc_ops.ctc_loss(labels, inputs, sequence_lengths))
def testEmptyBatch(self): inputs = constant_op.constant([], dtype=dtypes.float32, shape=(1, 0, 2)) sequence_lengths = constant_op.constant([], dtype=dtypes.int32) labels = sparse_tensor.SparseTensor( indices=constant_op.constant([], shape=(0, 2), dtype=dtypes.int64), values=constant_op.constant([], shape=(0,), dtype=dtypes.int32), dense_shape=[5, 5]) with self.session(use_gpu=False) as sess: with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "batch_size must not be 0"): sess.run(ctc_ops.ctc_loss(labels, inputs, sequence_lengths))
def testCtcLossDenseUniqueFastPathIsSameAsCtcLoss(self): random_seed.set_random_seed(5) batch_size = 8 num_labels = 6 label_length = 5 num_frames = 12 logits = random_ops.random_uniform([num_frames, batch_size, num_labels]) labels = random_ops.random_uniform( [batch_size, label_length], minval=1, maxval=num_labels, dtype=dtypes.int64) label_lengths = random_ops.random_uniform( [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64) label_mask = array_ops.sequence_mask( label_lengths, maxlen=label_length, dtype=label_lengths.dtype) labels *= label_mask logit_lengths = [num_frames] * batch_size ctc_loss = ctc_ops.ctc_loss_dense( labels=labels, logits=logits, label_length=label_lengths, logit_length=logit_lengths, unique=ctc_ops.ctc_unique_labels(labels)) ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0] # Shift labels down by one (move blank from 0 to num_labels -1) tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1 tf_nn_ctc_logits = array_ops.concat([ logits[:, :, 1:], logits[:, :, 0:1], ], axis=2) tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse( tf_ctc_loss_labels, label_lengths) tf_nn_ctc_loss = ctc_ops.ctc_loss( labels=tf_ctc_loss_labels, inputs=tf_nn_ctc_logits, sequence_length=logit_lengths, time_major=True) tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0] with self.cached_session() as sess: for _ in range(32): self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss])) self.assertAllClose( *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]), rtol=2e-06, atol=2e-06)
def testCtcLossDenseWithNegativeBlankIndexIsSameAsCtcLoss(self): with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"): random_seed.set_random_seed(5) batch_size = 8 num_labels = 6 label_length = 5 num_frames = 12 logits = random_ops.random_uniform( [num_frames, batch_size, num_labels]) labels = random_ops.random_uniform([batch_size, label_length], minval=0, maxval=num_labels - 1, dtype=dtypes.int64) label_lengths = random_ops.random_uniform([batch_size], minval=2, maxval=label_length, dtype=dtypes.int64) label_mask = array_ops.sequence_mask(label_lengths, maxlen=label_length, dtype=label_lengths.dtype) labels *= label_mask logit_lengths = [num_frames] * batch_size ctc_loss = ctc_ops.ctc_loss_dense(labels=labels, logits=logits, label_length=label_lengths, logit_length=logit_lengths, blank_index=-1) ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0] tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse( tf_ctc_loss_labels, label_lengths) tf_nn_ctc_loss = ctc_ops.ctc_loss(labels=tf_ctc_loss_labels, inputs=logits, sequence_length=logit_lengths, time_major=True) tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0] with self.cached_session() as sess: for _ in range(32): self.assertAllClose( *self.evaluate([ctc_loss, tf_nn_ctc_loss])) self.assertAllClose(*self.evaluate( [ctc_loss_grads, tf_nn_ctc_grads]), rtol=2e-06, atol=2e-06)
def ctc_lambda_func(self, args): y_pred, y_true, input_length, label_length = args label_length = math_ops.to_int32(array_ops.squeeze(label_length)) input_length = math_ops.to_int32(array_ops.squeeze(input_length)) sparse_labels = math_ops.to_int32( ctc_label_dense_to_sparse(y_true, label_length)) y_pred = math_ops.log( array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-7) return array_ops.expand_dims( ctc.ctc_loss(inputs=y_pred, labels=sparse_labels, sequence_length=input_length, ignore_longer_outputs_than_inputs=True), 1)
def _ctc_loss_with_beam_search(logits, sparse_labels, seq_length, top_path=1, merge_repeated=False): ctc_loss = math_ops.reduce_mean( ctc_ops.ctc_loss(sparse_labels, logits, seq_length)) pre_label_tensors, log_prob = tf.nn.ctc_beam_search_decoder( logits, seq_length, merge_repeated=merge_repeated, top_paths=top_path) top1_label_tensor = math_ops.cast(pre_label_tensors[0], dtypes.int32) top1_ed = math_ops.reduce_mean( array_ops.edit_distance(top1_label_tensor, sparse_labels)) return ctc_loss, top1_ed, pre_label_tensors, log_prob
def ctc_batch(y_true, y_pred, input_length, label_length): label_length = tf.to_int32(tf.squeeze(label_length)) input_length = tf.to_int32(tf.squeeze(input_length)) sparse_labels = tf.to_int32( K.ctc_label_dense_to_sparse(y_true, label_length)) y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8) return tf.expand_dims( ctc.ctc_loss(inputs=y_pred, labels=sparse_labels, sequence_length=input_length, ignore_longer_outputs_than_inputs=True), 1)
def testInvalidSecondGradient(self): inputs = np.random.randn(2, 2, 3).astype(np.float32) inputs_t = constant_op.constant(inputs) labels = SimpleSparseTensorFrom([[0, 1], [1, 0]]) seq_lens = np.array([2, 2], dtype=np.int32) v = [1.0] with self.test_session(use_gpu=False): loss = ctc_ops.ctc_loss(inputs=inputs_t, labels=labels, sequence_length=seq_lens) # Taking ths second gradient should fail, since it is not # yet supported. with self.assertRaisesRegexp(LookupError, "explicitly disabled"): _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
def testInvalidSecondGradient(self): inputs = np.random.randn(2, 2, 3).astype(np.float32) inputs_t = constant_op.constant(inputs) labels = SimpleSparseTensorFrom([[0, 1], [1, 0]]) seq_lens = np.array([2, 2], dtype=np.int32) v = [1.0] with self.test_session(use_gpu=False): loss = ctc_ops.ctc_loss( inputs=inputs_t, labels=labels, sequence_length=seq_lens) # Taking ths second gradient should fail, since it is not # yet supported. with self.assertRaisesRegexp(LookupError, "explicitly disabled"): _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
def backward(self): self.loss = ctc.ctc_loss(labels=self.y_, inputs=self.logits, sequence_length=self.seq_len) self.cost = tf.reduce_mean(self.loss) self.opt = tf.train.AdamOptimizer(0.05).minimize(self.loss) self.decoded, self.log_prob = \ tf.nn.ctc_beam_search_decoder(self.logits, self.seq_len, merge_repeated=False) #把稀疏矩阵转换成稠密矩阵,长度不足的序列用-1来填充 self.dense_decoded = tf.sparse_tensor_to_dense(self.decoded[0], default_value=-1) self.acc = tf.reduce_mean( tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.y_))
def testCtcLossDenseWithNegativeBlankIndexIsSameAsCtcLoss(self): with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"): random_seed.set_random_seed(5) batch_size = 8 num_labels = 6 label_length = 5 num_frames = 12 logits = random_ops.random_uniform([num_frames, batch_size, num_labels]) labels = random_ops.random_uniform( [batch_size, label_length], minval=0, maxval=num_labels-1, dtype=dtypes.int64) label_lengths = random_ops.random_uniform( [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64) label_mask = array_ops.sequence_mask( label_lengths, maxlen=label_length, dtype=label_lengths.dtype) labels *= label_mask logit_lengths = [num_frames] * batch_size ctc_loss = ctc_ops.ctc_loss_dense( labels=labels, logits=logits, label_length=label_lengths, logit_length=logit_lengths, blank_index=-1) ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0] tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse( tf_ctc_loss_labels, label_lengths) tf_nn_ctc_loss = ctc_ops.ctc_loss( labels=tf_ctc_loss_labels, inputs=logits, sequence_length=logit_lengths, time_major=True) tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0] with self.cached_session() as sess: for _ in range(32): self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss])) self.assertAllClose( *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]), rtol=2e-06, atol=2e-06)
def build_graph(self, args, maxTimeSteps): self.graph = tf.Graph() with self.graph.as_default(): self.inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, args.batch_size, args.num_feature)) #[maxL,32,39] self.inputXX = tf.reshape(self.inputX,shape=(args.batch_size,maxTimeSteps,args.num_feature)) inputXrs = tf.reshape(self.inputX, [-1, args.num_feature]) #self.inputList = tf.split(0, maxTimeSteps, inputXrs) #convert inputXrs from [32*maxL,39] to [32,maxL,39] #self.inputnew = tf.reshape(self.inputX, [1, 0, 2]) self.targetIxs = tf.placeholder(tf.int64) self.targetVals = tf.placeholder(tf.int32) self.targetShape = tf.placeholder(tf.int64) self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape) self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size)) self.config = { 'name':args.model, 'rnncell':self.cell_fn, 'num_layer':args.num_layer, 'num_hidden':args.num_hidden, 'num_class':args.num_class, 'activation':args.activation, 'optimizer':args.optimizer, 'learning rate':args.learning_rate } # forward layer forwardH1 = self.cell_fn(args.num_hidden,activation=tf.nn.relu) # backward layer backwardH1 = self.cell_fn(args.num_hidden,activation=tf.nn.relu) # bi-directional layer fbH1, state = bidirectional_dynamic_rnn(forwardH1, backwardH1, self.inputXX, sequence_length=self.seqLengths, dtype=tf.float32, scope='BDRNN_H1') fbH1 = tf.concat(2, fbH1) print(fbH1.get_shape) shape = fbH1.get_shape().as_list() fbH1 = tf.reshape(fbH1,[shape[0]*shape[1],-1]) #seq*batch,feature fbH1_list = tf.split(0,shape[1],fbH1) logits = [build_forward_layer(t,[shape[2],args.num_class],kernel='linear') for t in fbH1_list] logits3d = tf.pack(logits) self.loss = tf.reduce_mean(ctc.ctc_loss(logits3d, self.targetY, self.seqLengths)) self.optimizer = tf.train.AdamOptimizer(args.learning_rate).minimize(self.loss) self.logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [self.seqLengths[0], 1]) self.predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, self.seqLengths)[0][0]) self.errorRate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targetY, normalize=False))/tf.to_float(tf.size(self.targetY.values)) self.initial_op = tf.initialize_all_variables() self.saver = tf.train.Saver(tf.all_variables(),max_to_keep=5,keep_checkpoint_every_n_hours=1) self.logfile = args.log_dir+str(datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')+'.txt').replace(' ','').replace('/','') self.var_op = tf.all_variables() self.var_trainable_op = tf.trainable_variables()
def loss(self): """ 定义loss :return: """ # 调用ctc loss with tf.name_scope('loss'): #损失 self.avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(self.text, self.logits, self.seq_length)) tf.summary.scalar('loss',self.avg_loss) # [optimizer] with tf.name_scope('train'): #训练过程 self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.avg_loss) with tf.name_scope("decode"): self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder(self.logits, self.seq_length, merge_repeated=False) with tf.name_scope("accuracy"): self.distance = tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.text) # 计算label error rate (accuracy) self.label_err = tf.reduce_mean(self.distance, name='label_error_rate') tf.summary.scalar('accuracy', self.label_err)
def ctc_batch_cost(y_true, y_pred, input_length): """Runs CTC loss algorithm on each batch element. # Arguments y_true: tensor `(samples, max_string_length)` containing the truth labels. y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax. input_length: tensor `(samples, 1)` containing the sequence length for each batch item in `y_pred`. [time_step] * batch_size | np.zeros([batch_size, 1]);input_length[i] = time_step(img_w//pool_size) # Returns Tensor with shape (samples,1) containing the CTC loss of each element. """ input_length = tf.to_int32(tf.squeeze(input_length)) sparse_labels = tf.to_int32(ctc_label_dense_to_sparse(y_true)) y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8) return tf.expand_dims( ctc.ctc_loss(inputs=y_pred, labels=sparse_labels, sequence_length=input_length), 1)
# see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers W = tf.Variable(tf.truncated_normal([num_hidden, num_classes], stddev=0.1)) # Zero initialization # Tip: Is tf.zeros_initializer the same? b = tf.Variable(tf.constant(0., shape=[num_classes])) # Doing the affine projection logits = tf.matmul(outputs, W) + b # Reshaping back to the original shape logits = tf.reshape(logits, [batch_s, -1, num_classes]) # Time major logits = tf.transpose(logits, (1, 0, 2)) loss = ctc_ops.ctc_loss(targets, logits, seq_len) cost = tf.reduce_mean(loss) optimizer = tf.train.MomentumOptimizer(initial_learning_rate, 0.9).minimize(cost) # Option 2: tf.contrib.ctc.ctc_beam_search_decoder # (it's slower but you'll get better results) decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len) # Inaccuracy: label error rate ler = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) saver = tf.train.Saver()
def continue_train(): input_tensor = tf.placeholder( tf.float32, [None, None, n_input + (2 * n_input * n_context)], name='input') # ctc_loss计算需要使用sparse_placeholder生成SparseTensor targets = tf.sparse_placeholder(tf.int32, name='targets') # 文本 keep_dropout = tf.placeholder(tf.float32) seq_length = tf.placeholder(tf.int32, [None], name='seq_length') # 序列长 regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE) logits = inference(input_tensor, words_size + 1, True, keep_dropout, regularizer, tf.to_int64(seq_length)) avg_loss = tf.reduce_mean(ctc_ops.ctc_loss( targets, logits, seq_length)) + tf.add_n(tf.get_collection('losses')) learning_rate = 0.001 optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(avg_loss) with tf.name_scope("decode"): decoded, log_prob = ctc_ops.ctc_beam_search_decoder( logits, seq_length, merge_repeated=False) with tf.name_scope("accuracy"): distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets) # 计算label error rate (accuracy) ler = tf.reduce_mean(distance, name='label_error_rate') epochs = 1000 #ckpt = tf.train.get_checkpoint_state(savedir) saver = tf.train.Saver(max_to_keep=5) #saver2 = tf.train.Saver(max_to_keep=5) # 生成saver with tf.Session() as sess: choose_cpkt = "BiRNN.cpkt-204" sess.run(tf.global_variables_initializer()) print_tensors_in_checkpoint_file(savedir + choose_cpkt, None, True) saver.restore(sess, savedir + choose_cpkt) #graph = tf.get_default_graph() #cur_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] startepo = 204 train_start = time.time() for epoch in range(startepo, epochs): # 样本集迭代次数 epoch_start = time.time() #if epoch < startepo: # continue print("epoch start:", epoch + 1, "total epochs= ", epochs) ##run batch## n_batches_per_epoch = int(np.ceil(len(labels) / batch_size)) print("total loop ", n_batches_per_epoch, "in one epoch,", batch_size, "items in one loop") train_cost = 0 train_ler = 0 next_idx = 0 for batch in range(n_batches_per_epoch): # 一次batch_size,取多少次 # 取数据 next_idx, source, source_lengths, sparse_labels = next_batch( labels, next_idx, batch_size) feed = { input_tensor: source, targets: sparse_labels, seq_length: source_lengths, keep_dropout: keep_dropout_rate } # 计算 avg_loss optimizer ; batch_cost, _ = sess.run([avg_loss, optimizer], feed_dict=feed) train_cost += batch_cost if (batch + 1) % 50 == 0: print('loop:', batch + 1, 'Train cost: ', train_cost / (batch + 1)) feed2 = { input_tensor: source, targets: sparse_labels, seq_length: source_lengths, keep_dropout: 1.0 } d, train_ler = sess.run([decoded[0], ler], feed_dict=feed2) dense_decoded = tf.sparse_tensor_to_dense( d, default_value=-1).eval(session=sess) dense_labels = base.sparse_tuple_to_texts_ch( sparse_labels, words) counter = 0 print('Label err rate: ', train_ler) duration = time.time() - train_start print('cost time: {:.2f} min'.format(duration / 60)) for orig, decoded_arr in zip(dense_labels, dense_decoded): # convert to strings decoded_str = base.ndarray_to_text_ch( decoded_arr, words) decoded_str = decoded_str.strip().strip('龚') print(' file {}'.format(counter)) print('Original: {}'.format(orig)) print('Decoded: {}'.format(decoded_str)) counter = counter + 1 break epoch_duration = time.time() - epoch_start log = 'Epoch {}/{}, train_cost: {:.3f}, train_ler: {:.3f}, time: {:.2f} sec' print( log.format(epoch + 1, epochs, train_cost, train_ler, epoch_duration)) saver.save(sess, savedir + "BiRNN.cpkt", global_step=epoch + 1) print("save cpkt-%s complete." % (epoch + 1))
def CheckpointTest(): # input_tensor为输入音频数据,由前面分析可知,它的结构是[batch_size, amax_stepsize, n_input + (2 * n_input * n_context)] # 其中,batch_size是batch的长度,amax_stepsize是时序长度,n_input + (2 * n_input * n_context)是MFCC特征数, # batch_size是可变的,所以设为None,由于每一批次的时序长度不固定,所有,amax_stepsize也设为None input_tensor = tf.placeholder(tf.float32, [None, None, n_input + (2 * n_input * n_context)], name='input') # Use sparse_placeholder; will generate a SparseTensor, required by ctc_loss op. # targets保存的是音频数据对应的文本的系数张量,所以用sparse_placeholder创建一个稀疏张量 targets = tf.sparse_placeholder(tf.int32, name='targets') # seq_length保存的是当前batch数据的时序长度 seq_length = tf.placeholder(tf.int32, [None], name='seq_length') # keep_dropout则是dropout的参数 keep_dropout = tf.placeholder(tf.float32) # logits is the non-normalized output/activations from the last layer. # logits will be input for the loss function. # nn_model is from the import statement in the load_model function logits = BiRNN_model(input_tensor, tf.to_int64(seq_length), n_input, n_context, words_size + 1, keep_dropout) aa = ctc_ops.ctc_loss(targets, logits, seq_length) # 使用ctc loss计算损失 avg_loss = tf.reduce_mean(aa) # 优化器 learning_rate = 0.001 optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(avg_loss) # 使用CTC decoder with tf.name_scope("decode"): decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_length, merge_repeated=True) # 计算编辑距离 with tf.name_scope("accuracy"): distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets) # 计算label error rate (accuracy) ler = tf.reduce_mean(distance, name='label_error_rate') # 迭代次数 epochs = 150 # 模型保存地址 savedir = "saver/" # 如果该目录不存在,新建 if os.path.exists(savedir) == False: os.mkdir(savedir) # 生成saver saver = tf.train.Saver(max_to_keep=1) # 创建session with tf.Session() as sess: # 初始化 sess.run(tf.global_variables_initializer()) # 没有模型的话,就重新初始化 kpt = tf.train.latest_checkpoint(savedir) print("kpt:", kpt) startepo = 0 if kpt != None: saver.restore(sess, kpt) ind = kpt.find("-") startepo = int(kpt[ind + 1:]) # 要识别的语音文件 wav_file = 'input.wav' source, source_lengths, sparse_labels = get_speech_file(wav_file, labels) feed2 = {input_tensor: source, targets: sparse_labels, seq_length: source_lengths, keep_dropout: 1.0} d, train_ler = sess.run([decoded[0], ler], feed_dict=feed2) dense_decoded = tf.sparse_tensor_to_dense(d, default_value=-1).eval(session=sess) if (len(dense_decoded) > 0): decoded_str = ndarray_to_text_ch(dense_decoded[0], words) print('Decoded: {}'.format(decoded_str))
fbH1, _, _ = bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32, scope='BDLSTM_H1') fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1] outH1 = [ tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) + biasesOutH1 for t in fbH1rs ] logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1] ####Optimizing logits3d = tf.pack(logits) loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths)) optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32( ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0]) errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \ tf.to_float(tf.size(targetY.values)) ####Run session with tf.Session(graph=graph) as session: print('Initializing') tf.initialize_all_variables().run()
outputs = tf.reshape(outputs, [-1, num_lstm_hidden]) # Weights for regression layer. W = tf.Variable(tf.truncated_normal([num_lstm_hidden, num_classes], stddev=0.1), name='W') b = tf.Variable(tf.constant(0., shape=[num_classes]), name='b') # Apply linear transform logits = tf.matmul(outputs, W) + b # Reshaping back to the original shape logits = tf.reshape(logits, [batch_s, -1, num_classes]) # Swap dimensions to time major for CTC loss. logits = tf.transpose(logits, (1, 0, 2)) loss = ctc.ctc_loss(targets, logits, seq_len) cost = tf.reduce_mean(loss) # Record the loss tf.contrib.deprecated.scalar_summary('loss', cost) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, use_nesterov=True).minimize(cost) decoded, log_prob = ctc.ctc_beam_search_decoder(inputs=logits, sequence_length=seq_len) # Label error rate using the edit distance between output and target ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) # Record the label error rate tf.contrib.deprecated.scalar_summary('label error rate', ler)
tf.float32, [None, None, n_input + (2 * n_input * n_context)], name='input') #语音log filter bank or MFCC features # Use sparse_placeholder; will generate a SparseTensor, required by ctc_loss op. targets = tf.sparse_placeholder(tf.int32, name='targets') #文本 # 1d array of size [batch_size] seq_length = tf.placeholder(tf.int32, [None], name='seq_length') #序列长 keep_dropout = tf.placeholder(tf.float32) # logits is the non-normalized output/activations from the last layer. # logits will be input for the loss function. # nn_model is from the import statement in the load_model function logits = BiRNN_model(input_tensor, tf.to_int64(seq_length), n_input, n_context, words_size + 1, keep_dropout) #调用ctc loss avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(targets, logits, seq_length)) #[optimizer] learning_rate = 0.001 optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(avg_loss) with tf.name_scope("decode"): decoded, log_prob = ctc_ops.ctc_beam_search_decoder(logits, seq_length, merge_repeated=False) with tf.name_scope("accuracy"): distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets) # 计算label error rate (accuracy) ler = tf.reduce_mean(distance, name='label_error_rate')
def runCTC(batch): INPUT_PATH = '../TRAIN/All/mfcc/' #directory of MFCC nFeatures x nFrames 2-D array .npy files TARGET_PATH = '../TRAIN/All/phone_y/' #directory of nPhonemes 1-D array .npy files ####Learning Parameters learningRate = 0.001 momentum = 0.9 nEpochs = 300 batchSize = batch.shape[1] ####Network Parameters nFeatures = 39 #12 MFCC coefficients + energy, and derivatives nHidden = 256 nClasses = 30 #39 phonemes, plus the "blank" for CTC ####Load data print('Loading data') with open('TIMIT_data_prepared_for_CTC.pkl', 'rb') as f: data = pickle.load(f) input_list = batch charmap = data['chars'] print(charmap) charmap.append('_') #batchedData, maxTimeSteps = data_lists_to_batches(input_list, target_list, batchSize) maxTimeSteps = 776 totalN = len(input_list) ####Define graph print('Defining graph') graph = tf.Graph() with graph.as_default(): ####NOTE: try variable-steps inputs and dynamic bidirectional rnn, when it's implemented in tensorflow ####Graph input inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, batchSize, nFeatures)) #Prep input data to fit requirements of rnn.bidirectional_rnn # Reshape to 2-D tensor (nTimeSteps*batchSize, nfeatures) inputXrs = tf.reshape(inputX, [-1, nFeatures]) # Split to get a list of 'n_steps' tensors of shape (batch_size, n_hidden) inputList = tf.split(0, maxTimeSteps, inputXrs) targetIxs = tf.placeholder(tf.int64) targetVals = tf.placeholder(tf.int32) targetShape = tf.placeholder(tf.int64) targetY = tf.SparseTensor(targetIxs, targetVals, targetShape) seqLengths = tf.placeholder(tf.int32, shape=(batchSize)) ####Weights & biases weightsOutH1 = tf.Variable( tf.truncated_normal([2, nHidden], stddev=np.sqrt(2.0 / (2 * nHidden)))) biasesOutH1 = tf.Variable(tf.zeros([nHidden])) weightsOutH2 = tf.Variable( tf.truncated_normal([2, nHidden], stddev=np.sqrt(2.0 / (2 * nHidden)))) biasesOutH2 = tf.Variable(tf.zeros([nHidden])) weightsClasses = tf.Variable( tf.truncated_normal([nHidden, nClasses], stddev=np.sqrt(2.0 / nHidden))) biasesClasses = tf.Variable(tf.zeros([nClasses])) ####Network forwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) backwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) fbH1, _, _ = bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32, scope='BDLSTM_H1') fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1] outH1 = [ tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) + biasesOutH1 for t in fbH1rs ] logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1] ####Optimizing logits3d = tf.pack(logits) loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths)) optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32( ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0]) errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \ tf.to_float(tf.size(targetY.values)) ####Run session with tf.Session(graph=graph) as session: print('Initializing') saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('/users/TeamASR/models') if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") session.run(tf.initialize_all_variables()) feedDict = {inputX: batch, seqLengths: (np.ones([batchSize]) * 776)} logit = session.run([logits3d], feed_dict=feedDict) return logit
def runCTC(batch): INPUT_PATH = '../TRAIN/All/mfcc/' #directory of MFCC nFeatures x nFrames 2-D array .npy files TARGET_PATH = '../TRAIN/All/phone_y/' #directory of nPhonemes 1-D array .npy files ####Learning Parameters learningRate = 0.001 momentum = 0.9 nEpochs = 300 batchSize = batch.shape[1] ####Network Parameters nFeatures = 39 #12 MFCC coefficients + energy, and derivatives nHidden = 256 nClasses = 30 #39 phonemes, plus the "blank" for CTC ####Load data print('Loading data') with open('TIMIT_data_prepared_for_CTC.pkl','rb') as f: data= pickle.load(f) input_list = batch charmap = data['chars'] print(charmap) charmap.append('_') #batchedData, maxTimeSteps = data_lists_to_batches(input_list, target_list, batchSize) maxTimeSteps = 776 totalN = len(input_list) ####Define graph print('Defining graph') graph = tf.Graph() with graph.as_default(): ####NOTE: try variable-steps inputs and dynamic bidirectional rnn, when it's implemented in tensorflow ####Graph input inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, batchSize, nFeatures)) #Prep input data to fit requirements of rnn.bidirectional_rnn # Reshape to 2-D tensor (nTimeSteps*batchSize, nfeatures) inputXrs = tf.reshape(inputX, [-1, nFeatures]) # Split to get a list of 'n_steps' tensors of shape (batch_size, n_hidden) inputList = tf.split(0, maxTimeSteps, inputXrs) targetIxs = tf.placeholder(tf.int64) targetVals = tf.placeholder(tf.int32) targetShape = tf.placeholder(tf.int64) targetY = tf.SparseTensor(targetIxs, targetVals, targetShape) seqLengths = tf.placeholder(tf.int32, shape=(batchSize)) ####Weights & biases weightsOutH1 = tf.Variable(tf.truncated_normal([2, nHidden], stddev=np.sqrt(2.0 / (2*nHidden)))) biasesOutH1 = tf.Variable(tf.zeros([nHidden])) weightsOutH2 = tf.Variable(tf.truncated_normal([2, nHidden], stddev=np.sqrt(2.0 / (2*nHidden)))) biasesOutH2 = tf.Variable(tf.zeros([nHidden])) weightsClasses = tf.Variable(tf.truncated_normal([nHidden, nClasses], stddev=np.sqrt(2.0 / nHidden))) biasesClasses = tf.Variable(tf.zeros([nClasses])) ####Network forwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) backwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) fbH1, _, _ = bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32, scope='BDLSTM_H1') fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1] outH1 = [tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) + biasesOutH1 for t in fbH1rs] logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1] ####Optimizing logits3d = tf.pack(logits) loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths)) optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating logitsMaxTest = tf.slice(tf.argmax(logits3d,2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0]) errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \ tf.to_float(tf.size(targetY.values)) ####Run session with tf.Session(graph=graph) as session: print('Initializing') saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('/users/TeamASR/models') if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") session.run(tf.initialize_all_variables()) feedDict = {inputX: batch, seqLengths: (np.ones([batchSize])*776)} logit = session.run([logits3d], feed_dict=feedDict) return logit
forwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) backwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) print("building bidirectional_rnn ... SLOW!!!") fbH1, _, _ = bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32, scope='BDLSTM_H1') print("done building rnn") print("building fbH1rs ") fbH1rs = [tf.reshape(t, [Size, 2, nHidden]) for t in fbH1] print("building outH1 ") outH1 = [tf.reduce_sum(tf.multiply(t, weightsOutH1), axis=1) + biasesOutH1 for t in fbH1rs] print("building logits ") logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1] print("len(outH1) %d"% len(outH1)) ####Optimizing print("building loss") logits3d = tf.stack(logits) loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths)) out = tf.identity(loss, 'ctc_loss_mean') optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating print("building Evaluation") logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0]) reduced_sum = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) errorRate = reduced_sum / tf.to_float(tf.size(targetY.values)) check_op = tf.add_check_numerics_ops() print("done building graph") ####Run session with tf.Session(graph=graph) as session: