def decode(self, predictions, seq_len,k): #print(target.get_shape().as_list(),'target') if self.ctc_decoder == 'greedy': decoded, log_prob = ctc_ops.ctc_greedy_decoder(predictions, seq_len) elif self.ctc_decoder == 'beam_search': decoded, log_prob = ctc_ops.ctc_beam_search_decoder(predictions, seq_len,top_paths=k) else: raise Exception("model type not supported: {}".format(self.ctc_decoder)) return decoded
def setup_decoder(self): with tf.name_scope("decode"): if self.beam_search_decoder == 'default': self.decoded, self.log_prob = ctc_ops.ctc_beam_search_decoder( self.logits, self.seq_length, merge_repeated=False) elif self.beam_search_decoder == 'greedy': self.decoded, self.log_prob = ctc_ops.ctc_greedy_decoder( self.logits, self.seq_length, merge_repeated=False) else: logging.warning("Invalid beam search decoder option selected!")
def ctc_decode(self, y_pred, input_length, greedy=True, beam_width=100, top_paths=1, merge_repeated=False): """Decodes the output of a softmax. Can use either greedy search (also known as best path) or a constrained dictionary search. # Arguments y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax. input_length: tensor `(samples, )` containing the sequence length for each batch item in `y_pred`. greedy: perform much faster best-path search if `True`. This does not use a dictionary. beam_width: if `greedy` is `False`: a beam search decoder will be used with a beam of this width. top_paths: if `greedy` is `False`, how many of the most probable paths will be returned. merge_repeated: if `greedy` is `False`, merge repeated classes in the output beams. # Returns Tuple: List: if `greedy` is `True`, returns a list of one element that contains the decoded sequence. If `False`, returns the `top_paths` most probable decoded sequences. Important: blank labels are returned as `-1`. Tensor `(top_paths, )` that contains the log probability of each decoded sequence. """ _EPSILON = 1e-7 y_pred = tf_math_ops.log( tf.transpose(y_pred, perm=[1, 0, 2]) + _EPSILON) input_length = tf.cast(input_length, tf.int32) if greedy: (decoded, log_prob) = ctc_ops.ctc_greedy_decoder( inputs=y_pred, sequence_length=input_length) else: (decoded, log_prob) = ctc_ops.ctc_beam_search_decoder( inputs=y_pred, sequence_length=input_length, beam_width=beam_width, top_paths=top_paths, merge_repeated=merge_repeated) decoded_dense = [] for st in decoded: dense_tensor = tf.sparse.to_dense(st, default_value=-1) decoded_dense.append(dense_tensor) return decoded_dense, log_prob
def build_graph(self, args, maxTimeSteps): self.graph = tf.Graph() with self.graph.as_default(): self.inputX = tf.placeholder( tf.float32, shape=(maxTimeSteps, args.batch_size, args.num_feature)) #[maxL,32,39] inputXrs = tf.reshape(self.inputX, [-1, args.num_feature]) self.inputList = tf.split( inputXrs, maxTimeSteps, 0) #convert inputXrs from [32*maxL,39] to [32,maxL,39] self.targetIxs = tf.placeholder(tf.int64) self.targetVals = tf.placeholder(tf.int32) self.targetShape = tf.placeholder(tf.int64) self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape) self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size)) depth = 10 width = 8 self.config = { 'name': 'residual network', 'num_layer': depth, 'num_featuremap': width, 'num_class': args.num_class, 'optimizer': args.optimizer, 'learning rate': args.learning_rate } inpt = tf.reshape( self.inputX, [args.batch_size, maxTimeSteps, args.num_feature, 1]) conv_output = build_resnet(inpt, maxTimeSteps, depth, width, args.num_class) self.loss = tf.reduce_mean( ctc.ctc_loss(self.targetY, conv_output, self.seqLengths)) self.optimizer = args.optimizer(args.learning_rate).minimize( self.loss) self.logitsMaxTest = tf.slice(tf.argmax(conv_output, 2), [0, 0], [self.seqLengths[0], 1]) self.predictions = tf.to_int32( ctc.ctc_beam_search_decoder(conv_output, self.seqLengths)[0][0]) self.errorRate = tf.reduce_sum( tf.edit_distance(self.predictions, self.targetY, normalize=False)) / tf.to_float( tf.size(self.targetY.values)) self.initial_op = tf.global_variables_initializer() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=2, keep_checkpoint_every_n_hours=1) self.var_op = tf.global_variables() self.var_trainable_op = tf.trainable_variables()
def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1): """Decodes the output of a softmax. Can use either greedy search (also known as best path) or a constrained dictionary search. # Arguments y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax. input_length: tensor `(samples, )` containing the sequence length for each batch item in `y_pred`. greedy: perform much faster best-path search if `true`. This does not use a dictionary. beam_width: if `greedy` is `false`: a beam search decoder will be used with a beam of this width. top_paths: if `greedy` is `false`, how many of the most probable paths will be returned. # Returns Tuple: List: if `greedy` is `true`, returns a list of one element that contains the decoded sequence. If `false`, returns the `top_paths` most probable decoded sequences. Important: blank labels are returned as `-1`. Tensor `(top_paths, )` that contains the log probability of each decoded sequence. """ y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8) input_length = tf.to_int32(input_length) if greedy: (decoded, log_prob) = ctc.ctc_greedy_decoder(inputs=y_pred, sequence_length=input_length, merge_repeated=False) else: (decoded, log_prob) = ctc.ctc_beam_search_decoder(inputs=y_pred, sequence_length=input_length, beam_width=beam_width, top_paths=top_paths, merge_repeated=False) decoded_dense = [ tf.sparse_to_dense(st.indices, st.dense_shape, st.values, default_value=-1) for st in decoded ] return (decoded_dense, log_prob)
def build_graph(self, args, maxTimeSteps): self.graph = tf.Graph() with self.graph.as_default(): self.inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, args.batch_size, args.num_feature)) #[maxL,32,39] self.inputXX = tf.reshape(self.inputX,shape=(args.batch_size,maxTimeSteps,args.num_feature)) inputXrs = tf.reshape(self.inputX, [-1, args.num_feature]) #self.inputList = tf.split(0, maxTimeSteps, inputXrs) #convert inputXrs from [32*maxL,39] to [32,maxL,39] #self.inputnew = tf.reshape(self.inputX, [1, 0, 2]) self.targetIxs = tf.placeholder(tf.int64) self.targetVals = tf.placeholder(tf.int32) self.targetShape = tf.placeholder(tf.int64) self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape) self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size)) self.config = { 'name':args.model, 'rnncell':self.cell_fn, 'num_layer':args.num_layer, 'num_hidden':args.num_hidden, 'num_class':args.num_class, 'activation':args.activation, 'optimizer':args.optimizer, 'learning rate':args.learning_rate } # forward layer forwardH1 = self.cell_fn(args.num_hidden,activation=tf.nn.relu) # backward layer backwardH1 = self.cell_fn(args.num_hidden,activation=tf.nn.relu) # bi-directional layer fbH1, state = bidirectional_dynamic_rnn(forwardH1, backwardH1, self.inputXX, sequence_length=self.seqLengths, dtype=tf.float32, scope='BDRNN_H1') fbH1 = tf.concat(2, fbH1) print(fbH1.get_shape) shape = fbH1.get_shape().as_list() fbH1 = tf.reshape(fbH1,[shape[0]*shape[1],-1]) #seq*batch,feature fbH1_list = tf.split(0,shape[1],fbH1) logits = [build_forward_layer(t,[shape[2],args.num_class],kernel='linear') for t in fbH1_list] logits3d = tf.pack(logits) self.loss = tf.reduce_mean(ctc.ctc_loss(logits3d, self.targetY, self.seqLengths)) self.optimizer = tf.train.AdamOptimizer(args.learning_rate).minimize(self.loss) self.logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [self.seqLengths[0], 1]) self.predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, self.seqLengths)[0][0]) self.errorRate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targetY, normalize=False))/tf.to_float(tf.size(self.targetY.values)) self.initial_op = tf.initialize_all_variables() self.saver = tf.train.Saver(tf.all_variables(),max_to_keep=5,keep_checkpoint_every_n_hours=1) self.logfile = args.log_dir+str(datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')+'.txt').replace(' ','').replace('/','') self.var_op = tf.all_variables() self.var_trainable_op = tf.trainable_variables()
def ctc_decode(y_pred, input_length, max_output_length): """ Cut down from https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L4170 Decodes the output of a softmax. Uses greedy (best path) search. # Arguments y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax. input_length: tensor `(samples, )` containing the sequence length for each batch item in `y_pred`. max_output_length: int giving the max output sequence length # Returns List: list of one element that contains the decoded sequence. """ y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + K.epsilon()) input_length = tf.to_int32((tf.squeeze(input_length, axis=-1))) # (decoded, _) = ctc_ops.ctc_greedy_decoder(inputs=y_pred, sequence_length=input_length) (decoded, _) = ctc_ops.ctc_beam_search_decoder(inputs=y_pred, sequence_length=input_length, beam_width=10) st = decoded[0] decoded_dense = tf.sparse_to_dense(st.indices, st.dense_shape, st.values, default_value=-1) # Unfortunately, decoded_dense will be of different number of columns, depending on the decodings. # We need to get it all in one standard shape, so let's pad if necessary. max_length = max_output_length + 2 # giving 2 extra characters for CTC leeway cols = tf.shape(decoded_dense)[-1] def f1(): return tf.pad(decoded_dense, [[0, 0], [0, max_length - cols]], constant_values=-1) def f2(): return decoded_dense return tf.cond(tf.less(cols, max_length), f1, f2)
def ctc_complete_analysis_lambda_func(args, **arguments): """ Complete CTC analysis using Keras and tensorflow WARNING : tf is required :param args: y_pred, labels, input_length, label_len :param arguments: greedy, beam_width, top_paths :return: ler = label error rate """ y_pred, labels, input_length, label_len = args my_params = arguments assert (K.backend() == 'tensorflow') batch = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8) input_length = tf.to_int32(tf.squeeze(input_length)) greedy = my_params['greedy'] beam_width = my_params['beam_width'] top_paths = my_params['top_paths'] if greedy: (decoded, log_prob) = ctc.ctc_greedy_decoder(inputs=batch, sequence_length=input_length) else: (decoded, log_prob) = ctc.ctc_beam_search_decoder( inputs=batch, sequence_length=input_length, beam_width=beam_width, top_paths=top_paths) cast_decoded = tf.cast(decoded[0], tf.float32) sparse_y = K.ctc_label_dense_to_sparse( labels, tf.cast(tf.squeeze(label_len), tf.int32)) ed_tensor = tf_edit_distance(cast_decoded, sparse_y, norm=True) ler_per_seq = Kreshape_To1D(ed_tensor) return K.cast(ler_per_seq, dtype='float32')
def loss(self): """ 定义loss :return: """ # 调用ctc loss with tf.name_scope('loss'): #损失 self.avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(self.text, self.logits, self.seq_length)) tf.summary.scalar('loss',self.avg_loss) # [optimizer] with tf.name_scope('train'): #训练过程 self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.avg_loss) with tf.name_scope("decode"): self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder(self.logits, self.seq_length, merge_repeated=False) with tf.name_scope("accuracy"): self.distance = tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.text) # 计算label error rate (accuracy) self.label_err = tf.reduce_mean(self.distance, name='label_error_rate') tf.summary.scalar('accuracy', self.label_err)
def loss(self): """ 定义loss :return: """ # 调用ctc loss with tf.name_scope('loss'): #损失 self.avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(self.text, self.logits, self.seq_length)) tf.summary.scalar('loss',self.avg_loss) # [optimizer] with tf.name_scope('train'): #训练过程 self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.avg_loss) with tf.name_scope("decode"): self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder(self.logits, self.seq_length, merge_repeated=False) with tf.name_scope("accuracy"): self.distance = tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.text) # 计算label error rate (accuracy) self.label_err = tf.reduce_mean(self.distance, name='label_error_rate') tf.summary.scalar('accuracy', self.label_err)
def build_graph(self, args, maxTimeSteps): self.graph = tf.Graph() with self.graph.as_default(): self.inputX = tf.placeholder( tf.float32, shape=(maxTimeSteps, args.batch_size, args.num_feature)) #[maxL,32,39] inputXrs = tf.reshape(self.inputX, [-1, args.num_feature]) #self.inputList = tf.split(0, maxTimeSteps, inputXrs) #convert inputXrs from [32*maxL,39] to [32,maxL,39] self.inputList = tf.split( inputXrs, maxTimeSteps, 0) #convert inputXrs from [32*maxL,39] to [32,maxL,39] self.targetIxs = tf.placeholder(tf.int64) self.targetVals = tf.placeholder(tf.int32) self.targetShape = tf.placeholder(tf.int64) self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape) self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size)) self.config = { 'name': args.model, 'rnncell': self.cell_fn, 'num_layer': args.num_layer, 'num_hidden': args.num_hidden, 'num_class': args.num_class, 'activation': args.activation, 'optimizer': args.optimizer, 'learning rate': args.learning_rate } fbHrs = build_multi_dynamic_brnn(self.args, maxTimeSteps, self.inputX, self.cell_fn, self.seqLengths) with tf.name_scope('fc-layer'): with tf.variable_scope('fc'): weightsClasses = tf.Variable( tf.truncated_normal([args.num_hidden, args.num_class], name='weightsClasses')) biasesClasses = tf.Variable(tf.zeros([args.num_class]), name='biasesClasses') logits = [ tf.matmul(t, weightsClasses) + biasesClasses for t in fbHrs ] #logits3d = tf.pack(logits) logits3d = tf.stack(logits) self.loss = tf.reduce_mean( ctc.ctc_loss(self.targetY, logits3d, self.seqLengths)) #self.var_op = tf.all_variables() self.var_op = tf.global_variables() self.var_trainable_op = tf.trainable_variables() if args.grad_clip == -1: # not apply gradient clipping self.optimizer = tf.train.AdamOptimizer( args.learning_rate).minimize(self.loss) else: # apply gradient clipping grads, _ = tf.clip_by_global_norm( tf.gradients(self.loss, self.var_trainable_op), args.grad_clip) opti = tf.train.AdamOptimizer(args.learning_rate) self.optimizer = opti.apply_gradients( zip(grads, self.var_trainable_op)) self.logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [self.seqLengths[0], 1]) self.predictions = tf.to_int32( ctc.ctc_beam_search_decoder(logits3d, self.seqLengths)[0][0]) self.errorRate = tf.reduce_sum( tf.edit_distance(self.predictions, self.targetY, normalize=False)) / tf.to_float( tf.size(self.targetY.values)) self.initial_op = tf.global_variables_initializer() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5, keep_checkpoint_every_n_hours=1) self.logfile = args.log_dir + str( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') + '.txt').replace(' ', '').replace('/', '')
lstm_output_tr=tf.transpose(lstm_outputs_re, [1,0,2]) W2 = weight_variable([2*hidden_size,output_size]) b2 = bias_variable([output_size]) #n_batch, n_time_steps, n_features = l_in.input_var.shape #Unnecessary in this version. Just collecting the info so that we can reshape the output back to the original shape l_reshape3 = tf.reshape(lstm_output_tr,[-1,2*hidden_size] ) h_2 = tf.matmul(l_reshape3,W2) + b2 l_reshape4 = tf.reshape(h_2,[-1,output_size]) l_soft = tf.nn.softmax(l_reshape4) l_soft_reshaped = tf.reshape(l_soft,[-1,n_time_steps,output_size]) l_soft_tr = tf.transpose(l_soft_reshaped, [1,0,2]) loss = tf.reduce_mean(tf.nn.ctc_loss(l_soft_tr, targets,seqLengths)) optimizer = tf.train.AdamOptimizer(learningRate).minimize(loss) logitsMaxTest = tf.slice(tf.argmax(l_soft_reshaped, 2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32(ctc.ctc_beam_search_decoder(l_soft_reshaped , seqLengths)[0][0]) errorRate = tf.reduce_sum(tf.edit_distance(predictions, targets, normalize=False)) / \ tf.to_float(tf.size(targets.values)) def getminibatch(x,y,bs): perm = np.random.permutation(len(x)) toselect = perm[:bs] batch = {} batch['x'] = np.array([x[i] for i in toselect]) batch['ind'], batch['val'], batch['shape'] = target_list_to_sparse_tensor([y[i] for i in toselect]) batch['seqlen'] = np.zeros([bs]) batch['seqlen'].fill(776) return batch number_of_batches = 100 batch_size_var = 38 nEpochs = 100
logits = tf.reshape(logits, [batch_s, -1, num_classes]) # Swap dimensions to time major for CTC loss. logits = tf.transpose(logits, (1, 0, 2)) loss = ctc.ctc_loss(targets, logits, seq_len) cost = tf.reduce_mean(loss) # Record the loss tf.contrib.deprecated.scalar_summary('loss', cost) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, use_nesterov=True).minimize(cost) decoded, log_prob = ctc.ctc_beam_search_decoder(inputs=logits, sequence_length=seq_len) # Label error rate using the edit distance between output and target ler = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) # Record the label error rate tf.contrib.deprecated.scalar_summary('label error rate', ler) saver = tf.train.Saver() merged = tf.contrib.deprecated.merge_all_summaries() train_writer = tf.summary.FileWriter('./summaries/train', graph) test_writer = tf.summary.FileWriter('./summaries/test', graph) def test_decoding(input_feed_dict, input_original):
def train(audio_processer, num_inputs, num_classes, model_architecture, model_size_info, learning_rate, training_steps, batch_size, aligning, eval_step_interval, output_dir): X = tf.placeholder( dtype=tf.float32, shape=[None, audio_processer.get_max_step(aligning), num_inputs], name='input_tensor') sequence_len = tf.placeholder(dtype=tf.int32, shape=[None], name='sequence_len') Y = tf.sparse_placeholder(dtype=tf.int32, name='output_tensor') model_settings = prepare_model_settings(20, num_classes) logits, dropout_prob = create_model(X, sequence_len, model_settings, model_architecture, model_size_info, True) with tf.name_scope('loss'): avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(Y, logits, sequence_len)) tf.summary.scalar('loss', avg_loss) with tf.name_scope('train'): learning_rate_input = tf.placeholder(tf.float32, [], name='learning_rate_input') train_step = tf.train.AdamOptimizer( learning_rate=learning_rate_input).minimize(avg_loss) with tf.name_scope("decoder"): decoder, _ = ctc_ops.ctc_beam_search_decoder(logits, sequence_len, merge_repeated=False) with tf.name_scope("accuracy"): evaluation_step = tf.reduce_mean( tf.edit_distance(tf.cast(decoder[0], tf.int32), Y)) tf.summary.scalar('accuracy', evaluation_step) if tf.test.gpu_device_name(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) else: sess = tf.InteractiveSession() saver = tf.train.Saver(max_to_keep=2) sess.run(tf.global_variables_initializer()) ckpt = tf.train.latest_checkpoint(output_dir + 'train/') if ckpt: saver.restore(sess, ckpt) merged_summaries = tf.summary.merge_all() train_writer = tf.summary.FileWriter(output_dir + 'train/logs/', sess.graph) num_train_batches = audio_processer.get_batch_count(batch_size, 'train') num_dev_batches = audio_processer.get_batch_count(batch_size, 'dev') total_training_step = sum(training_steps) for training_step in range(1, total_training_step + 1): total_train_loss = 0 epoch_start = time.time() learning_rate_value = learning_rate[ 1 if training_step > training_steps[0] else 0] for batch in range(num_train_batches): data = audio_processer.get_data(batch * batch_size, batch_size, 'train', aligning) #train_summary, loss, _ = sess.run([merged_summaries, avg_loss, train_step], loss, _ = sess.run( [avg_loss, train_step], feed_dict={ X: data[0], Y: data[1], sequence_len: data[2], learning_rate_input: learning_rate_value, dropout_prob: 0.95 }) #train_writer.add_summary(train_summary, batch) total_train_loss += loss time_cost = time.time() - epoch_start print('training step: %d/%d, train loss: %g, time cost: %.2fs' % (training_step, total_training_step, total_train_loss / num_train_batches, time_cost)) if training_step % eval_step_interval == 0: saver.save(sess, output_dir + "train/speech-model.ckpt", global_step=training_step) rand_batch = random.randint(0, num_dev_batches - 1) data = audio_processer.get_data(rand_batch * batch_size, batch_size, 'dev', aligning) dev_accuracy = sess.run(evaluation_step, feed_dict={ X: data[0], Y: data[1], sequence_len: data[2], dropout_prob: 1.0 }) print('WER: %.2f, training step: %d/%d' % (dev_accuracy, training_step, total_training_step)) total_test_accuracy = 0 num_test_batches = audio_processer.get_batch_count(batch_size, 'test') for batch in range(1, num_test_batches + 1): data = audio_processer.get_data(batch * batch_size, batch_size, 'test', aligning) decodes, accuracy = sess.run([decoder[0], evaluation_step], feed_dict={ X: data[0], Y: data[1], sequence_len: data[2], dropout_prob: 1.0 }) total_test_accuracy += accuracy dense_decodes = tf.sparse_tensor_to_dense( decodes, default_value=-1).eval(session=sess) dense_labels = sparse_tuple_to_string(test_data[1], lexicon) for orig, decode_array in zip(dense_labels, dense_decodes): decoded_str = trans_array_to_string(decode_array, lexicon) print('语音原始文本: {}'.format(orig)) print('识别出来的文本: {}'.format(decoded_str)) break print('Final WER: %.2f, train steps: %d' % (total_test_accuracy, total_training_step))
def continue_train(): input_tensor = tf.placeholder( tf.float32, [None, None, n_input + (2 * n_input * n_context)], name='input') # ctc_loss计算需要使用sparse_placeholder生成SparseTensor targets = tf.sparse_placeholder(tf.int32, name='targets') # 文本 keep_dropout = tf.placeholder(tf.float32) seq_length = tf.placeholder(tf.int32, [None], name='seq_length') # 序列长 regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE) logits = inference(input_tensor, words_size + 1, True, keep_dropout, regularizer, tf.to_int64(seq_length)) avg_loss = tf.reduce_mean(ctc_ops.ctc_loss( targets, logits, seq_length)) + tf.add_n(tf.get_collection('losses')) learning_rate = 0.001 optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(avg_loss) with tf.name_scope("decode"): decoded, log_prob = ctc_ops.ctc_beam_search_decoder( logits, seq_length, merge_repeated=False) with tf.name_scope("accuracy"): distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets) # 计算label error rate (accuracy) ler = tf.reduce_mean(distance, name='label_error_rate') epochs = 1000 #ckpt = tf.train.get_checkpoint_state(savedir) saver = tf.train.Saver(max_to_keep=5) #saver2 = tf.train.Saver(max_to_keep=5) # 生成saver with tf.Session() as sess: choose_cpkt = "BiRNN.cpkt-204" sess.run(tf.global_variables_initializer()) print_tensors_in_checkpoint_file(savedir + choose_cpkt, None, True) saver.restore(sess, savedir + choose_cpkt) #graph = tf.get_default_graph() #cur_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] startepo = 204 train_start = time.time() for epoch in range(startepo, epochs): # 样本集迭代次数 epoch_start = time.time() #if epoch < startepo: # continue print("epoch start:", epoch + 1, "total epochs= ", epochs) ##run batch## n_batches_per_epoch = int(np.ceil(len(labels) / batch_size)) print("total loop ", n_batches_per_epoch, "in one epoch,", batch_size, "items in one loop") train_cost = 0 train_ler = 0 next_idx = 0 for batch in range(n_batches_per_epoch): # 一次batch_size,取多少次 # 取数据 next_idx, source, source_lengths, sparse_labels = next_batch( labels, next_idx, batch_size) feed = { input_tensor: source, targets: sparse_labels, seq_length: source_lengths, keep_dropout: keep_dropout_rate } # 计算 avg_loss optimizer ; batch_cost, _ = sess.run([avg_loss, optimizer], feed_dict=feed) train_cost += batch_cost if (batch + 1) % 50 == 0: print('loop:', batch + 1, 'Train cost: ', train_cost / (batch + 1)) feed2 = { input_tensor: source, targets: sparse_labels, seq_length: source_lengths, keep_dropout: 1.0 } d, train_ler = sess.run([decoded[0], ler], feed_dict=feed2) dense_decoded = tf.sparse_tensor_to_dense( d, default_value=-1).eval(session=sess) dense_labels = base.sparse_tuple_to_texts_ch( sparse_labels, words) counter = 0 print('Label err rate: ', train_ler) duration = time.time() - train_start print('cost time: {:.2f} min'.format(duration / 60)) for orig, decoded_arr in zip(dense_labels, dense_decoded): # convert to strings decoded_str = base.ndarray_to_text_ch( decoded_arr, words) decoded_str = decoded_str.strip().strip('龚') print(' file {}'.format(counter)) print('Original: {}'.format(orig)) print('Decoded: {}'.format(decoded_str)) counter = counter + 1 break epoch_duration = time.time() - epoch_start log = 'Epoch {}/{}, train_cost: {:.3f}, train_ler: {:.3f}, time: {:.2f} sec' print( log.format(epoch + 1, epochs, train_cost, train_ler, epoch_duration)) saver.save(sess, savedir + "BiRNN.cpkt", global_step=epoch + 1) print("save cpkt-%s complete." % (epoch + 1))
# logits will be input for the loss function. # nn_model is from the import statement in the load_model function logits = BiRNN_model(input_tensor, tf.to_int64(seq_length), n_input, n_context, words_size + 1, keep_dropout) #调用ctc loss avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(targets, logits, seq_length)) #[optimizer] learning_rate = 0.001 optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(avg_loss) with tf.name_scope("decode"): decoded, log_prob = ctc_ops.ctc_beam_search_decoder(logits, seq_length, merge_repeated=False) with tf.name_scope("accuracy"): distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets) # 计算label error rate (accuracy) ler = tf.reduce_mean(distance, name='label_error_rate') epochs = 100 savedir = "log/yuyinchalltest/" saver = tf.train.Saver(max_to_keep=1) # 生成saver # create the session sess = tf.Session() # 没有模型的话,就重新初始化 sess.run(tf.global_variables_initializer())
def _create_graph(self, num_hidden, batch_size, max_time_steps, num_features, conv_depth, num_classes): self.graph = tf.Graph() with self.graph.as_default(): # e.g: log filter bank or MFCC features # Has size [batch_size, max_time_steps, num_features], but the # batch_size and max_stepsize can vary along each step self.inputs = tf.placeholder( tf.float32, [batch_size, max_time_steps, num_features]) # Here we use sparse_placeholder that will generate a # SparseTensor required by ctc_loss op. self.targets = tf.sparse_placeholder(tf.int32) # 1d array of size [batch_size] self.seq_len = tf.placeholder(tf.int32, [batch_size]) # Defining the cell # Can be: # tf.nn.rnn_cell.RNNCell # tf.nn.rnn_cell.GRUCell cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True) inputsW = (pt.wrap(tf.expand_dims(self.inputs, -1)).conv2d( 3, conv_depth, activation_fn=tf.nn.relu).conv2d(3, conv_depth, activation_fn=tf.nn.relu)) # The second output is the last state and we will no use that inputsW = tf.reshape( inputsW, [batch_size, max_time_steps, num_features * conv_depth]) outputs, _ = tf.nn.dynamic_rnn(cell, inputsW, self.seq_len, dtype=tf.float32) # outputs, _ = tf.nn.dynamic_rnn(cell, inputs, self.seq_len, dtype=tf.float32) shape = tf.shape(self.inputs) batch_s, max_timesteps = shape[0], shape[1] # Reshaping to apply the same weights over the timesteps outputs = tf.reshape(outputs, [-1, num_hidden]) # Truncated normal with mean 0 and stdev=0.1 # Tip: Try another initialization # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers W = tf.Variable( tf.truncated_normal([num_hidden, num_classes], stddev=0.1)) # Zero initialization # Tip: Is tf.zeros_initializer the same? b = tf.Variable(tf.constant(0., shape=[num_classes])) # Doing the affine projection logits = tf.matmul(outputs, W) + b # Reshaping back to the original shape logits = tf.reshape(logits, [batch_s, -1, num_classes]) # Time major logits = tf.transpose(logits, (1, 0, 2)) self.loss = tf.reduce_mean( ctc.ctc_loss(logits, self.targets, self.seq_len)) self.logitsMaxTest = tf.slice(tf.argmax(logits, 2), [0, 0], [self.seq_len[0], 1]) self.optimizer = tf.train.AdamOptimizer().minimize(self.loss) self.predictions = tf.to_int32( ctc.ctc_beam_search_decoder(logits, self.seq_len)[0][0]) self.error_rate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targets, normalize=False)) / \ tf.to_float(tf.size(self.targets.values)) tf.scalar_summary('loss', self.loss) tf.scalar_summary('error_rate', self.error_rate) self.merged_summaries = tf.merge_all_summaries()
def train_model(train_data=None, test_data=None, decode=False, file_decode=False): graph = tf.Graph() with graph.as_default(): # e.g: log filter bank or MFCC features # Has size [batch_size, max_stepsize, num_features], but the # batch_size and max_stepsize can vary along each step inputs = tf.placeholder(tf.float32, [None, None, num_features]) targets_idx = tf.placeholder(tf.int64) targets_val = tf.placeholder(tf.int32) targets_shape = tf.placeholder(tf.int64) targets = tf.SparseTensor(targets_idx, targets_val, targets_shape) # 1d array of size [batch_size] seq_len = tf.placeholder(tf.int32, [None]) # Weights & biases weight_classes = tf.Variable(tf.truncated_normal([num_hidden, num_classes], mean=0, stddev=0.1, dtype=tf.float32)) bias_classes = tf.Variable(tf.zeros([num_classes]), dtype=tf.float32) # Network forward_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, use_peepholes=True, state_is_tuple=True) backward_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, use_peepholes=True, state_is_tuple=True) stack_forward_cell = tf.nn.rnn_cell.MultiRNNCell([forward_cell] * num_layers, state_is_tuple=True) stack_backward_cell = tf.nn.rnn_cell.MultiRNNCell([backward_cell] * num_layers, state_is_tuple=True) outputs, _ = tf.nn.bidirectional_dynamic_rnn(stack_forward_cell, stack_backward_cell, inputs, sequence_length=seq_len, time_major=False, # [batch_size, max_time, num_hidden] dtype=tf.float32) inputs_shape = tf.shape(inputs) batch_size = inputs_shape[0] """ outputs_concate = tf.concat_v2(outputs, 2) outputs_concate = tf.reshape(outputs_concate, [-1, 2*num_hidden]) # logits = tf.matmul(outputs_concate, weight_classes) + bias_classes """ fw_output = tf.reshape(outputs[0], [-1, num_hidden]) bw_output = tf.reshape(outputs[1], [-1, num_hidden]) logits = tf.add(tf.add(tf.matmul(fw_output, weight_classes), tf.matmul(bw_output, weight_classes)), bias_classes) logits = tf.reshape(logits, [batch_size, -1, num_classes]) loss = tf.reduce_mean(ctc_ops.ctc_loss(logits, targets, seq_len, time_major=False)) optimizer = tf.train.MomentumOptimizer(learning_rate, momentum).minimize(loss) # Evaluating # decoded, log_prob = ctc_ops.ctc_greedy_decoder(tf.transpose(logits, perm=[1, 0, 2]), seq_len) decoded, log_prob = ctc_ops.ctc_beam_search_decoder(tf.transpose(logits, perm=[1, 0, 2]), seq_len) label_error_rate = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as session: session.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables(), max_to_keep=0) if not decode: ckpt = tf.train.get_checkpoint_state(ENV.output) if ckpt: print('load', ckpt.model_checkpoint_path) saver.restore(session, ckpt.model_checkpoint_path) total_train_data = len(train_data) total_test_data = len(test_data) num_batch = total_train_data for curr_epoch in range(num_epochs): start = time.time() train_cost = 0 train_ler = 0 # Create the feed_dict for the placeholders filled with the next # `batch size` examples. for i in range(num_batch-1): feed = { inputs: train_data[i][0], targets_idx: train_data[i][1][0], targets_val: train_data[i][1][1], targets_shape: train_data[i][1][2], seq_len: train_data[i][2] } batch_cost, _ = session.run([loss, optimizer], feed) train_cost += batch_cost*batch_size train_ler += session.run(label_error_rate, feed_dict=feed)*batch_size log = "Epoch {}/{}, iter {}, batch_cost {}" logging.info(log.format(curr_epoch+1, num_epochs, i, batch_cost)) train_cost /= num_batch train_ler /= num_batch saver.save(session, os.path.join(P.OUTPUT, 'best.ckpt'), global_step=curr_epoch) feed_test = { inputs: test_data[0][0], targets_idx: test_data[0][1][0], targets_val: test_data[0][1][1], targets_shape: train_data[0][1][2], seq_len: test_data[0][2] } test_cost, test_ler = session.run([loss, label_error_rate], feed_dict=feed_test) log = "Epoch {}/{}, test_cost {}, test_ler {}" logging.info(log.format(curr_epoch+1, num_epochs, test_cost, test_ler)) else: # DECODE ckpt = tf.train.get_checkpoint_state(P.MODEL_PATH) print('load', ckpt.model_checkpoint_path) saver = tf.train.Saver() saver.restore(session, ckpt.model_checkpoint_path) while True: # 准备输入文件 if file_decode: wav_file = raw_input('Enter the wav file path:') else: wav_file = 'temp.wav' raw_input('Press Enter to start...') try: sox = subprocess.Popen(['sox', '-d', '-b', '16', '-c', '1', '-r', '16000', 'temp.wav']) sox.communicate() except KeyboardInterrupt: os.kill(sox.pid, signal.SIGTERM) if sox.poll() is None: time.sleep(2) print('Done recording') features = process_wav(wav_file) batch_features = np.array([features for i in range(16)]) batch_seq_len = np.array([features.shape[0] for i in range(16)]) print(batch_features.shape) feed = { inputs: batch_features, seq_len: batch_seq_len } d, oc = session.run([decoded[0], outputs], feed_dict=feed) dsp = d.shape #[16 86] res = [] print size(oc) for label in d.values[:dsp[1]]: # id of phoneme for k, v in phoneme_set_39.items(): if v == label + 1: res.append(k) print(res)
backwardH1 = tf.contrib.rnn.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) fbH1, _, _ = tf.contrib.rnn.static_bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32, scope='BDLSTM_H1') fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1] outH1 = [tf.reduce_sum(tf.multiply(t, weightsOutH1), reduction_indices=1) + biasesOutH1 for t in fbH1rs] logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1] ####Optimizing logits3d = tf.stack(logits) loss = tf.reduce_mean(ctc.ctc_loss(targetY, logits3d, seqLengths)) optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0]) errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \ tf.to_float(tf.size(targetY.values)) ####Run session with tf.Session(graph=graph) as session: print('Initializing') tf.global_variables_initializer().run() for epoch in range(nEpochs): print('Epoch', epoch+1, '...') batchErrors = np.zeros(len(batchedData)) batchRandIxs = np.random.permutation(len(batchedData)) #randomize batch order for batch, batchOrigI in enumerate(batchRandIxs): batchInputs, batchTargetSparse, batchSeqLengths = batchedData[batchOrigI] batchTargetIxs, batchTargetVals, batchTargetShape = batchTargetSparse feedDict = {inputX: batchInputs, targetIxs: batchTargetIxs, targetVals: batchTargetVals,
def train(train_sample_files, train_vector_labels, test_sample_files, test_vector_labels, lexicon, num_inputs, num_contexts, training_steps, learning_rate, batch_size, summaries_dir, train_dir, eval_step_interval, model_architecture, model_size_info): use_gpu = False device_name = tf.test.gpu_device_name() if not device_name: warnings.warn( 'No GPU found. Please use a GPU to train your neural network.') else: use_gpu = True print('Found GPU at: {}'.format(device_name)) X = tf.placeholder( dtype=tf.float32, shape=[None, None, num_inputs + (2 * num_inputs * num_contexts)], name='input') sequence_len = tf.placeholder(dtype=tf.int32, shape=[None], name='sequence_len') Y = tf.sparse_placeholder(dtype=tf.int32) num_character = len(lexicon) + 1 model_settings = prepare_model_settings(20, num_character, use_gpu) logits, dropout_prob = create_model(X, sequence_len, model_settings, model_architecture, model_size_info, True) with tf.name_scope('loss'): avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(Y, logits, sequence_len)) tf.summary.scalar('loss', avg_loss) with tf.name_scope('train'): train_step = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(avg_loss) with tf.name_scope("decoder"): decoder, _ = ctc_ops.ctc_beam_search_decoder(logits, sequence_len, merge_repeated=False) with tf.name_scope("accuracy"): evaluation_step = tf.reduce_mean( tf.edit_distance(tf.cast(decoder[0], tf.int32), Y)) tf.summary.scalar('accuracy', evaluation_step) if use_gpu == True: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) else: sess = tf.InteractiveSession() saver = tf.train.Saver(max_to_keep=1) sess.run(tf.global_variables_initializer()) ckpt = tf.train.latest_checkpoint(train_dir) if ckpt is not None: saver.restore(sess, ckpt) merged_summaries = tf.summary.merge_all() train_writer = tf.summary.FileWriter(summaries_dir, sess.graph) num_train_batches = len(train_sample_files) // batch_size num_test_batches = len(test_sample_files) // batch_size for training_step in range(training_steps): total_loss = 0 for train_batch in range(num_train_batches): sparse_labels, batch_samples, num_steps = get_next_batches( batch_size * train_batch, train_sample_files, train_vector_labels, num_contexts, batch_size) # train_summary, loss, _ = sess.run([merged_summaries, avg_loss, train_step], loss, _ = sess.run( [avg_loss, train_step], feed_dict={ X: batch_samples, Y: sparse_labels, sequence_len: num_steps, dropout_prob: 0.95 }) # train_writer.add_summary(train_summary, train_batch) total_loss += loss print('training step: %d/%d, loss: %g' % (training_step + 1, training_steps, total_loss / num_train_batches)) if (training_step + 1) % eval_step_interval == 0: saver.save(sess, train_dir + "speech.ckpt", global_step=training_step) total_test_accuracy = 0 for test_batch in range(num_test_batches): sparse_labels, batch_samples, num_steps = get_next_batches( batch_size * test_batch, test_sample_files, test_vector_labels, num_contexts, batch_size) test_accuracy = evaluation_step.eval( feed_dict={ X: batch_samples, Y: sparse_labels, sequence_len: num_steps, dropout_prob: 1.0 }) total_test_accuracy += test_accuracy print('WER: %.2f, training step: %d/%d' % (total_test_accuracy / num_test_batches, training_step + 1, training_steps)) total_accuracy = 0 for test_batch in range(num_test_batches): sparse_labels, batch_samples, num_steps = get_next_batches( batch_size * test_batch, test_sample_files, test_vector_labels, num_contexts, batch_size) decodes, accuracy = sess.run( [decoder[0], evaluation_step], feed_dict={ X: batch_samples, Y: sparse_labels, sequence_len: num_steps, dropout_prob: 1.0 }) total_accuracy += accuracy dense_decodes = tf.sparse_tensor_to_dense( decodes, default_value=-1).eval(session=sess) dense_labels = trans_tuple_to_texts(sparse_labels, lexicon) for orig, decode_array in zip(dense_labels, dense_decodes): decoded_str = trans_array_to_text(decode_array, lexicon) print('语音原始文本: {}'.format(orig)) print('识别出来的文本: {}'.format(decoded_str)) break print('Final WER: %.2f, train steps: %d' % (total_accuracy / num_test_batches, training_steps))
def get_eval(logits3d, target_y, seq_lens): logits_test = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seq_lens[0], 1]) predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seq_lens)[0][0]) error = tf.reduce_sum(tf.edit_distance(predictions, target_y, normalize=False))/tf.to_float(tf.size(target_y.values)) return error, logits_test
def runCTC(batch): INPUT_PATH = '../TRAIN/All/mfcc/' #directory of MFCC nFeatures x nFrames 2-D array .npy files TARGET_PATH = '../TRAIN/All/phone_y/' #directory of nPhonemes 1-D array .npy files ####Learning Parameters learningRate = 0.001 momentum = 0.9 nEpochs = 300 batchSize = batch.shape[1] ####Network Parameters nFeatures = 39 #12 MFCC coefficients + energy, and derivatives nHidden = 256 nClasses = 30 #39 phonemes, plus the "blank" for CTC ####Load data print('Loading data') with open('TIMIT_data_prepared_for_CTC.pkl','rb') as f: data= pickle.load(f) input_list = batch charmap = data['chars'] print(charmap) charmap.append('_') #batchedData, maxTimeSteps = data_lists_to_batches(input_list, target_list, batchSize) maxTimeSteps = 776 totalN = len(input_list) ####Define graph print('Defining graph') graph = tf.Graph() with graph.as_default(): ####NOTE: try variable-steps inputs and dynamic bidirectional rnn, when it's implemented in tensorflow ####Graph input inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, batchSize, nFeatures)) #Prep input data to fit requirements of rnn.bidirectional_rnn # Reshape to 2-D tensor (nTimeSteps*batchSize, nfeatures) inputXrs = tf.reshape(inputX, [-1, nFeatures]) # Split to get a list of 'n_steps' tensors of shape (batch_size, n_hidden) inputList = tf.split(0, maxTimeSteps, inputXrs) targetIxs = tf.placeholder(tf.int64) targetVals = tf.placeholder(tf.int32) targetShape = tf.placeholder(tf.int64) targetY = tf.SparseTensor(targetIxs, targetVals, targetShape) seqLengths = tf.placeholder(tf.int32, shape=(batchSize)) ####Weights & biases weightsOutH1 = tf.Variable(tf.truncated_normal([2, nHidden], stddev=np.sqrt(2.0 / (2*nHidden)))) biasesOutH1 = tf.Variable(tf.zeros([nHidden])) weightsOutH2 = tf.Variable(tf.truncated_normal([2, nHidden], stddev=np.sqrt(2.0 / (2*nHidden)))) biasesOutH2 = tf.Variable(tf.zeros([nHidden])) weightsClasses = tf.Variable(tf.truncated_normal([nHidden, nClasses], stddev=np.sqrt(2.0 / nHidden))) biasesClasses = tf.Variable(tf.zeros([nClasses])) ####Network forwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) backwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) fbH1, _, _ = bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32, scope='BDLSTM_H1') fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1] outH1 = [tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) + biasesOutH1 for t in fbH1rs] logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1] ####Optimizing logits3d = tf.pack(logits) loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths)) optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating logitsMaxTest = tf.slice(tf.argmax(logits3d,2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0]) errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \ tf.to_float(tf.size(targetY.values)) ####Run session with tf.Session(graph=graph) as session: print('Initializing') saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('/users/TeamASR/models') if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") session.run(tf.initialize_all_variables()) feedDict = {inputX: batch, seqLengths: (np.ones([batchSize])*776)} logit = session.run([logits3d], feed_dict=feedDict) return logit
print("building outH1 ") outH1 = [tf.reduce_sum(tf.multiply(t, weightsOutH1), axis=1) + biasesOutH1 for t in fbH1rs] print("building logits ") logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1] print("len(outH1) %d"% len(outH1)) ####Optimizing print("building loss") logits3d = tf.stack(logits) loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths)) out = tf.identity(loss, 'ctc_loss_mean') optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating print("building Evaluation") logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0]) reduced_sum = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) errorRate = reduced_sum / tf.to_float(tf.size(targetY.values)) check_op = tf.add_check_numerics_ops() print("done building graph") ####Run session with tf.Session(graph=graph) as session: try: merged = tf.summary.merge_all() except: merged = tf.summary.merge_all() try:writer = tf.summary.FileWriter("/tmp/basic_new", session.graph) except: writer = tf.summary.FileWriter("/tmp/basic_new", session.graph) try:saver = tf.train.Saver() # defaults to saving all variables except: print("tf.train.Saver() broken in tensorflow 0.12")
# Reshaping back to the original shape logits = tf.reshape(logits, [batch_s, -1, num_classes]) # Swap dimensions to time major for CTC loss. logits = tf.transpose(logits, (1, 0, 2)) loss = ctc.ctc_loss(targets, logits, seq_len) cost = tf.reduce_mean(loss) # Record the loss tf.contrib.deprecated.scalar_summary('loss', cost) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, use_nesterov=True).minimize(cost) decoded, log_prob = ctc.ctc_beam_search_decoder(inputs=logits, sequence_length=seq_len) # Label error rate using the edit distance between output and target ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) # Record the label error rate tf.contrib.deprecated.scalar_summary('label error rate', ler) saver = tf.train.Saver() merged = tf.contrib.deprecated.merge_all_summaries() train_writer = tf.summary.FileWriter('./summaries/train', graph) test_writer = tf.summary.FileWriter('./summaries/test', graph) def test_decoding(input_feed_dict, input_original): """
def runCTC(batch): INPUT_PATH = '../TRAIN/All/mfcc/' #directory of MFCC nFeatures x nFrames 2-D array .npy files TARGET_PATH = '../TRAIN/All/phone_y/' #directory of nPhonemes 1-D array .npy files ####Learning Parameters learningRate = 0.001 momentum = 0.9 nEpochs = 300 batchSize = batch.shape[1] ####Network Parameters nFeatures = 39 #12 MFCC coefficients + energy, and derivatives nHidden = 256 nClasses = 30 #39 phonemes, plus the "blank" for CTC ####Load data print('Loading data') with open('TIMIT_data_prepared_for_CTC.pkl', 'rb') as f: data = pickle.load(f) input_list = batch charmap = data['chars'] print(charmap) charmap.append('_') #batchedData, maxTimeSteps = data_lists_to_batches(input_list, target_list, batchSize) maxTimeSteps = 776 totalN = len(input_list) ####Define graph print('Defining graph') graph = tf.Graph() with graph.as_default(): ####NOTE: try variable-steps inputs and dynamic bidirectional rnn, when it's implemented in tensorflow ####Graph input inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, batchSize, nFeatures)) #Prep input data to fit requirements of rnn.bidirectional_rnn # Reshape to 2-D tensor (nTimeSteps*batchSize, nfeatures) inputXrs = tf.reshape(inputX, [-1, nFeatures]) # Split to get a list of 'n_steps' tensors of shape (batch_size, n_hidden) inputList = tf.split(0, maxTimeSteps, inputXrs) targetIxs = tf.placeholder(tf.int64) targetVals = tf.placeholder(tf.int32) targetShape = tf.placeholder(tf.int64) targetY = tf.SparseTensor(targetIxs, targetVals, targetShape) seqLengths = tf.placeholder(tf.int32, shape=(batchSize)) ####Weights & biases weightsOutH1 = tf.Variable( tf.truncated_normal([2, nHidden], stddev=np.sqrt(2.0 / (2 * nHidden)))) biasesOutH1 = tf.Variable(tf.zeros([nHidden])) weightsOutH2 = tf.Variable( tf.truncated_normal([2, nHidden], stddev=np.sqrt(2.0 / (2 * nHidden)))) biasesOutH2 = tf.Variable(tf.zeros([nHidden])) weightsClasses = tf.Variable( tf.truncated_normal([nHidden, nClasses], stddev=np.sqrt(2.0 / nHidden))) biasesClasses = tf.Variable(tf.zeros([nClasses])) ####Network forwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) backwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True) fbH1, _, _ = bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32, scope='BDLSTM_H1') fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1] outH1 = [ tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) + biasesOutH1 for t in fbH1rs ] logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1] ####Optimizing logits3d = tf.pack(logits) loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths)) optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32( ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0]) errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \ tf.to_float(tf.size(targetY.values)) ####Run session with tf.Session(graph=graph) as session: print('Initializing') saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('/users/TeamASR/models') if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") session.run(tf.initialize_all_variables()) feedDict = {inputX: batch, seqLengths: (np.ones([batchSize]) * 776)} logit = session.run([logits3d], feed_dict=feedDict) return logit
b2 = bias_variable([output_size]) #n_batch, n_time_steps, n_features = l_in.input_var.shape #Unnecessary in this version. Just collecting the info so that we can reshape the output back to the original shape l_reshape3 = tf.reshape(lstm_output_tr, [-1, 2 * hidden_size]) h_2 = tf.matmul(l_reshape3, W2) + b2 l_reshape4 = tf.reshape(h_2, [-1, output_size]) l_soft = tf.nn.softmax(l_reshape4) l_soft_reshaped = tf.reshape(l_soft, [-1, n_time_steps, output_size]) l_soft_tr = tf.transpose(l_soft_reshaped, [1, 0, 2]) loss = tf.reduce_mean(tf.nn.ctc_loss(l_soft_tr, targets, seqLengths)) optimizer = tf.train.AdamOptimizer(learningRate).minimize(loss) logitsMaxTest = tf.slice(tf.argmax(l_soft_reshaped, 2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32( ctc.ctc_beam_search_decoder(l_soft_reshaped, seqLengths)[0][0]) errorRate = tf.reduce_sum(tf.edit_distance(predictions, targets, normalize=False)) / \ tf.to_float(tf.size(targets.values)) def getminibatch(x, y, bs): perm = np.random.permutation(len(x)) toselect = perm[:bs] batch = {} batch['x'] = np.array([x[i] for i in toselect]) batch['ind'], batch['val'], batch['shape'] = target_list_to_sparse_tensor( [y[i] for i in toselect]) batch['seqlen'] = np.zeros([bs]) batch['seqlen'].fill(776) return batch
biasesOutH1 for t in fbH1rs ] logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1] ####Optimizing logits3d = tf.pack(logits) loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths)) optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss) ####Evaluating logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1]) predictions = tf.to_int32( ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0]) errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \ tf.to_float(tf.size(targetY.values)) ####Run session with tf.Session(graph=graph) as session: print('Initializing') tf.initialize_all_variables().run() for epoch in range(nEpochs): print('Epoch', epoch + 1, '...') batchErrors = np.zeros(len(batchedData)) batchRandIxs = np.random.permutation( len(batchedData)) #randomize batch order for batch, batchOrigI in enumerate(batchRandIxs): batchInputs, batchTargetSparse, batchSeqLengths = batchedData[ batchOrigI]
def ctc_loss(self, outputs, targets, seq_len, num_classes, initial_learning_rate, keep_prob=0.8, scopeN="l1-ctc_loss"): """Implements ctc loss @param outputs: [batch,h,w,chanels] @param targets: sparce tensor @param seq_len: the length of the inputs sequences [batch] @param num_classes: the number of classes @param initial_learning_rate: learning rate @param keep_prob: if true dropout layer @param scopeN: the scope name @returns: list with [optimizer, cost, Inaccuracy- label error rate, decoded output of the batch] """ with tf.name_scope('Train'): with tf.variable_scope("ctc_loss-" + scopeN) as scope: W = tf.Variable( tf.truncated_normal([self.hidden * 2, num_classes], stddev=0.1)) # Zero initialization b = tf.Variable(tf.constant(0., shape=[num_classes])) tf.summary.histogram('histogram-b-ctc', b) tf.summary.histogram('histogram-w-ctc', W) # Doing the affine projection logits = tf.matmul(outputs, W) + b if keep_prob is not None: logits = tf.nn.dropout(logits, keep_prob) # Reshaping back to the original shape logits = tf.reshape(logits, [self.width, self.batch_size, num_classes]) #logits = tf.transpose(logits, [1,0,2]) with tf.name_scope('CTC-loss'): loss = ctc_ops.ctc_loss(logits, targets, seq_len) cost = tf.reduce_mean(loss) with tf.name_scope('Optimizer'): if self.optimizer == "ADAM": optimizer = tf.train.AdamOptimizer( learning_rate=initial_learning_rate, name="AdamOptimizer").minimize(cost) elif self.optimizer == "RMSP": optimizer = tf.train.RMSPropOptimizer( learning_rate=initial_learning_rate, decay=self.decay, momentum=self.momentum).minimize(cost) else: raise Exception("model type not supported: {}".format( self.optimizer)) with tf.name_scope('Prediction'): if self.ctc_decoder == 'greedy': decoded, log_prob = ctc_ops.ctc_greedy_decoder( logits, seq_len) elif self.ctc_decoder == 'beam_search': decoded, log_prob = ctc_ops.ctc_beam_search_decoder( logits, seq_len) else: raise Exception("model type not supported: {}".format( self.ctc_decoder)) # Inaccuracy: label error rate ler = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) return optimizer, cost, ler, decoded
logits = tf.transpose(logits, [1, 0, 2]) return logits datasets = read_data_sets("./data/ldc93s1", train_batch_size, dev_batch_size, test_batch_size, n_input, n_context) audio, audio_lengths, labels = datasets.train.next_batch() logits = model(audio, audio_lengths, dropout_rate) loss = ctc_ops.ctc_loss(logits, labels, audio_lengths) avg_loss = tf.reduce_mean(loss) decoded, _ = ctc_ops.ctc_beam_search_decoder(logits, audio_lengths, merge_repeated=False) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon) optimize_op = optimizer.minimize(avg_loss) session = tf.Session(config=session_config) with session.as_default(): tf.initialize_all_variables().run() tf.train.start_queue_runners() datasets.start_queue_threads(session) for epoch in range(training_iters):
""" fw_output = tf.reshape(outputs[0], [-1, num_hidden]) bw_output = tf.reshape(outputs[1], [-1, num_hidden]) logits = tf.add( tf.add(tf.matmul(fw_output, weight_classes), tf.matmul(bw_output, weight_classes)), bias_classes) logits = tf.reshape(logits, [batch_size, -1, num_classes]) loss = tf.reduce_mean( ctc_ops.ctc_loss(logits, targets, seq_len, time_major=False)) optimizer = tf.train.MomentumOptimizer(learning_rate, momentum).minimize(loss) # Evaluating # decoded, log_prob = ctc_ops.ctc_greedy_decoder(tf.transpose(logits, perm=[1, 0, 2]), seq_len) decoded, log_prob = ctc_ops.ctc_beam_search_decoder( tf.transpose(logits, perm=[1, 0, 2]), seq_len) label_error_rate = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as session: # 加载模型 ckpt = tf.train.get_checkpoint_state(model_folder) print('load', ckpt.model_checkpoint_path) saver = tf.train.Saver() saver.restore(session, ckpt.model_checkpoint_path) # 准备输入文件 filenames = glob(P.TEST_DATA)