def tower_loss(scope, feats, labels, seq_lens): """Calculate the total loss on a single tower running the deepSpeech model. This function builds the graph for computing the loss per tower(GPU). ARGS: scope: unique prefix string identifying the deepSpeech tower, e.g. 'tower_0' feats: Tensor of shape BxFxT representing the audio features (mfccs or spectrogram). labels: sparse tensor holding labels of each utterance. seq_lens: tensor of shape [batch_size] holding the sequence length per input utterance. Returns: Tensor of shape [batch_size] containing the total loss for a batch of data """ # Build inference Graph. logits = deepSpeech.inference(feats, seq_lens, ARGS) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. strided_seq_lens = tf.div(seq_lens, ARGS.temporal_stride) _ = deepSpeech.loss(logits, labels, strided_seq_lens) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summary to all individual losses and the total loss; # do the same for the averaged version of the losses. for loss in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a # multi-GPU training session. This helps the clarity # of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % helper_routines.TOWER_NAME, '', loss.op.name) # Name each loss as '(raw)' and name the moving average # version of the loss as the original loss name. tf.scalar_summary(loss_name + '(raw)', loss) tf.scalar_summary(loss_name, loss_averages.average(loss)) # Without this loss_averages_op would never run with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) return total_loss
def train(): """ Train deepSpeech for a number of steps. This function build a set of ops required to build the model and optimize weights. """ # Learning rate set up learning_rate, global_step = set_learning_rate() # Create an optimizer that performs gradient descent. # optimizer = tf.train.AdamOptimizer(learning_rate) # forward pass to compute loss freq_size = 161 inputs = tf.placeholder(tf.float32, [ARGS.batch_size, freq_size, None]) targets = tf.placeholder(tf.float32, [ARGS.batch_size, None]) max_seqlen = tf.placeholder(tf.int32, [ARGS.batch_size]) logits = deepSpeech.inference(inputs, max_seqlen, ARGS) # ctcloss loss = deepSpeech.loss(logits, targets, max_seqlen) # backward optimize optimizer = tf.train.GradientDescentOptimizer(learning_rate) # optimizer.minimize(loss) grads_and_vars = optimizer.compute_gradients(loss) clipped_grads_and_vars = [(tf.clip_by_value(grad, clip_value_min=-400, clip_value_max=400), var) for grad, var in grads_and_vars] apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) train_op = apply_gradient_op with g.as_default(), tf.device('/device:GPU:0'): # Start running operations on the Graph. allow_soft_placement # must be set to True to build towers on GPU, as some of the # ops do not have GPU implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=ARGS.log_device_placement)) print("forbid the use of checkpoint") sess.run(tf.global_variables_initializer()) # Start the queue runners. tf.train.start_queue_runners(sess) g.finalize() # Run training loop # run_train_loop(sess, (loss_op, train_op, summary_op), saver) run_train_loop(sess, (logits, loss, train_op))
def tower_loss(sess, feats, labels, seq_lens): """Calculate the total loss on a single tower running the deepSpeech model. This function builds the graph for computing the loss per tower(GPU). ARGS: feats: Tensor of shape BxFxT representing the audio features (mfccs or spectrogram). labels: sparse tensor holding labels of each utterance. seq_lens: tensor of shape [batch_size] holding the sequence length per input utterance. Returns: Tensor of shape [batch_size] containing the total loss for a batch of data """ # Build inference Graph. logits = deepSpeech.inference(sess, feats, seq_lens, ARGS) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. total_loss = deepSpeech.loss(logits, labels, seq_lens) # Compute the moving average of all individual losses and the total loss. # loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') # loss_averages_op = loss_averages.apply([total_loss]) # Attach a scalar summary to all individual losses and the total loss; # do the same for the averaged version of the losses. # loss_name = total_loss.op.name # Name each loss as '(raw)' and name the moving average # version of the loss as the original loss name. # tf.summary.scalar(loss_name + '(raw)', total_loss) # Without this loss_averages_op would never run # with tf.control_dependencies([loss_averages_op]): # total_loss = tf.identity(total_loss) return total_loss