def simple_toy_example_2(): #num_time_steps = 2 #num_batches = 3 #alphabet_size = 5 # Feature size. blank_label = 0 probs = np.array([ [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]], [[0.1, 0.1, 0.1, 0.6, 0.1], [0.1, 0.1, 0.1, 0.1, 0.6]], [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.1, 0.6, 0.1]] ]) probs = np.transpose(probs, (1, 0, 2)) # (batches, time-steps, features) -> (time-steps, batches, features). #probs = np.log(probs) # ??? prob_lens = np.array([2, 2, 2]) #labels = np.array([[1, 2], [3, 4], [1, 3]]) # InvalidArgumentError (see above for traceback): flat_labels is not a vector. labels = np.array([ 1, 2, 3, 4, 1, 3 ]) label_lens = np.array([2, 2, 2]) ctc_costs = warpctc_tensorflow.ctc(probs, labels, label_lens, prob_lens, blank_label=blank_label) with tf.Session() as sess: costs = sess.run(ctc_costs) print('CTC costs =', costs)
def simple_toy_example_1(): #num_time_steps = 5 #num_batches = 2 #alphabet_size = 6 # Feature size. blank_label = 5 activations = np.array([ [[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553], [0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508]], [[0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436], [0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549]], [[0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688], [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456]], [[0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533], [0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345]], [[0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107], [0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]] ]) activations = np.log(activations) # ??? activation_lens = np.array([5, 5]) labels = np.array([ 0, 1, 2, 1, 0, 0, 1, 1, 0 ]) label_lens = np.array([5, 4]) # Expected CTC = [3.3421143650988143, 5.42262]. ctc_costs = warpctc_tensorflow.ctc(activations, labels, label_lens, activation_lens, blank_label=blank_label) with tf.Session() as sess: costs = sess.run(ctc_costs) print('CTC costs =', costs)
def __init__(self, input_dim=128, output_dim=104, learning_rate=0.001): super(RNN, self).__init__() self.input_dim = input_dim self.output_dim = output_dim self.inp = Input(shape=(None, self.input_dim), name="Input") self.batch_norm = keras.layers.normalization.BatchNormalization()( self.inp) # self.gru_1 = GRU(256, return_sequences=True, kernel_initializer='he_normal', name='gru1')(self.batch_norm) # self.gru_1b = GRU(256, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(self.inp) # self.gru1_merged = add([self.gru_1, self.gru_1b]) # self.gru_2 = GRU(256, return_sequences=True, kernel_initializer='he_normal', name='gru2')(self.gru1_merged) # self.gru_2b = GRU(256, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')(self.gru1_merged) self.gru_1 = Bidirectional(GRU(256, return_sequences=True, kernel_initializer='he_normal', name='gru1'), merge_mode="sum")(self.batch_norm) self.gru_2 = Bidirectional(GRU(256, return_sequences=True, kernel_initializer='he_normal', name='gru2'), merge_mode="concat")(self.gru_1) self.y_pred = TimeDistributed( Dense(self.output_dim, kernel_initializer='he_normal', name='dense2', activation='linear'))(self.gru_2) self.model = Model(inputs=self.inp, outputs=self.y_pred) self.model.summary() self.y_true = K.placeholder(name='y_true', ndim=1, dtype='int32') self.input_length = K.placeholder(name='input_length', ndim=1, dtype='int32') self.label_length = K.placeholder(name='label_length', ndim=1, dtype='int32') self.loss_out = K.mean( warpctc_tensorflow.ctc(tf.transpose(self.y_pred, perm=[1, 0, 2]), self.y_true, self.label_length, self.input_length)) # self.ctc_loss = K.function([self.y_true, self.y_pred, self.input_length, self.label_length, K.learning_phase()], \ # [self.loss_out]) # self.optimizer = keras.optimizers.Adam(lr = learning_rate) self.optimizer = keras.optimizers.SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=200) self.update = self.optimizer.get_updates(self.model.trainable_weights, [], loss=self.loss_out) self.network_output = K.ctc_decode( Activation('softmax')(self.y_pred), self.input_length, True)[0][0] self.train_step = K.function([self.inp, self.y_true, self.input_length, self.label_length, K.learning_phase()], \ [self.loss_out, self.y_pred], updates = self.update) self.test = K.argmax(self.y_pred, axis=2) self.predict_step = K.function( [self.inp, self.input_length, K.learning_phase()], [self.network_output])
def cost(self) -> tf.Tensor: loss = warpctc.ctc(activations=self.logits, flat_labels=self.flat_labels, label_lengths=self.label_lengths, input_lengths=self.encoder.lengths, blank_label=len(self.vocabulary)) return tf.reduce_sum(loss)
def loss(logits, seq_length, labels, label_length): """Calculate the networks CTC loss. Args: logits (tf.Tensor): 3D float Tensor. If time_major == False, this will be a Tensor shaped: [batch_size, max_time, num_classes]. If time_major == True (default), this will be a Tensor shaped: [max_time, batch_size, num_classes]. The logits. labels (tf.SparseTensor or tf.Tensor): An int32 SparseTensor. labels.indices[i, :] == [b, t] means labels.values[i] stores the id for (batch b, time t). labels.values[i] must take on values in [0, num_labels), if `FLAGS.use_warp_ctc` is false. Else, an int32 dense Tensor version of the above sparse version. seq_length (tf.Tensor): 1D int32 vector, size [batch_size]. The sequence lengths. label_length (tf.Tensor): 1D Tensor with the length of each label within the batch. Shape [batch_size]. Returns: tf.Tensor: 1D float Tensor with size [1], containing the mean loss. """ if FLAGS.use_warp_ctc: # Labels need to be a 1D vector, with every label concatenated. flat_labels = tf.reshape(labels, [-1]) # Remove padding from labels. partitions = tf.cast(tf.equal(flat_labels, 0), tf.int32) flat_labels, _ = tf.dynamic_partition(flat_labels, partitions, 2) # `label_length` needs to be a 1D vector. flat_label_length = tf.reshape(label_length, [-1]) # https://github.com/baidu-research/warp-ctc total_loss = warp_ctc.ctc(activations=logits, flat_labels=flat_labels, label_lengths=flat_label_length, input_lengths=seq_length, blank_label=28) # total_loss = tf.Print(total_loss, [total_loss], message='total_loss ') else: # https://www.tensorflow.org/api_docs/python/tf/nn/ctc_loss total_loss = tf.nn.ctc_loss(labels=labels, inputs=logits, sequence_length=seq_length, preprocess_collapse_repeated=False, ctc_merge_repeated=True, time_major=True) # Return average CTC loss. return tf.reduce_mean(total_loss)
def createCtcCriterion(self): # using built-in ctc loss calculator # self.loss = tf.nn.ctc_loss(self.target, self.result, self.lossSeqLengths) # using baidu's warp ctc loss calculator self.loss = warpctc_tensorflow.ctc(self.result, self.lossTarget, self.targetSeqLengths, self.inputSeqLengths, blank_label=36) self.cost = tf.reduce_mean(self.loss)
def _setup_loss(self, logits): """ Function returning loss. """ with tf.name_scope("loss"): loss = warpctc_tensorflow.ctc(self.logits, self.Y_batch.values, self.Y_batch_len, tf.div(self.X_batch_len, self.shrink_factor), blank_label=8) return tf.reduce_mean(loss)
def _run_ctc(self, activations, input_lengths, flat_labels, label_lengths, expected_costs, expected_gradients, use_gpu=False, expected_error=None): self.assertEquals(activations.shape, expected_gradients.shape) activations_t = tf.constant(activations) input_lengths_t = tf.constant(input_lengths) flat_labels_t = tf.constant(flat_labels) label_lengths_t = tf.constant(label_lengths) costs = ctc(activations=activations_t, flat_labels=flat_labels_t, label_lengths=label_lengths_t, input_lengths=input_lengths_t) grad = tf.gradients(costs, [activations_t])[0] self.assertShapeEqual(expected_costs, costs) self.assertShapeEqual(expected_gradients, grad) log_dev_placement = False if not use_gpu: # Note: using use_gpu=False seems to not work # it runs the GPU version instead config = tf.ConfigProto(log_device_placement=log_dev_placement, device_count={'GPU': 0}) else: config = tf.ConfigProto(log_device_placement=log_dev_placement, allow_soft_placement=False) with self.test_session(use_gpu=use_gpu, force_gpu=use_gpu, config=config) as sess: if expected_error is None: (tf_costs, tf_grad) = sess.run([costs, grad]) self.assertAllClose(tf_costs, expected_costs, atol=1e-6) self.assertAllClose(tf_grad, expected_gradients, atol=1e-6) else: with self.assertRaisesOpError(expected_error): sess.run([costs, grad]) sess.run([costs, grad])
def ctc_loss(self, logits, len_logits, labels, len_labels): """ No valid path found: It is possible that no valid path is found if the activations for the targets are zero. """ with tf.name_scope("ctc_loss"): if self.args.model.use_wrapctc: import warpctc_tensorflow from tfTools.tfTools import get_indices indices = get_indices(len_labels) flat_labels = tf.gather_nd(labels, indices) ctc_loss = warpctc_tensorflow.ctc( activations=tf.transpose(logits, [1, 0, 2]), flat_labels=flat_labels, label_lengths=len_labels, input_lengths=len_logits, blank_label=self.args.dim_output) else: # with tf.get_default_graph()._kernel_label_map({"CTCLoss": "WarpCTC"}): labels_sparse = dense_sequence_to_sparse(labels, len_labels) ctc_loss = tf.nn.ctc_loss( labels_sparse, logits, sequence_length=len_logits, ctc_merge_repeated=self.ctc_merge_repeated, ignore_longer_outputs_than_inputs=True, time_major=False) if self.args.model.policy_learning: from tfModels.regularization import policy_learning softmax_temperature = self.model.decoder.softmax_temperature dim_output = self.dim_output decoded_sparse = self.ctc_decode(logits, len_logits) rl_loss = policy_learning(logits, len_logits, decoded_sparse, labels, len_labels, softmax_temperature, dim_output, self.args) ctc_loss += self.args.model.policy_learning * rl_loss return ctc_loss
def build_loss(self): time_step_batch = self.get_output('time_step_len') logits_batch = self.get_output('logits') labels = self.get_output('labels') label_len = self.get_output('labels_len') ctc_loss = warpctc_tensorflow.ctc(activations=logits_batch, flat_labels=labels, label_lengths=label_len, input_lengths=time_step_batch) loss = tf.reduce_mean(ctc_loss) decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits_batch, time_step_batch, merge_repeated=True) dense_decoded = tf.cast( tf.sparse_tensor_to_dense(decoded[0], default_value=0), tf.int32) # add regularizer if cfg.TRAIN.WEIGHT_DECAY > 0: regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) loss = tf.add_n(regularization_losses) + loss return loss, dense_decoded
def build_model(self): # Helper Variables self.global_step_tensor = tf.Variable(0, trainable=False, name='global_step') self.global_step_inc = self.global_step_tensor.assign(self.global_step_tensor + 1) self.global_epoch_tensor = tf.Variable(0, trainable=False, name='global_epoch') self.global_epoch_inc = self.global_epoch_tensor.assign(self.global_epoch_tensor + 1) # Inputs to the network with tf.variable_scope('inputs'): self.x, y, self.length, self.lab_length = self.data_loader.get_input() self.y = tf.contrib.layers.dense_to_sparse(y, eos_token=-1) self.x = tf.transpose(self.x, [2, 0, 1]) self.is_training = tf.placeholder(tf.bool, name='Training_flag') tf.add_to_collection('inputs', self.x) tf.add_to_collection('inputs', self.length) tf.add_to_collection('inputs', self.lab_length) tf.add_to_collection('inputs', y) tf.add_to_collection('inputs', self.is_training) # Network Architecture out_W = tf.Variable(tf.truncated_normal([2 * self.rnn_num_hidden, self.data_loader.num_classes], stddev=0.1), name='out_W') out_b = tf.Variable(tf.constant(0., shape=[self.data_loader.num_classes]), name='out_b') # RNN output = self.x with tf.variable_scope('MultiRNN', reuse=tf.AUTO_REUSE): for i in range(self.rnn_num_layers): lstm = tf.contrib.cudnn_rnn.CudnnLSTM(1, self.rnn_num_hidden, 'linear_input', 'bidirectional') output, state = lstm(output) if i < self.rnn_num_layers - 1: output = tf.layers.dropout(output, self.rnn_dropout, noise_shape=tf.constant( value=[1, self.config.batch_size, 2 * self.rnn_num_hidden]), training=self.is_training) # Fully Connected with tf.name_scope('Dense'): output = tf.concat(output, 2) # Reshaping to apply the same weights over the timesteps output = tf.reshape(output, [-1, 2*self.rnn_num_hidden]) # Doing the affine projection logits = tf.matmul(output, out_W) + out_b # Reshaping back to the original shape self.logits = tf.reshape(logits, [self.config.batch_size, -1, self.data_loader.num_classes]) self.logits = tf.transpose(self.logits, (1, 0, 2)) with tf.variable_scope('loss-acc'): self.loss = warpctc_tensorflow.ctc(self.logits, self.y.values, self.lab_length, self.length, self.data_loader.num_classes - 1) self.cost = tf.reduce_mean(self.loss) self.prediction = tf.nn.ctc_beam_search_decoder(self.logits, sequence_length=self.length, merge_repeated=False) self.cer = self.calc_cer(self.prediction[0][0], self.y) with tf.variable_scope('train_step'): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self.train_step = tf.train.RMSPropOptimizer(learning_rate=self.config.learning_rate).minimize( self.loss, global_step=self.global_step_tensor) tf.add_to_collection('train', self.train_step) tf.add_to_collection('train', self.cost) tf.add_to_collection('train', self.cer)
def __init__(self, learning_rate=0.001): conv_filters = 16 kernel_size = (3, 3) pool_size = 2 time_dense_size = 32 rnn_size = 512 img_h = 32 act = 'relu' self.width = K.placeholder(name='width', ndim=0, dtype='int32') self.input_data = Input(name='the_input', shape=(None, img_h, 1), dtype='float32') self.inner = Conv2D(conv_filters, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv1')(self.input_data) self.inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(self.inner) self.inner = Conv2D(conv_filters, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv2')(self.inner) self.inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(self.inner) self.inner = Lambda(self.res, arguments={"last_dim": (img_h // (pool_size ** 2)) * conv_filters \ , "width": self.width // 4})(self.inner) # cuts down input size going into RNN: self.inp = Dense(time_dense_size, activation=act, name='dense1')(self.inner) self.batch_norm = keras.layers.normalization.BatchNormalization()( self.inp) self.gru_1 = Bidirectional(GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1'), merge_mode="sum")(self.batch_norm) self.gru_2 = Bidirectional(GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2'), merge_mode="concat")(self.gru_1) self.y_pred = TimeDistributed( Dense(63, kernel_initializer='he_normal', name='dense2', activation='linear'))(self.gru_2) self.model = Model(inputs=self.input_data, outputs=self.y_pred) self.model.summary() self.out = K.function( [self.input_data, self.width, K.learning_phase()], [self.y_pred]) self.y_true = K.placeholder(name='y_true', ndim=1, dtype='int32') self.input_length = K.placeholder(name='input_length', ndim=1, dtype='int32') self.label_length = K.placeholder(name='label_length', ndim=1, dtype='int32') self.loss_out = K.mean( warpctc_tensorflow.ctc(tf.transpose(self.y_pred, perm=[1, 0, 2]), self.y_true, self.label_length, self.input_length)) # self.optimizer = keras.optimizers.Adam(lr = learning_rate) self.optimizer = keras.optimizers.SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=200) self.update = self.optimizer.get_updates(self.model.trainable_weights, [], loss=self.loss_out) self.network_output = K.ctc_decode( Activation('softmax')(self.y_pred), self.input_length, True)[0][0] self.train_step = K.function([self.input_data, self.width, self.y_true, self.input_length, self.label_length, K.learning_phase()], \ [self.loss_out, self.y_pred], updates = self.update) self.test = K.argmax(self.y_pred, axis=2) self.predict_step = K.function([ self.input_data, self.width, self.input_length, K.learning_phase() ], [self.network_output])
def __init__(self): self.graph = tf.Graph() with self.graph.as_default(): # e.g: log filter bank or MFCC features # Has size [batch_size, max_stepsize, num_features], but the # batch_size and max_stepsize can vary along each step self.inputs = tf.placeholder(tf.float32, [None, None, num_features, 3]) # Here we use sparse_placeholder that will generate a # SparseTensor required by ctc_loss op. # self.labels = tf.sparse_placeholder(tf.int32) self.labels = tf.placeholder(tf.int32, [None]) # 1d array of size [batch_size] self.seq_len = tf.placeholder(tf.int32, [None]) self.label_len = tf.placeholder(tf.int32, [None]) self.output_keep_prob = tf.placeholder("float") self.input_keep_prob = tf.placeholder("float") # CNN model W_conv1 = weight_variable([3, 3, 3, 64]) b_conv1 = bias_variable([64]) h_conv1 = tf.nn.relu(conv2d(self.inputs, W_conv1) + b_conv1) h_pool1 = max_pool_2x2(h_conv1) W_conv2 = weight_variable([3, 3, 64, 128]) b_conv2 = bias_variable([128]) h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) h_pool2 = max_pool_2x2(h_conv2) # h_pool2 shape [64, 140, 7, 128] shape = tf.shape(h_pool2) batch_s, max_timesteps, features_num = shape[0], shape[1], shape[2] # reshape to [batch_size, max_timesteps, features] h_pool2 = tf.reshape(h_pool2, [batch_s, -1, num_features * 32]) # Define bi-lstm cells with tensorflow # Forward direction cell lstm_fw_cell = tf.contrib.rnn.LSTMCell(FLAGS.num_hidden, forget_bias=1.0) # add dropout lstm_fw_cell = tf.contrib.rnn.DropoutWrapper( cell=lstm_fw_cell, input_keep_prob=self.input_keep_prob, output_keep_prob=self.output_keep_prob) # Backward direction cell lstm_bw_cell = tf.contrib.rnn.LSTMCell(FLAGS.num_hidden, forget_bias=1.0) # add dropout lstm_bw_cell = tf.contrib.rnn.DropoutWrapper( cell=lstm_bw_cell, input_keep_prob=self.input_keep_prob, output_keep_prob=self.output_keep_prob) outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, h_pool2, self.seq_len, dtype=tf.float32) # combine backward and forward lstm cell outputs outputs = tf.concat(outputs, 2) # Reshaping to apply the same weights over the timesteps outputs = tf.reshape(outputs, [-1, FLAGS.num_hidden * 2]) # full connection layer W = tf.Variable(tf.truncated_normal( [FLAGS.num_hidden * 2, num_classes], stddev=0.1, dtype=tf.float32), name='W') ## 2 layer LSTM model ## Stacking rnn cells # stack = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(FLAGS.num_hidden,state_is_tuple=True) for _ in range(FLAGS.num_layers)] , state_is_tuple=True) # outputs, _ = tf.nn.dynamic_rnn(stack, h_pool2, self.seq_len, dtype=tf.float32) ## Reshaping to apply the same weights over the timesteps # outputs = tf.reshape(outputs, [-1, FLAGS.num_hidden]) ## full connection layer # W = tf.Variable(tf.truncated_normal([FLAGS.num_hidden,num_classes], stddev=0.1, dtype=tf.float32), name='W') # Zero initialization b = tf.Variable( tf.constant(0., dtype=tf.float32, shape=[num_classes], name='b')) # Doing the affine projection logits = tf.matmul(outputs, W) + b # Reshaping back to the original shape logits = tf.reshape(logits, [batch_s, -1, num_classes]) # Time major logits = tf.transpose(logits, (1, 0, 2)) self.global_step = tf.Variable(0, trainable=False) # self.loss = tf.nn.ctc_loss(labels=self.labels,inputs=logits, sequence_length=self.seq_len) self.loss = warpctc_tensorflow.ctc(activations=logits, flat_labels=self.labels, label_lengths=self.label_len, input_lengths=self.seq_len) self.regularizer = tf.nn.l2_loss(W_conv1) + tf.nn.l2_loss( W_conv2) + tf.nn.l2_loss(W) self.cost = tf.reduce_mean(self.loss) + 0.01 * self.regularizer # learning_rate=tf.train.exponential_decay(FLAGS.initial_learning_rate, # self.global_step, # FLAGS.decay_steps, # FLAGS.decay_rate,staircase=True) # tf.summary.scalar('lr',learning_rate) #optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, # momentum=FLAGS.momentum,use_nesterov=True).minimize(cost,global_step=global_step) self.optimizer = tf.train.AdamOptimizer( learning_rate=FLAGS.initial_learning_rate, beta1=FLAGS.beta1, beta2=FLAGS.beta2).minimize(self.cost, global_step=self.global_step) # Option 2: tf.contrib.ctc.ctc_beam_search_decoder # (it's slower but you'll get better results) self.decoded, self.log_prob = tf.nn.ctc_greedy_decoder( logits, self.seq_len, merge_repeated=True) # self.decoded, self.log_prob = tf.nn.ctc_beam_search_decoder(logits, self.seq_len,merge_repeated=True) self.dense_decoded = tf.sparse_tensor_to_dense(self.decoded[0], default_value=-1) # Inaccuracy: label error rate #self.lerr = tf.reduce_mean(tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.labels)) tf.summary.scalar('cost', self.cost) # tf.summary.scalar('lerr',self.lerr) self.merged_summay = tf.summary.merge_all()
def build_model(self): # Helper Variables self.global_step_tensor = tf.Variable(0, trainable=False, name='global_step') self.global_step_inc = self.global_step_tensor.assign( self.global_step_tensor + 1) self.global_epoch_tensor = tf.Variable(0, trainable=False, name='global_epoch') self.global_epoch_inc = self.global_epoch_tensor.assign( self.global_epoch_tensor + 1) # Inputs to the network with tf.variable_scope('inputs'): self.x, y, self.length, self.lab_length = self.data_loader.get_input( ) self.y = tf.contrib.layers.dense_to_sparse(y, eos_token=-1) self.x = tf.expand_dims(self.x, 3) # Center Images x_shift = (tf.shape(self.x)[2] - self.length) / tf.constant(2) y_shift = tf.zeros_like(x_shift) translation_vector = tf.cast(tf.stack([x_shift, y_shift], axis=1), tf.float32) self.x = tf.contrib.image.translate(self.x, translation_vector) self.length = tf.cast( tf.math.ceil( tf.math.divide(self.length, tf.constant(self.reduce_factor))), tf.int32) batch_size = tf.shape(self.x)[0] self.is_training = tf.placeholder(tf.bool, name='Training_flag') tf.add_to_collection('inputs', self.x) tf.add_to_collection('inputs', self.length) tf.add_to_collection('inputs', self.lab_length) tf.add_to_collection('inputs', y) tf.add_to_collection('inputs', self.is_training) # Define CNN variables intitalizer = tf.contrib.layers.xavier_initializer_conv2d() out_W = tf.Variable(tf.truncated_normal( [2 * self.rnn_num_hidden, self.data_loader.num_classes], stddev=0.1), name='out_W') out_b = tf.Variable(tf.constant(0., shape=[self.data_loader.num_classes]), name='out_b') # CNNs with tf.name_scope('CNN_Block_1'): conv1_out = tf.layers.dropout(self.x, self.conv_dropouts[0], tf.concat([ tf.reshape(batch_size, [-1]), tf.constant(value=[1, 1, 1]) ], 0), training=self.is_training) conv1_out = tf.layers.conv2d(conv1_out, self.conv_depths[0], self.conv_patch_sizes[0], padding='same', activation=None, kernel_initializer=intitalizer) conv1_out = tf.layers.batch_normalization(conv1_out) conv1_out = tf.nn.leaky_relu(conv1_out) conv1_out = tf.layers.max_pooling2d(conv1_out, 2, 2, padding='same') with tf.name_scope('CNN_Block_2'): conv2_out = tf.layers.dropout( conv1_out, self.conv_dropouts[1], noise_shape=tf.concat([ tf.reshape(batch_size, [-1]), tf.constant(value=[1, 1, self.conv_depths[0]]) ], 0), training=self.is_training) conv2_out = tf.layers.conv2d(conv2_out, self.conv_depths[1], self.conv_patch_sizes[1], padding='same', activation=None, kernel_initializer=intitalizer) conv2_out = tf.layers.batch_normalization(conv2_out) conv2_out = tf.nn.leaky_relu(conv2_out) conv2_out = tf.layers.max_pooling2d(conv2_out, 2, 2, padding='same') with tf.name_scope('CNN_Block_3'): conv3_out = tf.layers.dropout( conv2_out, self.conv_dropouts[2], noise_shape=tf.concat([ tf.reshape(batch_size, [-1]), tf.constant(value=[1, 1, self.conv_depths[1]]) ], 0), training=self.is_training) conv3_out = tf.layers.conv2d(conv3_out, self.conv_depths[2], self.conv_patch_sizes[2], padding='same', activation=None, kernel_initializer=intitalizer) conv3_out = tf.layers.batch_normalization(conv3_out) conv3_out = tf.nn.leaky_relu(conv3_out) conv3_out = tf.layers.max_pooling2d(conv3_out, 2, 2, padding='same') with tf.name_scope('CNN_Block_4'): conv4_out = tf.layers.dropout( conv3_out, self.conv_dropouts[3], noise_shape=tf.concat([ tf.reshape(batch_size, [-1]), tf.constant(value=[1, 1, self.conv_depths[2]]) ], 0), training=self.is_training) conv4_out = tf.layers.conv2d(conv4_out, self.conv_depths[3], self.conv_patch_sizes[3], padding='same', activation=None, kernel_initializer=intitalizer) conv4_out = tf.layers.batch_normalization(conv4_out) conv4_out = tf.nn.leaky_relu(conv4_out) with tf.name_scope('CNN_Block_5'): conv5_out = tf.layers.dropout( conv4_out, self.conv_dropouts[4], noise_shape=tf.concat([ tf.reshape(batch_size, [-1]), tf.constant(value=[1, 1, self.conv_depths[3]]) ], 0), training=self.is_training) conv5_out = tf.layers.conv2d(conv5_out, self.conv_depths[4], self.conv_patch_sizes[4], padding='same', activation=None, kernel_initializer=intitalizer) conv5_out = tf.layers.batch_normalization(conv5_out) conv5_out = tf.nn.leaky_relu(conv5_out) output = tf.transpose(conv5_out, [2, 0, 1, 3]) output = tf.reshape(output, [ -1, batch_size, (self.config.im_height // self.reduce_factor) * self.conv_depths[4] ]) self.length = tf.tile(tf.expand_dims(tf.shape(output)[0], axis=0), [batch_size]) # RNN with tf.variable_scope('MultiRNN', reuse=tf.AUTO_REUSE): for i in range(self.rnn_num_layers): output = tf.layers.dropout(output, self.rnn_dropout, training=self.is_training) lstm = tf.contrib.cudnn_rnn.CudnnLSTM(1, self.rnn_num_hidden, 'linear_input', 'bidirectional') output, state = lstm(output) # Fully Connected with tf.name_scope('Dense'): output = tf.concat(output, 2) # Linear dropout output = tf.layers.dropout(output, self.linear_dropout, training=self.is_training) # Reshaping to apply the same weights over the timesteps output = tf.reshape(output, [-1, 2 * self.rnn_num_hidden]) # Doing the affine projection logits = tf.matmul(output, out_W) + out_b # Reshaping back to the original shape self.logits = tf.reshape( logits, [-1, batch_size, self.data_loader.num_classes]) with tf.variable_scope('loss-acc'): self.loss = warpctc_tensorflow.ctc( self.logits, self.y.values, self.lab_length, self.length, self.data_loader.num_classes - 1) self.cost = tf.reduce_mean(self.loss) self.prediction = tf.nn.ctc_beam_search_decoder( self.logits, sequence_length=self.length, merge_repeated=False) self.cer = self.calc_cer(self.prediction[0][0], self.y) with tf.variable_scope('train_step'): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self.train_step = tf.train.RMSPropOptimizer( learning_rate=self.config.learning_rate, decay=self.config.learning_rate_decay).minimize( self.loss, global_step=self.global_step_tensor) tf.add_to_collection('train', self.train_step) tf.add_to_collection('train', self.cost) tf.add_to_collection('train', self.cer)
def __init__(self): self.graph = tf.Graph() with self.graph.as_default(): # e.g: log filter bank or MFCC features # Has size [batch_size, max_stepsize, num_features], but the # batch_size and max_stepsize can vary along each step self.inputs = tf.placeholder(tf.float32, [None, None, num_features]) # Here we use sparse_placeholder that will generate a # SparseTensor required by ctc_loss op. #self.labels = tf.sparse_placeholder(tf.int32) self.labels = tf.placeholder(tf.int32, [None]) # 1d array of size [batch_size] self.seq_len = tf.placeholder(tf.int32, [None]) self.label_len = tf.placeholder(tf.int32, [None]) # Defining the cell # Can be: # tf.nn.rnn_cell.RNNCell # tf.nn.rnn_cell.GRUCell #cell = tf.contrib.rnn.LSTMCell(FLAGS.num_hidden, state_is_tuple=True) #cell = tf.contrib.rnn.DropoutWrapper(cell = cell,output_keep_prob=0.8) # #cell1 = tf.contrib.rnn.LSTMCell(FLAGS.num_hidden, state_is_tuple=True) #cell1 = tf.contrib.rnn.DropoutWrapper(cell = cell1,output_keep_prob=0.8) # Stacking rnn cells #stack = tf.contrib.rnn.MultiRNNCell([cell,cell1] , state_is_tuple=True) stack = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.LSTMCell(FLAGS.num_hidden, state_is_tuple=True) for _ in range(FLAGS.num_layers) ], state_is_tuple=True) # The second output is the last state and we will no use that outputs, _ = tf.nn.dynamic_rnn(stack, self.inputs, self.seq_len, dtype=tf.float32) shape = tf.shape(self.inputs) batch_s, max_timesteps = shape[0], shape[1] # Reshaping to apply the same weights over the timesteps outputs = tf.reshape(outputs, [-1, FLAGS.num_hidden]) # Truncated normal with mean 0 and stdev=0.1 # Tip: Try another initialization # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers W = tf.Variable(tf.truncated_normal( [FLAGS.num_hidden, num_classes], stddev=0.1, dtype=tf.float32), name='W') # Zero initialization # Tip: Is tf.zeros_initializer the same? b = tf.Variable( tf.constant(0., dtype=tf.float32, shape=[num_classes], name='b')) # Doing the affine projection logits = tf.matmul(outputs, W) + b # Reshaping back to the original shape logits = tf.reshape(logits, [batch_s, -1, num_classes]) # Time major logits = tf.transpose(logits, (1, 0, 2)) self.global_step = tf.Variable(0, trainable=False) self.loss = warpctc_tensorflow.ctc(activations=logits, flat_labels=self.labels, label_lengths=self.label_len, input_lengths=self.seq_len) self.cost = tf.reduce_mean(self.loss) self.learning_rate = tf.train.exponential_decay( FLAGS.initial_learning_rate, self.global_step, FLAGS.decay_steps, FLAGS.decay_rate, staircase=True) # self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate, # momentum=FLAGS.momentum).minimize(self.cost,global_step=self.global_step) #optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, # momentum=FLAGS.momentum,use_nesterov=True).minimize(cost,global_step=global_step) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=FLAGS.beta1, beta2=FLAGS.beta2).minimize(self.loss, global_step=self.global_step) # Option 2: tf.contrib.ctc.ctc_beam_search_decoder # (it's slower but you'll get better results) #decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len,merge_repeated=False) self.decoded, self.log_prob = tf.nn.ctc_beam_search_decoder( logits, self.seq_len, merge_repeated=True) #dense_decoded = tf.cast(tf.sparse_tensor_to_dense(self.decoded[0],default_value=-1),tf.int32) # Inaccuracy: label error rate #self.lerr = tf.reduce_mean(tf.edit_distance(tf.cast(dense_decoded, tf.int32), self.labels)) tf.summary.scalar('cost', self.cost) #tf.summary.scalar('lerr',self.lerr) self.merged_summay = tf.summary.merge_all()
def loss(logits, seq_lens, labels, label_lens): loss = warpctc_tensorflow.ctc(activations=logits,flat_labels=labels,label_lengths=label_lens,input_lengths=seq_lens) cost = tf.reduce_mean(loss) tf.add_to_collection('losses', cost) return tf.add_n(tf.get_collection('losses'), name='total_loss')