def decoder_layer(input_ph, num_layers: int, num_units: List[int], activation_list, name: str = 'decoder', use_batch_normalization: bool = True, train_ph: bool = True, use_tensorboard: bool = True, keep_prob_list: List[float] = 0, tensorboard_scope: str = None): return dense_multilayer(input_ph, num_layers, num_units, name, activation_list, use_batch_normalization, train_ph, use_tensorboard, keep_prob_list, tensorboard_scope)
def model_fn(features, labels, mode, config, params): feature = features['feature'] feat_len = features['feat_len'] sparse_target = labels global_step = tf.train.get_global_step() with tf.name_scope("seq_len"): input_features_length = feat_len with tf.name_scope("input_features"): input_features = feature with tf.name_scope("input_labels"): input_labels = sparse_target subsample_factor = params["num_reduce_by_half"] if subsample_factor is not None and subsample_factor > 0: for i in range(subsample_factor): input_features_length = tf.div(input_features_length, 2) + tf.cast( input_features_length % 2, dtype=tf.int32) input_features = input_features[:, ::2] if params['noise_stddev'] is not None and params['noise_stddev'] != 0.0: input_features = tf.keras.layers.GaussianNoise( stddev=params['noise_stddev'])( inputs=input_features, training=mode == tf.estimator.ModeKeys.TRAIN) rnn_input = tf.identity(input_features) with tf.name_scope("dense_layer_1"): rnn_input = dense_multilayer( input_ph=rnn_input, num_layers=params['num_dense_layers_1'], num_units=params['num_units_1'], name='dense_layer_1', activation_list=params['dense_activations_1'], use_batch_normalization=params['batch_normalization_1'], batch_normalization_trainable=params[ 'batch_normalization_trainable_1'], train_ph=mode == tf.estimator.ModeKeys.TRAIN, use_tensorboard=True, keep_prob_list=params['keep_prob_1'], kernel_initializers=params['kernel_init_1'], bias_initializers=params['bias_init_1'], tensorboard_scope='dense_layer_1') with tf.name_scope("RNN_cell"): if params['is_bidirectional']: rnn_outputs = bidirectional_rnn( input_ph=rnn_input, seq_len_ph=input_features_length, num_layers=len(params['num_cell_units']), num_cell_units=params['num_cell_units'], activation_list=params['cell_activation'], use_tensorboard=True, tensorboard_scope='RNN', train_ph=mode == tf.estimator.ModeKeys.TRAIN, keep_prob_list=params['keep_prob_rnn'], use_batch_normalization=params["rnn_batch_normalization"] == True) else: rnn_outputs = unidirectional_rnn( input_ph=rnn_input, seq_len_ph=input_features_length, num_layers=len(params['num_cell_units']), num_cell_units=params['num_cell_units'], activation_list=params['cell_activation'], use_tensorboard=True, tensorboard_scope='RNN', train_ph=mode == tf.estimator.ModeKeys.TRAIN, keep_prob_list=params['keep_prob_rnn'], use_batch_normalization=params["rnn_batch_normalization"] == True) with tf.name_scope("dense_layer_2"): rnn_outputs = dense_multilayer( input_ph=rnn_outputs, num_layers=params['num_dense_layers_2'], num_units=params['num_units_2'], name='dense_layer_2', activation_list=params['dense_activations_2'], use_batch_normalization=params['batch_normalization_2'], batch_normalization_trainable=params[ 'batch_normalization_trainable_2'], train_ph=mode == tf.estimator.ModeKeys.TRAIN, use_tensorboard=True, keep_prob_list=params['keep_prob_2'], kernel_initializers=params['kernel_init_2'], bias_initializers=params['bias_init_2'], tensorboard_scope='dense_layer_2', # batch_normalization_training=True ) with tf.name_scope("dense_output"): dense_output_no_activation = dense_layer( input_ph=rnn_outputs, num_units=params['num_classes'], name='dense_output_no_activation', activation=None, use_batch_normalization=False, train_ph=False, use_tensorboard=True, keep_prob=1, tensorboard_scope='dense_output') dense_output = tf.nn.softmax(dense_output_no_activation, name='dense_output') tf.summary.histogram('dense_output', dense_output) with tf.name_scope("decoder"): output_time_major = tf.transpose(dense_output, (1, 0, 2)) if params['beam_width'] == 0: decoded, log_prob = tf.nn.ctc_greedy_decoder(output_time_major, input_features_length, merge_repeated=True) else: decoded, log_prob = tf.nn.ctc_beam_search_decoder( output_time_major, input_features_length, beam_width=params['beam_width'], top_paths=1, merge_repeated=False) dense_decoded = tf.sparse.to_dense(sp_input=decoded[0], validate_indices=True) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode, predictions=dense_decoded) with tf.name_scope("loss"): rnn_loss = 0 for var in tf.trainable_variables(): if var.name.startswith('RNN_cell') and 'kernel' in var.name: rnn_loss += tf.nn.l2_loss(var) dense_loss = 0 for var in tf.trainable_variables(): if var.name.startswith('dense_layer') or \ var.name.startswith('input_dense_layer') and \ 'kernel' in var.name: dense_loss += tf.nn.l2_loss(var) loss = tf.nn.ctc_loss(input_labels, dense_output_no_activation, input_features_length, time_major=False) logits_loss = tf.reduce_mean(tf.reduce_sum(loss)) loss = logits_loss \ + params['rnn_regularizer'] * rnn_loss \ + params['dense_regularizer'] * dense_loss tf.summary.scalar('loss', loss) with tf.name_scope("label_error_rate"): # Inaccuracy: label error rate ler = tf.reduce_mean( tf.edit_distance(hypothesis=tf.cast(decoded[0], tf.int32), truth=input_labels, normalize=True)) metrics = { 'LER': tf.metrics.mean(ler), } tf.summary.scalar('label_error_rate', tf.reduce_mean(ler)) logging_hook = tf.train.LoggingTensorHook(tensors={ "loss": loss, "ler": ler, }, every_n_iter=1) if mode == tf.estimator.ModeKeys.TRAIN: if params['use_learning_rate_decay']: learning_rate = tf.train.exponential_decay( params['learning_rate'], global_step, decay_steps=params['learning_rate_decay_steps'], decay_rate=params['learning_rate_decay'], staircase=True) else: learning_rate = params['learning_rate'] if params['optimizer'] == 'sgd': optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif params['optimizer'] == 'momentum' and params[ 'momentum'] is not None: optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params['momentum']) elif params['optimizer'] == 'rms': optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate) else: optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) loss = tf.tuple([loss], control_inputs=tf.get_collection( tf.GraphKeys.UPDATE_OPS))[0] if params['clip_gradient'] != 0: grads = tf.gradients(loss, tf.trainable_variables()) grads, _ = tf.clip_by_global_norm(grads, params['clip_gradient']) grads_and_vars = list(zip(grads, tf.trainable_variables())) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) else: train_op = optimizer.minimize(loss, global_step=global_step) train_logging_hook = tf.train.LoggingTensorHook( tensors={ 'loss': loss, 'ler': tf.reduce_mean(ler), 'learning_rate': tf.reduce_mean(learning_rate), # 'feal_len': feat_len, # 'feal_len2': input_features_length, # 'feal_len3': tf.shape(input_features), }, every_n_secs=1) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=[train_logging_hook], eval_metric_ops=metrics) if mode == tf.estimator.ModeKeys.EVAL: def _create_alignment_images_summary(outputs): images = outputs images = tf.expand_dims(images, -1) # Scale to range [0, 255] images -= 1 images = -images images *= 255 summary = tf.summary.image("alignment_images", images) return summary with tf.name_scope('alignment'): alignment_summary = _create_alignment_images_summary(dense_output) eval_summary_hook = tf.train.SummarySaverHook( save_steps=10, output_dir=os.path.join(config.model_dir, 'eval'), summary_op=alignment_summary) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, evaluation_hooks=[logging_hook, eval_summary_hook], eval_metric_ops=metrics)
def create_graph(self): with self.graph.as_default(): self.tf_is_traing_pl = tf.placeholder_with_default( True, shape=(), name='is_training') with tf.name_scope("seq_len"): self.seq_len = tf.placeholder(tf.int32, shape=[None], name="sequence_length") with tf.name_scope("input_features"): self.input_feature = tf.placeholder( dtype=tf.float32, shape=[None, None, self.network_data.num_features], name="input") tf.summary.image('feature', [tf.transpose(self.input_feature)]) with tf.name_scope("input_labels"): self.input_label = tf.sparse_placeholder(dtype=tf.int32, shape=[None, None], name="input_label") self.dense_layer_1 = tf.identity(self.input_feature) with tf.name_scope("dense_layer_1"): self.dense_layer_1 = dense_multilayer( input_ph=self.dense_layer_1, num_layers=self.network_data.num_dense_layers_1, num_units=self.network_data.num_dense_units_1, name='dense_layer_1', activation_list=self.network_data.dense_activations_1, use_batch_normalization=self.network_data. batch_normalization_1, train_ph=self.tf_is_traing_pl, use_tensorboard=True, keep_prob_list=self.network_data.keep_dropout_1, kernel_initializers=self.network_data.kernel_init_1, bias_initializers=self.network_data.bias_init_1, tensorboard_scope='dense_layer_1') with tf.name_scope("RNN_1"): if self.network_data.is_bidirectional_1: self.rnn_outputs_1 = bidirectional_rnn( input_ph=self.dense_layer_1, seq_len_ph=self.seq_len, num_layers=len(self.network_data.num_fw_cell_units_1), num_fw_cell_units=self.network_data. num_fw_cell_units_1, num_bw_cell_units=self.network_data. num_bw_cell_units_1, name="RNN_1", activation_fw_list=self.network_data. cell_fw_activation_1, activation_bw_list=self.network_data. cell_bw_activation_1, use_tensorboard=True, tensorboard_scope='RNN_1', output_size=self.network_data.rnn_output_sizes_1) else: self.rnn_outputs_1 = unidirectional_rnn( input_ph=self.dense_layer_1, seq_len_ph=self.seq_len, num_layers=len(self.network_data.num_cell_units_1), num_cell_units=self.network_data.num_cell_units_1, name="RNN_1", activation_list=self.network_data.cell_activation_1, use_tensorboard=True, tensorboard_scope='RNN_1', output_size=self.network_data.rnn_output_sizes_1) with tf.name_scope("dense_layer_2"): self.dense_layer_2 = dense_multilayer( input_ph=self.rnn_outputs_1, num_layers=self.network_data.num_dense_layers_2, num_units=self.network_data.num_dense_units_2, name='dense_layer_2', activation_list=self.network_data.dense_activations_2, use_batch_normalization=self.network_data. batch_normalization_2, train_ph=self.tf_is_traing_pl, use_tensorboard=True, keep_prob_list=self.network_data.keep_dropout_2, kernel_initializers=self.network_data.kernel_init_2, bias_initializers=self.network_data.bias_init_2, tensorboard_scope='dense_layer_2') with tf.name_scope("dense_output_1"): self.dense_output_no_activation_1 = dense_layer( input_ph=self.dense_layer_2, num_units=self.network_data.num_classes, name='dense_output_no_activation_1', activation=None, use_batch_normalization=False, train_ph=False, use_tensorboard=True, keep_prob=1, tensorboard_scope='dense_output_1') self.dense_output_1 = tf.nn.softmax( self.dense_output_no_activation_1, name='dense_output_1') tf.summary.histogram('dense_output_1', self.dense_output_1) with tf.name_scope("decoder_1"): self.output_time_major_1 = tf.transpose( self.dense_output_1, (1, 0, 2)) self.decoded_1, log_prob = self.network_data.decoder_function( self.output_time_major_1, self.seq_len) self.dense_decoded_1 = tf.sparse_to_dense( self.decoded_1[0].indices, self.decoded_1[0].dense_shape, self.decoded_1[0].values) with tf.name_scope("dense_layer_3"): self.dense_layer_3 = dense_multilayer( input_ph=self.dense_output_1, num_layers=self.network_data.num_dense_layers_3, num_units=self.network_data.num_dense_units_3, name='dense_layer_3', activation_list=self.network_data.dense_activations_3, use_batch_normalization=self.network_data. batch_normalization_3, train_ph=self.tf_is_traing_pl, use_tensorboard=True, keep_prob_list=self.network_data.keep_dropout_3, kernel_initializers=self.network_data.kernel_init_3, bias_initializers=self.network_data.bias_init_3, tensorboard_scope='dense_layer_3') with tf.name_scope("RNN_2"): if self.network_data.is_bidirectional_2: self.rnn_outputs_2 = bidirectional_rnn( input_ph=self.dense_layer_3, seq_len_ph=self.seq_len, num_layers=len(self.network_data.num_fw_cell_units_2), num_fw_cell_units=self.network_data. num_fw_cell_units_2, num_bw_cell_units=self.network_data. num_bw_cell_units_2, name="RNN_2", activation_fw_list=self.network_data. cell_fw_activation_2, activation_bw_list=self.network_data. cell_bw_activation_2, use_tensorboard=True, tensorboard_scope='RNN_2', output_size=self.network_data.rnn_output_sizes_2) else: self.rnn_outputs_2 = unidirectional_rnn( input_ph=self.dense_layer_3, seq_len_ph=self.seq_len, num_layers=len(self.network_data.num_cell_units_2), num_cell_units=self.network_data.num_cell_units_2, name="RNN_2", activation_list=self.network_data.cell_activation_2, use_tensorboard=True, tensorboard_scope='RNN_2', output_size=self.network_data.rnn_output_sizes_2) with tf.name_scope("dense_layer_4"): self.dense_layer_4 = dense_multilayer( input_ph=self.rnn_outputs_2, num_layers=self.network_data.num_dense_layers_4, num_units=self.network_data.num_dense_units_4, name='dense_layer_4', activation_list=self.network_data.dense_activations_4, use_batch_normalization=self.network_data. batch_normalization_4, train_ph=self.tf_is_traing_pl, use_tensorboard=True, keep_prob_list=self.network_data.keep_dropout_4, kernel_initializers=self.network_data.kernel_init_4, bias_initializers=self.network_data.bias_init_4, tensorboard_scope='dense_layer_4') with tf.name_scope("dense_output_2"): self.dense_output_no_activation_2 = dense_layer( input_ph=self.dense_layer_4, num_units=self.network_data.num_classes, name='dense_output_no_activation_2', activation=None, use_batch_normalization=False, train_ph=False, use_tensorboard=True, keep_prob=1, tensorboard_scope='dense_output_no_activation_2') self.dense_output_2 = tf.nn.softmax( self.dense_output_no_activation_2, name='dense_output_2') tf.summary.histogram('dense_output_2', self.dense_output_2) with tf.name_scope("decoder_2"): self.output_time_major_2 = tf.transpose( self.dense_output_2, (1, 0, 2)) self.decoded_2, log_prob = self.network_data.decoder_function( self.output_time_major_2, self.seq_len) self.dense_decoded_2 = tf.sparse_to_dense( self.decoded_2[0].indices, self.decoded_2[0].dense_shape, self.decoded_2[0].values) with tf.name_scope("loss"): rnn_loss = 0 for var in tf.trainable_variables(): if var.name.startswith('RNN_') and 'kernel' in var.name: rnn_loss += tf.nn.l2_loss(var) dense_loss = 0 for var in tf.trainable_variables(): if var.name.startswith('dense_layer') or \ var.name.startswith('dense_layer') and \ 'kernel' in var.name: dense_loss += tf.nn.l2_loss(var) loss_1 = tf.nn.ctc_loss(self.input_label, self.dense_output_no_activation_1, self.seq_len, time_major=False) loss_2 = tf.nn.ctc_loss(self.input_label, self.dense_output_no_activation_2, self.seq_len, time_major=False) self.logits_loss = tf.reduce_mean(tf.reduce_sum( loss_2)) + 0.3 * tf.reduce_mean(tf.reduce_sum(loss_1)) self.loss = self.logits_loss \ + self.network_data.rnn_regularizer * rnn_loss \ + self.network_data.dense_regularizer * dense_loss tf.summary.scalar('loss', self.loss) # define the optimizer with tf.name_scope("training"): self.training_op = self.network_data.optimizer.minimize( self.loss) with tf.name_scope("label_error_rate"): # Inaccuracy: label error rate self.ler = tf.reduce_mean( tf.edit_distance(hypothesis=tf.cast( self.decoded_2[0], tf.int32), truth=self.input_label, normalize=True)) tf.summary.scalar('label_error_rate', tf.reduce_mean(self.ler)) self.checkpoint_saver = tf.train.Saver(save_relative_paths=True) self.merged_summary = tf.summary.merge_all()
def create_graph(self): with self.graph.as_default(): self.tf_is_traing_pl = tf.placeholder_with_default( True, shape=(), name='is_training') with tf.name_scope("seq_len"): self.seq_len = tf.placeholder(tf.int32, shape=[None], name="sequence_length") with tf.name_scope("input_features"): self.input_feature = tf.placeholder( dtype=tf.float32, shape=[None, None, self.network_data.num_features], name="input") tf.summary.image('feature', [tf.transpose(self.input_feature)]) with tf.name_scope("input_labels"): self.input_label = tf.sparse_placeholder(dtype=tf.int32, shape=[None, None], name="input_label") self.dense_layer_1 = tf.identity(self.input_feature) with tf.name_scope("dense_layer_1"): self.dense_layer_1 = dense_multilayer( input_ph=self.dense_layer_1, num_layers=self.network_data.num_dense_layers_1, num_units=self.network_data.num_dense_units_1, name='dense_layer_1', activation_list=self.network_data.dense_activations_1, use_batch_normalization=self.network_data. batch_normalization_1, train_ph=self.tf_is_traing_pl, use_tensorboard=True, keep_prob_list=self.network_data.keep_dropout_1, kernel_initializers=self.network_data.kernel_init_1, bias_initializers=self.network_data.bias_init_1, tensorboard_scope='dense_layer_1') with tf.name_scope("RNN_cell"): if self.network_data.is_bidirectional: self.rnn_outputs = bidirectional_rnn( input_ph=self.dense_layer_1, seq_len_ph=self.seq_len, num_layers=len(self.network_data.num_fw_cell_units), num_fw_cell_units=self.network_data.num_fw_cell_units, num_bw_cell_units=self.network_data.num_bw_cell_units, name="RNN_cell", activation_fw_list=self.network_data. cell_fw_activation, activation_bw_list=self.network_data. cell_bw_activation, use_tensorboard=True, tensorboard_scope='RNN', output_size=self.network_data.rnn_output_sizes) else: self.rnn_outputs = unidirectional_rnn( input_ph=self.dense_layer_1, seq_len_ph=self.seq_len, num_layers=len(self.network_data.num_cell_units), num_cell_units=self.network_data.num_cell_units, name="RNN_cell", activation_list=self.network_data.cell_activation, use_tensorboard=True, tensorboard_scope='RNN', output_size=self.network_data.rnn_output_sizes) with tf.name_scope("dense_layer_2"): self.dense_layer_2 = dense_multilayer( input_ph=self.rnn_outputs, num_layers=self.network_data.num_dense_layers_2, num_units=self.network_data.num_dense_units_2, name='dense_layer_2', activation_list=self.network_data.dense_activations_2, use_batch_normalization=self.network_data. batch_normalization_2, train_ph=self.tf_is_traing_pl, use_tensorboard=True, keep_prob_list=self.network_data.keep_dropout_2, kernel_initializers=self.network_data.kernel_init_2, bias_initializers=self.network_data.bias_init_2, tensorboard_scope='dense_layer_2') with tf.name_scope("dense_output"): self.dense_output_no_activation = dense_layer( input_ph=self.rnn_outputs, num_units=self.network_data.num_classes, name='dense_output_no_activation', activation=None, use_batch_normalization=False, train_ph=False, use_tensorboard=True, keep_prob=1, tensorboard_scope='dense_output') self.dense_output = tf.nn.softmax( self.dense_output_no_activation, name='dense_output') tf.summary.histogram('dense_output', self.dense_output) with tf.name_scope("output_classes"): self.output_classes = tf.argmax(self.dense_output, 2) with tf.name_scope("loss"): rnn_loss = 0 for var in tf.trainable_variables(): if var.name.startswith( 'RNN_cell') and 'kernel' in var.name: rnn_loss += tf.nn.l2_loss(var) dense_loss = 0 for var in tf.trainable_variables(): if var.name.startswith( 'dense_layer') and 'kernel' in var.name: dense_loss += tf.nn.l2_loss(var) loss = tf.nn.ctc_loss(self.input_label, self.dense_output_no_activation, self.seq_len, time_major=False) self.logits_loss = tf.reduce_mean(tf.reduce_sum(loss)) self.loss = self.logits_loss \ + self.network_data.rnn_regularizer * rnn_loss \ + self.network_data.dense_regularizer * dense_loss tf.summary.scalar('loss', self.loss) # define the optimizer with tf.name_scope("training"): self.training_op = self.network_data.optimizer.minimize( self.loss) with tf.name_scope("decoder"): self.output_time_major = tf.transpose(self.dense_output, (1, 0, 2)) self.word_beam_search_module = tf.load_op_library( self.network_data.word_beam_search_path) # prepare information about language (dictionary, characters in dataset, characters forming words) chars = str().join(self.network_data.char_list) word_chars = open(self.network_data.word_char_list_path).read( ).splitlines()[0] corpus = open(self.network_data.corpus_path).read() # decode using the "Words" mode of word beam search self.decoded = self.word_beam_search_module.word_beam_search( self.output_time_major, self.network_data.beam_width, self.network_data.scoring_mode, self.network_data.smoothing, corpus.encode('utf8'), chars.encode('utf8'), word_chars.encode('utf8')) with tf.name_scope("label_error_rate"): # No es la mejor forma de calcular el LER, pero ya probé varias y esta fue la que mejor anduvo # Inaccuracy: label error rate dense_label = tf.sparse_to_dense(self.input_label.indices, self.input_label.dense_shape, self.input_label.values) # (self.network_data.num_classes-1) its the blank index decoded_mask = tf.not_equal(self.decoded, self.network_data.num_classes - 1) decoded_mask.set_shape([None, None]) decoded_mask = tf.boolean_mask(self.decoded, decoded_mask) label_mask = tf.not_equal(dense_label, self.network_data.num_classes - 1) label_mask.set_shape([None, None]) label_mask = tf.boolean_mask(dense_label, label_mask) self.edit_distance = tf.edit_distance( hypothesis=tf.cast( tf.contrib.layers.dense_to_sparse([decoded_mask]), tf.int32), truth=tf.cast( tf.contrib.layers.dense_to_sparse([label_mask]), tf.int32), normalize=True) self.ler = tf.reduce_mean(self.edit_distance) tf.summary.scalar('label_error_rate', tf.reduce_mean(self.ler)) self.checkpoint_saver = tf.train.Saver(save_relative_paths=True) self.merged_summary = tf.summary.merge_all()
def create_graph(self, use_tfrecords=False, features_tensor=None, labels_tensor=None, features_len_tensor=None, labels_len_tensor=None): with self.graph.as_default(): self.tf_is_traing_pl = tf.placeholder_with_default( True, shape=(), name='is_training') with tf.name_scope("input_features"): if use_tfrecords: self.input_features = features_tensor else: self.input_features = tf.placeholder( dtype=tf.float32, shape=[None, None, self.network_data.num_features], name="input_features") with tf.name_scope("input_features_length"): if use_tfrecords: self.input_features_length = features_len_tensor else: self.input_features_length = tf.placeholder( dtype=tf.int32, shape=[None], name='input_features_length') with tf.name_scope("input_labels"): if use_tfrecords: self.input_labels = labels_tensor else: self.input_labels = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_labels') with tf.name_scope("input_labels_length"): if use_tfrecords: self.input_labels_length = labels_len_tensor else: self.input_labels_length = tf.placeholder( dtype=tf.int32, shape=[None], name='input_labels_length') self.max_label_length = tf.reduce_max(self.input_labels_length, name='max_label_length') self.max_features_length = tf.reduce_max( self.input_features_length, name='max_features_length') self.batch_size = tf.shape(self.input_features)[0] self.global_step = tf.Variable(0, trainable=False, name='global_step') with tf.name_scope("embeddings"): self.embedding = tf.get_variable( name='embedding', shape=[ self.network_data.num_classes + 1, self.network_data.num_embeddings ], dtype=tf.float32) self.label_embedding = tf.nn.embedding_lookup( params=self.embedding, ids=self.input_labels, name='label_embedding') with tf.name_scope("dense_layer_1"): self.dense_layer_1_out = dense_multilayer( input_ph=self.input_features, num_layers=self.network_data.num_dense_layers_1, num_units=self.network_data.num_units_1, name='dense_layer_1', activation_list=self.network_data.dense_activations_1, use_batch_normalization=self.network_data. batch_normalization_1, train_ph=self.tf_is_traing_pl, use_tensorboard=True, keep_prob_list=self.network_data.keep_prob_1, kernel_initializers=self.network_data.kernel_init_1, bias_initializers=self.network_data.bias_init_1, tensorboard_scope='dense_layer_1') with tf.name_scope("listener"): self.listener_output, self.listener_out_len, self.listener_state = bidirectional_pyramidal_rnn( input_ph=self.dense_layer_1_out, seq_len_ph=self.input_features_length, num_layers=self.network_data.listener_num_layers, num_units=self.network_data.listener_num_units, name="listener", activation_list=self.network_data.listener_activation_list, use_tensorboard=True, tensorboard_scope="listener", keep_prob=self.network_data.listener_keep_prob_list, train_ph=self.tf_is_traing_pl) with tf.variable_scope("attention"): cell, decoder_initial_state = attention_layer( input=self.listener_output, num_layers=self.network_data.attention_num_layers, rnn_units_list=list( map(lambda x: 2 * x, self.network_data.listener_num_units)), rnn_activations_list=self.network_data. attention_activation_list, attention_units=self.network_data.attention_units, lengths=self.listener_out_len, batch_size=self.batch_size, input_state=self.listener_state, keep_prob=self.network_data.attention_keep_prob_list, train_ph=self.tf_is_traing_pl) self.logits, _, _ = attention_decoder( input_cell=cell, initial_state=decoder_initial_state, embedding=self.embedding, seq_embedding=self.label_embedding, seq_embedding_len=self.input_labels_length, output_projection=Dense(self.network_data.num_classes), max_iterations=self.max_label_length, sampling_prob=0.5, time_major=False, name="attention") with tf.name_scope("tile_batch"): if self.network_data.beam_width > 0: tiled_listener_output = tf.contrib.seq2seq.tile_batch( self.listener_output, multiplier=self.network_data.beam_width) tiled_listener_state = tf.contrib.seq2seq.tile_batch( self.listener_state, multiplier=self.network_data.beam_width) tiled_listener_out_len = tf.contrib.seq2seq.tile_batch( self.listener_out_len, multiplier=self.network_data.beam_width) tiled_batch_size = self.batch_size * self.network_data.beam_width else: tiled_listener_output = self.listener_output tiled_listener_state = self.listener_state tiled_listener_out_len = self.listener_out_len tiled_batch_size = self.batch_size self.projection_layer = Dense(self.network_data.num_classes, use_bias=True) with tf.variable_scope("attention", reuse=True): tiled_cell, tiled_decoder_initial_state = attention_layer( input=tiled_listener_output, num_layers=self.network_data.attention_num_layers, rnn_units_list=list( map(lambda x: 2 * x, self.network_data.listener_num_units)), rnn_activations_list=self.network_data. attention_activation_list, attention_units=self.network_data.attention_units, lengths=tiled_listener_out_len, batch_size=tiled_batch_size, input_state=tuple(tiled_listener_state), keep_prob=None, train_ph=self.tf_is_traing_pl) start_tokens = tf.fill([self.batch_size], self.network_data.sos_id) if self.network_data.beam_width > 0: decoded_ids = beam_search_decoder( input_cell=tiled_cell, embedding=self.embedding, initial_state=tiled_decoder_initial_state, start_token=start_tokens, end_token=self.network_data.eos_id, beam_width=self.network_data.beam_width, output_layer=self.projection_layer, max_iterations=self.max_features_length, name="attention", time_major=False) decoded_ids = decoded_ids[:, :, 0] # Most probable beam else: decoded_ids = greedy_decoder( input_cell=tiled_cell, embedding=self.embedding, initial_state=tiled_decoder_initial_state, start_token=start_tokens, end_token=self.network_data.eos_id, output_layer=self.projection_layer, max_iterations=self.max_features_length, name="attention", time_major=False) with tf.name_scope('decoded_ids'): self.decoded_ids = tf.identity(decoded_ids, name='decoded_ids') with tf.name_scope("loss"): kernel_loss = 0 for var in tf.trainable_variables(): if var.name.startswith( 'dense_layer') and 'kernel' in var.name: kernel_loss += tf.nn.l2_loss(var) for var in tf.trainable_variables(): if var.name.startswith( 'listener') and 'kernel' in var.name: kernel_loss += tf.nn.l2_loss(var) for var in tf.trainable_variables(): if var.name.startswith( 'attention') and 'kernel' in var.name: kernel_loss += tf.nn.l2_loss(var) target_weights = tf.sequence_mask(self.input_labels_length, self.max_label_length, dtype=tf.float32, name='mask') sequence_loss = tf.contrib.seq2seq.sequence_loss( logits=self.logits, targets=self.input_labels, weights=target_weights, average_across_timesteps=True, average_across_batch=True) self.loss = sequence_loss + self.network_data.kernel_regularizer * kernel_loss tf.summary.scalar('sequence_loss', sequence_loss) tf.summary.scalar('loss', self.loss) with tf.name_scope("label_error_rate"): train_decoded_ids = tf.argmax(tf.nn.softmax(self.logits, axis=2), axis=2) self.train_ler = tf.reduce_mean( tf.edit_distance( hypothesis=tf.contrib.layers.dense_to_sparse( tf.cast(train_decoded_ids, tf.int32)), truth=tf.contrib.layers.dense_to_sparse( self.input_labels), normalize=True)) self.ler = tf.reduce_mean( tf.edit_distance( hypothesis=tf.contrib.layers.dense_to_sparse( tf.cast(self.decoded_ids, tf.int32)), truth=tf.contrib.layers.dense_to_sparse( self.input_labels), normalize=True)) tf.summary.scalar('label_error_rate', tf.reduce_mean(self.ler)) tf.summary.scalar('train_label_error_rate', tf.reduce_mean(self.train_ler)) with tf.name_scope("training_op"): if self.network_data.use_learning_rate_decay: self.learning_rate = tf.train.exponential_decay( self.network_data.learning_rate, self.global_step, decay_steps=self.network_data. learning_rate_decay_steps, decay_rate=self.network_data.learning_rate_decay, staircase=True) else: self.learning_rate = self.network_data.learning_rate opt = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=self.network_data.adam_beta1, beta2=self.network_data.adam_beta2, epsilon=self.network_data.adam_epsilon) if self.network_data.clip_norm > 0: grads, vs = zip(*opt.compute_gradients(self.loss)) grads, _ = tf.clip_by_global_norm( grads, self.network_data.clip_norm) self.train_op = opt.apply_gradients( zip(grads, vs), global_step=self.global_step) else: self.train_op = self.network_data.optimizer.minimize( self.loss) self.checkpoint_saver = tf.train.Saver(save_relative_paths=True) self.merged_summary = tf.summary.merge_all()
def model_fn(features, labels, mode, config, params): input_features = features['feature'] input_features_length = features['feat_len'] subsample_factor = params["num_reduce_by_half"] if subsample_factor is not None and subsample_factor > 0: for i in range(subsample_factor): input_features_length = tf.div(input_features_length, 2) + tf.cast(input_features_length % 2, dtype=tf.int32) input_features = input_features[:, ::2] if params['noise_stddev'] is not None and params['noise_stddev'] != 0.0: input_features = tf.keras.layers.GaussianNoise(stddev=params['noise_stddev'])(inputs=input_features, training=mode == tf.estimator.ModeKeys.TRAIN) decoder_inputs = None targets = None targets_length = None global_step = tf.train.get_global_step() if mode != tf.estimator.ModeKeys.PREDICT: decoder_inputs = labels['targets_inputs'] targets = labels['targets_outputs'] targets_length = labels['target_len'] with tf.name_scope("dense_layer_1"): input_features = dense_multilayer(input_ph=input_features, num_layers=params['num_dense_layers_1'], num_units=params['num_units_1'], name='dense_layer_1', activation_list=params['dense_activations_1'], use_batch_normalization=params['batch_normalization_1'], batch_normalization_trainable=params['batch_normalization_trainable_1'], train_ph=mode == tf.estimator.ModeKeys.TRAIN, use_tensorboard=True, keep_prob_list=params['keep_prob_1'], kernel_initializers=params['kernel_init_1'], bias_initializers=params['bias_init_1'], tensorboard_scope='dense_layer_1') with tf.variable_scope('listener'): listener_output, input_features_length, listener_state = bidirectional_pyramidal_rnn( input_ph=input_features, seq_len_ph=input_features_length, num_layers=params['listener_num_layers'], num_units=params['listener_num_units'], name="listener", activation_list=params['listener_activation_list'], use_tensorboard=True, tensorboard_scope="listener", keep_prob=params['listener_keep_prob_list'], train_ph=mode == tf.estimator.ModeKeys.TRAIN) with tf.name_scope("dense_layer_2"): listener_output = dense_multilayer(input_ph=listener_output, num_layers=params['num_dense_layers_2'], num_units=params['num_units_2'], name='dense_layer_2', activation_list=params['dense_activations_2'], use_batch_normalization=params['batch_normalization_2'], batch_normalization_trainable=params['batch_normalization_trainable_2'], train_ph=mode == tf.estimator.ModeKeys.TRAIN, use_tensorboard=True, keep_prob_list=params['keep_prob_2'], kernel_initializers=params['kernel_init_2'], bias_initializers=params['bias_init_2'], tensorboard_scope='dense_layer_2') with tf.variable_scope('tile_batch'): batch_size = tf.shape(listener_output)[0] if mode == tf.estimator.ModeKeys.PREDICT and params['beam_width'] > 0: listener_output = tf.contrib.seq2seq.tile_batch( listener_output, multiplier=params['beam_width']) input_features_length = tf.contrib.seq2seq.tile_batch( input_features_length, multiplier=params['beam_width']) listener_state = tf.contrib.seq2seq.tile_batch( listener_state, multiplier=params['beam_width']) batch_size = batch_size * params['beam_width'] with tf.variable_scope('attention'): attention_cell, attention_state = attention_layer( input=listener_output, lengths=input_features_length, num_layers=params['attention_num_layers'], attention_units=params['attention_units'], attention_size=params['attention_size'], attention_type=params['attention_type'], activation=params['attention_activation'], keep_prob=params['attention_keep_prob'], train_ph=mode == tf.estimator.ModeKeys.TRAIN, batch_size=batch_size, input_state=None, use_tensorboard=True, tensorboard_scope='attention_cell' ) with tf.variable_scope('speller'): def embedding_fn(ids): if params['num_embeddings'] != 0: target_embedding = tf.get_variable( name='target_embedding', shape=[params['num_classes'], params['num_embeddings']], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) return tf.nn.embedding_lookup(target_embedding, ids) else: return tf.one_hot(ids, params['num_classes']) projection_layer = tf.layers.Dense(params['num_classes'], use_bias=True, name='projection_layer') maximum_iterations = None if mode != tf.estimator.ModeKeys.TRAIN: max_source_length = tf.reduce_max(input_features_length) maximum_iterations = tf.to_int32(tf.round(tf.to_float(max_source_length) * 2)) if mode == tf.estimator.ModeKeys.TRAIN: decoder_inputs = embedding_fn(decoder_inputs) decoder = attention_decoder( input_cell=attention_cell, initial_state=attention_state, embedding_fn=embedding_fn, seq_embedding=decoder_inputs, seq_embedding_len=targets_length, projection_layer=projection_layer, sampling_prob=params['sampling_probability']) elif mode == tf.estimator.ModeKeys.PREDICT and params['beam_width'] > 0: decoder = beam_search_decoder( input_cell=attention_cell, embedding=embedding_fn, start_token=params['sos_id'], end_token=params['eos_id'], initial_state=attention_state, beam_width=params['beam_width'], projection_layer=projection_layer, batch_size=batch_size) else: decoder = greedy_decoder( inputs=attention_cell, embedding=embedding_fn, start_token=params['sos_id'], end_token=params['eos_id'], initial_state=attention_state, projection_layer=projection_layer, batch_size=batch_size) decoder_outputs, final_context_state, final_sequence_length = tf.contrib.seq2seq.dynamic_decode( decoder, maximum_iterations=maximum_iterations) with tf.name_scope('prediction'): if mode == tf.estimator.ModeKeys.PREDICT and params['beam_width'] > 0: logits = tf.no_op() sample_ids = decoder_outputs.predicted_ids else: logits = decoder_outputs.rnn_output sample_ids = tf.to_int32(tf.argmax(logits, -1)) if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'sample_ids': sample_ids} return tf.estimator.EstimatorSpec(mode, predictions=predictions) with tf.name_scope('metrics'): ler = edit_distance( sample_ids, targets, params['eos_id'], None) #params.mapping) metrics = {'ler': tf.metrics.mean(ler),} tf.summary.scalar('ler', metrics['ler'][1]) with tf.name_scope('loss'): kernel_loss = 0 for var in tf.trainable_variables(): if var.name.startswith('dense_layer') and 'kernel' in var.name: kernel_loss += tf.nn.l2_loss(var) attetion_loss = attention_loss( logits=logits, targets=targets, logits_length=final_sequence_length, targets_length=targets_length, eos_id=params['eos_id'], train_ph=mode == tf.estimator.ModeKeys.TRAIN) loss = attetion_loss + params['kernel_regularizer'] * kernel_loss if mode == tf.estimator.ModeKeys.EVAL: def _create_attention_images_summary(context_state): """Reference: https://github.com/tensorflow/nmt/blob/master/nmt/attention_model.py""" images = (context_state.alignment_history.stack()) # Reshape to (batch, src_seq_len, tgt_seq_len,1) images = tf.expand_dims(tf.transpose(images, [1, 2, 0]), -1) # Scale to range [0, 255] images -= 1 images = -images images *= 255 summary = tf.summary.image("attention_images", images) return summary with tf.name_scope('alignment'): attention_summary = _create_attention_images_summary(final_context_state) eval_summary_hook = tf.train.SummarySaverHook( save_steps=10, output_dir=os.path.join(config.model_dir, 'eval'), summary_op=attention_summary) logging_hook = tf.train.LoggingTensorHook( tensors={ 'ler': tf.reduce_mean(ler), # 'max_predictions': sample_ids[tf.argmax(ler)], # 'max_targets': targets[tf.argmax(ler)], }, every_n_iter=10) return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics, evaluation_hooks=[logging_hook, eval_summary_hook]) with tf.name_scope('train'): if params['use_learning_rate_decay']: learning_rate = tf.train.exponential_decay( params['learning_rate'], global_step, decay_steps=params['learning_rate_decay_steps'], decay_rate=params['learning_rate_decay'], staircase=True) else: learning_rate = params['learning_rate'] if params['optimizer'] == 'sgd': optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) elif params['optimizer'] == 'momentum' and params['momentum'] is not None: optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params['momentum']) elif params['optimizer'] == 'rms': optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate) else: optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) loss = tf.tuple([loss], control_inputs=tf.get_collection(tf.GraphKeys.UPDATE_OPS))[0] if params['clip_gradient'] != 0: grads = tf.gradients(loss, tf.trainable_variables()) grads, _ = tf.clip_by_global_norm(grads, params['clip_gradient']) grads_and_vars = list(zip(grads, tf.trainable_variables())) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) else: train_op = optimizer.minimize(loss, global_step=global_step) logging_hook = tf.train.LoggingTensorHook( tensors={ 'loss': loss, 'ler': tf.reduce_mean(ler), 'learning_rate': tf.reduce_mean(learning_rate), # 'feal_len': features['feat_len'], # 'feal_len2': input_features_length, # 'feal_len3': tf.shape(input_features), }, every_n_secs=1) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook], eval_metric_ops=metrics)
def create_graph(self, use_tfrecords=False, features_tensor=None, labels_tensor=None, features_len_tensor=None): with self.graph.as_default(): self.tf_is_traing_pl = tf.placeholder_with_default(True, shape=(), name='is_training') with tf.name_scope("seq_len"): if not use_tfrecords: self.input_features_length = tf.placeholder(tf.int32, shape=[None], name="sequence_length") else: self.input_features_length = features_len_tensor with tf.name_scope("input_features"): if not use_tfrecords: self.input_features = tf.placeholder( dtype=tf.float32, shape=[None, None, self.network_data.num_features], name="input") else: self.input_features = features_tensor with tf.name_scope("input_labels"): if not use_tfrecords: self.input_labels = tf.sparse_placeholder( dtype=tf.int32, shape=[None, None], name="input_label") else: self.input_labels = labels_tensor self.rnn_input = tf.identity(self.input_features) with tf.name_scope("dense_layer_1"): self.rnn_input = dense_multilayer(input_ph=self.rnn_input, num_layers=self.network_data.num_dense_layers_1, num_units=self.network_data.num_units_1, name='dense_layer_1', activation_list=self.network_data.dense_activations_1, use_batch_normalization=self.network_data.batch_normalization_1, train_ph=self.tf_is_traing_pl, use_tensorboard=True, keep_prob_list=self.network_data.keep_prob_1, kernel_initializers=self.network_data.kernel_init_1, bias_initializers=self.network_data.bias_init_1, tensorboard_scope='dense_layer_1') with tf.name_scope("RNN_cell"): if self.network_data.is_bidirectional: self.rnn_outputs = bidirectional_rnn( input_ph=self.rnn_input, seq_len_ph=self.input_features_length, num_layers=len(self.network_data.num_fw_cell_units), num_fw_cell_units=self.network_data.num_fw_cell_units, num_bw_cell_units=self.network_data.num_bw_cell_units, name="RNN_cell", activation_fw_list=self.network_data.cell_fw_activation, activation_bw_list=self.network_data.cell_bw_activation, use_tensorboard=True, tensorboard_scope='RNN', output_size=self.network_data.rnn_output_sizes) else: self.rnn_outputs = unidirectional_rnn( input_ph=self.rnn_input, seq_len_ph=self.input_features_length, num_layers=len(self.network_data.num_cell_units), num_cell_units=self.network_data.num_cell_units, name="RNN_cell", activation_list=self.network_data.cell_activation, use_tensorboard=True, tensorboard_scope='RNN', output_size=self.network_data.rnn_output_sizes) with tf.name_scope("dense_layer_2"): self.rnn_outputs = dense_multilayer(input_ph=self.rnn_outputs, num_layers=self.network_data.num_dense_layers_2, num_units=self.network_data.num_units_2, name='dense_layer_2', activation_list=self.network_data.dense_activations_2, use_batch_normalization=self.network_data.batch_normalization_2, train_ph=self.tf_is_traing_pl, use_tensorboard=True, keep_prob_list=self.network_data.keep_prob_2, kernel_initializers=self.network_data.kernel_init_2, bias_initializers=self.network_data.bias_init_2, tensorboard_scope='dense_layer_2') with tf.name_scope("dense_output"): self.dense_output_no_activation = dense_layer(input_ph=self.rnn_outputs, num_units=self.network_data.num_classes, name='dense_output_no_activation', activation=None, use_batch_normalization=False, train_ph=False, use_tensorboard=True, keep_prob=1, tensorboard_scope='dense_output') self.dense_output = tf.nn.softmax(self.dense_output_no_activation, name='dense_output') tf.summary.histogram('dense_output', self.dense_output) with tf.name_scope("loss"): rnn_loss = 0 for var in tf.trainable_variables(): if var.name.startswith('RNN_cell') and 'kernel' in var.name: rnn_loss += tf.nn.l2_loss(var) dense_loss = 0 for var in tf.trainable_variables(): if var.name.startswith('dense_layer') or \ var.name.startswith('input_dense_layer') and \ 'kernel' in var.name: dense_loss += tf.nn.l2_loss(var) loss = tf.nn.ctc_loss(self.input_labels, self.dense_output_no_activation, self.input_features_length, time_major=False) self.logits_loss = tf.reduce_mean(tf.reduce_sum(loss)) self.loss = self.logits_loss \ + self.network_data.rnn_regularizer * rnn_loss \ + self.network_data.dense_regularizer * dense_loss tf.summary.scalar('loss', self.loss) # define the optimizer with tf.name_scope("training"): self.train_op = self.network_data.optimizer.minimize(self.loss) with tf.name_scope("decoder"): self.output_time_major = tf.transpose(self.dense_output, (1, 0, 2)) self.decoded, log_prob = self.network_data.decoder_function(self.output_time_major, self.input_features_length) with tf.name_scope("label_error_rate"): # Inaccuracy: label error rate self.ler = tf.reduce_mean(tf.edit_distance(hypothesis=tf.cast(self.decoded[0], tf.int32), truth=self.input_labels, normalize=True)) tf.summary.scalar('label_error_rate', tf.reduce_mean(self.ler)) self.checkpoint_saver = tf.train.Saver(save_relative_paths=True) self.merged_summary = tf.summary.merge_all()