def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) graph = tf.Graph() with graph.as_default(): ###################### # Config model_deploy# ###################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step global_step = tf.Variable(0, name='global_step', trainable=False) ############################### # Select and load the dataset # ############################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) # Load the dataset fileh = tables.open_file('/home/sina/datasets/lip_read_features/lipread_train.hdf5', mode='r') # Get the mean vectors mean_mouth = np.load('/home/sina/GITHUB/LIPREAD_PROJECT/data_preprocessing/mean_mouth.npy') mean_mouth = mean_mouth.reshape( (1, fileh.root.mouth.shape[1], fileh.root.mouth.shape[2], fileh.root.mouth.shape[3])) mean_speech = np.load('/home/sina/GITHUB/LIPREAD_PROJECT/data_preprocessing/mean_speech.npy') mean_speech = mean_speech.reshape( (1, fileh.root.speech.shape[1], fileh.root.speech.shape[2], fileh.root.speech.shape[3])) ############################################ ######### Cross Validation Section ######### ############################################ num_samples_per_epoch = fileh.root.label.shape[0] X = np.arange(num_samples_per_epoch) kf = KFold(n_splits=5) num_batches_per_epoch = int(num_samples_per_epoch / FLAGS.batch_size) ###################### # Select the network # ###################### network_speech_fn = nets_factory.get_network_fn( FLAGS.model_speech_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) network_mouth_fn = nets_factory.get_network_fn( FLAGS.model_mouth_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### # TODO: Do some preprocessing if necessary. ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): """ Define the place holders and creating the batch tensor. """ # Place holders mouth = tf.placeholder(tf.float32, (47, 73, 9)) speech = tf.placeholder(tf.float32, (13, 15, 1)) label = tf.placeholder(tf.uint8, (1)) # Create the batch tensors batch_speech, batch_mouth, batch_labels = tf.train.batch( [speech, mouth, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Run the model # #################### # Outputs of two networks logits_speech, end_points_speech = network_speech_fn(batch_speech) # logits_speech = tf.nn.l2_normalize(logits_speech, dim=1, epsilon=1e-12, name=None) logits_mouth, end_points_mouth = network_mouth_fn(batch_mouth) # logits_mouth = tf.nn.l2_normalize(logits_mouth, dim=1, epsilon=1e-12, name=None) ############################# # Specify the loss function # ############################# # Two distance metric are defined: # 1 - distance_weighted: which is a weighted average of the distance between two structures. # 2 - distance_l2: which is the regular l2-norm of the two networks outputs. #### Weighted distance ###### distance_vector = tf.subtract(logits_speech, logits_mouth, name=None) distance_weighted = slim.fully_connected(distance_vector, 1, activation_fn=tf.nn.sigmoid, normalizer_fn=None, scope='fc_weighted') #### Euclidean distance #### distance_l2 = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(logits_speech, logits_mouth), 2), 1, keep_dims=True)) #### Contrastive loss ##### loss = losses.contrastive_loss(batch_labels, distance_l2, margin= 50) # Adding the accuracy metric with tf.name_scope('accuracy'): predictions = tf.to_int64(tf.sign(tf.sign(distance_l2 - 0.5) + 1)) labels = tf.argmax(distance_l2, 1) accuracy = tf.reduce_mean(tf.to_float(tf.equal(predictions, labels))) tf.add_to_collection('accuracy', accuracy) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for all end_points. for end_point in end_points_speech: x = end_points_speech[end_point] summaries.add(tf.summary.histogram('activations_speech/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity_speech/' + end_point, tf.nn.zero_fraction(x))) for end_point in end_points_mouth: x = end_points_mouth[end_point] summaries.add(tf.summary.histogram('activations_mouth/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity_mouth/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### learning_rate = _configure_learning_rate(num_samples_per_epoch, global_step) optimizer = _configure_optimizer(learning_rate) optimizer = optimizer.minimize(loss) # Add to parameters to summaries summaries.add(tf.summary.scalar('learning_rate', learning_rate)) summaries.add(tf.summary.scalar('global_step', global_step)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) summaries.add(tf.summary.scalar('eval/Loss', loss)) summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### with tf.Session(graph=graph) as sess: # Initialization of the network. variables_to_restore = slim.get_variables_to_restore() saver = tf.train.Saver(slim.get_variables_to_restore()) coord = tf.train.Coordinator() sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) num_epoch = 3 # # Save the model # saver.restore(sess, '/home/sina/TRAIN_LIPREAD/train_logs-1366') # op to write logs to Tensorboard summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=graph) step = 1 EER_AVERAGE = 0.0 AUC_AVERAGE = 0.0 for train_index, test_index in kf.split(X): num_batches_train_per_epoch = int(train_index.shape[0] / FLAGS.batch_size) for epoch in range(num_epoch): # Loop over all batches for i in range(num_batches_train_per_epoch): step += 1 start_idx = train_index[i * FLAGS.batch_size] end_idx = train_index[(i + 1) * FLAGS.batch_size] speech, mouth, label = fileh.root.speech[start_idx:end_idx], fileh.root.mouth[ start_idx:end_idx], fileh.root.label[ start_idx:end_idx] # mean subtraction speech = (speech - mean_speech) / 186.0 mouth = (mouth - mean_mouth) / 255.0 _, loss_value, score_dissimilarity, score_dissimilarity_2, training_accuracy, summary = sess.run( [optimizer, loss, distance_l2, distance_weighted, accuracy, summary_op], feed_dict={global_step: step, batch_speech: speech, batch_mouth: mouth, batch_labels: label.reshape([FLAGS.batch_size, 1])}) summary_writer.add_summary(summary, epoch * num_batches_per_epoch + i) # Calculate ROC data EER_train, AUC_train = calculate_roc.calculate_eer_auc(label, score_dissimilarity) print("Cross validation train, " + "Epoch " + str(epoch + 1) + ", Minibatch " + str( i + 1) + " of %d " % num_batches_per_epoch + ", Minibatch Loss= " + \ "{:.6f}".format(loss_value) + ", EER= " + "{:.5f}".format(EER_train) + ", AUC= " + "{:.5f}".format(AUC_train)) # Save the model saver.save(sess, FLAGS.train_dir, global_step=step) print('Training is finished!! ... ') ### CROSS VALIDATION TEST ############ num_batches_test_per_epoch = int(test_index.shape[0] / FLAGS.batch_size) score_dissimilarity_vector = np.zeros((FLAGS.batch_size * num_batches_test_per_epoch, 1)) label_vector = np.zeros((FLAGS.batch_size * num_batches_test_per_epoch,)) for i in range(num_batches_test_per_epoch): start_idx = test_index[ i * FLAGS.batch_size] end_idx = test_index[(i + 1) * FLAGS.batch_size] speech, mouth, label = fileh.root.speech[start_idx:end_idx], fileh.root.mouth[ start_idx:end_idx], fileh.root.label[ start_idx:end_idx] # mean subtraction speech = (speech - mean_speech) / 186.0 mouth = (mouth - mean_mouth) / 255.0 _, loss_value, score_dissimilarity, score_dissimilarity_2, training_accuracy = sess.run( [optimizer, loss, distance_l2, distance_weighted, accuracy], feed_dict={global_step: step, batch_speech: speech, batch_mouth: mouth, batch_labels: label.reshape([FLAGS.batch_size, 1])}) score_dissimilarity_vector[i * FLAGS.batch_size:(i + 1) * FLAGS.batch_size] = score_dissimilarity label_vector[i * FLAGS.batch_size:(i + 1) * FLAGS.batch_size] = label print("Cross validation test, " + "Minibatch " + str( i + 1) + " of %d " % num_batches_test_per_epoch) # Calculate ROC data EER_test, AUC_test = calculate_roc.calculate_eer_auc(label_vector, score_dissimilarity_vector) print("EER_test=",EER_test) print("AUC_test=",AUC_test) EER_AVERAGE += EER_test AUC_AVERAGE += AUC_test print(EER_AVERAGE,AUC_AVERAGE) print("EER_AVERAGE=", EER_AVERAGE / 5.0) print("AUC_AVERAGE=", AUC_AVERAGE / 5.0)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) graph = tf.Graph() with graph.as_default(), tf.device('/cpu:0'): ###################### # Config model_deploy# ###################### # required from data num_samples_per_epoch = train_data['mouth'].shape[0] num_batches_per_epoch = int(num_samples_per_epoch / FLAGS.batch_size) num_samples_per_epoch_test = test_data['mouth'].shape[0] num_batches_per_epoch_test = int(num_samples_per_epoch_test / FLAGS.batch_size) # Create global_step global_step = tf.Variable(0, name='global_step', trainable=False) ######################################### # Configure the larning rate. # ######################################### learning_rate = _configure_learning_rate(num_samples_per_epoch, global_step) opt = _configure_optimizer(learning_rate) ###################### # Select the network # ###################### is_training = tf.placeholder(tf.bool) network_speech_fn = nets_factory.get_network_fn( FLAGS.model_speech_name, num_classes=2, weight_decay=FLAGS.weight_decay, is_training=is_training) network_mouth_fn = nets_factory.get_network_fn( FLAGS.model_mouth_name, num_classes=2, weight_decay=FLAGS.weight_decay, is_training=is_training) ##################################### # Select the preprocessing function # ##################################### # TODO: Do some preprocessing if necessary. ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## # with tf.device(deploy_config.inputs_device()): """ Define the place holders and creating the batch tensor. """ # Mouth spatial set INPUT_SEQ_LENGTH = 9 INPUT_HEIGHT = 60 INPUT_WIDTH = 100 INPUT_CHANNELS = 1 batch_mouth = tf.placeholder(tf.float32, shape=([ None, INPUT_SEQ_LENGTH, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNELS ])) # Speech spatial set INPUT_SEQ_LENGTH_SPEECH = 15 INPUT_HEIGHT_SPEECH = 40 INPUT_WIDTH_SPEECH = 1 INPUT_CHANNELS_SPEECH = 3 batch_speech = tf.placeholder(tf.float32, shape=([ None, INPUT_SEQ_LENGTH_SPEECH, INPUT_HEIGHT_SPEECH, INPUT_WIDTH_SPEECH, INPUT_CHANNELS_SPEECH ])) # Label batch_labels = tf.placeholder(tf.uint8, (None, 1)) margin_imp_tensor = tf.placeholder(tf.float32, ()) ################################ ## Feed forwarding to network ## ################################ tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in range(FLAGS.num_clones): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('tower', i)) as scope: """ Two distance metric are defined: 1 - distance_weighted: which is a weighted average of the distance between two structures. 2 - distance_l2: which is the regular l2-norm of the two networks outputs. Place holders """ ######################################## ######## Outputs of two networks ####### ######################################## logits_speech, end_points_speech = network_speech_fn( batch_speech) logits_mouth, end_points_mouth = network_mouth_fn( batch_mouth) # # Uncomment if the output embedding is desired to be as |f(x)| = 1 # logits_speech = tf.nn.l2_normalize(logits_speech, dim=1, epsilon=1e-12, name=None) # logits_mouth = tf.nn.l2_normalize(logits_mouth, dim=1, epsilon=1e-12, name=None) ################################################# ########### Loss Calculation #################### ################################################# # ##### Weighted distance using a fully connected layer ##### # distance_vector = tf.subtract(logits_speech, logits_mouth, name=None) # distance_weighted = slim.fully_connected(distance_vector, 1, activation_fn=tf.nn.sigmoid, # normalizer_fn=None, # scope='fc_weighted') ##### Euclidean distance #### distance_l2 = tf.sqrt( tf.reduce_sum(tf.pow( tf.subtract(logits_speech, logits_mouth), 2), 1, keep_dims=True)) ##### Contrastive loss ###### loss = losses.contrastive_loss( batch_labels, distance_l2, margin_imp=margin_imp_tensor, scope=scope) # ##### call the optimizer ###### # # TODO: call optimizer object outside of this gpu environment # # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # Calculate the mean of each gradient. grads = average_gradients(tower_grads) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) ################################################# ########### Summary Section ##################### ################################################# # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for all end_points. for end_point in end_points_speech: x = end_points_speech[end_point] # summaries.add(tf.summary.histogram('activations_speech/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity_speech/' + end_point, tf.nn.zero_fraction(x))) for end_point in end_points_mouth: x = end_points_mouth[end_point] # summaries.add(tf.summary.histogram('activations_mouth/' + end_point, x)) summaries.add( tf.summary.scalar('sparsity_mouth/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) # Add to parameters to summaries summaries.add(tf.summary.scalar('learning_rate', learning_rate)) summaries.add(tf.summary.scalar('global_step', global_step)) summaries.add(tf.summary.scalar('eval/Loss', loss)) summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### ######## Training ######### ########################### with tf.Session(graph=graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess: # Initialization of the network. variables_to_restore = slim.get_variables_to_restore() saver = tf.train.Saver(variables_to_restore, max_to_keep=20) coord = tf.train.Coordinator() sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # # Restore the model # saver.restore(sess, '/home/sina/TRAIN_LIPREAD/train_logs-1366') # op to write logs to Tensorboard summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=graph) ##################################### ############## TRAIN ################ ##################################### step = 1 for epoch in range(FLAGS.num_epochs): # Loop over all batches for batch_num in range(num_batches_per_epoch): step += 1 start_idx = batch_num * FLAGS.batch_size end_idx = (batch_num + 1) * FLAGS.batch_size speech_train, mouth_train, label_train = train_data['speech'][ start_idx:end_idx], train_data['mouth'][ start_idx:end_idx], train_label[start_idx:end_idx] # # # Standardalization for speech if necessary # speech_train = (speech_train - mean_speech) / std_speech # # # # Standardalization for visual if necessary # mouth_train = (mouth_train - mean_mouth) / std_mouth ######################################################################### ################## Online Pair Selection Algorithm ###################### ######################################################################### online_pair_selection = True if online_pair_selection: distance = sess.run(distance_l2, feed_dict={ is_training: False, batch_speech: speech_train, batch_mouth: mouth_train, batch_labels: label_train.reshape( [FLAGS.batch_size, 1]) }) label_keep = [] ############################### hard_margin = 10 # Max-Min distance in genuines max_gen = 0 min_gen = 100 for j in range(label_train.shape[0]): if label_train[j] == 1: if max_gen < distance[j, 0]: max_gen = distance[j, 0] if min_gen > distance[j, 0]: min_gen = distance[j, 0] # Min-Max distance in impostors min_imp = 100 max_imp = 0 for k in range(label_train.shape[0]): if label_train[k] == 0: if min_imp > distance[k, 0]: min_imp = distance[k, 0] if max_imp < distance[k, 0]: max_imp = distance[k, 0] ### Keeping hard impostors and genuines for i in range(label_train.shape[0]): # imposter if label_train[i] == 0: if distance[i, 0] < max_gen + hard_margin: label_keep.append(i) elif label_train[i] == 1: # if distance[i, 0] > min_imp - hard_margin: label_keep.append(i) #### Choosing the pairs ###### speech_train = speech_train[label_keep] mouth_train = mouth_train[label_keep] label_train = label_train[label_keep] ############################################ #### Running the training operation ######## _, loss_value, score_dissimilarity, summary, training_step, _ = sess.run( [ train_op, loss, distance_l2, summary_op, global_step, is_training ], feed_dict={ is_training: True, margin_imp_tensor: 100, batch_speech: speech_train, batch_mouth: mouth_train, batch_labels: label_train.reshape([label_train.shape[0], 1]) }) summary_writer.add_summary(summary, epoch * num_batches_per_epoch + i) # try and error method is used to handle the error due to ROC calculation try: # Calculation of ROC EER, AUC, AP, fpr, tpr = calculate_roc.calculate_eer_auc_ap( label_train, score_dissimilarity) if (batch_num + 1) % FLAGS.log_every_n_steps == 0: print("Epoch " + str(epoch + 1) + ", Minibatch " + str( batch_num + 1) + " of %d " % num_batches_per_epoch + ", Minibatch Loss= " + \ "{:.6f}".format(loss_value) + ", EER= " + "{:.5f}".format(EER) + ", AUC= " + "{:.5f}".format( AUC) + ", AP= " + "{:.5f}".format(AP) + ", contrib = %d pairs" % label_train.shape[0]) except: print("Error: ", sys.exc_info()[0]) print("No contributing impostor pair!") # Save the model saver.save(sess, FLAGS.train_dir, global_step=training_step) ################################################### ############## TEST PER EACH EPOCH ################ ################################################### score_dissimilarity_vector = np.zeros( (FLAGS.batch_size * num_batches_per_epoch_test, 1)) label_vector = np.zeros( (FLAGS.batch_size * num_batches_per_epoch_test, 1)) # Loop over all batches for i in range(num_batches_per_epoch_test): start_idx = i * FLAGS.batch_size end_idx = (i + 1) * FLAGS.batch_size speech_test, mouth_test, label_test = test_data['speech'][ start_idx:end_idx], test_data['mouth'][ start_idx:end_idx], test_label[start_idx:end_idx] # # # Uncomment if standardalization is needed # # mean subtraction if necessary # speech_test = (speech_test - mean_speech) / std_speech # mouth_test = (mouth_test - mean_mouth) / std_mouth # Evaluation phase # WARNING: margin_imp_tensor has no effect here but it needs to be there because its tensor required a value to feed in!! loss_value, score_dissimilarity, _ = sess.run( [loss, distance_l2, is_training], feed_dict={ is_training: False, margin_imp_tensor: 50, batch_speech: speech_test, batch_mouth: mouth_test, batch_labels: label_test.reshape([FLAGS.batch_size, 1]) }) if (i + 1) % FLAGS.log_every_n_steps == 0: print("TESTING: Epoch " + str(epoch + 1) + ", Minibatch " + str(i + 1) + " of %d " % num_batches_per_epoch_test) score_dissimilarity_vector[ start_idx:end_idx] = score_dissimilarity label_vector[start_idx:end_idx] = label_test ############################## ##### K-fold validation ###### ############################## K = 10 EER = np.zeros((K, 1)) AUC = np.zeros((K, 1)) AP = np.zeros((K, 1)) batch_k_validation = int(label_vector.shape[0] / float(K)) for i in range(K): EER[i, :], AUC[i, :], AP[ i, :], fpr, tpr = calculate_roc.calculate_eer_auc_ap( label_vector[i * batch_k_validation:(i + 1) * batch_k_validation], score_dissimilarity_vector[i * batch_k_validation:(i + 1) * batch_k_validation]) # Printing Equal Error Rate(EER), Area Under the Curve(AUC) and Average Precision(AP) print("TESTING: Epoch " + str(epoch + 1) + ", EER= " + str(np.mean(EER, axis=0)) + ", AUC= " + str(np.mean(AUC, axis=0)) + ", AP= " + str(np.mean(AP, axis=0)))