def insight(self, data, spklist, batch_type="softmax", output_embeddings=False, aux_data=None): """Just use to debug the network """ self.sess.run(tf.global_variables_initializer()) self.sess.run(tf.local_variables_initializer()) assert batch_type == "softmax" or batch_type == "end2end", "The batch_type can only be softmax or end2end" embeddings_val = None labels_val = None self.load() if output_embeddings: # If we want to output embeddings, the features should be loaded in order data_loader = KaldiDataSeqQueue( data, spklist, num_parallel=2, max_qsize=10, batch_size=self.params.num_speakers_per_batch * self.params.num_segments_per_speaker, min_len=self.params.min_segment_len, max_len=self.params.max_segment_len, shuffle=False) data_loader.start() tf.logging.info("Generate valid embeddings.") # In this mode, the embeddings and labels will be saved and output. It needs more memory and takes longer # to process these values. while True: try: features, labels = data_loader.fetch() valid_emb_val, valid_labels_val, endpoints_val = self.sess.run( [self.embeddings, self.valid_labels, self.endpoints], feed_dict={ self.valid_features: features, self.valid_labels: labels }) # acc = np.sum(np.equal(np.argmax(endpoints_val['logits'], axis=1), labels, dtype=np.float)) / float( # labels.shape[0]) # print("Acc: %f" % acc) # Save the embeddings and labels if embeddings_val is None: embeddings_val = valid_emb_val labels_val = valid_labels_val else: embeddings_val = np.concatenate( (embeddings_val, valid_emb_val), axis=0) labels_val = np.concatenate( (labels_val, valid_labels_val), axis=0) except DataOutOfRange: break data_loader.stop() if batch_type == "softmax": data_loader = KaldiDataSeqQueue( data, spklist, num_parallel=2, max_qsize=10, batch_size=self.params.num_speakers_per_batch * self.params.num_segments_per_speaker * 10, min_len=self.params.min_segment_len, max_len=self.params.max_segment_len, shuffle=True) elif batch_type == "end2end": # The num_valid_speakers_per_batch and num_valid_segments_per_speaker are only required when # End2End loss is used. Since we switch the loss function to softmax generalized e2e loss # when the e2e loss is used. assert "num_valid_speakers_per_batch" in self.params.dict and "num_valid_segments_per_speaker" in self.params.dict, \ "Valid parameters should be set if E2E loss is selected" data_loader = KaldiDataRandomQueue( data, spklist, num_parallel=2, max_qsize=10, num_speakers=self.params.num_valid_speakers_per_batch, num_segments=self.params.num_valid_segments_per_speaker, min_len=self.params.min_segment_len, max_len=self.params.max_segment_len, shuffle=True) else: raise ValueError data_loader.start() while True: try: features, labels = data_loader.fetch() _, endpoints_val = self.sess.run( [self.valid_ops["valid_loss_op"], self.endpoints], feed_dict={ self.valid_features: features, self.valid_labels: labels }) except DataOutOfRange: break data_loader.stop() loss = self.sess.run(self.valid_ops["valid_loss"]) tf.logging.info( "Shorter segments are used to test the valid loss (%d-%d)" % (self.params.min_segment_len, self.params.max_segment_len)) tf.logging.info("Loss: %f" % loss) # while True: # try: # features, labels = data_loader.fetch() # valid_ops, endpoints_val = self.sess.run([self.valid_ops, self.endpoints], feed_dict={self.valid_features: features, # self.valid_labels: labels}) # loss = valid_ops["valid_loss"] # except DataOutOfRange: # break # data_loader.stop() # tf.logging.info("Loss: %f" % loss) acc = np.sum( np.equal(np.argmax(endpoints_val['logits'], axis=1), labels, dtype=np.float)) / float(labels.shape[0]) print("Acc: %f" % acc) import pdb pdb.set_trace() # from model.test_utils import softmax # with tf.variable_scope("softmax", reuse=True): # test = tf.get_variable("output/kernel") # test_val = self.sess.run(test) return loss, embeddings_val, labels_val
if os.path.isfile(os.path.join(model_dir, "learning_rate")): learning_rate_array = load_lr( os.path.join(model_dir, "learning_rate")) assert len( learning_rate_array ) == start_epoch + 1, "Not enough learning rates in the learning_rate file." else: learning_rate_array = [float(learning_rate)] * (start_epoch + 1) dim = FeatureReader(args.train_dir).get_dim() if "selected_dim" in params.dict: dim = params.selected_dim with open(os.path.join(model_dir, "feature_dim"), "w") as f: f.write("%d\n" % dim) num_total_train_speakers = KaldiDataRandomQueue( args.train_dir, args.train_spklist).num_total_speakers tf.logging.info( "There are %d speakers in the training set and the dim is %d" % (num_total_train_speakers, dim)) min_valid_loss = ValidLoss() if os.path.isfile(os.path.join(model_dir, "valid_loss")): min_valid_loss = load_valid_loss(os.path.join(model_dir, "valid_loss")) # The trainer is used to control the training process if args.num_gpus == 1: trainer = Trainer(params, args.finetune_model, dim, num_total_train_speakers) else: if (params.num_speakers_per_batch * params.num_segments_per_speaker) % args.num_gpus != 0:
def valid(self, data, spklist, batch_type="softmax", output_embeddings=False, aux_data=None): """Evaluate on the validation set Args: data: The training data directory. spklist: The spklist is a file map speaker name to the index. batch_type: `softmax` or `end2end`. The batch is `softmax-like` or `end2end-like`. If the batch is `softmax-like`, each sample are from different speakers; if the batch is `end2end-like`, the samples are from N speakers with M segments per speaker. output_embeddings: Set True to output the corresponding embeddings and labels of the valid set. If output_embeddings, an additional valid metric (e.g. EER) should be computed outside the function. aux_data: The auxiliary data directory. :return: valid_loss, embeddings and labels (None if output_embeddings is False). """ # Initialization will reset all the variables in the graph. # The local variables are also need to be initialized for metrics function. self.sess.run(tf.global_variables_initializer()) self.sess.run(tf.local_variables_initializer()) assert batch_type == "softmax" or batch_type == "end2end", "The batch_type can only be softmax or end2end" curr_step = 0 # Load the model. The valid function can only be called after training (of course...) if os.path.isfile(os.path.join(self.model, "checkpoint")): curr_step = self.load() else: tf.logging.info( "[Warning] Cannot find model in %s. Random initialization is used in validation." % self.model) embeddings_val = None labels_val = None num_batches = 0 if output_embeddings: # If we want to output embeddings, the features should be loaded in order data_loader = KaldiDataSeqQueue( data, spklist, num_parallel=2, max_qsize=10, batch_size=self.params.num_speakers_per_batch * self.params.num_segments_per_speaker, min_len=self.params.min_segment_len, max_len=self.params.max_segment_len, shuffle=False) data_loader.start() tf.logging.info("Generate valid embeddings.") # In this mode, the embeddings and labels will be saved and output. It needs more memory and takes longer # to process these values. while True: try: if num_batches % 100 == 0: tf.logging.info("valid step: %d" % num_batches) features, labels = data_loader.fetch() valid_emb_val, valid_labels_val = self.sess.run( [self.embeddings, self.valid_labels], feed_dict={ self.valid_features: features, self.valid_labels: labels, self.global_step: curr_step }) # Save the embeddings and labels if embeddings_val is None: embeddings_val = valid_emb_val labels_val = valid_labels_val else: embeddings_val = np.concatenate( (embeddings_val, valid_emb_val), axis=0) labels_val = np.concatenate( (labels_val, valid_labels_val), axis=0) num_batches += 1 except DataOutOfRange: break data_loader.stop() if batch_type == "softmax": data_loader = KaldiDataSeqQueue( data, spklist, num_parallel=2, max_qsize=10, batch_size=self.params.num_speakers_per_batch * self.params.num_segments_per_speaker, min_len=self.params.min_segment_len, max_len=self.params.max_segment_len, shuffle=True) elif batch_type == "end2end": # The num_valid_speakers_per_batch and num_valid_segments_per_speaker are only required when # End2End loss is used. Since we switch the loss function to softmax generalized e2e loss # when the e2e loss is used. assert "num_valid_speakers_per_batch" in self.params.dict and "num_valid_segments_per_speaker" in self.params.dict, \ "Valid parameters should be set if E2E loss is selected" data_loader = KaldiDataRandomQueue( data, spklist, num_parallel=2, max_qsize=10, num_speakers=self.params.num_valid_speakers_per_batch, num_segments=self.params.num_valid_segments_per_speaker, min_len=self.params.min_segment_len, max_len=self.params.max_segment_len, shuffle=True) else: raise ValueError data_loader.start() num_batches = 0 for _ in range(self.params.valid_max_iterations): try: if num_batches % 100 == 0: tf.logging.info("valid step: %d" % num_batches) features, labels = data_loader.fetch() _ = self.sess.run(self.valid_ops["valid_loss_op"], feed_dict={ self.valid_features: features, self.valid_labels: labels, self.global_step: curr_step }) num_batches += 1 except DataOutOfRange: break data_loader.stop() loss, summary = self.sess.run( [self.valid_ops["valid_loss"], self.valid_summary]) # We only save the summary for the last batch. self.valid_summary_writer.add_summary(summary, curr_step) # The valid loss is averaged over all the batches. tf.logging.info("[Validation %d batches] valid loss: %f" % (num_batches, loss)) # The output embeddings and labels can be used to compute EER or other metrics return loss, embeddings_val, labels_val
def train_tune_lr(self, data, spklist, tune_period=100, aux_data=None): """Tune the learning rate. According to: https://www.kdnuggets.com/2017/11/estimating-optimal-learning-rate-deep-neural-network.html Args: data: The training data directory. spklist: The spklist is a file map speaker name to the index. tune_period: How many steps per learning rate. aux_data: The auxiliary data directory. """ # initialize all variables self.sess.run(tf.global_variables_initializer()) # We need to load the model sometimes, since we may try to find the learning rate for fine-tuning. if os.path.isfile(os.path.join(self.model, "checkpoint")): self.load() data_loader = KaldiDataRandomQueue( data, spklist, num_parallel=self.params.num_parallel_datasets, max_qsize=self.params.max_queue_size, num_speakers=self.params.num_speakers_per_batch, num_segments=self.params.num_segments_per_speaker, min_len=self.params.min_segment_len, max_len=self.params.max_segment_len, shuffle=True) data_loader.start() # The learning rate normally varies from 1e-5 to 1 # Some common values: # 1. factor = 1.15 # tune_period = 200 # tune_times = 100 init_learning_rate = 1e-5 factor = 1.15 tune_times = 100 fp_lr = open(os.path.join(self.model, "learning_rate_tuning"), "w") for step in range(tune_period * tune_times): lr = init_learning_rate * (factor**(step // tune_period)) try: if step % tune_period == 0: train_ops = [ self.train_ops, self.train_op, self.train_summary ] # train_ops = [self.train_ops, self.train_op] start_time = time.time() features, labels = data_loader.fetch() train_val = self.sess.run(train_ops, feed_dict={ self.train_features: features, self.train_labels: labels, self.global_step: 0, self.learning_rate: lr }) end_time = time.time() tf.logging.info( "Epoch: step: %2d, time: %.4f s/step, lr: %f, raw loss: %f, total loss: %f" \ % (step, end_time - start_time, lr, train_val[0]["raw_loss"], train_val[0]["loss"])) fp_lr.write("%d %f %f\n" % (step, lr, train_val[0]["loss"])) self.summary_writer.add_summary(train_val[-1], step) else: features, labels = data_loader.fetch() _ = self.sess.run(self.train_op, feed_dict={ self.train_features: features, self.train_labels: labels, self.global_step: 0, self.learning_rate: lr }) except DataOutOfRange: tf.logging.info("Finished reading features.") break data_loader.stop() fp_lr.close() return
def train(self, data, spklist, learning_rate, aux_data=None): """Train the model. Args: data: The training data directory. spklist: The spklist is a file map speaker name to the index. learning_rate: The learning rate is passed by the main program. The main program can easily tune the learning rate according to the validation accuracy or anything else. aux_data: The auxiliary data (maybe useful in child class.) """ # initialize all variables self.sess.run(tf.global_variables_initializer()) # curr_step is the real step the training at. curr_step = 0 # Load the model if we have if os.path.isfile(os.path.join(self.model, "checkpoint")): curr_step = self.load() # The data loader data_loader = KaldiDataRandomQueue( data, spklist, num_parallel=self.params.num_parallel_datasets, max_qsize=self.params.max_queue_size, num_speakers=self.params.num_speakers_per_batch, num_segments=self.params.num_segments_per_speaker, min_len=self.params.min_segment_len, max_len=self.params.max_segment_len, shuffle=True) data_loader.start() epoch = int(curr_step / self.params.num_steps_per_epoch) for step in range(curr_step % self.params.num_steps_per_epoch, self.params.num_steps_per_epoch): try: if step % self.params.save_summary_steps == 0 or step % self.params.show_training_progress == 0: train_ops = [self.train_ops, self.train_op] if step % self.params.save_summary_steps == 0: train_ops.append(self.train_summary) start_time = time.time() features, labels = data_loader.fetch() train_val = self.sess.run(train_ops, feed_dict={ self.train_features: features, self.train_labels: labels, self.global_step: curr_step, self.learning_rate: learning_rate }) end_time = time.time() tf.logging.info( "Epoch: [%2d] step: [%2d/%2d] time: %.4f s/step, raw loss: %f, total loss: %f" % (epoch, step, self.params.num_steps_per_epoch, end_time - start_time, train_val[0]["raw_loss"], train_val[0]["loss"])) if step % self.params.save_summary_steps == 0: self.summary_writer.add_summary( train_val[-1], curr_step) else: # Only compute optimizer. features, labels = data_loader.fetch() _ = self.sess.run(self.train_op, feed_dict={ self.train_features: features, self.train_labels: labels, self.global_step: curr_step, self.learning_rate: learning_rate }) if step % self.params.save_checkpoints_steps == 0 and curr_step != 0: self.save(curr_step) curr_step += 1 except DataOutOfRange: tf.logging.info("Finished reading features.") break data_loader.stop() self.save(curr_step) return
def train(self, data, spklist, learning_rate, aux_data=None): """Train the model. Args: data: The training data directory. spklist: The spklist is a file map speaker name to the index. learning_rate: The learning rate is passed by the main program. The main program can easily tune the learning rate according to the validation accuracy or anything else. aux_data: The auxiliary data (maybe useful in child class.) """ # initialize all variables # graph = tf.get_default_graph() # kernel_six = graph.get_tensor_by_name('tdnn_svd6/tdnn6.5_dense/kernel:0') # def get_semi_orthogonal(mat): #pri# nt(mat.shape) # M = tf.transpose(mat) # #M = mat # I = tf.Variable(np.identity(M.shape[0]), dtype=tf.float32) # for _ in range(10): # P = tf.matmul(M, M, transpose_b=True) # alpha2 = tf.divide(tf.trace(tf.matmul(P, P, transpose_b=True)), tf.trace(P)) # M = M - (1 / (2.0 * alpha2)) * tf.matmul(tf.subtract(P, alpha2 * I), M) # P = tf.matmul(M, M, transpose_b=True) # alpha2 = tf.divide(tf.trace(tf.matmul(P, P, transpose_b=True)), tf.trace(P)) # M = M / alpha2 # return tf.transpose(M) # semi = get_semi_orthogonal(kernel_six) # semi_op = tf.assign(kernel_six, semi) self.sess.run(tf.global_variables_initializer()) # curr_step is the real step the training at. curr_step = 0 # Load the model if we have if os.path.isfile(os.path.join(self.model, "checkpoint")): curr_step = self.load() # The data loader data_loader = KaldiDataRandomQueue(data, spklist, num_parallel=self.params.num_parallel_datasets, max_qsize=self.params.max_queue_size, num_speakers=self.params.num_speakers_per_batch, num_segments=self.params.num_segments_per_speaker, min_len=self.params.min_segment_len, max_len=self.params.max_segment_len, shuffle=True) epoch = int(curr_step / self.params.num_steps_per_epoch) data_loader.start() for step in range(curr_step % self.params.num_steps_per_epoch, self.params.num_steps_per_epoch): try: # if step % 4 == 0: # # SEMI ORTHOGONA; # self.sess.run(semi_op) if step % self.params.save_summary_steps == 0 or step % self.params.show_training_progress == 0: train_ops = [self.train_ops, self.train_op] if step % self.params.save_summary_steps == 0: train_ops.append(self.train_summary) start_time = time.time() features, labels = data_loader.fetch() train_val = self.sess.run(train_ops, feed_dict={self.train_features: features, self.train_labels: labels, self.global_step: curr_step, self.learning_rate: learning_rate}) end_time = time.time() tf.logging.info( "Epoch: [%2d] step: [%2d/%2d] time: %.4f s/step, raw loss: %f, total loss: %f" % (epoch, step, self.params.num_steps_per_epoch, end_time - start_time, train_val[0]["raw_loss"], train_val[0]["loss"])) if step % self.params.save_summary_steps == 0: self.summary_writer.add_summary(train_val[-1], curr_step) else: # Only compute optimizer. features, labels = data_loader.fetch() _ = self.sess.run(self.train_op, feed_dict={self.train_features: features, self.train_labels: labels, self.global_step: curr_step, self.learning_rate: learning_rate}) if step % self.params.save_checkpoints_steps == 0 and curr_step != 0: self.save(curr_step) curr_step += 1 except DataOutOfRange: tf.logging.info("Finished reading features.") break data_loader.stop() self.save(curr_step) return
# Disable GPU os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import tensorflow as tf if __name__ == '__main__': tf.reset_default_graph() tf.logging.set_verbosity(tf.logging.INFO) nnet_dir = os.path.join(args.model_dir, "nnet") config_json = os.path.join(args.model_dir, "nnet/config.json") if not os.path.isfile(config_json): sys.exit("Cannot find params.json in %s" % config_json) params = Params(config_json) # First, we need to extract the weights num_total_train_speakers = KaldiDataRandomQueue( os.path.dirname(args.spklist), args.spklist).num_total_speakers dim = FeatureReader(os.path.dirname(args.spklist)).get_dim() if "selected_dim" in params.dict: dim = params.selected_dim trainer = Trainer(params, args.model_dir, dim, num_total_train_speakers, single_cpu=True) trainer.build("valid") trainer.sess.run(tf.global_variables_initializer()) trainer.sess.run(tf.local_variables_initializer()) if not args.init: curr_step = trainer.load() else: