def train(self, epoch, data, spklist, aux_data=None):
     self.network.train()
     curr_step = 0
     data_loader = KaldiDataRandomQueue(data, spklist,
                                        num_parallel=self.params.num_parallel_datasets,
                                        max_qsize=self.params.max_queue_size,
                                        num_speakers=self.params.num_speakers_per_batch,
                                        num_segments=self.params.num_segments_per_speaker,
                                        min_len=self.params.min_segment_len,
                                        max_len=self.params.max_segment_len,
                                        shuffle=True)
     data_loader.start()
     sum_loss, sum_samples = 0, 0
     for step in range(curr_step % self.params.num_steps_per_epoch, self.params.num_steps_per_epoch):
         features, labels = data_loader.fetch()
         sum_samples += len(features)
         features, labels = self.transform(features, labels)
         out, _ = self.network(features)
         torch.cuda.empty_cache()
         loss = self.loss_network(out, labels)
         sum_loss += loss.item() * len(features)
         if step % self.params.show_training_process == 0:
             with open(os.path.join(self.model_log, "iter_loss_log"), 'a') as iter_f:
                 iter_f.write("Time:{}, Epoch:{}, Iter:{}, Loss:{}\n".format(
                     time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
                     epoch, step, sum_loss / sum_samples))
         self.optimizer.zero_grad()
         loss.backward()
         self.optimizer.step()
         curr_step += 1
     with open(os.path.join(self.model_log, "epoch_loss_log"), 'a') as epoch_f:
         epoch_f.write("Time:{}, Epoch:{}, Loss:{}\n".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
                                                             epoch, sum_loss / sum_samples))
         self.save(epoch=epoch, model=self.network, optimizer=self.optimizer)
     data_loader.stop()
예제 #2
0
    # The model directory always has a folder named nnet
    model_dir = os.path.join(args.finetune_model, "nnet")

    # Set the random seed. The random operations may appear in data input, batch forming, etc.
    tf.set_random_seed(params.seed)
    random.seed(params.seed)
    np.random.seed(params.seed)

    dim = FeatureReader(args.train_dir).get_dim()
    if "selected_dim" in params.dict:
        dim = params.selected_dim

    with open(os.path.join(model_dir, "feature_dim"), "w") as f:
        f.write("%d\n" % dim)

    num_total_train_speakers = KaldiDataRandomQueue(args.train_dir, args.train_spklist).num_total_speakers
    tf.logging.info("There are %d speakers in the training set and the dim is %d" % (num_total_train_speakers, dim))

    min_valid_loss = ValidLoss()

    # The trainer is used to control the training process
    trainer = Trainer(params, args.finetune_model, dim, num_total_train_speakers)
    trainer.build("train")
    trainer.build("valid")

    # Load the pre-trained model and transfer to current model
    trainer.get_finetune_model(params.noload_var_list)

    trainer.train_tune_lr(args.train_dir, args.train_spklist, args.tune_period)
    trainer.close()
    tf.logging.info("Finish tuning.")
예제 #3
0
    def insight(self,
                data,
                spklist,
                batch_type="softmax",
                output_embeddings=False,
                aux_data=None):
        """Just use to debug the network
        """
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(tf.local_variables_initializer())
        assert batch_type == "softmax" or batch_type == "end2end", "The batch_type can only be softmax or end2end"

        embeddings_val = None
        labels_val = None

        self.load()

        if output_embeddings:
            # If we want to output embeddings, the features should be loaded in order
            data_loader = KaldiDataSeqQueue(
                data,
                spklist,
                num_parallel=2,
                max_qsize=10,
                batch_size=self.params.num_speakers_per_batch *
                self.params.num_segments_per_speaker,
                min_len=self.params.min_segment_len,
                max_len=self.params.max_segment_len,
                shuffle=False)
            data_loader.start()

            tf.logging.info("Generate valid embeddings.")
            # In this mode, the embeddings and labels will be saved and output. It needs more memory and takes longer
            # to process these values.
            while True:
                try:
                    features, labels = data_loader.fetch()
                    valid_emb_val, valid_labels_val, endpoints_val = self.sess.run(
                        [self.embeddings, self.valid_labels, self.endpoints],
                        feed_dict={
                            self.valid_features: features,
                            self.valid_labels: labels
                        })

                    # acc = np.sum(np.equal(np.argmax(endpoints_val['logits'], axis=1), labels, dtype=np.float)) / float(
                    #     labels.shape[0])
                    # print("Acc: %f" % acc)

                    # Save the embeddings and labels
                    if embeddings_val is None:
                        embeddings_val = valid_emb_val
                        labels_val = valid_labels_val
                    else:
                        embeddings_val = np.concatenate(
                            (embeddings_val, valid_emb_val), axis=0)
                        labels_val = np.concatenate(
                            (labels_val, valid_labels_val), axis=0)
                except DataOutOfRange:
                    break
            data_loader.stop()

        if batch_type == "softmax":
            data_loader = KaldiDataSeqQueue(
                data,
                spklist,
                num_parallel=2,
                max_qsize=10,
                batch_size=self.params.num_speakers_per_batch *
                self.params.num_segments_per_speaker * 10,
                min_len=self.params.min_segment_len,
                max_len=self.params.max_segment_len,
                shuffle=True)
        elif batch_type == "end2end":
            # The num_valid_speakers_per_batch and num_valid_segments_per_speaker are only required when
            # End2End loss is used. Since we switch the loss function to softmax generalized e2e loss
            # when the e2e loss is used.
            assert "num_valid_speakers_per_batch" in self.params.dict and "num_valid_segments_per_speaker" in self.params.dict, \
                "Valid parameters should be set if E2E loss is selected"
            data_loader = KaldiDataRandomQueue(
                data,
                spklist,
                num_parallel=2,
                max_qsize=10,
                num_speakers=self.params.num_valid_speakers_per_batch,
                num_segments=self.params.num_valid_segments_per_speaker,
                min_len=self.params.min_segment_len,
                max_len=self.params.max_segment_len,
                shuffle=True)
        else:
            raise ValueError

        data_loader.start()

        while True:
            try:
                features, labels = data_loader.fetch()
                _, endpoints_val = self.sess.run(
                    [self.valid_ops["valid_loss_op"], self.endpoints],
                    feed_dict={
                        self.valid_features: features,
                        self.valid_labels: labels
                    })
            except DataOutOfRange:
                break
        data_loader.stop()
        loss = self.sess.run(self.valid_ops["valid_loss"])
        tf.logging.info(
            "Shorter segments are used to test the valid loss (%d-%d)" %
            (self.params.min_segment_len, self.params.max_segment_len))
        tf.logging.info("Loss: %f" % loss)

        # while True:
        #     try:
        #         features, labels = data_loader.fetch()
        #         valid_ops, endpoints_val = self.sess.run([self.valid_ops, self.endpoints], feed_dict={self.valid_features: features,
        #                                                                                                          self.valid_labels: labels})
        #         loss = valid_ops["valid_loss"]
        #     except DataOutOfRange:
        #         break
        # data_loader.stop()
        # tf.logging.info("Loss: %f" % loss)

        acc = np.sum(
            np.equal(np.argmax(endpoints_val['logits'], axis=1),
                     labels,
                     dtype=np.float)) / float(labels.shape[0])
        print("Acc: %f" % acc)

        import pdb
        pdb.set_trace()
        # from model.test_utils import softmax
        # with tf.variable_scope("softmax", reuse=True):
        #     test = tf.get_variable("output/kernel")
        #     test_val = self.sess.run(test)
        return loss, embeddings_val, labels_val
예제 #4
0
    def valid(self,
              data,
              spklist,
              batch_type="softmax",
              output_embeddings=False,
              aux_data=None):
        """Evaluate on the validation set

        Args:
            data: The training data directory.
            spklist: The spklist is a file map speaker name to the index.
            batch_type: `softmax` or `end2end`. The batch is `softmax-like` or `end2end-like`.
                        If the batch is `softmax-like`, each sample are from different speakers;
                        if the batch is `end2end-like`, the samples are from N speakers with M segments per speaker.
            output_embeddings: Set True to output the corresponding embeddings and labels of the valid set.
                               If output_embeddings, an additional valid metric (e.g. EER) should be computed outside
                               the function.
            aux_data: The auxiliary data directory.

        :return: valid_loss, embeddings and labels (None if output_embeddings is False).
        """
        # Initialization will reset all the variables in the graph.
        # The local variables are also need to be initialized for metrics function.
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(tf.local_variables_initializer())
        assert batch_type == "softmax" or batch_type == "end2end", "The batch_type can only be softmax or end2end"

        curr_step = 0
        # Load the model. The valid function can only be called after training (of course...)
        if os.path.isfile(os.path.join(self.model, "checkpoint")):
            curr_step = self.load()
        else:
            tf.logging.info(
                "[Warning] Cannot find model in %s. Random initialization is used in validation."
                % self.model)

        embeddings_val = None
        labels_val = None
        num_batches = 0

        if output_embeddings:
            # If we want to output embeddings, the features should be loaded in order
            data_loader = KaldiDataSeqQueue(
                data,
                spklist,
                num_parallel=2,
                max_qsize=10,
                batch_size=self.params.num_speakers_per_batch *
                self.params.num_segments_per_speaker,
                min_len=self.params.min_segment_len,
                max_len=self.params.max_segment_len,
                shuffle=False)
            data_loader.start()

            tf.logging.info("Generate valid embeddings.")
            # In this mode, the embeddings and labels will be saved and output. It needs more memory and takes longer
            # to process these values.
            while True:
                try:
                    if num_batches % 100 == 0:
                        tf.logging.info("valid step: %d" % num_batches)
                    features, labels = data_loader.fetch()
                    valid_emb_val, valid_labels_val = self.sess.run(
                        [self.embeddings, self.valid_labels],
                        feed_dict={
                            self.valid_features: features,
                            self.valid_labels: labels,
                            self.global_step: curr_step
                        })
                    # Save the embeddings and labels
                    if embeddings_val is None:
                        embeddings_val = valid_emb_val
                        labels_val = valid_labels_val
                    else:
                        embeddings_val = np.concatenate(
                            (embeddings_val, valid_emb_val), axis=0)
                        labels_val = np.concatenate(
                            (labels_val, valid_labels_val), axis=0)
                    num_batches += 1
                except DataOutOfRange:
                    break
            data_loader.stop()

        if batch_type == "softmax":
            data_loader = KaldiDataSeqQueue(
                data,
                spklist,
                num_parallel=2,
                max_qsize=10,
                batch_size=self.params.num_speakers_per_batch *
                self.params.num_segments_per_speaker,
                min_len=self.params.min_segment_len,
                max_len=self.params.max_segment_len,
                shuffle=True)
        elif batch_type == "end2end":
            # The num_valid_speakers_per_batch and num_valid_segments_per_speaker are only required when
            # End2End loss is used. Since we switch the loss function to softmax generalized e2e loss
            # when the e2e loss is used.
            assert "num_valid_speakers_per_batch" in self.params.dict and "num_valid_segments_per_speaker" in self.params.dict, \
                "Valid parameters should be set if E2E loss is selected"
            data_loader = KaldiDataRandomQueue(
                data,
                spklist,
                num_parallel=2,
                max_qsize=10,
                num_speakers=self.params.num_valid_speakers_per_batch,
                num_segments=self.params.num_valid_segments_per_speaker,
                min_len=self.params.min_segment_len,
                max_len=self.params.max_segment_len,
                shuffle=True)
        else:
            raise ValueError

        data_loader.start()
        num_batches = 0
        for _ in range(self.params.valid_max_iterations):
            try:
                if num_batches % 100 == 0:
                    tf.logging.info("valid step: %d" % num_batches)
                features, labels = data_loader.fetch()
                _ = self.sess.run(self.valid_ops["valid_loss_op"],
                                  feed_dict={
                                      self.valid_features: features,
                                      self.valid_labels: labels,
                                      self.global_step: curr_step
                                  })
                num_batches += 1
            except DataOutOfRange:
                break
        data_loader.stop()

        loss, summary = self.sess.run(
            [self.valid_ops["valid_loss"], self.valid_summary])
        # We only save the summary for the last batch.
        self.valid_summary_writer.add_summary(summary, curr_step)
        # The valid loss is averaged over all the batches.
        tf.logging.info("[Validation %d batches] valid loss: %f" %
                        (num_batches, loss))

        # The output embeddings and labels can be used to compute EER or other metrics
        return loss, embeddings_val, labels_val
예제 #5
0
    def train_tune_lr(self, data, spklist, tune_period=100, aux_data=None):
        """Tune the learning rate.

        According to: https://www.kdnuggets.com/2017/11/estimating-optimal-learning-rate-deep-neural-network.html

        Args:
            data: The training data directory.
            spklist: The spklist is a file map speaker name to the index.
            tune_period: How many steps per learning rate.
            aux_data: The auxiliary data directory.
        """
        # initialize all variables
        self.sess.run(tf.global_variables_initializer())

        # We need to load the model sometimes, since we may try to find the learning rate for fine-tuning.
        if os.path.isfile(os.path.join(self.model, "checkpoint")):
            self.load()

        data_loader = KaldiDataRandomQueue(
            data,
            spklist,
            num_parallel=self.params.num_parallel_datasets,
            max_qsize=self.params.max_queue_size,
            num_speakers=self.params.num_speakers_per_batch,
            num_segments=self.params.num_segments_per_speaker,
            min_len=self.params.min_segment_len,
            max_len=self.params.max_segment_len,
            shuffle=True)
        data_loader.start()

        # The learning rate normally varies from 1e-5 to 1
        # Some common values:
        # 1. factor = 1.15
        #    tune_period = 200
        #    tune_times = 100
        init_learning_rate = 1e-5
        factor = 1.15
        tune_times = 100

        fp_lr = open(os.path.join(self.model, "learning_rate_tuning"), "w")
        for step in range(tune_period * tune_times):
            lr = init_learning_rate * (factor**(step // tune_period))
            try:
                if step % tune_period == 0:
                    train_ops = [
                        self.train_ops, self.train_op, self.train_summary
                    ]
                    # train_ops = [self.train_ops, self.train_op]
                    start_time = time.time()
                    features, labels = data_loader.fetch()
                    train_val = self.sess.run(train_ops,
                                              feed_dict={
                                                  self.train_features:
                                                  features,
                                                  self.train_labels: labels,
                                                  self.global_step: 0,
                                                  self.learning_rate: lr
                                              })
                    end_time = time.time()
                    tf.logging.info(
                        "Epoch: step: %2d, time: %.4f s/step, lr: %f, raw loss: %f, total loss: %f" \
                        % (step, end_time - start_time, lr,
                           train_val[0]["raw_loss"], train_val[0]["loss"]))
                    fp_lr.write("%d %f %f\n" %
                                (step, lr, train_val[0]["loss"]))
                    self.summary_writer.add_summary(train_val[-1], step)
                else:
                    features, labels = data_loader.fetch()
                    _ = self.sess.run(self.train_op,
                                      feed_dict={
                                          self.train_features: features,
                                          self.train_labels: labels,
                                          self.global_step: 0,
                                          self.learning_rate: lr
                                      })
            except DataOutOfRange:
                tf.logging.info("Finished reading features.")
                break
        data_loader.stop()
        fp_lr.close()
        return
예제 #6
0
    def train(self, data, spklist, learning_rate, aux_data=None):
        """Train the model.

        Args:
            data: The training data directory.
            spklist: The spklist is a file map speaker name to the index.
            learning_rate: The learning rate is passed by the main program. The main program can easily tune the
                           learning rate according to the validation accuracy or anything else.
            aux_data: The auxiliary data (maybe useful in child class.)
        """
        # initialize all variables
        self.sess.run(tf.global_variables_initializer())

        # curr_step is the real step the training at.
        curr_step = 0

        # Load the model if we have
        if os.path.isfile(os.path.join(self.model, "checkpoint")):
            curr_step = self.load()

        # The data loader
        data_loader = KaldiDataRandomQueue(
            data,
            spklist,
            num_parallel=self.params.num_parallel_datasets,
            max_qsize=self.params.max_queue_size,
            num_speakers=self.params.num_speakers_per_batch,
            num_segments=self.params.num_segments_per_speaker,
            min_len=self.params.min_segment_len,
            max_len=self.params.max_segment_len,
            shuffle=True)
        data_loader.start()

        epoch = int(curr_step / self.params.num_steps_per_epoch)
        for step in range(curr_step % self.params.num_steps_per_epoch,
                          self.params.num_steps_per_epoch):
            try:
                if step % self.params.save_summary_steps == 0 or step % self.params.show_training_progress == 0:
                    train_ops = [self.train_ops, self.train_op]
                    if step % self.params.save_summary_steps == 0:
                        train_ops.append(self.train_summary)
                    start_time = time.time()
                    features, labels = data_loader.fetch()
                    train_val = self.sess.run(train_ops,
                                              feed_dict={
                                                  self.train_features:
                                                  features,
                                                  self.train_labels: labels,
                                                  self.global_step: curr_step,
                                                  self.learning_rate:
                                                  learning_rate
                                              })
                    end_time = time.time()
                    tf.logging.info(
                        "Epoch: [%2d] step: [%2d/%2d] time: %.4f s/step, raw loss: %f, total loss: %f"
                        % (epoch, step, self.params.num_steps_per_epoch,
                           end_time - start_time, train_val[0]["raw_loss"],
                           train_val[0]["loss"]))
                    if step % self.params.save_summary_steps == 0:
                        self.summary_writer.add_summary(
                            train_val[-1], curr_step)
                else:
                    # Only compute optimizer.
                    features, labels = data_loader.fetch()
                    _ = self.sess.run(self.train_op,
                                      feed_dict={
                                          self.train_features: features,
                                          self.train_labels: labels,
                                          self.global_step: curr_step,
                                          self.learning_rate: learning_rate
                                      })

                if step % self.params.save_checkpoints_steps == 0 and curr_step != 0:
                    self.save(curr_step)
                curr_step += 1
            except DataOutOfRange:
                tf.logging.info("Finished reading features.")
                break

        data_loader.stop()
        self.save(curr_step)

        return
    def train(self, data, spklist, learning_rate, aux_data=None):
        """Train the model.

        Args:
            data: The training data directory.
            spklist: The spklist is a file map speaker name to the index.
            learning_rate: The learning rate is passed by the main program. The main program can easily tune the
                           learning rate according to the validation accuracy or anything else.
            aux_data: The auxiliary data (maybe useful in child class.)
        """
        # initialize all variables
        # graph = tf.get_default_graph()
        # kernel_six = graph.get_tensor_by_name('tdnn_svd6/tdnn6.5_dense/kernel:0')
        # def get_semi_orthogonal(mat):
    #pri# nt(mat.shape)
        #     M = tf.transpose(mat)
        #     #M = mat
        #     I = tf.Variable(np.identity(M.shape[0]), dtype=tf.float32)
        #     for _ in range(10):
        #         P = tf.matmul(M, M, transpose_b=True)
        #         alpha2 = tf.divide(tf.trace(tf.matmul(P, P, transpose_b=True)), tf.trace(P))
        #         M = M - (1 / (2.0 * alpha2)) * tf.matmul(tf.subtract(P, alpha2 * I), M)
        #     P = tf.matmul(M, M, transpose_b=True)
        #     alpha2 = tf.divide(tf.trace(tf.matmul(P, P, transpose_b=True)), tf.trace(P))
        #     M = M / alpha2 
        #     return tf.transpose(M)

        # semi = get_semi_orthogonal(kernel_six)
        # semi_op = tf.assign(kernel_six, semi)

        self.sess.run(tf.global_variables_initializer())

        # curr_step is the real step the training at.
        curr_step = 0

        # Load the model if we have
        if os.path.isfile(os.path.join(self.model, "checkpoint")):
            curr_step = self.load()

        # The data loader
        data_loader = KaldiDataRandomQueue(data, spklist,
                                           num_parallel=self.params.num_parallel_datasets,
                                           max_qsize=self.params.max_queue_size,
                                           num_speakers=self.params.num_speakers_per_batch,
                                           num_segments=self.params.num_segments_per_speaker,
                                           min_len=self.params.min_segment_len,
                                           max_len=self.params.max_segment_len,
                                           shuffle=True)
        epoch = int(curr_step / self.params.num_steps_per_epoch)
        data_loader.start()
        for step in range(curr_step % self.params.num_steps_per_epoch, self.params.num_steps_per_epoch):
            try:
        #         if step % 4 == 0:
        #             # SEMI ORTHOGONA;
        #             self.sess.run(semi_op)
                if step % self.params.save_summary_steps == 0 or step % self.params.show_training_progress == 0:
                    train_ops = [self.train_ops, self.train_op]
                    if step % self.params.save_summary_steps == 0:
                        train_ops.append(self.train_summary)
                    start_time = time.time()
                    features, labels = data_loader.fetch()
                    train_val = self.sess.run(train_ops, feed_dict={self.train_features: features,
                                                                    self.train_labels: labels,
                                                                    self.global_step: curr_step,
                                                                    self.learning_rate: learning_rate})
                    end_time = time.time()
                    tf.logging.info(
                        "Epoch: [%2d] step: [%2d/%2d] time: %.4f s/step, raw loss: %f, total loss: %f"
                        % (epoch, step, self.params.num_steps_per_epoch, end_time - start_time,
                           train_val[0]["raw_loss"], train_val[0]["loss"]))
                    if step % self.params.save_summary_steps == 0:
                        self.summary_writer.add_summary(train_val[-1], curr_step)
                else:
                    # Only compute optimizer.
                    features, labels = data_loader.fetch()
                    _ = self.sess.run(self.train_op, feed_dict={self.train_features: features,
                                                                self.train_labels: labels,
                                                                self.global_step: curr_step,
                                                                self.learning_rate: learning_rate})

                if step % self.params.save_checkpoints_steps == 0 and curr_step != 0:
                    self.save(curr_step)
                curr_step += 1
            except DataOutOfRange:
                tf.logging.info("Finished reading features.")
                break

        data_loader.stop()
        self.save(curr_step)

        return
예제 #8
0
    # Disable GPU
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import tensorflow as tf

if __name__ == '__main__':
    tf.reset_default_graph()
    tf.logging.set_verbosity(tf.logging.INFO)
    nnet_dir = os.path.join(args.model_dir, "nnet")
    config_json = os.path.join(args.model_dir, "nnet/config.json")
    if not os.path.isfile(config_json):
        sys.exit("Cannot find params.json in %s" % config_json)
    params = Params(config_json)

    # First, we need to extract the weights
    num_total_train_speakers = KaldiDataRandomQueue(
        os.path.dirname(args.spklist), args.spklist).num_total_speakers
    dim = FeatureReader(os.path.dirname(args.spklist)).get_dim()
    if "selected_dim" in params.dict:
        dim = params.selected_dim
    trainer = Trainer(params,
                      args.model_dir,
                      dim,
                      num_total_train_speakers,
                      single_cpu=True)
    trainer.build("valid")
    trainer.sess.run(tf.global_variables_initializer())
    trainer.sess.run(tf.local_variables_initializer())

    if not args.init:
        curr_step = trainer.load()
    else: