Пример #1
0
def main():

    # Hyperparameters

    parser = argparse.ArgumentParser()

    # in_dir = ~/wav
    parser.add_argument("--in_dir",
                        type=str,
                        required=True,
                        help="input data(pickle) dir")
    parser.add_argument(
        "--ckpt_dir",
        type=str,
        required=True,
        help="checkpoint to save/ start with for train/inference")
    parser.add_argument("--mode",
                        default="train",
                        choices=["train", "test", "infer"],
                        help="setting mode for execution")

    # Saving Checkpoints, Data... etc
    parser.add_argument("--max_step",
                        type=int,
                        default=500000,
                        help="maximum steps in training")
    parser.add_argument("--checkpoint_freq",
                        type=int,
                        default=100,
                        help="how often save checkpoint")

    # Data
    parser.add_argument("--segment_length",
                        type=float,
                        default=1.6,
                        help="segment length in seconds")
    parser.add_argument("--spectrogram_scale",
                        type=int,
                        default=40,
                        help="scale of the input spectrogram")

    # Ininitialization
    parser.add_argument("--init_type",
                        type=str,
                        default="uniform",
                        help="type of initializer")
    parser.add_argument("--init_weight_range",
                        type=float,
                        default=0.1,
                        help="initial weight ranges from -0.1 to 0.1")

    # Optimization
    parser.add_argument("--loss_type",
                        default="softmax",
                        choices=["softmax", "contrast"],
                        help="loss type for optimization")
    parser.add_argument("--optimizer",
                        type=str,
                        default="sgd",
                        help="type of optimizer")
    parser.add_argument("--learning_rate",
                        type=float,
                        default=0.01,
                        help="learning rate")
    parser.add_argument("--l2_norm_clip",
                        type=float,
                        default=3.0,
                        help="L2-norm of gradient is clipped at")

    # Train
    parser.add_argument("--num_spk_per_batch",
                        type=int,
                        default=64,
                        help="N speakers of batch size N*M")
    parser.add_argument("--num_utt_per_batch",
                        type=int,
                        default=10,
                        help="M utterances of batch size N*M")

    # LSTM
    parser.add_argument("--lstm_proj_clip",
                        type=float,
                        default=0.5,
                        help="Gradient scale for projection node in LSTM")
    parser.add_argument("--num_lstm_stacks",
                        type=int,
                        default=3,
                        help="number of LSTM stacks")
    parser.add_argument("--num_lstm_cells",
                        type=int,
                        default=768,
                        help="number of LSTM cells")
    parser.add_argument("--dim_lstm_projection",
                        type=int,
                        default=256,
                        help="dimension of LSTM projection")

    # Scaled Cosine similarity
    parser.add_argument(
        "--scale_clip",
        type=float,
        default=0.01,
        help="Gradient scale for scale values in scaled cosine similarity")

    # Collect hparams
    args = parser.parse_args()

    # Set up Queue
    global_queue = queue.Queue()
    # Set up Feeder
    libri_feeder = Feeder(args, "train", "libri")
    libri_feeder.set_up_feeder(global_queue)

    vox1_feeder = Feeder(args, "train", "vox1")
    vox1_feeder.set_up_feeder(global_queue)

    vox2_feeder = Feeder(args, "train", "vox2")
    vox2_feeder.set_up_feeder(global_queue)

    # Set up Model

    model = GE2E(args)
    graph = model.set_up_model("train")

    # Training
    with graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=graph) as sess:

        train_writer = tf.summary.FileWriter(args.ckpt_dir, sess.graph)
        ckpt = tf.train.get_checkpoint_state(args.ckpt_dir)
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('Restoring Variables from {}'.format(
                ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
            start_step = sess.run(model.global_step)

        else:
            print('start from 0')
            init_op = tf.global_variables_initializer()
            sess.run(init_op)
            start_step = 1

        for num_step in range(start_step, args.max_step + 1):

            print("current step: " + str(num_step) + "th step")

            batch = global_queue.get()

            summary, training_loss, _ = sess.run(
                [model.sim_mat_summary, model.total_loss, model.optimize],
                feed_dict={
                    model.input_batch: batch[0],
                    model.target_batch: batch[1]
                })
            train_writer.add_summary(summary, num_step)
            print("batch loss:" + str(training_loss))

            if num_step % args.checkpoint_freq == 0:
                save_path = saver.save(sess,
                                       args.ckpt_dir + "/model.ckpt",
                                       global_step=model.global_step)
                print("model saved in file: %s / %d th step" %
                      (save_path, sess.run(model.global_step)))
def main():

    # Hyperparameters

    parser = argparse.ArgumentParser()

    # Path

    # wav name formatting: id_clip_uttnum.wav
    parser.add_argument("--test_dir",
                        type=str,
                        required=True,
                        help="input test dir")
    #/home/hdd2tb/ninas96211/dev_wav_set

    parser.add_argument("--ckpt_file",
                        type=str,
                        required=True,
                        help="checkpoint to start with for inference")

    # Data
    #parser.add_argument("--window_length", type=int, default=160, help="sliding window length(frames)")
    parser.add_argument("--segment_length",
                        type=float,
                        default=1.6,
                        help="segment length in seconds")
    parser.add_argument("--overlap_ratio",
                        type=float,
                        default=0.5,
                        help="overlaping percentage")
    parser.add_argument("--spectrogram_scale",
                        type=int,
                        default=40,
                        help="scale of the input spectrogram")
    # Enrol
    parser.add_argument("--num_spk_per_batch",
                        type=int,
                        default=5,
                        help="N speakers of batch size N*M")
    parser.add_argument("--num_utt_per_batch",
                        type=int,
                        default=10,
                        help="M utterances of batch size N*M")

    # LSTM
    parser.add_argument("--num_lstm_stacks",
                        type=int,
                        default=3,
                        help="number of LSTM stacks")
    parser.add_argument("--num_lstm_cells",
                        type=int,
                        default=768,
                        help="number of LSTM cells")
    parser.add_argument("--dim_lstm_projection",
                        type=int,
                        default=256,
                        help="dimension of LSTM projection")

    # Total Score
    # (Sum of True Scores) -  (Sum of False Scores)
    # if the model is perfect, the score will be num_of_true_pairs
    # if the model really sucks, the score will be - num_of_false_pairs

    # Collect hparams
    args = parser.parse_args()
    global_queue = queue.Queue()
    feeder = Feeder(args, "test")
    feeder.set_up_feeder(global_queue)

    model = GE2E(args)
    graph = model.set_up_model("test")

    with graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=graph) as sess:
        # restore from checkpoints

        saver.restore(sess, args.ckpt_file)
        total_score = 0
        num_true_pairs = 0
        num_false_pairs = 0

        while len(feeder.wav_pairs) > 0:
            wav1_data, wav2_data, match = global_queue.get()
            wav1_out = sess.run(model.norm_out,
                                feed_dict={model.input_batch: wav1_data})
            wav2_out = sess.run(model.norm_out,
                                feed_dict={model.input_batch: wav2_data})
            wav1_dvector = np.mean(wav1_out, axis=0)
            wav2_dvector = np.mean(wav2_out, axis=0)
            final_score = np.dot(wav1_dvector, wav2_dvector) / (
                np.linalg.norm(wav1_dvector) * np.linalg.norm(wav2_dvector))
            print("final score:" + str(final_score))
            print("same? :" + str(match))
            if match == True:
                total_score += final_score
                num_true_pairs += 1
            if match == False:
                total_score -= final_score
                num_false_pairs += 1

    print("in total: " + str(total_score))
    print("num true pairs: " + str(num_true_pairs))
    print("num false pairs: " + str(num_false_pairs))
Пример #3
0
def main():

    # Hyperparameters

    parser = argparse.ArgumentParser()

    # Path

    # wav name formatting: id_clip_uttnum.wav
    parser.add_argument("--in_dir", type=str, required=True, help="input dir")
    parser.add_argument("--out_dir", type=str, required=True, help="out dir")
    parser.add_argument("--batch_inference",
                        action="store_true",
                        help="set whether to use the batch inference")
    parser.add_argument("--dataset", type=str, default="libri", help="out dir")
    parser.add_argument("--in_wav1", type=str, help="input wav1 dir")
    parser.add_argument("--in_wav2",
                        default="temp.wav",
                        type=str,
                        help="input wav2 dir")
    #/home/hdd2tb/ninas96211/dev_wav_set
    parser.add_argument("--mode",
                        default="infer",
                        choices=["train", "test", "infer"],
                        help="setting mode for execution")

    parser.add_argument("--ckpt_file",
                        type=str,
                        default='./xckpt/model.ckpt-58100',
                        help="checkpoint to start with for inference")

    # Data
    #parser.add_argument("--window_length", type=int, default=160, help="sliding window length(frames)")
    parser.add_argument("--segment_length",
                        type=float,
                        default=1.6,
                        help="segment length in seconds")
    parser.add_argument("--overlap_ratio",
                        type=float,
                        default=0.5,
                        help="overlaping percentage")
    parser.add_argument("--spectrogram_scale",
                        type=int,
                        default=40,
                        help="scale of the input spectrogram")
    # Enrol
    parser.add_argument("--num_spk_per_batch",
                        type=int,
                        default=5,
                        help="N speakers of batch size N*M")
    parser.add_argument("--num_utt_per_batch",
                        type=int,
                        default=10,
                        help="M utterances of batch size N*M")

    # LSTM
    parser.add_argument("--num_lstm_stacks",
                        type=int,
                        default=3,
                        help="number of LSTM stacks")
    parser.add_argument("--num_lstm_cells",
                        type=int,
                        default=768,
                        help="number of LSTM cells")
    parser.add_argument("--dim_lstm_projection",
                        type=int,
                        default=256,
                        help="dimension of LSTM projection")
    parser.add_argument('--gpu', default='0', help='Path to model checkpoint')
    parser.add_argument('--gpu_num',
                        default=4,
                        help='Path to model checkpoint')

    # Collect hparams
    args = parser.parse_args()

    import os
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)

    feeder = Feeder(args)
    feeder.set_up_feeder()

    model = GE2E(args)
    graph = model.set_up_model()

    #Training
    with graph.as_default():
        saver = tf.train.Saver()

    #num_gpu=4
    #sess_arr=[]
    #for i in range(num_gpu):
    #    gpu_options = tf.GPUOptions(visible_device_list=str(i))
    #    sess_arr.append(tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)))
    #    saver.restore(sess_arr[i], args.ckpt_file)

    #    t = Thread(target=worker)
    #    t.daemon = True
    #    t.start()

    #    save_dvector_of_dir_parallel(sess_arr, feeder, model, args)

    with tf.Session(graph=graph) as sess:
        # restore from checkpoints

        saver.restore(sess, args.ckpt_file)

        #get_dvector_of_dir(sess, feeder, model, args)

        t = Thread(target=worker)
        t.daemon = True
        t.start()
        save_dvector_of_dir_parallel(sess, feeder, model, args)
def main():

    # Hyperparameters

    parser = argparse.ArgumentParser()

    # Path

    # wav name formatting: id_clip_uttnum.wav
    parser.add_argument("--in_wav1",
                        type=str,
                        required=True,
                        help="input wav1 dir")
    parser.add_argument("--in_wav2",
                        type=str,
                        required=True,
                        help="input wav2 dir")
    #/home/hdd2tb/ninas96211/dev_wav_set
    parser.add_argument("--mode",
                        default="infer",
                        choices=["train", "test", "infer"],
                        help="setting mode for execution")

    parser.add_argument("--ckpt_file",
                        type=str,
                        required=True,
                        help="checkpoint to start with for inference")

    # Data
    #parser.add_argument("--window_length", type=int, default=160, help="sliding window length(frames)")
    parser.add_argument("--segment_length",
                        type=float,
                        default=1.6,
                        help="segment length in seconds")
    parser.add_argument("--overlap_ratio",
                        type=float,
                        default=0.5,
                        help="overlaping percentage")
    parser.add_argument("--spectrogram_scale",
                        type=int,
                        default=40,
                        help="scale of the input spectrogram")
    # Enrol
    parser.add_argument("--num_spk_per_batch",
                        type=int,
                        default=5,
                        help="N speakers of batch size N*M")
    parser.add_argument("--num_utt_per_batch",
                        type=int,
                        default=10,
                        help="M utterances of batch size N*M")

    # LSTM
    parser.add_argument("--num_lstm_stacks",
                        type=int,
                        default=3,
                        help="number of LSTM stacks")
    parser.add_argument("--num_lstm_cells",
                        type=int,
                        default=768,
                        help="number of LSTM cells")
    parser.add_argument("--dim_lstm_projection",
                        type=int,
                        default=256,
                        help="dimension of LSTM projection")

    # Collect hparams
    args = parser.parse_args()

    feeder = Feeder(args, "infer")
    feeder.set_up_feeder()

    model = GE2E(args)
    graph = model.set_up_model("infer")

    # Training

    with graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=graph) as sess:
        # restore from checkpoints

        saver.restore(sess, args.ckpt_file)

        wav1_data, wav2_data, match = feeder.create_infer_batch()

        # score

        wav1_out = sess.run(model.norm_out,
                            feed_dict={model.input_batch: wav1_data})
        wav2_out = sess.run(model.norm_out,
                            feed_dict={model.input_batch: wav2_data})

        wav1_dvector = np.mean(wav1_out, axis=0)
        wav2_dvector = np.mean(wav2_out, axis=0)

        final_score = np.dot(wav1_dvector, wav2_dvector) / (
            np.linalg.norm(wav1_dvector) * np.linalg.norm(wav2_dvector))

        print("final score:" + str(final_score))
        print("same? :" + str(match))