Exemplo n.º 1
0
 def test_in_out(self):
     dummy_input = T.rand(4, 20, 80)  # B x T x D
     dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)]
     model = SpeakerEncoder(input_dim=80,
                            proj_dim=256,
                            lstm_dim=768,
                            num_lstm_layers=3)
     # computing d vectors
     output = model.forward(dummy_input)
     assert output.shape[0] == 4
     assert output.shape[1] == 256
     output = model.inference(dummy_input)
     assert output.shape[0] == 4
     assert output.shape[1] == 256
     # compute d vectors by passing LSTM hidden
     # output = model.forward(dummy_input, dummy_hidden)
     # assert output.shape[0] == 4
     # assert output.shape[1] == 20
     # assert output.shape[2] == 256
     # check normalization
     output_norm = T.nn.functional.normalize(output, dim=1, p=2)
     assert_diff = (output_norm - output).sum().item()
     assert output.type() == "torch.FloatTensor"
     assert (abs(assert_diff) <
             1e-4), f" [!] output_norm has wrong values - {assert_diff}"
     # compute d for a given batch
     dummy_input = T.rand(1, 240, 80)  # B x T x D
     output = model.compute_embedding(dummy_input,
                                      num_frames=160,
                                      overlap=0.5)
     assert output.shape[0] == 1
     assert output.shape[1] == 256
     assert len(output.shape) == 2
Exemplo n.º 2
0
def main(args):  # pylint: disable=redefined-outer-name
    # pylint: disable=global-variable-undefined
    global meta_data_train
    global meta_data_eval

    ap = AudioProcessor(**c.audio)
    model = SpeakerEncoder(
        input_dim=c.model["input_dim"],
        proj_dim=c.model["proj_dim"],
        lstm_dim=c.model["lstm_dim"],
        num_lstm_layers=c.model["num_lstm_layers"],
    )
    optimizer = RAdam(model.parameters(), lr=c.lr)

    if c.loss == "ge2e":
        criterion = GE2ELoss(loss_method="softmax")
    elif c.loss == "angleproto":
        criterion = AngleProtoLoss()
    else:
        raise Exception("The %s  not is a loss supported" % c.loss)

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        try:
            # TODO: fix optimizer init, model.cuda() needs to be called before
            # optimizer restore
            # optimizer.load_state_dict(checkpoint['optimizer'])
            if c.reinit_layers:
                raise RuntimeError
            model.load_state_dict(checkpoint["model"])
        except KeyError:
            print(" > Partial model initialization.")
            model_dict = model.state_dict()
            model_dict = set_init_dict(model_dict, checkpoint, c)
            model.load_state_dict(model_dict)
            del model_dict
        for group in optimizer.param_groups:
            group["lr"] = c.lr
        print(" > Model restored from step %d" % checkpoint["step"], flush=True)
        args.restore_step = checkpoint["step"]
    else:
        args.restore_step = 0

    if use_cuda:
        model = model.cuda()
        criterion.cuda()

    if c.lr_decay:
        scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)

    # pylint: disable=redefined-outer-name
    meta_data_train, meta_data_eval = load_meta_data(c.datasets)

    global_step = args.restore_step
    _, global_step = train(model, criterion, optimizer, scheduler, ap, global_step)
Exemplo n.º 3
0
def setup_model(c):
    model = SpeakerEncoder(
        c.model_params["input_dim"],
        c.model_params["proj_dim"],
        c.model_params["lstm_dim"],
        c.model_params["num_lstm_layers"],
    )
    return model
Exemplo n.º 4
0
    def test_speaker_embedding():
        # load config
        config = load_config(encoder_config_path)
        config.audio.resample = True

        # create a dummy speaker encoder
        model = SpeakerEncoder(**config.model_params)
        save_checkpoint(model, None, None, get_tests_input_path(), 0)

        # load audio processor and speaker encoder
        ap = AudioProcessor(**config.audio)
        manager = SpeakerManager(encoder_model_path=encoder_model_path,
                                 encoder_config_path=encoder_config_path)

        # load a sample audio and compute embedding
        waveform = ap.load_wav(sample_wav_path)
        mel = ap.melspectrogram(waveform)
        x_vector = manager.compute_x_vector(mel.T)
        assert x_vector.shape[1] == 256

        # compute x_vector directly from an input file
        x_vector = manager.compute_x_vector_from_clip(sample_wav_path)
        x_vector2 = manager.compute_x_vector_from_clip(sample_wav_path)
        x_vector = torch.FloatTensor(x_vector)
        x_vector2 = torch.FloatTensor(x_vector2)
        assert x_vector.shape[0] == 256
        assert (x_vector - x_vector2).sum() == 0.0

        # compute x_vector from a list of wav files.
        x_vector3 = manager.compute_x_vector_from_clip(
            [sample_wav_path, sample_wav_path2])
        x_vector3 = torch.FloatTensor(x_vector3)
        assert x_vector3.shape[0] == 256
        assert (x_vector - x_vector3).sum() != 0.0

        # remove dummy model
        os.remove(encoder_model_path)
Exemplo n.º 5
0
            #print(f'wav_file: {wav_file}')
            if os.path.exists(wav_file):
                wav_files.append(wav_file)
    print(f'Count of wavs imported: {len(wav_files)}')
else:
    # Parse all wav files in data_path
    wav_path = data_path
    wav_files = glob.glob(data_path + '/**/*.wav', recursive=True)

output_files = [
    wav_file.replace(wav_path, args.output_path).replace('.wav', '.npy')
    for wav_file in wav_files
]

for output_file in output_files:
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

model = SpeakerEncoder(**c.model)
model.load_state_dict(torch.load(args.model_path)['model'])
model.eval()
if args.use_cuda:
    model.cuda()

for idx, wav_file in enumerate(tqdm(wav_files)):
    mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T
    mel_spec = torch.FloatTensor(mel_spec[None, :, :])
    if args.use_cuda:
        mel_spec = mel_spec.cuda()
    embedd = model.compute_embedding(mel_spec)
    np.save(output_files[idx], embedd.detach().cpu().numpy())
Exemplo n.º 6
0
def setup_model(c):
    model = SpeakerEncoder(c.model['input_dim'], c.model['proj_dim'],
                           c.model['lstm_dim'], c.model['num_lstm_layers'])
    return model