Exemplo n.º 1
0
def test_single(trainer, speaker2id_path, result_dir, enc_only, s_speaker,
                t_speaker):

    with open(speaker2id_path, 'r') as f_json:
        speaker2id = json.load(f_json)

    if s_speaker == 'S015':
        filename = './data/english/train/unit/S015_0361841101.wav'
    elif s_speaker == 'S119':
        filename = './data/english/train/unit/S119_1561145062.wav'
    else:
        raise NotImplementedError('Please modify path manually!')

    _, spec = get_spectrograms(filename)
    spec_expand = np.expand_dims(spec, axis=0)
    spec_tensor = torch.from_numpy(spec_expand).type(torch.FloatTensor)
    c = torch.tensor(torch.from_numpy(np.array([speaker2id[t_speaker]
                                                ]))).cuda()
    result = trainer.test_step(spec_tensor, c, enc_only=enc_only)
    result = result.squeeze(axis=0).transpose((1, 0))
    wav_data = spectrogram2wav(result)
    write(os.path.join(result_dir, 'result.wav'), rate=16000, data=wav_data)
    print(
        'Testing on source speaker {} and target speaker {}, output shape: {}'.
        format(s_speaker, t_speaker, result.shape))
Exemplo n.º 2
0
def target_classify(trainer, seg_len, synthesis_list, result_dir, flag='test'):
    dir_path = os.path.join(result_dir, f'{flag}/')
    with open(synthesis_list, 'r') as f:
        file = f.readlines()
    acc = []
    for line in file:
        # get wav path
        line = line.split('\n')[0].split(' ')
        utt_id = line[0].split('/')[1].split('_')[1]
        tar_speaker = line[1]
        wav_path = os.path.join(dir_path, f'{tar_speaker}_{utt_id}.wav')

        # get spectrogram
        _, spec = get_spectrograms(wav_path)

        # padding spec
        if len(spec) < seg_len:
            padding = np.zeros((seg_len - spec.shape[0], spec.shape[1]))
            spec = np.concatenate((spec, padding), axis=0)

        # classification
        logits = []
        for idx in range(0, len(spec), seg_len):
            if idx + (seg_len * 2) > len(spec):
                spec_frag = spec[idx:-1]
            else:
                spec_frag = spec[idx:idx + seg_len]

            if len(spec_frag) >= seg_len:
                x = torch.from_numpy(
                    np.expand_dims(spec_frag[:seg_len, :],
                                   axis=0)).type(torch.FloatTensor)
                logit = trainer.classify(x)
                logits.append(logit)
            elif idx == 0:
                raise RuntimeError('Please check if input is too short!')
        logits = np.concatenate(logits, axis=0)
        #logits = np.sum(logits, axis = 0)
        for logit in logits:
            am = logit.argmax()
            if am == 0:
                clf_speaker = 'V001'
            elif am == 1:
                clf_speaker = 'V002'
            else:
                clf_speaker = 'None'
            if clf_speaker == tar_speaker:
                acc.append(1)
                #print('[info]: {} is classified to {}'.format(wav_path, clf_speaker))
            else:
                acc.append(0)
                #print('[Error]: {} is classified to {}'.format(wav_path, clf_speaker))
    print('Classification Acc: {:.3f}'.format(np.sum(acc) / float(len(acc))))
Exemplo n.º 3
0
def encode_for_tacotron(target, trainer, seg_len, multi2idx_path, wav_path,
                        result_path):
    wavs = sorted(glob.glob(os.path.join(wav_path, '*.wav')))
    print('[Converter] - Number of wav files to encoded: ', len(wavs))

    names = []
    enc_outputs = []

    for wav_path in tqdm(wavs):
        name = wav_path.split('/')[-1].split('.')[0]
        s_id = name.split('_')[0]
        u_id = name.split('_')[1]
        if s_id != target:
            continue

        y, sr = librosa.load(wav_path)
        d = librosa.get_duration(y=y, sr=sr)
        if d > 25:
            continue  # --> this filter out too long utts, 3523/3533 for V001 and V002 together in the english dataset

        _, spec = get_spectrograms(wav_path)
        encodings = encode(spec, trainer, seg_len, save=False)
        encodings = parse_encodings(encodings)
        enc_outputs.append(encodings)
        names.append((s_id, u_id))

    # build encodings to character mapping
    idx = 0
    multi2idx = {}
    print('[Converter] - Building encoding to symbol mapping...')
    for encodings in tqdm(enc_outputs):
        for encoding in encodings:
            if str(encoding) not in multi2idx:
                multi2idx[str(encoding)] = symbols[idx]
                idx += 1

    print('[Converter] - Number of unique discret units: ', len(multi2idx))
    with open(multi2idx_path, 'w') as file:
        file.write(json.dumps(multi2idx))

    result_path = result_path.replace('target', target)
    print('[Converter] - Writing to meta file...')
    with open(result_path, 'w') as file:
        for i, encodings in enumerate(enc_outputs):
            file.write(str(names[i][0]) + '_' + str(names[i][1] + '|'))
            for encoding in encodings:
                file.write(multi2idx[str(encoding)])
            file.write('\n')
Exemplo n.º 4
0
def test_single(trainer, seg_len, speaker2id_path, result_dir, enc_only,
                s_speaker, t_speaker):

    with open(speaker2id_path, 'r') as f_json:
        speaker2id = json.load(f_json)

    if s_speaker == 'S015':
        filename = './data/english/train/unit/S015_0361841101.wav'
    elif s_speaker == 'S119':
        filename = './data/english/train/unit/S119_1561145062.wav'
    elif s_speaker == 'S130':
        filename = './data/english/test/S130_3516588097.wav'
    elif s_speaker == 'S089':
        filename = './data/english/test/S089_1810826781.wav'
    elif s_speaker == 'S378':
        filename = './data/surprise/test/S378_117437.wav'
    else:
        raise NotImplementedError('Please modify path manually!')

    _, spec = get_spectrograms(filename)
    wav_data, encodings = convert(trainer,
                                  seg_len,
                                  src_speaker_spec=spec,
                                  src_speaker=s_speaker,
                                  tar_speaker=t_speaker,
                                  utt_id='',
                                  speaker2id=speaker2id,
                                  result_dir=result_dir,
                                  enc_only=enc_only,
                                  save=[])

    sf.write(os.path.join(result_dir, 'result.wav'), wav_data, hp.sr, 'PCM_16')
    write_encodings(os.path.join(result_dir, 'result.txt'), encodings)

    err_result = compare_asr(filename, os.path.join(result_dir, 'result.wav'))

    print(
        'Testing on source speaker {} and target speaker {}, output shape: {}'.
        format(s_speaker, t_speaker, wav_data.shape))
    print('Comparing ASR result - WERR: {:.3f}  CERR: {:.3f}'.format(
        err_result[0], err_result[1]))
Exemplo n.º 5
0
def main():

    #---initialize---#
    args = get_test_args()
    HPS = Hps(args.hps_path)
    hps = HPS.get_tuple()
    trainer = get_trainer(args.hps_path, args.encoder_path, hps.g_mode,
                          hps.enc_mode)

    if args.eval_t == 'None':
        print(
            '[Tacotron] - None is not a valid evaluation target! Please specify target manually, must be either V001, or V002.'
        )
        return

    # Tacotron implementation: https://github.com/andi611/TTS-Tacotron-Pytorch
    model = Tacotron(n_vocab=len(symbols),
                     embedding_dim=config.embedding_dim,
                     mel_dim=config.num_mels,
                     linear_dim=config.num_freq,
                     r=config.outputs_per_step,
                     padding_idx=config.padding_idx,
                     attention=config.attention,
                     use_mask=config.use_mask)

    #---handle path---#
    result_dir = os.path.join(args.result_dir, args.sub_result_dir)
    os.makedirs(result_dir, exist_ok=True)
    checkpoint_path = os.path.join(args.ckpt_dir, args.model_name)
    if args.dataset == 'english' and not os.path.isdir(
            './ckpt_tacotron_english'):
        print(
            '[Tacotron] - Recommand using the following name for ckpt_dir: ./ckpt_tacotron_english/'
        )
    elif args.dataset == 'surprise' and not os.path.isdir(
            './ckpt_tacotron_surprise'):
        print(
            '[Tacotron] - Recommand using the following name for ckpt_dir: ./ckpt_tacotron_surprise/'
        )

    #---load and set model---#
    print('[Tacotron] - Testing on the {} set.'.format(args.dataset))
    print('[Tacotron] - Loading model: ', checkpoint_path)
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint["state_dict"])

    #---load and set mappings---#
    print('[Tacotron] - Loading mapping files: ', args.speaker2id_path)
    valid_arguments(valid_target=args.dataset, arg=args.speaker2id_path)
    with open(args.speaker2id_path, 'r') as f_json:
        speaker2id = json.load(f_json)

    print('[Tacotron] - Loading mapping files: ', args.multi2idx_path)
    with open(args.multi2idx_path, 'r') as f_json:
        multi2idx = json.load(f_json)

    if not args.test_single:
        #---parse testing list---#
        print('[Tacotron] - Testing from list: ', args.synthesis_list)
        valid_arguments(valid_target=args.dataset, arg=args.synthesis_list)
        feeds = []
        with open(args.synthesis_list, 'r') as f:
            file = f.readlines()
            for line in file:
                line = line.split('\n')[0].split(' ')
                feeds.append({
                    's_id': line[0].split('/')[1].split('_')[0],
                    'utt_id': line[0].split('/')[1].split('_')[1],
                    't_id': line[1],
                })
        print('[Tester] - Number of files to be resynthesize: ', len(feeds))

        for feed in tqdm(feeds):
            if feed['t_id'] == args.eval_t:
                wav_path = os.path.join(
                    args.testing_dir,
                    feed['s_id'] + '_' + feed['utt_id'] + '.wav')
                _, spec = get_spectrograms(wav_path)
                encodings = encode(spec, trainer, hps.seg_len, save=False)
                encodings = parse_encodings(encodings)
                line = ''.join([multi2idx[encoding] for encoding in encodings])
                print(line)
                out_path = os.path.join(
                    result_dir, feed['t_id'] + '_' + feed['utt_id'] + '.wav')
                synthesis_speech(model, text=line, path=out_path)
    else:
        wav_path = './data/english/train/voice/V002_0674932509.wav'
        # wav_path = './data/english/train/voice/V002_2252538703.wav'
        # wav_path = './data/english/train/voice/V002_1665800749.wav'
        _, spec = get_spectrograms(wav_path)
        encodings = encode(spec, trainer, hps.seg_len, save=False)
        write_encodings(path='./result/result.wav', encodings=encodings)
        parsed_encodings = parse_encodings(encodings)
        line = ''.join([multi2idx[encoding] for encoding in parsed_encodings])
        print(line)
        synthesis_speech(model, text=line, path='./result/result.wav')

    # model.decoder.max_decoder_steps = config.max_decoder_steps # Set large max_decoder steps to handle long sentence outputs

    sys.exit(0)