예제 #1
0
def cut_wav(wav_filename, textgrid_filename, output_folder):
    wav, sr = librosa.load(wav_filename)
    # print(np.shape(wav))
    # print(sr)

    cut_info_dict = parse_TextGrid.parse_textgrid(textgrid_filename)

    total_wav = np.array([])
    cnt = 0
    for ind, cut_info in enumerate(cut_info_dict):
        # print(type(cut_info))
        if cut_info != "None":
            filename = str(cnt) + "_" + cut_info + ".wav"
            fn = os.path.join(output_folder, filename)
            wav_part, _ = utils.cut_wav_save(wav, sr,
                                             cut_info_dict[cut_info][0],
                                             cut_info_dict[cut_info][1], fn)
            cnt = cnt + 1
            total_wav = np.concatenate((total_wav, wav_part))
    # print(np.shape(total_wav))
    # print(wav)
    # print(total_wav)
    total_filename = os.path.join(output_folder, "total.wav")
    # librosa.output.write_wav(total_filename, total_wav, sr)
    utils.save_wav(total_wav, sr, total_filename)
예제 #2
0
def synthesize(mel_sp, save_path, weight_path):
    wavenet = WaveNet(hparams.num_mels, hparams.upsample_scales)
    wavenet.load_weights(weight_path)
    mel_sp = tf.expand_dims(mel_sp, axis=0)

    outputs = wavenet.synthesis(mel_sp)
    outputs = np.squeeze(outputs)
    outputs = inv_mulaw_quantize(outputs)

    save_wav(outputs, save_path, hparams.sampling_rate)
예제 #3
0
def cut_total_wav(wav_filename, textgrid_filename, output_filename):
    wav, sr = librosa.load(wav_filename)

    cut_info_dict = parse_TextGrid.parse_textgrid(textgrid_filename)

    total_wav = np.array([])
    # cnt = 0
    for ind, cut_info in enumerate(cut_info_dict):
        # print(type(cut_info))
        if cut_info != "None":
            # filename = str(cnt) + "_" + cut_info + ".wav"
            # fn = os.path.join(output_folder, filename)
            wav_part = utils.cut_wav(wav, sr, cut_info_dict[cut_info][0],
                                     cut_info_dict[cut_info][1])
            # cnt = cnt + 1
            total_wav = np.concatenate((total_wav, wav_part))

    utils.save_wav(total_wav, sr, output_filename)
예제 #4
0
def evaluation(model, step, device, args):
    # Evaluation
    model.eval()
    with torch.no_grad():
        # Preprocessing eval texts
        print('Start generating evaluation speeches...')
        n_eval = len(hps.eval_texts)
        for i in range(n_eval):
            sys.stdout.write('\rProgress: {}/{}'.format(i + 1, n_eval))
            sys.stdout.flush()
            text = hps.eval_texts[i]
            text = text_normalize(text)

            txt_id = sent2idx(text) + [hps.vocab.find('E')]
            txt_len = len(txt_id)
            GO_frame = torch.zeros(1, 1, hps.n_mels)

            # Shape: (1, seq_length)
            txt = torch.LongTensor([txt_id])
            txt_len = torch.LongTensor([txt_len])
            if args.cuda:
                GO_frame = GO_frame.cuda()
                txt = txt.cuda()
                txt_len.cuda()
            _batch = model(text=txt, frames=GO_frame, text_length=txt_len)
            mel = _batch['mel'][0]
            mag = _batch['mag'][0]
            attn = _batch['attn'][0]
            if args.cuda:
                mel = mel.cpu()
                mag = mag.cpu()
                attn = attn.cpu()
            mel = mel.numpy()
            mag = mag.numpy()
            attn = attn.numpy()

            wav = mag2wav(mag)
            save_alignment(attn, step, 'eval/plots/attn_{}.png'.format(text))
            save_spectrogram(mag,
                             'eval/plots/spectrogram_[{}].png'.format(text))
            save_wav(wav, 'eval/results/wav_{}.wav'.format(text))
        sys.stdout.write('\n')
예제 #5
0
def pad2drums(read_from_fname, save_to_fname):
    """
    Reads .wav-file in folder "raw_audio" from a drum pad (with mic about 10 cm away)
    and converts it to an .wav-file with drum sounds in place of 
    the pad sounds. Created file is placed in folder "results".
    """

    load_path = 'raw_audio/'
    fs, raw_audio = load_wav(load_path + read_from_fname)

    # Detecting the pad hits from the raw_audio
    hit_indices, hit_strengths = detect_sound(raw_audio, stereo=True)

    dg = DrumGenerator(fs=fs)
    drum_audio = dg.generate_drum_audio(hit_indices, hit_strengths,
                                        raw_audio.size)

    # Save drum_audio to file name for save_to_file added by user
    save_path = 'results/' + save_to_fname
    save_wav(save_path, drum_audio, fs)
예제 #6
0
def demo():
    mir1k_sr = 16000
    n_fft = 1024
    hop_length = n_fft // 4
    num_rnn_layer = 3
    num_hidden_units = 256
    checkpoint = torch.load("final_model.pth")

    mir1k_dir = 'data/MIR1K/MIR-1K'
    test_path = os.path.join(mir1k_dir, 'MIR-1K_test.json')

    with open(test_path, 'r') as text_file:
        content = json.load(text_file)
    wav_filenames = ["{}/{}".format("data/MIR1K/MIR-1K/Wavfile", f) for f in content]
    wav_filenames = ["../HW3/sample_music.wav"] # only get the first two for demo
    
    wavs_mono, wavs_src1, wavs_src2 = load_wavs(filenames = wav_filenames, sr = mir1k_sr)

    stfts_mono, stfts_src1, stfts_src2 = wavs_to_specs(
        wavs_mono = wavs_mono, wavs_src1 = wavs_src1, wavs_src2 = wavs_src2, n_fft = n_fft, hop_length = hop_length)

    stfts_mono_full, stfts_src1_full, stfts_src2_full = prepare_data_full(stfts_mono = stfts_mono, stfts_src1 = stfts_src1, stfts_src2 = stfts_src2)

    model = Model(n_fft // 2 + 1, num_hidden_units).to(device)
    model.load_state_dict(checkpoint["model_state_dict"])

    wavs_src1_pred = list()
    wavs_src2_pred = list()
    step = 1
    model.eval()
    with torch.no_grad():
        for wav_filename, wav_mono, stft_mono_full in zip(wav_filenames, wavs_mono, stfts_mono_full):

            stft_mono_magnitude, stft_mono_phase = sperate_magnitude_phase(data = stft_mono_full)
            stft_mono_magnitude = np.array([stft_mono_magnitude])

            stft_mono_magnitude = torch.Tensor(stft_mono_magnitude).to(device)
            y1_pred, y2_pred = model(stft_mono_magnitude)

            # ISTFT with the phase from mono
            y1_pred = y1_pred.cpu().numpy()
            y2_pred = y2_pred.cpu().numpy()

            y1_stft_hat = combine_magnitude_phase(y1_pred[0], stft_mono_phase)
            y2_stft_hat = combine_magnitude_phase(y2_pred[0], stft_mono_phase)

            y1_stft_hat = y1_stft_hat.transpose()
            y2_stft_hat = y2_stft_hat.transpose()

            y1_hat = librosa.istft(y1_stft_hat, hop_length = hop_length)
            y2_hat = librosa.istft(y2_stft_hat, hop_length = hop_length)

            filename = "demo/"+wav_filename.split("/")[-1]
            
            save_wav(filename+"_mono.wav", wav_mono)
            save_wav(filename+"_src1", y1_hat)
            save_wav(filename+"_src2", y2_hat)
    print("done")
예제 #7
0
        bits=params["preprocessing"]["bits"],
        hop_length=params["preprocessing"]["hop_length"],
        nc=args.nc,
        device=device)
    model.to(device)

    print("Load checkpoint from: {}:".format(args.checkpoint))
    checkpoint = torch.load(args.checkpoint,
                            map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint["model"])
    model_step = checkpoint["step"]

    wav = load_wav(args.wav_path, params["preprocessing"]["sample_rate"])
    utterance_id = os.path.basename(args.wav_path).split(".")[0]
    wav = wav / np.abs(wav).max() * 0.999
    mel = melspectrogram(wav,
                         sample_rate=params["preprocessing"]["sample_rate"],
                         preemph=params["preprocessing"]["preemph"],
                         num_mels=params["preprocessing"]["num_mels"],
                         num_fft=params["preprocessing"]["num_fft"],
                         min_level_db=params["preprocessing"]["min_level_db"],
                         hop_length=params["preprocessing"]["hop_length"],
                         win_length=params["preprocessing"]["win_length"],
                         fmin=params["preprocessing"]["fmin"])
    mel = torch.FloatTensor(mel).unsqueeze(0).to(device)
    output = model.generate(mel)
    path = os.path.join(
        args.gen_dir,
        "gen_{}_model_steps_{}.wav".format(utterance_id, model_step))
    save_wav(path, output, params["preprocessing"]["sample_rate"])
예제 #8
0
def train_fn(args, params):
    # Directory preparation
    exp_dir = makeExpDirs(args.results_dir, args.exp_name)

    # Automatic Mixed-Precision
    if args.optim != "no":
        import apex

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = Vocoder(mel_channels=params["preprocessing"]["num_mels"],
                    conditioning_channels=params["vocoder"]["conditioning_channels"],
                    embedding_dim=params["vocoder"]["embedding_dim"],
                    rnn_channels=params["vocoder"]["rnn_channels"],
                    fc_channels=params["vocoder"]["fc_channels"],
                    bits=params["preprocessing"]["bits"],
                    hop_length=params["preprocessing"]["hop_length"],
                    nc=args.nc,
                    device=device
                    )
    model.to(device)
    print(model)

    optimizer = optim.Adam(model.parameters(), lr=params["vocoder"]["learning_rate"])

    # Automatic Mixed-Precision
    if args.optim != "no":
        model, optimizer = apex.amp.initialize(model, optimizer, opt_level=args.optim)

    scheduler = optim.lr_scheduler.StepLR(optimizer, params["vocoder"]["schedule"]["step_size"], params["vocoder"]["schedule"]["gamma"])

    if args.resume is not None:
        print(f"Resume checkpoint from: {args.resume}:")
        checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage)
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        global_step = checkpoint["step"]
    else:
        global_step = 0

    train_dataset = VocoderDataset(meta_file=os.path.join(args.data_dir, "train.txt"),
                                   sample_frames=params["vocoder"]["sample_frames"],
                                   audio_slice_frames=params["vocoder"]["audio_slice_frames"],
                                   hop_length=params["preprocessing"]["hop_length"],
                                   bits=params["preprocessing"]["bits"])

    train_dataloader = DataLoader(train_dataset, batch_size=params["vocoder"]["batch_size"],
                                  shuffle=True, num_workers=1,
                                  pin_memory=True)

    num_epochs = params["vocoder"]["num_steps"] // len(train_dataloader) + 1
    start_epoch = global_step // len(train_dataloader) + 1

    # Logger
    writer = SummaryWriter(exp_dir/"logs")

    # Add original utterance to TensorBoard
    if args.resume is None:
        with open(os.path.join(args.data_dir, "test.txt"), encoding="utf-8") as f:
            test_wavnpy_paths = [line.strip().split("|")[1] for line in f]
        for index, wavnpy_path in enumerate(test_wavnpy_paths):
            muraw_npy = np.load(wavnpy_path)
            wav_npy = mulaw_decode(muraw_npy, 2**params["preprocessing"]["bits"])
            writer.add_audio("orig", torch.from_numpy(wav_npy), global_step=global_step, sample_rate=params["preprocessing"]["sample_rate"])
            break


    for epoch in range(start_epoch, num_epochs + 1):
        running_loss = 0
        
        for i, (audio, mels) in enumerate(tqdm(train_dataloader, leave=False), 1):
            audio, mels = audio.to(device), mels.to(device)

            output = model(audio[:, :-1], mels)
            loss = F.cross_entropy(output.transpose(1, 2), audio[:, 1:])
            optimizer.zero_grad()

            # Automatic Mixed-Precision
            if args.optim != "no":
                with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()
            scheduler.step()

            running_loss += loss.item()
            average_loss = running_loss / i

            global_step += 1

            if global_step % args.save_step == 0:
                save_checkpoint(model, optimizer, scheduler, global_step, exp_dir/"params", False)

            if global_step % params["vocoder"]["checkpoint_interval"] == 0:
                save_checkpoint(model, optimizer, scheduler, global_step, exp_dir/"params", True)

            if global_step % params["vocoder"]["generation_interval"] == 0:
                with open(os.path.join(args.data_dir, "test.txt"), encoding="utf-8") as f:
                    test_mel_paths = [line.strip().split("|")[2] for line in f]

                for index, mel_path in enumerate(test_mel_paths):
                    utterance_id = os.path.basename(mel_path).split(".")[0]
                    # unsqueeze: insert in a batch
                    mel = torch.FloatTensor(np.load(mel_path)).unsqueeze(0).to(device)
                    output = model.generate(mel)
                    path = exp_dir/"samples"/f"gen_{utterance_id}_model_steps_{global_step}.wav"
                    save_wav(str(path), output, params["preprocessing"]["sample_rate"])
                    if index == 0:
                        writer.add_audio("cnvt", torch.from_numpy(output), global_step=global_step, sample_rate=params["preprocessing"]["sample_rate"])
        # finish a epoch
        writer.add_scalar("NLL", average_loss, global_step)
예제 #9
0
def run(args):
    # Check cuda device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Data
    if hps.bucket:
        dataset = LJSpeech_Dataset(meta_file=hps.meta_path, wav_dir=hps.wav_dir, batch_size=hps.batch_size, do_bucket=True, bucket_size=20)
        loader = DataLoader(
            dataset, 
            batch_size=1,
            shuffle=True,
            num_workers=4)
    else:
        dataset = LJSpeech_Dataset(meta_file=hps.meta_path, wav_dir=hps.wav_dir)
        loader = DataLoader(
            dataset,
            batch_size=hps.batch_size,
            shuffle=True,
            num_workers=4,
            drop_last=True,
            collate_fn=collate_fn)

    # Network
    model = Tacotron()
    criterion = nn.L1Loss()
    if args.cuda:
        model = nn.DataParallel(model.to(device))
        criterion = criterion.to(device)
    # The learning rate scheduling mechanism in "Attention is all you need" 
    lr_lambda = lambda step: hps.warmup_step ** 0.5 * min((step+1) * (hps.warmup_step ** -1.5), (step+1) ** -0.5)
    optimizer = optim.Adam(model.parameters(), lr=hps.lr)
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
        
    step = 1
    epoch = 1
    # Load model
    if args.ckpt:
        ckpt = load(args.ckpt)
        step = ckpt['step']
        epoch = ckpt['epoch']
        model.load_state_dict(ckpt['model'])
        optimizer.load_state_dict(ckpt['optimizer'])
        scheduler = optim.lr_scheduler.LambdaLR(
            optimizer, 
            lr_lambda, 
            last_epoch=step)

    if args.eval:
        # Evaluation
        model.eval()
        with torch.no_grad():
			# Preprocessing eval texts
            print('Start generating evaluation speeches...')
            n_eval = len(hps.eval_texts)
            for i in range(n_eval):
                sys.stdout.write('\rProgress: {}/{}'.format(i+1, n_eval))
                sys.stdout.flush()
                text = hps.eval_texts[i]
                text = text_normalize(text)
                txt_id = sent2idx(text) + [hps.char_set.find('E')]
                GO_frame = torch.zeros(1, 1, hps.n_mels)

                # Shape: (1, seq_length)
                txt = torch.LongTensor(txt_id).unsqueeze(0)
                if args.cuda:
                    GO_frame = GO_frame.cuda()
                    txt = txt.cuda()
                _batch = model(text=txt, frames=GO_frame)
                mel = _batch['mel'][0]
                mag = _batch['mag'][0]
                attn = _batch['attn'][0]
                if args.cuda:
               	    mel = mel.cpu()
                    mag = mag.cpu()
                    attn = attn.cpu()
                mel = mel.numpy()
                mag = mag.numpy()
                attn = attn.numpy()

                wav = mag2wav(mag)
                save_alignment(attn, step, 'eval/plots/attn_{}.png'.format(text))
                save_spectrogram(mag, 'eval/plots/spectrogram_[{}].png'.format(text))
                save_wav(wav, 'eval/results/wav_{}.wav'.format(text))
            sys.stdout.write('\n')

    if args.train:
        before_load = time.time()
        # Start training
        model.train()
        while True:
            for batch in loader:
                # torch.LongTensor, (batch_size, seq_length)
                txt = batch['text']
                # torch.Tensor, (batch_size, max_time, hps.n_mels)
                mel = batch['mel']
                # torch.Tensor, (batch_size, max_time, hps.n_fft)
                mag = batch['mag']
                if hps.bucket:
                    # If bucketing, the shape will be (1, batch_size, ...)
                    txt = txt.squeeze(0)
                    mel = mel.squeeze(0)
                    mag = mag.squeeze(0)
                # GO frame
                GO_frame = torch.zeros(mel[:, :1, :].size())
                if args.cuda:
                    txt = txt.to(device)
                    mel = mel.to(device)
                    mag = mag.to(device)
                    GO_frame = GO_frame.to(device)

                # Model prediction
                decoder_input = torch.cat([GO_frame, mel[:, hps.reduction_factor::hps.reduction_factor, :]], dim=1)

                load_time = time.time() - before_load
                before_step = time.time()

                _batch = model(text=txt, frames=decoder_input)
                _mel = _batch['mel']
                _mag = _batch['mag']
                _attn = _batch['attn']

                # Optimization
                optimizer.zero_grad()
                loss_mel = criterion(_mel, mel)
                loss_mag = criterion(_mag, mag)
                loss = loss_mel + loss_mag
                loss.backward()
                # Gradient clipping
                total_norm = clip_grad_norm_(model.parameters(), max_norm=hps.clip_norm)
                # Apply gradient
                optimizer.step()
                # Adjust learning rate
                scheduler.step()
                process_time = time.time() - before_step 
                if step % hps.log_every_step == 0:
                    lr_curr = optimizer.param_groups[0]['lr']
                    log = '[{}-{}] loss: {:.3f}, grad: {:.3f}, lr: {:.3e}, time: {:.2f} + {:.2f} sec'.format(epoch, step, loss.item(), total_norm, lr_curr, load_time, process_time)
                    print(log)
                if step % hps.save_model_every_step == 0:
                    save(filepath='tmp/ckpt/ckpt_{}.pth.tar'.format(step),
                         model=model.state_dict(),
                         optimizer=optimizer.state_dict(),
                         step=step, 
                         epoch=epoch)

                if step % hps.save_result_every_step == 0:
                    sample_idx = random.randint(0, hps.batch_size-1)
                    attn_sample = _attn[sample_idx].detach().cpu().numpy()
                    mag_sample = _mag[sample_idx].detach().cpu().numpy()
                    wav_sample = mag2wav(mag_sample)
                    # Save results
                    save_alignment(attn_sample, step, 'tmp/plots/attn_{}.png'.format(step))
                    save_spectrogram(mag_sample, 'tmp/plots/spectrogram_{}.png'.format(step))
                    save_wav(wav_sample, 'tmp/results/wav_{}.wav'.format(step))
                before_load = time.time()
                step += 1
            epoch += 1
예제 #10
0
def train(model, loader, optimizer, criterion, scheduler, step, epoch, device,
          args):
    before_load = time.time()
    # Start training
    model.train()
    while True:
        for batch in loader:
            # torch.LongTensor, (batch_size, seq_length)
            txt = batch['text']
            # torch.Tensor, (batch_size, max_time, hps.n_mels)
            mel = batch['mel']
            # torch.Tensor, (batch_size, max_time, hps.n_fft)
            mag = batch['mag']
            # torch.LongTensor, (batch_size, )
            txt_len = batch['text_length']
            frame_len = batch['frame_length']

            if hps.bucket:
                # If bucketing, the shape will be (1, batch_size, ...)
                txt = txt.squeeze(0)
                mel = mel.squeeze(0)
                mag = mag.squeeze(0)
                txt_len = txt_len.squeeze(0)
                frame_len = frame_len.squeeze(0)
            # GO frame
            GO_frame = torch.zeros(mel[:, :1, :].size())
            if args.cuda:
                txt = txt.to(device)
                mel = mel.to(device)
                mag = mag.to(device)
                GO_frame = GO_frame.to(device)

            # Model prediction
            decoder_input = torch.cat([
                GO_frame, mel[:, hps.reduction_factor::hps.reduction_factor, :]
            ],
                                      dim=1)

            load_time = time.time() - before_load
            before_step = time.time()

            _batch = model(text=txt,
                           frames=decoder_input,
                           text_length=txt_len,
                           frame_length=frame_len)
            _mel = _batch['mel']
            _mag = _batch['mag']
            _attn = _batch['attn']

            # Optimization
            optimizer.zero_grad()
            loss_mel = criterion(_mel, mel)
            loss_mag = criterion(_mag, mag)
            loss = loss_mel + loss_mag
            loss.backward()
            # Gradient clipping
            total_norm = clip_grad_norm_(model.parameters(),
                                         max_norm=hps.clip_norm)
            # Apply gradient
            optimizer.step()
            # Adjust learning rate
            scheduler.step()
            process_time = time.time() - before_step
            if step % hps.log_every_step == 0:
                lr_curr = optimizer.param_groups[0]['lr']
                log = '[{}-{}] total_loss: {:.3f}, mel_loss: {:.3f}, mag_loss: {:.3f}, grad: {:.3f}, lr: {:.3e}, time: {:.2f} + {:.2f} sec'.format(
                    epoch, step, loss.item(), loss_mel.item(), loss_mag.item(),
                    total_norm, lr_curr, load_time, process_time)
                print(log)
            if step % hps.save_model_every_step == 0:
                save(filepath='tmp/ckpt/ckpt_{}.pth.tar'.format(step),
                     model=model.state_dict(),
                     optimizer=optimizer.state_dict(),
                     step=step,
                     epoch=epoch)

            if step % hps.save_result_every_step == 0:
                sample_idx = random.randint(0, hps.batch_size - 1)
                attn_sample = _attn[sample_idx].detach().cpu().numpy()
                mag_sample = _mag[sample_idx].detach().cpu().numpy()
                wav_sample = mag2wav(mag_sample)
                # Save results
                save_alignment(attn_sample, step,
                               'tmp/plots/attn_{}.png'.format(step))
                save_spectrogram(mag_sample,
                                 'tmp/plots/spectrogram_{}.png'.format(step))
                save_wav(wav_sample, 'tmp/results/wav_{}.wav'.format(step))
            before_load = time.time()
            step += 1
        epoch += 1
예제 #11
0
def main():
    hps = Hparams
    parser = argparse.ArgumentParser('VC inference')
    parser.add_argument('--src_wav', type=str, help='source wav file path')
    parser.add_argument('--ckpt', type=str, help='model ckpt path')
    parser.add_argument('--save_dir', type=str, help='synthesized wav save directory')
    args = parser.parse_args()
    # 0.
    src_wav_arr = load_wav(args.src_wav)
    pre_emphasized_wav = _preemphasize(src_wav_arr)
    # 1. extract ppgs
    ppg_extractor_hps = hps.PPGExtractor.CNNBLSTMClassifier
    mfcc_pl = tf.placeholder(dtype=tf.float32,
                             shape=[None, None, 3 * hps.Audio.n_mfcc],
                             name='mfcc_pl')
    ppg_extractor = CNNBLSTMClassifier(out_dims=hps.Audio.ppg_dim,
                                       n_cnn=ppg_extractor_hps.n_cnn,
                                       cnn_hidden=ppg_extractor_hps.cnn_hidden,
                                       cnn_kernel=ppg_extractor_hps.cnn_kernel,
                                       n_blstm=ppg_extractor_hps.n_blstm,
                                       lstm_hidden=ppg_extractor_hps.lstm_hidden)
    predicted_ppgs = ppg_extractor(inputs=mfcc_pl)['logits']
    # set up a session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    # load saved model
    saver = tf.train.Saver()
    print('Restoring ppgs extractor from {}'.format(ppg_extractor_hps.ckpt))
    saver.restore(sess, ppg_extractor_hps.ckpt)
    mfcc_feats = wav2unnormalized_mfcc(src_wav_arr)
    ppg = sess.run(predicted_ppgs,
                   feed_dict={mfcc_pl: np.expand_dims(mfcc_feats, axis=0)})
    sess.close()
    ppg = softmax(np.squeeze(ppg, axis=0))

    # 2. extract lf0, mel-spectrogram
    log_f0 = logf0(args.src_wav)
    log_f0 = lf0_normailze(log_f0)
    # mel-spectrogram is extracted for comparison
    mel_spec = melspectrogram(pre_emphasized_wav).astype(np.float32).T

    # 3. prepare inputs
    min_len = min(log_f0.shape[0], ppg.shape[0])
    vc_inputs = np.concatenate([ppg[:min_len, :], log_f0[:min_len, :]], axis=1)
    vc_inputs = np.expand_dims(vc_inputs, axis=1)  # [time, batch, dim]

    # 4. setup vc model and do the inference
    model = BLSTMConversionModel(in_channels=hps.Audio.ppg_dim + 2,
                                 out_channels=hps.Audio.num_mels,
                                 lstm_hidden=hps.BLSTMConversionModel.lstm_hidden)
    device = torch.device('cpu')
    model.load_state_dict(torch.load(args.ckpt, map_location=device))
    model.eval()
    predicted_mels = model(torch.tensor(vc_inputs))
    predicted_mels = np.squeeze(predicted_mels.detach().numpy(), axis=1)

    # 5. synthesize wav
    synthesized_wav = inv_preemphasize(inv_mel_spectrogram(predicted_mels.T))
    resynthesized_wav = inv_preemphasize(inv_mel_spectrogram(mel_spec.T))
    ckpt_name = args.ckpt.split('/')[-1].split('.')[0]
    wav_name = args.src_wav.split('/')[-1].split('.')[0]
    save_wav(synthesized_wav, os.path.join(args.save_dir, '{}-{}-converted.wav'.format(wav_name, ckpt_name)))
    save_wav(resynthesized_wav, os.path.join(args.save_dir, '{}-{}-src-resyn.wav'.format(wav_name, ckpt_name)))
    return
예제 #12
0
def train_fn(args, params):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = Vocoder(
        mel_channels=params["preprocessing"]["num_mels"],
        conditioning_channels=params["vocoder"]["conditioning_channels"],
        embedding_dim=params["vocoder"]["embedding_dim"],
        rnn_channels=params["vocoder"]["rnn_channels"],
        fc_channels=params["vocoder"]["fc_channels"],
        bits=params["preprocessing"]["bits"],
        hop_length=params["preprocessing"]["hop_length"])
    model.to(device)
    print(model)

    optimizer = optim.Adam(model.parameters(),
                           lr=params["vocoder"]["learning_rate"])
    scheduler = optim.lr_scheduler.StepLR(
        optimizer, params["vocoder"]["schedule"]["step_size"],
        params["vocoder"]["schedule"]["gamma"])

    if args.resume is not None:
        print("Resume checkpoint from: {}:".format(args.resume))
        checkpoint = torch.load(args.resume,
                                map_location=lambda storage, loc: storage)
        model.load_state_dict(checkpoint["model"])
        global_step = checkpoint["step"]
    else:
        global_step = 0

    train_dataset = VocoderDataset(
        meta_file=os.path.join(args.data_dir, "train.txt"),
        sample_frames=params["vocoder"]["sample_frames"],
        audio_slice_frames=params["vocoder"]["audio_slice_frames"],
        hop_length=params["preprocessing"]["hop_length"],
        bits=params["preprocessing"]["bits"])

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=params["vocoder"]["batch_size"],
                                  shuffle=True,
                                  num_workers=1,
                                  pin_memory=True)

    num_epochs = params["vocoder"]["num_steps"] // len(train_dataloader) + 1
    start_epoch = global_step // len(train_dataloader) + 1

    for epoch in range(start_epoch, num_epochs + 1):
        running_loss = 0

        for i, (audio, mels) in enumerate(tqdm(train_dataloader), 1):
            audio, mels = audio.to(device), mels.to(device)

            output = model(audio[:, :-1], mels)
            loss = F.cross_entropy(output.transpose(1, 2), audio[:, 1:])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            running_loss += loss.item()
            average_loss = running_loss / i

            global_step += 1

            if global_step % params["vocoder"]["checkpoint_interval"] == 0:
                save_checkpoint(model, global_step, args.checkpoint_dir)

                with open(os.path.join(args.data_dir, "test.txt"),
                          encoding="utf-8") as f:
                    test_mel_paths = [line.strip().split("|")[2] for line in f]

                for mel_path in test_mel_paths:
                    utterance_id = os.path.basename(mel_path).split(".")[0]
                    mel = torch.FloatTensor(
                        np.load(mel_path)).unsqueeze(0).to(device)
                    output = model.generate(
                        mel, params["vocoder"]["generate"]["batched"],
                        params["vocoder"]["generate"]["target"],
                        params["vocoder"]["generate"]["overlap"])
                    path = os.path.join(
                        args.gen_dir, "gen_{}_model_steps_{}.wav".format(
                            utterance_id, global_step))
                    save_wav(path, output,
                             params["preprocessing"]["sample_rate"])

        print("epoch:{}, loss:{:.3f}".format(epoch, average_loss))
예제 #13
0
def gen_from_mel(model, mel, output):
    assert mel.shape[1] == 80, 'Input mel shape is invalid.'
    assert output.endswith('.wav')
    mel = torch.FloatTensor(mel).unsqueeze(0).to(device)
    waveform = model.generate(mel)
    save_wav(output, waveform, params["preprocessing"]["sample_rate"])