示例#1
0
def main():
    model = WaveNet()
    checkpoint = torch.load(
        'runs/Oct09_11-24-52_K-00030-LIN/checkpoint_9000.pth')
    model.load_state_dict(checkpoint['model'])
    weights = model.export_weights()
    wavenet = nv_wavenet.NVWaveNet(**weights)

    # TODO: とりあえずバッチサイズ1で実験
    # TODO: 複数の音声をまとめて推論するときは長さをpaddingする必要あり
    filename = 'data/arctic_a0001.wav'
    audio, sampling_rate = load_wav_to_torch(filename)
    mel = get_mel(audio)
    mel.unsqueeze_(0)
    print(mel.shape)

    # NVWaveNetの入力に合うように整形
    # (channels, batch=1, num_layers, samples)
    cond_input = get_cond_input(mel, model)

    # 波形を生成
    # 生成された波形は mu-law された状態なので元に戻す必要がある
    audio_data = wavenet.infer(cond_input, nv_wavenet.Impl.AUTO)
    print(audio_data.shape)
    print(audio_data.min(), audio_data.max())

    # wavenet.Aはmu_quantization
    audio = mu_law_decode_numpy(audio_data[0].cpu().numpy(), wavenet.A)
    audio = MAX_WAV_VALUE * audio
    wavdata = audio.astype('int16')
    scipy.io.wavfile.write('gen.wav', 16000, wavdata)
示例#2
0
def main(mel_files, model_filename, output_dir, batch_size, implementation):
    mel_files = utils.files_to_list(mel_files)
    model = torch.load(model_filename)['model']
    wavenet = nv_wavenet.NVWaveNet(**(model.export_weights()))

    for files in chunker(mel_files, batch_size):
        mels = []
        for file_path in files:
            print(file_path)
            mel = torch.load(file_path)
            mel = utils.to_gpu(mel)
            mels.append(torch.unsqueeze(mel, 0))
        cond_input = model.get_cond_input(torch.cat(mels, 0))
        audio_data = wavenet.infer(cond_input, implementation)

        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path))[0]

            audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(),
                                              wavenet.A)
            audio = utils.MAX_WAV_VALUE * audio
            audio = 32768.0 * audio
            wavdata = audio.astype('int16')
            #wavdata = audio.astype('float16')
            write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
示例#3
0
def main(audio_files, model_filename, output_dir, batch_size, speaker_id,
         implementation):
    audio_files = utils.files_to_list(audio_files)
    model = torch.load(model_filename)['model']
    model.eval()
    wavenet = nv_wavenet.NVWaveNet(
        **(model.decoders[speaker_id].export_weights()))

    for files in chunker(audio_files, batch_size):
        audio_ = []
        for file_path in files:
            print(file_path)
            audio, sampling_rate = utils.load_wav_to_torch(file_path)
            if sampling_rate != 16000:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, 16000))
            audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, 256)
            audio = utils.to_gpu(audio)
            audio_.append(torch.unsqueeze(audio, 0))
        latent = model.get_latent_input(torch.cat(audio_, 0))
        cond_input = model.decoders[speaker_id].get_cond_input(latent)
        audio_data = wavenet.infer(cond_input, implementation)

        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path))[0]

            audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(),
                                              wavenet.A)
            audio = utils.MAX_WAV_VALUE * audio
            wavdata = audio.astype('int16')
            write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
示例#4
0
def main(input_files,
         model_dir,
         output_dir,
         batch_size,
         implementation,
         data_config,
         audio_config,
         preload_mels=False):
    model_filename = get_latest_checkpoint(model_dir)
    print("Model path: {}".format(model_filename))
    model = torch.load(model_filename)['model']
    wavenet = nv_wavenet.NVWaveNet(**(model.export_weights()))
    print("Wavenet num layers: {}, max_dilation: {}".format(
        wavenet.num_layers, wavenet.max_dilation))
    writer = SummaryWriter(output_dir)
    mel_extractor = Mel2SampOnehot(audio_config=audio_config, **data_config)
    input_files = utils.files_to_list(input_files)

    audio_processor = AudioProcessor(audio_config)
    for j, files in enumerate(chunker(input_files, batch_size)):
        mels = []
        for i, file_path in enumerate(files):
            if preload_mels:
                mel = np.load(file_path[0]).T
                mel = torch.from_numpy(mel)
                mel = utils.to_gpu(mel)
            else:
                audio, _ = utils.load_wav_to_torch(file_path)
                file_name = os.path.splitext(os.path.basename(file_path))[0]
                writer.add_audio("eval_true/{}/{}".format(i, file_name),
                                 audio / utils.MAX_WAV_VALUE, 0, 22050)
                mel = mel_extractor.get_mel(audio)
                mel = mel.t().cuda()
            mels.append(torch.unsqueeze(mel, 0))
        mels = torch.cat(mels, 0)
        cond_input = model.get_cond_input(mels)
        audio_data = wavenet.infer(cond_input, implementation)

        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path[0]))[0]
            audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(),
                                              256)
            print("Range of {}.wav before deemphasis : {} to {}".format(
                file_name, audio.min(), audio.max()))
            if mel_extractor.apply_preemphasis:
                audio = audio.astype("float32")
                audio = audio_processor.deemphasis(audio[None, :])
                audio = audio.numpy()[0]
            print("Range of {}.wav after deemphasis : {} to {}".format(
                file_name, audio.min(), audio.max()))
            audio = np.tanh(audio)
            output_filepath = "{}.wav".format(file_name)
            output_filepath = os.path.join(output_dir, output_filepath)
            assert audio.dtype in [np.float64, np.float32]
            assert (np.abs(audio)).max() <= 1
            writer.add_audio(output_filepath, audio, 0, 22050)
            audio = (audio * 32767).astype("int16")
            scipy.io.wavfile.write(output_filepath, 22050, audio)
示例#5
0
def main(midi_files, model_filename, output_dir, batch_size, implementation):

    midi_files = utils.files_to_list(midi_files)
    model = torch.load(model_filename)['model']
    wavenet = nv_wavenet.NVWaveNet(**(model.export_weights()))
    
    for files in chunker(midi_files, batch_size):
        midi_batch = []
        for file_path in files:
            print(file_path)
            midi = torch.load(file_path)#.pin_memory()
            midi = utils.to_gpu(midi)
            midi_batch.append(torch.unsqueeze(midi, 0))

        #Get conditional input for inference wavenet
        cond_input = model.get_cond_input(torch.cat(midi_batch, 0))

        audio_data = wavenet.infer(cond_input, implementation)        
        print(audio_data)
        print(audio_data.size())
        print(np.max(audio_data.cpu().numpy()))
        print(np.min(audio_data.cpu().numpy()))            

        
        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path))[0]
            audio = utils.mu_law_decode_numpy(audio_data[i,:].cpu().numpy(), wavenet.A)

            write("{}/{}_infer_noMul.wav".format(output_dir, file_name),
                  16000, audio)

            print(audio.shape)
            print(np.max(audio))
            print(np.min(audio))            
            audio = utils.MAX_WAV_VALUE * audio
            print(np.max(audio))
            print(np.min(audio))            
            wavdata = audio.astype('int16')
            print(np.max(wavdata))
            print(np.min(wavdata))            
            
            write("{}/{}_infer.wav".format(output_dir, file_name),
                  16000, wavdata)

            exit()
示例#6
0
import torch
from scipy.io.wavfile import write
import numpy as np
import nv_wavenet

MAX_WAV_VALUE = 32768.0


def mu_law_decode_numpy(x, mu_quantization=256):
    assert (np.max(x) <= mu_quantization)
    assert (np.min(x) >= 0)
    mu = mu_quantization - 1.
    # Map values back to [-1, 1].
    signal = 2 * (x / mu) - 1
    # Perform inverse of mu-law transformation.
    magnitude = (1 / mu) * ((1 + mu)**np.abs(signal) - 1)
    return np.sign(signal) * magnitude


if __name__ == '__main__':
    model = torch.load("model.pt")
    wavenet = nv_wavenet.NVWaveNet(**model)
    cond_input = torch.load("cond_input.pt")

    samples = wavenet.infer(cond_input, nv_wavenet.Impl.PERSISTENT)

    audio = mu_law_decode_numpy(samples.cpu().numpy(), 256)
    audio = MAX_WAV_VALUE * audio
    wavdata = audio.astype('int16')
    write('audio.wav', 16000, wavdata)
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
"""
Tests that the NV-WaveNet class is producing audio
"""
import torch
from scipy.io.wavfile import write
import nv_wavenet
from wavenet import WaveNet
import utils
import json

if __name__ == '__main__':
    config = json.loads(open('config.json').read())
    wavenet_config = config["wavenet_config"]
    model = WaveNet(**wavenet_config).cuda()
    weights = model.export_weights()
    wavenet = nv_wavenet.NVWaveNet(**weights)
    num_samples = 10*1000
    batch_size = config['train_config']['batch_size']
    cond_input = torch.zeros([2 * wavenet_config['n_residual_channels'], batch_size, wavenet_config['n_layers'], num_samples]).cuda()

    samples = wavenet.infer(cond_input, nv_wavenet.Impl.PERSISTENT)[0]
   
    audio = utils.mu_law_decode_numpy(samples.cpu().numpy(), 256)
    audio = utils.MAX_WAV_VALUE * audio
    wavdata = audio.astype('int16')
    write('audio.wav',16000, wavdata)
示例#8
0
def load_wav_model(checkpoint_path):

    model = torch.load(checkpoint_path)['model']
    wavenet = nv_wavenet.NVWaveNet(**(model.export_weights()))
    return model, wavenet
示例#9
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          iters_per_checkpoint, iters_per_eval, batch_size, seed, checkpoint_path, log_dir, ema_decay=0.9999):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    if train_data_config["no_chunks"]:
        criterion = MaskedCrossEntropyLoss()
    else:
        criterion = CrossEntropyLoss()
    model = WaveNet(**wavenet_config).cuda()
    ema = ExponentialMovingAverage(ema_decay)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=200000, gamma=0.5)

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, scheduler, iteration, ema = load_checkpoint(checkpoint_path, model,
                                                                      optimizer, scheduler, ema)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2SampOnehot(audio_config=audio_config, verbose=True, **train_data_config)
    validset = Mel2SampOnehot(audio_config=audio_config, verbose=False, **valid_data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    valid_sampler = DistributedSampler(validset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    print(train_data_config)
    if train_data_config["no_chunks"]:
        collate_fn = utils.collate_fn
    else:
        collate_fn = torch.utils.data.dataloader.default_collate
    train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
                              collate_fn=collate_fn,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=True,
                              drop_last=True)
    valid_loader = DataLoader(validset, num_workers=1, shuffle=False,
                              sampler=valid_sampler, batch_size=1, pin_memory=True)
    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)
    
    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    writer = SummaryWriter(log_dir)
    print("Checkpoints writing to: {}".format(log_dir))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            if low_memory:
                torch.cuda.empty_cache()
            scheduler.step()
            model.zero_grad()

            if train_data_config["no_chunks"]:
                x, y, seq_lens = batch
                seq_lens = to_gpu(seq_lens)
            else:
                x, y = batch
            x = to_gpu(x).float()
            y = to_gpu(y)
            x = (x, y)  # auto-regressive takes outputs as inputs
            y_pred = model(x)
            if train_data_config["no_chunks"]:
                loss = criterion(y_pred, y, seq_lens)
            else:
                loss = criterion(y_pred, y)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus)[0]
            else:
                reduced_loss = loss.data[0]
            loss.backward()
            optimizer.step()

            for name, param in model.named_parameters():
                if name in ema.shadow:
                    ema.update(name, param.data)

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if rank == 0:
                writer.add_scalar('loss', reduced_loss, iteration)
            if (iteration % iters_per_checkpoint == 0 and iteration):
                if rank == 0:
                    checkpoint_path = "{}/wavenet_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, scheduler, learning_rate, iteration,
                                    checkpoint_path, ema, wavenet_config)
            if (iteration % iters_per_eval == 0 and iteration > 0 and not config["no_validation"]):
                if low_memory:
                    torch.cuda.empty_cache()
                if rank == 0:
                    model_eval = nv_wavenet.NVWaveNet(**(model.export_weights()))
                    for j, valid_batch in enumerate(valid_loader):
                        mel, audio = valid_batch
                        mel = to_gpu(mel).float()
                        cond_input = model.get_cond_input(mel)
                        predicted_audio = model_eval.infer(cond_input, nv_wavenet.Impl.AUTO)
                        predicted_audio = utils.mu_law_decode_numpy(predicted_audio[0, :].cpu().numpy(), 256)
                        writer.add_audio("valid/predicted_audio_{}".format(j),
                                         predicted_audio,
                                         iteration,
                                         22050)
                        audio = utils.mu_law_decode_numpy(audio[0, :].cpu().numpy(), 256)
                        writer.add_audio("valid_true/audio_{}".format(j),
                                         audio,
                                         iteration,
                                         22050)
                        if low_memory:
                            torch.cuda.empty_cache()
            iteration += 1
示例#10
0
    wavenet_path = 'checkpoints/shelby_retrain/wavenet_135000'
    tts_file = '/var/pylon/data/speech/pylon/tts/shelby/tts-train.txt'

    utterances = load_utterances(tts_file)

    tf.reset_default_graph()
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    graph = load_graph(sess, sushi_path)
    sushibot_inputs = graph.get_tensor_by_name("data/inputs:0")
    sushibot_lengths = graph.get_tensor_by_name("data/input_lengths:0")
    prediction = graph.get_tensor_by_name("sushibot/prediction:0")

    model = torch.load(wavenet_path)['model'].cuda(1)
    wavenet = nv_wavenet.NVWaveNet(**model.export_weights())

    for i, utterance in enumerate(tqdm(utterances)):
        input_vector = [[
            SUSHIBOT_CHARSET.index(c) if c in SUSHIBOT_CHARSET else 0
            for c in utterance
        ] + [SUSHIBOT_CHARSET.index('~')]]

        feed_dict = {
            sushibot_inputs: input_vector,
            sushibot_lengths: [len(input_vector[0])]
        }

        mels = sess.run(prediction, feed_dict=feed_dict)
        mels = mels.reshape(-1, 80)
        np.save(os.path.join(outdir, 'mels/sushi-mel-{:05d}.npy'.format(i)),
示例#11
0
 def load_wavenet(self):
     self.wavenet = torch.load(self.wavenet_path)['model']
     self.nv_wavenet = nv_wavenet.NVWaveNet(
         **(self.wavenet.export_weights()))