Exemplo n.º 1
0
    def generate_performance_audio(self, spectrograms):
        batch_size = spectrograms.shape[0]  #number of voices
        spec_depth = spectrograms.shape[
            1]  #depth of each spectrogram. should be 80

        capacity = 80 * 80 * 9  #maximum volume of a tensor the wavenet can handle at a time
        spec_hop = floor(capacity / (batch_size * spec_depth))
        spec_start = 0
        audio_start = 0
        audio_hop = spec_hop * self.sample_conversion
        batch = 1
        num_batches = ceil(self.spec_length / spec_hop)

        spectrograms = utils.to_gpu(spectrograms)
        audio = np.zeros((batch_size, self.audio_length))

        print('Generating audio with WaveNet Vocoder...')
        while spec_start + spec_hop < self.spec_length:
            print(' - batch %d of %d' % (batch, num_batches))
            #get clip
            clip = spectrograms[:, :, spec_start:spec_start + spec_hop]

            #get audio from network
            cond_input = self.wavenet.get_cond_input(clip)
            audio_data = self.nv_wavenet.infer(cond_input,
                                               nv_wavenet.Impl.AUTO)
            torch.cuda.empty_cache()
            # pdb.set_trace()
            for i in range(batch_size):
                audio[i, audio_start:audio_start +
                      audio_hop] = utils.mu_law_decode_numpy(
                          audio_data[i, :].cpu().numpy(), self.nv_wavenet.A)

            #add into at start:start+hop
            spec_start += spec_hop
            audio_start += audio_hop
            batch += 1

            #need to update the wavenet embeddings so that sound stream is continuous
            #here there be demons
            # self.nv_wavenet.embedding_prev = self.nv_wavenet.embedding_curr

        #add the last section if it didn't fit
        print(' - batch %d of %d' % (batch, num_batches))
        # spec_remaining = self.spec_length - spec_start
        clip = spectrograms[:, :, spec_start:self.spec_length]

        #get audio from network
        cond_input = self.wavenet.get_cond_input(clip)
        audio_data = self.nv_wavenet.infer(cond_input, nv_wavenet.Impl.AUTO)
        torch.cuda.empty_cache()
        for i in range(batch_size):
            audio[i,
                  audio_start:self.audio_length] = utils.mu_law_decode_numpy(
                      audio_data[i, :].cpu().numpy(), self.nv_wavenet.A)

        return audio
Exemplo n.º 2
0
def main(mel_files, model_filename, output_dir, batch_size, implementation):
    mel_files = utils.files_to_list(mel_files)
    model = torch.load(model_filename)['model']
    wavenet = nv_wavenet.NVWaveNet(**(model.export_weights()))

    for files in chunker(mel_files, batch_size):
        mels = []
        for file_path in files:
            print(file_path)
            mel = torch.load(file_path)
            mel = utils.to_gpu(mel)
            mels.append(torch.unsqueeze(mel, 0))
        cond_input = model.get_cond_input(torch.cat(mels, 0))
        audio_data = wavenet.infer(cond_input, implementation)

        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path))[0]

            audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(),
                                              wavenet.A)
            audio = utils.MAX_WAV_VALUE * audio
            audio = 32768.0 * audio
            wavdata = audio.astype('int16')
            #wavdata = audio.astype('float16')
            write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
Exemplo n.º 3
0
def main(audio_files, model_filename, output_dir, batch_size, speaker_id,
         implementation):
    audio_files = utils.files_to_list(audio_files)
    model = torch.load(model_filename)['model']
    model.eval()
    wavenet = nv_wavenet.NVWaveNet(
        **(model.decoders[speaker_id].export_weights()))

    for files in chunker(audio_files, batch_size):
        audio_ = []
        for file_path in files:
            print(file_path)
            audio, sampling_rate = utils.load_wav_to_torch(file_path)
            if sampling_rate != 16000:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, 16000))
            audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, 256)
            audio = utils.to_gpu(audio)
            audio_.append(torch.unsqueeze(audio, 0))
        latent = model.get_latent_input(torch.cat(audio_, 0))
        cond_input = model.decoders[speaker_id].get_cond_input(latent)
        audio_data = wavenet.infer(cond_input, implementation)

        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path))[0]

            audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(),
                                              wavenet.A)
            audio = utils.MAX_WAV_VALUE * audio
            wavdata = audio.astype('int16')
            write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
Exemplo n.º 4
0
def main(audio_file_path, model_filename, output_path):
    model = torch.load(model_filename, map_location=torch.device('cpu'))['model']
    
    # mels = []
    # for file_path in files:
    #     print(file_path)
    #     mel = torch.load(file_path)
    #     mel = utils.to_gpu(mel)
    #     mels.append(torch.unsqueeze(mel, 0))
    # cond_input = model.get_cond_input(torch.cat(mels, 0))
    # audio_data = wavenet.infer(cond_input, implementation)
    first_audio_data, _ = utils.load_wav_to_torch(audio_file_path)
    first_audio_data = first_audio_data[:10000]
    first_audio_data = utils.mu_law_encode(first_audio_data / utils.MAX_WAV_VALUE, 256)
    print("first_audio_data.shape", first_audio_data.shape)
    print("first_audio_data.shape", first_audio_data.dtype)
    audio_data = model.generate(first_samples = first_audio_data, num_samples=1000, receptive_field=6000)
    np.savetxt("audio_data.txt", audio_data.numpy().astype(int), fmt='%d')
    # for i, file_path in enumerate(files):
    # file_name = os.path.splitext(os.path.basename(file_path))[0]
    
    audio = utils.mu_law_decode_numpy(audio_data.cpu().numpy(), model.n_out_channels)
    audio = utils.MAX_WAV_VALUE * audio
    print("audio: ", audio)
    wavdata = audio.astype('int16')
    write(output_path, 16000, wavdata)
Exemplo n.º 5
0
def main(input_files,
         model_dir,
         output_dir,
         batch_size,
         implementation,
         data_config,
         audio_config,
         preload_mels=False):
    model_filename = get_latest_checkpoint(model_dir)
    print("Model path: {}".format(model_filename))
    model = torch.load(model_filename)['model']
    wavenet = nv_wavenet.NVWaveNet(**(model.export_weights()))
    print("Wavenet num layers: {}, max_dilation: {}".format(
        wavenet.num_layers, wavenet.max_dilation))
    writer = SummaryWriter(output_dir)
    mel_extractor = Mel2SampOnehot(audio_config=audio_config, **data_config)
    input_files = utils.files_to_list(input_files)

    audio_processor = AudioProcessor(audio_config)
    for j, files in enumerate(chunker(input_files, batch_size)):
        mels = []
        for i, file_path in enumerate(files):
            if preload_mels:
                mel = np.load(file_path[0]).T
                mel = torch.from_numpy(mel)
                mel = utils.to_gpu(mel)
            else:
                audio, _ = utils.load_wav_to_torch(file_path)
                file_name = os.path.splitext(os.path.basename(file_path))[0]
                writer.add_audio("eval_true/{}/{}".format(i, file_name),
                                 audio / utils.MAX_WAV_VALUE, 0, 22050)
                mel = mel_extractor.get_mel(audio)
                mel = mel.t().cuda()
            mels.append(torch.unsqueeze(mel, 0))
        mels = torch.cat(mels, 0)
        cond_input = model.get_cond_input(mels)
        audio_data = wavenet.infer(cond_input, implementation)

        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path[0]))[0]
            audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(),
                                              256)
            print("Range of {}.wav before deemphasis : {} to {}".format(
                file_name, audio.min(), audio.max()))
            if mel_extractor.apply_preemphasis:
                audio = audio.astype("float32")
                audio = audio_processor.deemphasis(audio[None, :])
                audio = audio.numpy()[0]
            print("Range of {}.wav after deemphasis : {} to {}".format(
                file_name, audio.min(), audio.max()))
            audio = np.tanh(audio)
            output_filepath = "{}.wav".format(file_name)
            output_filepath = os.path.join(output_dir, output_filepath)
            assert audio.dtype in [np.float64, np.float32]
            assert (np.abs(audio)).max() <= 1
            writer.add_audio(output_filepath, audio, 0, 22050)
            audio = (audio * 32767).astype("int16")
            scipy.io.wavfile.write(output_filepath, 22050, audio)
Exemplo n.º 6
0
def SaveTestData(audioX,
                 midiX,
                 fileNum,
                 output_dir,
                 test_segment_length,
                 audio_hz,
                 midi_hz,
                 mu_law_encode=True):
    """
    Save torch tensors for inference.py
    A random segment in the piece will be chosen. The length is specified by test_segment_length

    This also plots a visualization of the midi roll, and the ground truth audio segment
    """

    fig, ax = plt.subplots()

    filename = output_dir + "/" + str(fileNum)

    # save midi tensor
    if midiX is not None:
        segment_samples = int(np.floor(midi_hz * test_segment_length))
        starting_pos = random.randint(0, midiX.shape[1] - segment_samples)
        midiX = midiX[:, starting_pos:(starting_pos + segment_samples)]
        midiX = midiX.todense()
        torch.save(torch.from_numpy(midiX), filename + ".midiX")

        # plot midi roll
        plt.cla()
        ax.spy(midiX[:89, :], markersize=3, aspect="auto", origin='lower')
        plt.savefig(filename + ".png")

    # save ground truth audio
    if audioX is not None:
        segment_samples = int(audio_hz * test_segment_length)
        audio_start_pos = int(starting_pos * (audio_hz / midi_hz))
        audioX = audioX[audio_start_pos:(audio_start_pos + segment_samples)]
        torch.save(torch.from_numpy(audioX), filename + ".audioX")

        # save ground truth audio
        if mu_law_encode:
            raw_audio = utils.mu_law_decode_numpy(audioX)
        else:
            raw_audio = audioX.numpy()
        raw_audio = utils.MAX_WAV_VALUE * raw_audio
        wavdata = raw_audio.astype('int16')
        write(filename + "_groundTruth.wav", 16000, wavdata)
Exemplo n.º 7
0
#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
"""
Tests that the NV-WaveNet class is producing audio
"""
import torch
from scipy.io.wavfile import write
import nv_wavenet
import utils

if __name__ == '__main__':
    model = torch.load("model.pt")
    wavenet = nv_wavenet.NVWaveNet(**model)
    cond_input = torch.load("cond_input.pt")

    samples = wavenet.infer(cond_input, nv_wavenet.Impl.PERSISTENT)[0]

    audio = utils.mu_law_decode_numpy(samples.cpu().numpy(), 256)
    audio = utils.MAX_WAV_VALUE * audio
    wavdata = audio.astype('int16')
    write('audio.wav', 16000, wavdata)
Exemplo n.º 8
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          iters_per_checkpoint, iters_per_eval, batch_size, seed, checkpoint_path, log_dir, ema_decay=0.9999):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    if train_data_config["no_chunks"]:
        criterion = MaskedCrossEntropyLoss()
    else:
        criterion = CrossEntropyLoss()
    model = WaveNet(**wavenet_config).cuda()
    ema = ExponentialMovingAverage(ema_decay)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=200000, gamma=0.5)

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, scheduler, iteration, ema = load_checkpoint(checkpoint_path, model,
                                                                      optimizer, scheduler, ema)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2SampOnehot(audio_config=audio_config, verbose=True, **train_data_config)
    validset = Mel2SampOnehot(audio_config=audio_config, verbose=False, **valid_data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    valid_sampler = DistributedSampler(validset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    print(train_data_config)
    if train_data_config["no_chunks"]:
        collate_fn = utils.collate_fn
    else:
        collate_fn = torch.utils.data.dataloader.default_collate
    train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
                              collate_fn=collate_fn,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=True,
                              drop_last=True)
    valid_loader = DataLoader(validset, num_workers=1, shuffle=False,
                              sampler=valid_sampler, batch_size=1, pin_memory=True)
    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)
    
    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    writer = SummaryWriter(log_dir)
    print("Checkpoints writing to: {}".format(log_dir))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            if low_memory:
                torch.cuda.empty_cache()
            scheduler.step()
            model.zero_grad()

            if train_data_config["no_chunks"]:
                x, y, seq_lens = batch
                seq_lens = to_gpu(seq_lens)
            else:
                x, y = batch
            x = to_gpu(x).float()
            y = to_gpu(y)
            x = (x, y)  # auto-regressive takes outputs as inputs
            y_pred = model(x)
            if train_data_config["no_chunks"]:
                loss = criterion(y_pred, y, seq_lens)
            else:
                loss = criterion(y_pred, y)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus)[0]
            else:
                reduced_loss = loss.data[0]
            loss.backward()
            optimizer.step()

            for name, param in model.named_parameters():
                if name in ema.shadow:
                    ema.update(name, param.data)

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if rank == 0:
                writer.add_scalar('loss', reduced_loss, iteration)
            if (iteration % iters_per_checkpoint == 0 and iteration):
                if rank == 0:
                    checkpoint_path = "{}/wavenet_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, scheduler, learning_rate, iteration,
                                    checkpoint_path, ema, wavenet_config)
            if (iteration % iters_per_eval == 0 and iteration > 0 and not config["no_validation"]):
                if low_memory:
                    torch.cuda.empty_cache()
                if rank == 0:
                    model_eval = nv_wavenet.NVWaveNet(**(model.export_weights()))
                    for j, valid_batch in enumerate(valid_loader):
                        mel, audio = valid_batch
                        mel = to_gpu(mel).float()
                        cond_input = model.get_cond_input(mel)
                        predicted_audio = model_eval.infer(cond_input, nv_wavenet.Impl.AUTO)
                        predicted_audio = utils.mu_law_decode_numpy(predicted_audio[0, :].cpu().numpy(), 256)
                        writer.add_audio("valid/predicted_audio_{}".format(j),
                                         predicted_audio,
                                         iteration,
                                         22050)
                        audio = utils.mu_law_decode_numpy(audio[0, :].cpu().numpy(), 256)
                        writer.add_audio("valid_true/audio_{}".format(j),
                                         audio,
                                         iteration,
                                         22050)
                        if low_memory:
                            torch.cuda.empty_cache()
            iteration += 1
Exemplo n.º 9
0
                         pin_memory=False,
                         drop_last=True)

for batch in test_loader:
    # conditions, true_audio = testset[0]#batch

    x, y = batch
    true_audio = y.clone()

    y = torch.zeros_like(y)  #removing the waveform for pure inference
    x = utils.to_gpu(x).float()
    y = utils.to_gpu(y)
    x = (x, y)  # auto-regressive takes outputs as inputs
    y_pred = model(x)
    single = y_pred[0].detach().cpu()
    values, indices = single.max(0)
    indices = utils.mu_law_decode_numpy(indices.numpy(), 256)
    indices = utils.MAX_WAV_VALUE * indices
    indices = indices.astype('int16')

    true_audio = utils.mu_law_decode_numpy(true_audio[0].cpu().numpy(), 256)
    true_audio = utils.MAX_WAV_VALUE * true_audio
    true_audio = true_audio.astype('int16')

    play(indices, 16000)
    time.sleep(0.25)
    play(true_audio, 16000)
    time.sleep(1.0)

    del x, y, y_pred, single, values, indices, true_audio
    torch.cuda.empty_cache()
Exemplo n.º 10
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          iters_per_checkpoint, batch_size, seed, checkpoint_path):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = CrossEntropyLoss()
    model = WaveNet(**wavenet_config).cpu()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    print(f"receptive_field: {model.receptive_field()}")
    trainset = WavenetDataset(
        dataset_file='data/dataset.npz',
        item_length=model.receptive_field() + 1000 + model.output_length - 1,
        target_length=model.output_length,
        file_location='data/',
        test_stride=500,
    )
    print(trainset._length)
    print('the dataset has ' + str(len(trainset)) + ' items')
    train_loader = DataLoader(
        trainset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=False,
    )

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    start = time.time()
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()
            y, target = batch
            y = to_gpu(y).float()
            target = to_gpu(target)
            y_pred = model((None, y))
            loss = criterion(y_pred[:, :, -model.output_length:], target)
            loss.backward()
            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, loss))
            print_etr(start,
                      total_iterations=(epochs - epoch_offset) *
                      len(train_loader),
                      current_iteration=epoch * len(train_loader) + i + 1)
            writer.add_scalar('Loss/train', loss, global_step=iteration)

            if (iteration % iters_per_checkpoint == 0):
                y_choice = y_pred[0].detach().cpu().transpose(0, 1)
                y_prob = F.softmax(y_choice, dim=1)
                y_prob_collapsed = torch.multinomial(y_prob,
                                                     num_samples=1).squeeze(1)
                y_pred_audio = mu_law_decode_numpy(y_prob_collapsed.numpy(),
                                                   model.n_out_channels)
                import torchaudio
                y_audio = mu_law_decode_numpy(y.numpy(), model.n_out_channels)
                torchaudio.save("test_in.wav", torch.tensor(y_audio), 16000)
                torchaudio.save("test_out.wav", torch.tensor(y_pred_audio),
                                16000)
                writer.add_audio('Audio',
                                 y_pred_audio,
                                 global_step=iteration,
                                 sample_rate=data_config['sampling_rate'])
                checkpoint_path = "{}/wavenet_{}".format(
                    output_directory, iteration)
                save_checkpoint(model, optimizer, learning_rate, iteration,
                                checkpoint_path)

            writer.flush()
            iteration += 1