Пример #1
0
def assert_ready_for_upsampling(x, c):
    assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size()
Пример #2
0
def eval_model(global_step,
               writer,
               device,
               model,
               y,
               c,
               g,
               input_lengths,
               eval_dir,
               ema=None):
    if ema is not None:
        print("Using averaged model for evaluation")
        model = clone_as_averaged_model(device, model, ema)
        model.make_generation_fast_()

    model.eval()
    idx = np.random.randint(0, len(y))
    length = input_lengths[idx].data.cpu().item()

    # (T,)
    y_target = y[idx].view(-1).data.cpu().numpy()[:length]

    if c is not None:
        if hparams.upsample_conditional_features:
            c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0)
        else:
            c = c[idx, :, :length].unsqueeze(0)
        assert c.dim() == 3
        print("Shape of local conditioning features: {}".format(c.size()))
    if g is not None:
        # TODO: test
        g = g[idx]
        print("Shape of global conditioning features: {}".format(g.size()))

    # Dummy silence
    if is_mulaw_quantize(hparams.input_type):
        initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
    elif is_mulaw(hparams.input_type):
        initial_value = P.mulaw(0.0, hparams.quantize_channels)
    else:
        initial_value = 0.0
    print("Intial value:", initial_value)

    # (C,)
    if is_mulaw_quantize(hparams.input_type):
        initial_input = np_utils.to_categorical(
            initial_value,
            num_classes=hparams.quantize_channels).astype(np.float32)
        initial_input = torch.from_numpy(initial_input).view(
            1, 1, hparams.quantize_channels)
    else:
        initial_input = torch.zeros(1, 1, 1).fill_(initial_value)
    initial_input = initial_input.to(device)

    # Run the model in fast eval mode
    with torch.no_grad():
        y_hat = model.incremental_forward(initial_input,
                                          c=c,
                                          g=g,
                                          T=length,
                                          softmax=True,
                                          quantize=True,
                                          tqdm=tqdm,
                                          log_scale_min=hparams.log_scale_min)

    if is_mulaw_quantize(hparams.input_type):
        y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
        y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
        y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels)
    elif is_mulaw(hparams.input_type):
        y_hat = P.inv_mulaw(
            y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels)
        y_target = P.inv_mulaw(y_target, hparams.quantize_channels)
    else:
        y_hat = y_hat.view(-1).cpu().data.numpy()

    # Save audio
    os.makedirs(eval_dir, exist_ok=True)
    path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step))
    librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate)
    path = join(eval_dir, "step{:09d}_target.wav".format(global_step))
    librosa.output.write_wav(path, y_target, sr=hparams.sample_rate)

    # save figure
    path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step))
    save_waveplot(path, y_hat, y_target)
Пример #3
0
def wavegen(model, length=None, c=None, g=None, initial_value=None,
            fast=False, tqdm=tqdm):
    """Generate waveform samples by WaveNet.

    Args:
        model (nn.Module) : WaveNet decoder
        length (int): Time steps to generate. If conditinlal features are given,
          then this is determined by the feature size.
        c (numpy.ndarray): Conditional features, of shape T x C
        g (scaler): Speaker ID
        initial_value (int) : initial_value for the WaveNet decoder.
        fast (Bool): Whether to remove weight normalization or not.
        tqdm (lambda): tqdm

    Returns:
        numpy.ndarray : Generated waveform samples
    """
    from train import sanity_check
    sanity_check(model, c, g)

    c = _to_numpy(c)
    g = _to_numpy(g)

    if use_cuda:
        model = model.cuda()
    model.eval()
    if fast:
        model.make_generation_fast_()

    if c is None:
        assert length is not None
    else:
        # (Tc, D)
        assert c.ndim == 2
        Tc = c.shape[0]
        upsample_factor = audio.get_hop_size()
        # Overwrite length according to feature size
        length = Tc * upsample_factor
        # (Tc, D) -> (Tc', D)
        # Repeat features before feeding it to the network
        if not hparams.upsample_conditional_features:
            c = np.repeat(c, upsample_factor, axis=0)

        # B x C x T
        c = Variable(torch.FloatTensor(c.T).unsqueeze(0))

    if initial_value is None:
        initial_value = P.mulaw_quantize(0)  # dummy silence
    assert initial_value >= 0 and initial_value < 256

    initial_input = np_utils.to_categorical(
        initial_value, num_classes=256).astype(np.float32)
    initial_input = Variable(torch.from_numpy(initial_input)).view(1, 1, 256)
    g = None if g is None else Variable(torch.LongTensor([g]))
    if use_cuda:
        initial_input = initial_input.cuda()
        g = None if g is None else g.cuda()
        c = None if c is None else c.cuda()

    y_hat = model.incremental_forward(
        initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True)
    y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
    y_hat = P.inv_mulaw_quantize(y_hat)

    return y_hat
def eval_model(global_step,
               writer,
               device,
               student,
               teacher,
               y,
               c,
               g,
               input_lengths,
               eval_dir,
               ema=None):
    if ema is not None:
        print("Using averaged model for evaluation")
        student = clone_as_averaged_model(device, student, ema)
        student.make_generation_fast_()

    student.eval()
    teacher.eval()
    idx = np.random.randint(0, len(y))
    length = input_lengths[idx].data.cpu().item()

    # (T,)
    y_target = y[idx].view(-1).data.cpu().numpy()[:length]

    if c is not None:
        if hparams.upsample_conditional_features:
            c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0)
        else:
            c = c[idx, :, :length].unsqueeze(0)
        assert c.dim() == 3
        print("Shape of local conditioning features: {}".format(c.size()))
    if g is not None:
        # TODO: test
        g = g[idx]
        print("Shape of global conditioning features: {}".format(g.size()))

    # noise input
    dist = torch.distributions.normal.Normal(loc=0., scale=1.)
    z = dist.sample((1, 1, length)).to(device)

    # Run the model
    with torch.no_grad():
        student_hat, _, _, _ = student(x=z,
                                       c=c,
                                       g=g,
                                       log_scale_min=hparams.log_scale_min,
                                       device=device)
        teacher_output = teacher(student_hat, c=c, g=g, softmax=False)
        teacher_output = teacher_output.transpose(1, 2)
        teacher_hat = sample_from_gaussian(teacher_output,
                                           log_scale_min=hparams.log_scale_min)

    # if is_mulaw_quantize(hparams.input_type):
    #     y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
    #     y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
    #     y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels)
    # elif is_mulaw(hparams.input_type):
    #     y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels)
    #     y_target = P.inv_mulaw(y_target, hparams.quantize_channels)
    # else:
    #     y_hat = y_hat.view(-1).cpu().data.numpy()

    teacher_hat = teacher_hat.view(-1).cpu().data.numpy()
    student_hat = student_hat.view(-1).cpu().data.numpy()

    # Save audio
    os.makedirs(eval_dir, exist_ok=True)
    path = join(eval_dir, "step{:09d}_student.wav".format(global_step))
    librosa.output.write_wav(path, student_hat, sr=hparams.sample_rate)
    path = join(eval_dir, "step{:09d}_teacher.wav".format(global_step))
    librosa.output.write_wav(path, teacher_hat, sr=hparams.sample_rate)
    path = join(eval_dir, "step{:09d}_target.wav".format(global_step))
    librosa.output.write_wav(path, y_target, sr=hparams.sample_rate)

    # save figure
    path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step))
    save_waveplot(path, teacher_hat, y_target, student_hat)
Пример #5
0
def collate_fn(batch):
    """Create batch

    Args:
        batch(tuple): List of tuples
            - x[0] (ndarray,int) : list of (T,)
            - x[1] (ndarray,int) : list of (T, D)
            - x[2] (ndarray,int) : list of (1,), speaker id
    Returns:
        tuple: Tuple of batch
            - x (FloatTensor) : Network inputs (B, C, T)
            - y (LongTensor)  : Network targets (B, T, 1)
    """

    local_conditioning = len(batch[0]) >= 2 and hparams.cin_channels > 0
    global_conditioning = len(batch[0]) >= 3 and hparams.gin_channels > 0

    if hparams.max_time_sec is not None:
        max_time_steps = int(hparams.max_time_sec * hparams.sample_rate)
    elif hparams.max_time_steps is not None:
        max_time_steps = hparams.max_time_steps
    else:
        max_time_steps = None

    # Time resolution adjustment
    if local_conditioning:
        new_batch = []
        for idx in range(len(batch)):
            x, c, g = batch[idx]
            if hparams.upsample_conditional_features:
                assert_ready_for_upsampling(x, c)
                if max_time_steps is not None:
                    max_steps = ensure_divisible(max_time_steps,
                                                 audio.get_hop_size(), True)
                    if len(x) > max_steps:
                        max_time_frames = max_steps // audio.get_hop_size()
                        s = np.random.randint(0, len(c) - max_time_frames)
                        ts = s * audio.get_hop_size()
                        x = x[ts:ts + audio.get_hop_size() * max_time_frames]
                        c = c[s:s + max_time_frames, :]
                        assert_ready_for_upsampling(x, c)
            else:
                x, c = audio.adjust_time_resolution(x, c)
                if max_time_steps is not None and len(x) > max_time_steps:
                    s = np.random.randint(0, len(x) - max_time_steps)
                    x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :]
                assert len(x) == len(c)
            new_batch.append((x, c, g))
        batch = new_batch
    else:
        new_batch = []
        for idx in range(len(batch)):
            x, c, g = batch[idx]
            x = audio.trim(x)
            if max_time_steps is not None and len(x) > max_time_steps:
                s = np.random.randint(0, len(x) - max_time_steps)
                if local_conditioning:
                    x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :]
                else:
                    x = x[s:s + max_time_steps]
            new_batch.append((x, c, g))
        batch = new_batch

    # Lengths
    input_lengths = [len(x[0]) for x in batch]
    max_input_len = max(input_lengths)

    # (B, T, C)
    # pad for time-axis
    if is_mulaw_quantize(hparams.input_type):
        padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels)
        x_batch = np.array([
            _pad_2d(
                np_utils.to_categorical(x[0],
                                        num_classes=hparams.quantize_channels),
                max_input_len, 0, padding_value) for x in batch
        ],
                           dtype=np.float32)
    else:
        x_batch = np.array(
            [_pad_2d(x[0].reshape(-1, 1), max_input_len) for x in batch],
            dtype=np.float32)
    assert len(x_batch.shape) == 3

    # (B, T)
    if is_mulaw_quantize(hparams.input_type):
        padding_value = P.mulaw_quantize(0, mu=hparams.quantize_channels)
        y_batch = np.array([
            _pad(x[0], max_input_len, constant_values=padding_value)
            for x in batch
        ],
                           dtype=np.int)
    else:
        y_batch = np.array([_pad(x[0], max_input_len) for x in batch],
                           dtype=np.float32)
    assert len(y_batch.shape) == 2

    # (B, T, D)
    if local_conditioning:
        max_len = max([len(x[1]) for x in batch])
        c_batch = np.array([_pad_2d(x[1], max_len) for x in batch],
                           dtype=np.float32)
        assert len(c_batch.shape) == 3
        # (B x C x T)
        c_batch = torch.FloatTensor(c_batch).transpose(1, 2).contiguous()
    else:
        c_batch = None

    if global_conditioning:
        g_batch = torch.LongTensor([x[2] for x in batch])
    else:
        g_batch = None

    # Covnert to channel first i.e., (B, C, T)
    x_batch = torch.FloatTensor(x_batch).transpose(1, 2).contiguous()
    # Add extra axis
    if is_mulaw_quantize(hparams.input_type):
        y_batch = torch.LongTensor(y_batch).unsqueeze(-1).contiguous()
    else:
        y_batch = torch.FloatTensor(y_batch).unsqueeze(-1).contiguous()

    input_lengths = torch.LongTensor(input_lengths)

    return x_batch, y_batch, c_batch, g_batch, input_lengths
Пример #6
0
def get_data_loaders(dump_root, speaker_id, test_shuffle=True):
    data_loaders = {}
    local_conditioning = hparams.cin_channels > 0

    if hparams.max_time_steps is not None:
        max_steps = ensure_divisible(hparams.max_time_steps,
                                     audio.get_hop_size(), True)
    else:
        max_steps = None

    for phase in ["train_no_dev", "dev"]:
        train = phase == "train_no_dev"
        X = FileSourceDataset(
            RawAudioDataSource(join(dump_root, phase),
                               speaker_id=speaker_id,
                               max_steps=max_steps,
                               cin_pad=hparams.cin_pad,
                               hop_size=audio.get_hop_size()))
        if local_conditioning:
            Mel = FileSourceDataset(
                MelSpecDataSource(join(dump_root, phase),
                                  speaker_id=speaker_id,
                                  max_steps=max_steps,
                                  cin_pad=hparams.cin_pad,
                                  hop_size=audio.get_hop_size()))
            assert len(X) == len(Mel)
            print("Local conditioning enabled. Shape of a sample: {}.".format(
                Mel[0].shape))
        else:
            Mel = None
        print("[{}]: length of the dataset is {}".format(phase, len(X)))

        if train:
            lengths = np.array(X.file_data_source.lengths)
            # Prepare sampler
            sampler = PartialyRandomizedSimilarTimeLengthSampler(
                lengths, batch_size=hparams.batch_size)
            shuffle = False
            # make sure that there's no sorting bugs for https://github.com/r9y9/wavenet_vocoder/issues/130
            sampler_idx = np.asarray(
                sorted(list(map(lambda s: int(s), sampler))))
            assert (sampler_idx == np.arange(len(sampler_idx),
                                             dtype=np.int)).all()
        else:
            sampler = None
            shuffle = test_shuffle

        dataset = PyTorchDataset(X, Mel)
        data_loader = data_utils.DataLoader(dataset,
                                            batch_size=hparams.batch_size,
                                            drop_last=True,
                                            num_workers=hparams.num_workers,
                                            sampler=sampler,
                                            shuffle=shuffle,
                                            collate_fn=collate_fn,
                                            pin_memory=hparams.pin_memory)

        speaker_ids = {}
        if X.file_data_source.multi_speaker:
            for idx, (x, c, g) in enumerate(dataset):
                if g is not None:
                    try:
                        speaker_ids[g] += 1
                    except KeyError:
                        speaker_ids[g] = 1
            if len(speaker_ids) > 0:
                print("Speaker stats:", speaker_ids)

        data_loaders[phase] = data_loader

    return data_loaders
Пример #7
0
def assert_ready_for_upsampling(x, c, cin_pad):
    assert len(x) == (len(c) - 2 * cin_pad) * audio.get_hop_size()
Пример #8
0
def _process_utterance(mel_dir,
                       linear_dir,
                       wav_dir,
                       index,
                       wav_path,
                       text,
                       hparams,
                       step_factor=1):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate * step_factor)
        if step_factor > 1: wav = wav[::step_factor]
        audio_time = len(wav) / hparams.sample_rate
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

#Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

#Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis,
                                  hparams.preemphasize)

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        #Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

#Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        preem_wav = preem_wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

# Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

#Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                            audio.get_hop_size(hparams),
                                            hparams.wavenet_pad_sides)

        #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, (l_pad, r_pad),
                     mode='constant',
                     constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (wav_path, audio_filename, mel_filename, linear_filename,
            time_steps, mel_frames, audio_time, text, len(text))
Пример #9
0
def eval_model(global_step,
               writer,
               device,
               model,
               y,
               c,
               g,
               input_lengths,
               eval_dir,
               ema=None):
    if ema is not None:
        print("Using averaged model for evaluation")
        model = clone_as_averaged_model(device, model, ema)
        model.make_generation_fast_()

    model.eval()
    idx = np.random.randint(0, len(y))
    length = input_lengths[idx].data.cpu().item()

    # (T,)
    y_target = y[idx][:length].data.cpu().numpy()
    # print(y_target.size())

    if c is not None:
        if hparams.upsample_conditional_features:
            c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0)
        else:
            c = c[idx, :, :length].unsqueeze(0)
        assert c.dim() == 3
        print("Shape of local conditioning features: {}".format(c.size()))
    if g is not None:
        # TODO: test
        g = g[idx]
        print("Shape of global conditioning features: {}".format(g.size()))

    # print(c.shape)
    # Dummy silence
    initial_value = 0.0
    print("Intial value:", initial_value)

    # (C,)
    initial_input = torch.zeros(1, 1, 80).fill_(initial_value)
    initial_input = initial_input.to(device)

    # Run the model in fast eval mode
    with torch.no_grad():
        y_hat = model.incremental_forward(initial_input,
                                          c=c,
                                          g=g,
                                          T=length,
                                          softmax=True,
                                          quantize=True,
                                          tqdm=tqdm,
                                          log_scale_min=hparams.log_scale_min)

    # save figure
    y_hat = y_hat.squeeze().cpu().data.numpy()
    y_target = np.squeeze(y_target)
    # print(y_target.size())

    path = join(eval_dir, "step{:09d}_waveplots".format(global_step))
    save_waveplot(path, c, y_hat, y_target, writer, global_step)
Пример #10
0
def _process_utterance(out_dir, index, wav_path, text, mel_method):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Trim begin/end silences
    # NOTE: the threshold was chosen for clean signals
    wav, _ = librosa.effects.trim(wav,
                                  top_db=60,
                                  frame_length=hparams.fft_size,
                                  hop_length=hparams.hop_size)

    if hparams.highpass_cutoff > 0.0:
        wav = audio.low_cut_filter(wav, hparams.sample_rate,
                                   hparams.highpass_cutoff)

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # Trim silences in mul-aw quantized domain
        silence_threshold = 0
        if silence_threshold > 0:
            # [0, quantize_channels)
            out = P.mulaw_quantize(wav, hparams.quantize_channels - 1)
            start, end = audio.start_and_end_indices(out, silence_threshold)
            wav = wav[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        constant_values = P.mulaw(0.0, hparams.quantize_channels - 1)
        out_dtype = np.float32
    else:
        # [-1, 1]
        constant_values = 0.0
        out_dtype = np.float32

    wav = np.clip(wav, -1.0, 1.0)
    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = MelSpectrogramCreator.mel_spectrogram(wav, mel_method)

    if hparams.global_gain_scale > 0:
        wav *= hparams.global_gain_scale

    # Time domain preprocessing
    if hparams.preprocess is not None and hparams.preprocess not in [
            "", "none"
    ]:
        f = getattr(audio, hparams.preprocess)
        wav = f(wav)

    # Clip
    if np.abs(wav).max() > 1.0:
        print("""Warning: abs max value exceeds 1.0: {}""".format(
            np.abs(wav).max()))
        # ignore this sample
        return ("dummy", "dummy", -1, "dummy")

    # Set waveform target (out)
    if is_mulaw_quantize(hparams.input_type):
        out = P.mulaw_quantize(wav, hparams.quantize_channels - 1)
    elif is_mulaw(hparams.input_type):
        out = P.mulaw(wav, hparams.quantize_channels - 1)
    else:
        out = wav

    # zero pad
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size())
    if l > 0 or r > 0:
        out = np.pad(out, (l, r),
                     mode="constant",
                     constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    # Write the spectrograms to disk:
    name = splitext(basename(wav_path))[0]
    audio_filename = '%s-wave.npy' % (name)
    mel_filename = '%s-feats.npy' % (name)
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    speaker_id = _get_speaker_from_path(audio_filename)
    return (audio_filename, mel_filename, N, text, speaker_id)
Пример #11
0
def _process_utterance(out_dir, index, audio_filepath, text):
    # Load the audio to a numpy array:
    wav_whole = audio.load_wav(audio_filepath)

    if hparams.rescaling:
        wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max

    # This is a librivox source, so the audio files are going to be v. long
    # compared to a typical 'utterance' : So split the wav into chunks

    tup_results = []

    n_samples = int(8.0 * hparams.sample_rate)  # All 8 second utterances
    n_chunks = wav_whole.shape[0] // n_samples

    for chunk_idx in range(n_chunks):
        chunk_start, chunk_end = chunk_idx * n_samples, (chunk_idx +
                                                         1) * n_samples
        if chunk_idx == n_chunks - 1:  # This is the last chunk - allow it to extend to the end of the file
            chunk_end = None
        wav = wav_whole[chunk_start:chunk_end]

        # Mu-law quantize
        if is_mulaw_quantize(hparams.input_type):
            # [0, quantize_channels)
            out = P.mulaw_quantize(wav, hparams.quantize_channels)

            # Trim silences
            start, end = audio.start_and_end_indices(out,
                                                     hparams.silence_threshold)
            wav = wav[start:end]
            out = out[start:end]
            constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
            out_dtype = np.int16
        elif is_mulaw(hparams.input_type):
            # [-1, 1]
            out = P.mulaw(wav, hparams.quantize_channels)
            constant_values = P.mulaw(0.0, hparams.quantize_channels)
            out_dtype = np.float32
        else:
            # [-1, 1]
            out = wav
            constant_values = 0.0
            out_dtype = np.float32

        # Compute a mel-scale spectrogram from the trimmed wav:
        # (N, D)
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
        # lws pads zeros internally before performing stft
        # this is needed to adjust time resolution between audio and mel-spectrogram
        l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

        # zero pad for quantized signal
        out = np.pad(out, (l, r),
                     mode="constant",
                     constant_values=constant_values)
        N = mel_spectrogram.shape[0]
        assert len(out) >= N * audio.get_hop_size()

        # time resolution adjustment
        # ensure length of raw audio is multiple of hop_size so that we can use
        # transposed convolution to upsample
        out = out[:N * audio.get_hop_size()]
        assert len(out) % audio.get_hop_size() == 0

        timesteps = len(out)

        # Write the spectrograms to disk:
        audio_filename = 'librivox-audio-%04d-%05d.npy' % (
            index,
            chunk_idx,
        )
        mel_filename = 'librivox-mel-%04d-%05d.npy' % (
            index,
            chunk_idx,
        )
        text_idx = '%s - %05d' % (
            text,
            chunk_idx,
        )
        np.save(os.path.join(out_dir, audio_filename),
                out.astype(out_dtype),
                allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename),
                mel_spectrogram.astype(np.float32),
                allow_pickle=False)

        # Add results tuple describing this training example:
        tup_results.append((audio_filename, mel_filename, timesteps, text_idx))

    # Return all the audio results tuples (unpack in caller)
    return tup_results
Пример #12
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    # TODO
    if exists(lab_path) and False:
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=20)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=20)

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        if hparams.rescaling:
            wav = wav / np.abs(wav).max() * hparams.rescaling_max
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjast time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjastment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'cmu_arctic-audio-%05d.npy' % index
    mel_filename = 'cmu_arctic-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text, speaker_id)
Пример #13
0
def _process_song(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    # Trim begin/end silences
    # NOTE: the threshold was chosen for clean signals
    wav, _ = librosa.effects.trim(wav,
                                  top_db=60,
                                  frame_length=2048,
                                  hop_length=512)

    if hparams.highpass_cutoff > 0.0:
        wav = audio.low_cut_filter(wav, hparams.sample_rate,
                                   hparams.highpass_cutoff)

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # Trim silences in mul-aw quantized domain
        silence_threshold = 0
        if silence_threshold > 0:
            # [0, quantize_channels)
            out = P.mulaw_quantize(wav, hparams.quantize_channels - 1)
            start, end = audio.start_and_end_indices(out, silence_threshold)
            wav = wav[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        constant_values = P.mulaw(0.0, hparams.quantize_channels - 1)
        out_dtype = np.float32
    else:
        # [-1, 1]
        constant_values = 0.0
        out_dtype = np.float32

    #### CLAIRE Work here
    wav_name = os.path.splitext(os.path.basename(wav_path))[0]
    os.makedirs('./pwavs', exist_ok=True)
    pwav_path = './pwavs/{0}.wav'.format(wav_name)
    scipy.io.wavfile.write(pwav_path, 16000, wav)
    # make the chord directory if it does not exist
    chord_dir = "chord_dir"
    os.makedirs(chord_dir, exist_ok=True)

    # create xml file with notes and timestamps
    #subprocess.check_call(['./extract_chord_notes.sh', wav_path, chord_dir], shell=True)
    #os.system('./extract_chord_notes.sh {0} {1}'.format(pwav_path, chord_dir))
    os.system('./extract_chord_notes.sh {0} {1} > /dev/null 2>&1'.format(
        pwav_path, chord_dir))

    note_filename = '{0}/{1}.csv'.format(chord_dir, wav_name)

    #### Instead of computing the Mel Spectrogram, here return a time series of one hot encoded chords.
    # vector with 1 in row for each note played
    # 1000 samples per second
    note_samples = int((len(wav) / hparams.sample_rate) * 1000)
    # 12 notes per octave
    chords_time_series = np.zeros((12, note_samples))

    #print(np.shape(chords_time_series))

    with open(note_filename, newline='\n') as csvfile:
        #chordreader = csv.reader(csvfile, delimeter=',')
        chordreader = csvfile.readlines()
        #print(chordreader)
        for row in chordreader:
            row = row.split(",")
            start_time = float(row[0])
            end_time = float(row[1]) + start_time
            note = int(row[2]) % 12
            start_sample = min(note_samples - 1, int(start_time * 1000))
            end_sample = min(note_samples, int(end_time * 1000))
            try:
                chords_time_series[note][start_sample:end_sample] = 1
                # print('wav {0} start {1} end {2} note {3} num_notes {4}'.format(wav_name, start_sample, end_sample, note, note_samples))
            except Exception as e:
                print(np.shape(chords_time_series))
                # print('wav {0} start {1} end {2} note {3} num_notes {4}'.format(wav_name, start_sample, end_sample, note, note_samples))

    chords_time_series = chords_time_series.T

    # if hparams.global_gain_scale > 0:
    #     wav *= hparams.global_gain_scale

    # Time domain preprocessing
    if hparams.preprocess is not None and hparams.preprocess not in [
            "", "none"
    ]:
        f = getattr(audio, hparams.preprocess)
        wav = f(wav)

    # wav = np.clip(wav, -1.0, 1.0)

    # Set waveform target (out)
    if is_mulaw_quantize(hparams.input_type):
        out = P.mulaw_quantize(wav, hparams.quantize_channels - 1)
    elif is_mulaw(hparams.input_type):
        out = P.mulaw(wav, hparams.quantize_channels - 1)
    else:
        out = wav

    # zero pad
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size())
    if l > 0 or r > 0:
        out = np.pad(out, (l, r),
                     mode="constant",
                     constant_values=constant_values)
    N = chords_time_series.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    # Write the spectrograms to disk:
    name = splitext(basename(wav_path))[0]
    audio_filename = '%s-wave.npy' % (name)
    chords_filename = '%s-feats.npy' % (name)
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, chords_filename),
            chords_time_series.astype(np.int16),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, chords_filename, N, text)
Пример #14
0
    os.makedirs(dst_dir, exist_ok=True)
    dst_dir_name = basename(os.path.normpath(dst_dir))

    generated_utterances = {}
    cin_pad = hparams.cin_pad
    file_idx = 0
    for idx, (x, y, c, g, input_lengths) in enumerate(test_data_loader):
        if cin_pad > 0:
            c = F.pad(c, pad=(cin_pad, cin_pad), mode="replicate")

        # B x 1 x T
        if x[0] is not None:
            B, _, T = x.shape
        else:
            B, _, Tn = c.shape
            T = Tn * audio.get_hop_size()

        if g is None and num_utterances > 0 and B * idx >= num_utterances:
            break

        ref_files = []
        ref_feats = []
        for i in range(B):
            # Yes this is ugly...
            if hasattr(test_data_loader.dataset, "X"):
                ref_files.append(
                    test_data_loader.dataset.X.collected_files[file_idx][0])
            else:
                pass
            if hasattr(test_data_loader.dataset, "Mel"):
                ref_feats.append(
def _process_utterance(out_dir, index, speaker_id, wav_path, mgc_path,
                       lab_path, binary_dict, continuous_dict, text):
    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    # determine sessionID and uttID
    wavbn = os.path.basename(wav_path)
    uttID = os.path.splitext(wavbn)[0]

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # time-aligned context
    if hparams.frame_shift_ms is None:
        frame_shift_in_micro_sec = (hparams.hop_size *
                                    10000000) // hparams.sample_rate
    else:
        frame_shift_in_micro_sec = hparams.frame_shift_ms * 10000
    labels = hts.HTSLabelFile(frame_shift_in_micro_sec)
    labels.load(lab_path)
    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=True,
        frame_shift_in_micro_sec=frame_shift_in_micro_sec)

    Nwav = len(out) // audio.get_hop_size()
    out = out[:Nwav * audio.get_hop_size()]

    timesteps = len(out)

    fp = open(mgc_path)
    mgc = np.fromfile(fp, np.float32, -1) - np.log(32768)
    fp.close()
    N = len(mgc) // hparams.num_mels
    mgc = np.reshape(mgc, (N, hparams.num_mels))
    c0 = audio._normalize(audio._amp_to_db(np.exp(mgc[0:Nwav, 0:1])))

    # combine linguistic + c0
    context = np.hstack((linguistic_features, c0))

    # Write the spectrograms to disk:
    audio_filename = 'audio-' + uttID + '.npy'
    context_filename = 'context-' + uttID + '.npy'
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, context_filename),
            context.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, context_filename, timesteps, text, speaker_id)
    def thread_main(self, sess):
        stop = False
        while not stop:
            iterator = load_npy_data(self.metadata_filename, self.npy_dataroot, self.speaker_id)
            for wav, local_condition, global_condition in iterator:
                if self.coord.should_stop():
                    stop = True
                    break

                # force to align the audio and local_condition
                # if audio.shape[0] > local_condition.shape[0]:
                #     audio = audio[:local_condition.shape[0], :]
                # else:
                #     local_condition = local_condition[:audio.shape[0], :]

                # audio = np.pad(audio, [[self.receptive_field, 0], [0, 0]], mode='constant')
                # local_condition = np.pad(local_condition, [[self.receptive_field, 0], [0, 0]], mode='constant')
                # if self.sample_size:
                #     while len(audio) > self.receptive_field:
                #         audio_piece = audio[:(self.receptive_field + self.sample_size), :]
                #         audio = audio[self.sample_size:, :]
                #
                #         local_condition_piece = local_condition[:(self.receptive_field + self.sample_size), :]
                #         local_condition = local_condition[self.sample_size:, :]
                #
                #         if self.gc_enable:
                #             sess.run(self.enqueue, feed_dict=
                #             dict(zip(self._placeholders, (audio_piece, local_condition_piece, global_condition))))
                #         else:
                #             sess.run(self.enqueue, feed_dict=
                #             dict(zip(self._placeholders, (audio_piece, local_condition_piece))))
                # else:
                #     if self.gc_enable:
                #         sess.run(self.enqueue, feed_dict=dict(zip(
                #             self._placeholders, (audio, local_condition, global_condition))))
                #     else:
                #         sess.run(self.enqueue, feed_dict=dict(zip(self._placeholders, (audio, local_condition))))

                if hparams.upsample_conditional_features:
                    wav = wav.reshape(-1, 1)
                    assert_ready_for_upsampling(wav, local_condition)
                    if self.sample_size is not None:
                        sample_size = ensure_divisible(self.sample_size, audio.get_hop_size(), True)
                        if wav.shape[0] > sample_size:
                            max_frames = sample_size // audio.get_hop_size()
                            s = np.random.randint(0, len(local_condition) - max_frames)
                            ts = s * audio.get_hop_size()
                            wav = wav[ts:ts + audio.get_hop_size() * max_frames, :]
                            local_condition = local_condition[s:s + max_frames, :]
                            if self.gc_enable:
                                sess.run(self.enqueue, feed_dict=dict(zip(
                                    self._placeholders, (wav, local_condition, global_condition)
                                )))
                            else:
                                sess.run(self.enqueue, feed_dict=dict(zip(
                                    self._placeholders, (wav, local_condition)
                                )))
                else:
                    wav, local_condition = audio.adjust_time_resolution(wav, local_condition)
                    wav = wav.reshape(-1, 1)
                    if self.sample_size is not None:
                        while wav.shape[0] > self.sample_size:
                            wav_piece = wav[:(self.receptive_field + self.sample_size), :]
                            local_condition_piece = local_condition[:(self.receptive_field + self.sample_size), :]
                            wav = wav[:self.sample_size, :]
                            local_condition = local_condition[:self.sample_size, :]
                            assert len(wav_piece) == len(local_condition_piece)

                            if self.gc_enable:
                                sess.run(self.enqueue, feed_dict=dict(zip(
                                            self._placeholders, (wav_piece, local_condition_piece, global_condition))))
                            else:
                                sess.run(self.enqueue, feed_dict=dict(zip(
                                    self._placeholders, (wav_piece, local_condition_piece))))