Пример #1
0
def embed_utterance(wav,
                    using_partials=True,
                    return_partials=False,
                    model=None,
                    **kwargs):
    """
    Computes an embedding for a single utterance.
    
    # TODO: handle multiple wavs to benefit from batching on GPU
    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
    :param using_partials: if True, then the utterance is split in partial utterances of 
    <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
    normalized average. If False, the utterance is instead computed from feeding the entire 
    spectogram to the network.
    :param return_partials: if True, the partial embeddings will also be returned along with the 
    wav slices that correspond to the partial embeddings.
    :param kwargs: additional arguments to compute_partial_splits()
    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
    <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
    returned. If <using_partials> is simultaneously set to False, both these values will be None 
    instead.
    """
    # Process the entire utterance if not using partials
    if model is None:
        print("\n\n\n\n\nDidn't find model, will use preloaded.\n\n\n\n\n\n")
        model = _model
    if not using_partials:
        frames = audio.wav_to_mel_spectrogram(wav)
        embed = embed_frames_batch(frames[None, ...], model)[0]
        if return_partials:
            return embed, None, None
        return embed

    # Compute where to split the utterance into partials and pad if necessary
    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
    max_wave_length = wave_slices[-1].stop
    if max_wave_length >= len(wav):
        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")

    # Split the utterance into partials
    frames = audio.wav_to_mel_spectrogram(wav)
    frames_batch = np.array([frames[s] for s in mel_slices])
    partial_embeds = embed_frames_batch(frames_batch, model)

    # Compute the utterance embedding from the partial embeddings
    raw_embed = np.mean(partial_embeds, axis=0)
    embed = raw_embed / np.linalg.norm(raw_embed, 2)

    if return_partials:
        return embed, partial_embeds, wave_slices
    return embed
Пример #2
0
def computeEmbedding(wav, **kwargs):
    '''
    This Method computes the embedding vector for the wav paramater

    PARAMS:
    wav:the preprocessed wav file for which the e-vector will be calculated

    RETURNS:
    the embedding of the wav object 
    '''

    # If the last slice size is larger than the length of the wav then we
    #must zero-pad the wav
    wSlices, mSlices = computeSlices(len(wav), **kwargs)
    lastSliceStop = wSlices[-1].stop
    if lastSliceStop >= len(wav):
        wav = np.pad(wav, (0, lastSliceStop - len(wav)), "constant")
    #compute the mel spectogram of the wav
    frames = audio.wav_to_mel_spectrogram(wav)
    #group every mslice into an array which will be fed to the network
    framesInBatches = np.array([frames[s] for s in mSlices])
    #for every member in partialEmbeddings is the e-vector for the partial utterance
    partialEmbeddings = computeEmbeddingForBatch(framesInBatches)
    # The embedding vector of the complete utterance will be the normalization of the averaged version
    averageEmbedding = np.mean(partialEmbeddings, axis=0)
    embed = averageEmbedding / np.linalg.norm(averageEmbedding, 2)
    return embed
Пример #3
0
    def preprocess_speaker(speaker_dir: Path):
        # Give a name to the speaker that includes its dataset
        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)

        # Create an output directory with that name, as well as a txt file containing a
        # reference to each source file.
        speaker_out_dir = out_dir.joinpath(speaker_name)
        speaker_out_dir.mkdir(exist_ok=True)
        sources_fpath = speaker_out_dir.joinpath("_sources.txt")

        # There's a possibility that the preprocessing was interrupted earlier, check if
        # there already is a sources file.
        if sources_fpath.exists():
            try:
                with sources_fpath.open("r") as sources_file:
                    existing_fnames = {
                        line.split(",")[0]
                        for line in sources_file
                    }
            except:
                existing_fnames = {}
        else:
            existing_fnames = {}

        # Gather all audio files for that speaker recursively
        sources_file = sources_fpath.open("a" if skip_existing else "w")
        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
            # Check if the target output file already exists
            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
            out_fname = out_fname.replace(".%s" % extension, ".npy")
            if skip_existing and out_fname in existing_fnames:
                continue

            # Load and preprocess the waveform
            try:
                wav = audio.preprocess_wav(in_fpath)
            except ValueError as e:
                # loading VoxCeleb2, this gets raised:
                # ValueError("frames must be specified for non-seekable files")
                print(f"skipping loading of {in_fpath}, because: {str(e)}")
                continue
            if len(wav) == 0:
                continue

            print(f"Processing {in_fpath}...")
            # Create the mel spectrogram, discard those that are too short
            frames = audio.wav_to_mel_spectrogram(wav)
            if len(frames) < partials_n_frames:
                continue

            out_fpath = speaker_out_dir.joinpath(out_fname)
            np.save(out_fpath, frames)
            logger.add_sample(duration=len(wav) / sampling_rate)
            sources_file.write("%s,%s\n" % (out_fname, in_fpath))

        sources_file.close()
Пример #4
0
    def embed_utterance(self,
                        wav: np.ndarray,
                        return_partials=False,
                        rate=1.3,
                        min_coverage=0.75):
        """
        Computes an embedding for a single utterance. The utterance is divided in partial
        utterances and an embedding is computed for each. The complete utterance embedding is the
        L2-normed average embedding of the partial utterances.

        TODO: independent batched version of this function

        :param wav: a preprocessed utterance waveform as a numpy array of float32
        :param return_partials: if True, the partial embeddings will also be returned along with
        the wav slices corresponding to each partial utterance.
        :param rate: how many partial utterances should occur per second. Partial utterances must
        cover the span of the entire utterance, thus the rate should not be lower than the inverse
        of the duration of a partial utterance. By default, partial utterances are 1.6s long and
        the minimum rate is thus 0.625.
        :param min_coverage: when reaching the last partial utterance, it may or may not have
        enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
        then the last partial utterance will be considered by zero-padding the audio. Otherwise,
        it will be discarded. If there aren't enough frames for one partial utterance,
        this parameter is ignored so that the function always returns at least one slice.
        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
        <return_partials> is True, the partial utterances as a numpy array of float32 of shape
        (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
        returned.
        """
        # Compute where to split the utterance into partials and pad the waveform with zeros if
        # the partial utterances cover a larger range.
        wav_slices, mel_slices = self.compute_partial_slices(
            len(wav), rate, min_coverage)
        max_wave_length = wav_slices[-1].stop
        if max_wave_length >= len(wav):
            wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")

        # Split the utterance into partials and forward them through the model
        mel = audio.wav_to_mel_spectrogram(wav)
        mels = np.array([mel[s] for s in mel_slices])
        with torch.no_grad():
            mels = torch.from_numpy(mels).to(self.device)
            partial_embeds = self(mels).cpu().numpy()

        # Compute the utterance embedding from the partial embeddings
        raw_embed = np.mean(partial_embeds, axis=0)
        embed = raw_embed / np.linalg.norm(raw_embed, 2)

        if return_partials:
            return embed, partial_embeds, wav_slices
        return embed
Пример #5
0
def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path,
                        skip_existing: bool):
    # Give a name to the speaker that includes its dataset
    speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)

    # Create an output directory with that name, as well as a txt file containing a
    # reference to each source file.
    speaker_out_dir = out_dir.joinpath(speaker_name)
    speaker_out_dir.mkdir(exist_ok=True)
    sources_fpath = speaker_out_dir.joinpath("_sources.txt")

    # There's a possibility that the preprocessing was interrupted earlier, check if
    # there already is a sources file.
    if sources_fpath.exists():
        try:
            with sources_fpath.open("r") as sources_file:
                existing_fnames = {line.split(",")[0] for line in sources_file}
        except:
            existing_fnames = {}
    else:
        existing_fnames = {}

    # Gather all audio files for that speaker recursively
    sources_file = sources_fpath.open("a" if skip_existing else "w")
    audio_durs = []
    for extension in _AUDIO_EXTENSIONS:
        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
            # Check if the target output file already exists
            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
            out_fname = out_fname.replace(".%s" % extension, ".npy")
            if skip_existing and out_fname in existing_fnames:
                continue

            # Load and preprocess the waveform
            wav = audio.preprocess_wav(in_fpath)
            if len(wav) == 0:
                continue

            # Create the mel spectrogram, discard those that are too short
            frames = audio.wav_to_mel_spectrogram(wav)
            if len(frames) < partials_n_frames:
                continue

            out_fpath = speaker_out_dir.joinpath(out_fname)
            np.save(out_fpath, frames)
            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
            audio_durs.append(len(wav) / sampling_rate)

    sources_file.close()

    return audio_durs
 def preprocess_speaker(args):
     speaker, files = args
     # Give a name to the speaker that includes its dataset
     parts = list(speaker_dir.relative_to(datasets_root).parts)
     parts.append(speaker)
     speaker_name = "_".join(parts)
     
     # Create an output directory with that name, as well as a txt file containing a 
     # reference to each source file.
     speaker_out_dir = out_dir.joinpath(speaker_name)
     speaker_out_dir.mkdir(exist_ok=True)
     sources_fpath = speaker_out_dir.joinpath("_sources.txt")
     
     # There's a possibility that the preprocessing was interrupted earlier, check if 
     # there already is a sources file.
     if sources_fpath.exists():
         try:
             with sources_fpath.open("r") as sources_file:
                 existing_fnames = {line.split(",")[0] for line in sources_file}
         except:
             existing_fnames = {}
     else:
         existing_fnames = {}
     
     # Gather all audio files for that speaker recursively
     sources_file = sources_fpath.open("a" if skip_existing else "w")
     for in_fpath, out_fname in files:
         # Check if the target output file already exists
         if skip_existing and out_fname in existing_fnames:
             continue
             
         # Load and preprocess the waveform
         try:
             wav = audio.preprocess_wav(in_fpath)
             if len(wav) == 0:
                 continue
         except:
             print('wave preprocess error')
             continue
         
         # Create the mel spectrogram, discard those that are too short
         frames = audio.wav_to_mel_spectrogram(wav)
         if len(frames) < partials_n_frames:
             continue
         
         out_fpath = speaker_out_dir.joinpath(out_fname)
         np.save(out_fpath, frames)
         logger.add_sample(duration=len(wav) / sampling_rate)
         sources_file.write("%s,%s\n" % (out_fname, in_fpath))
     
     sources_file.close()
def preprocess(in_fpath, out_fpath, parent_path):

    source_text = parent_path / "_sources.txt"
    sources_file = source_text.open("w")

    # Load and preprocess the waveform
    wav = audio.preprocess_wav(in_fpath)
    if len(wav) == 0:
        print("empty audio file")
    
    # Create the mel spectrogram, discard those that are too short
    frames = audio.wav_to_mel_spectrogram(wav)
    if len(frames) < partials_n_frames:
        print("{} < {}, number of frames is less than partials_n_frames".format(len(frames), partials_n_frames))

    np.save(out_fpath, frames)
    sources_file.write("%s,%s\n" % (out_fpath.name + '.npy', in_fpath.name))
    
    sources_file.close()

    return frames
Пример #8
0
def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
    """
    Computes an embedding for a single utterance.

    # TODO: handle multiple wavs to benefit from batching on GPU
    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
    :param using_partials: if True, then the utterance is split in partial utterances of
    <partial_utterance_n_frames> frames and the utterance embedding is computed from their
    normalized average. If False, the utterance is instead computed from feeding the entire
    spectogram to the network.
    :param return_partials: if True, the partial embeddings will also be returned along with the
    wav slices that correspond to the partial embeddings.
    :param kwargs: additional arguments to compute_partial_splits()
    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
    returned. If <using_partials> is simultaneously set to False, both these values will be None
    instead.
    """

    # Extract raw audio patches for fCNN

    # Process the entire utterance if not using partials
    if not using_partials:
        if (mel_n_channels == 40):
            frames = audio.wav_to_mel_spectrogram(wav)
        else:
            win = np.hamming(int(sampling_rate * 0.02))
            inc = int(win.shape[0] / 2)
            frames = get_frame_from_file(wav,
                                         win=win,
                                         inc=inc,
                                         sr=sampling_rate,
                                         n_channels=1,
                                         duration=None)
            frames = np.transpose(frames)
        embed = embed_frames_batch(frames[None, ...], **kwargs)[0]
        if return_partials:
            return embed, None, None
        return embed

    # Compute where to split the utterance into partials and pad if necessary
    # Set min_pad average to 1.0 to snip last slice that does not have 200 frames, default value was 0.75

    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
    n_frames = int(np.ceil((len(wav) + 1) / samples_per_frame))

    if (n_frames < partials_n_frames):
        print('Audio too short! Skipping...')
        embed = None
        partial_embeds = None
        wave_slices = None
        if return_partials:
            return embed, partial_embeds, wave_slices
        return embed

    wave_slices, mel_slices = compute_partial_slices(len(wav),
                                                     min_pad_coverage=0.75)

    max_wave_length = wave_slices[-1].stop
    if max_wave_length >= len(wav):
        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")

    # Split the utterance into partials
    if (mel_n_channels == 40):
        frames = audio.wav_to_mel_spectrogram(wav)
    else:
        win = np.hamming(int(sampling_rate * 0.02))
        inc = int(win.shape[0] / 2)
        frames = get_frame_from_file(wav,
                                     win=win,
                                     inc=inc,
                                     sr=sampling_rate,
                                     n_channels=1,
                                     duration=None)
        frames = np.transpose(frames)
        if (
                frames.shape[0] < mel_slices[-1].stop
        ):  # This ensures that the number of frames in 'frames' corresponds to the expected melshapes
            pad_len = mel_slices[-1].stop - frames.shape[0]
            frames = np.concatenate((frames, frames[-1 * pad_len:]), axis=0)

    frames_batch = np.array([frames[s] for s in mel_slices])
    partial_embeds = embed_frames_batch(frames_batch, **kwargs)

    # Compute the utterance embedding from the partial embeddings
    raw_embed = np.mean(partial_embeds, axis=0)
    embed = raw_embed / np.linalg.norm(raw_embed, 2)

    if return_partials:
        return embed, partial_embeds, wave_slices
    return embed