예제 #1
0
    def get_features(self, y, sample_rate):
        """Feature extraction

        Parameters
        ----------
        y : (n_samples, 1) numpy array
            Waveform
        sample_rate : int
            Sample rate

        Returns
        -------
        data : (n_frames, n_dimensions) numpy array
            Features
        """
        # scale the audio signal between -1 and 1 before
        # creating audio object w/ shennong: Do this because
        # when pyannote uses "data augmentation", it normalizes
        # the signal, but when loading the data without data
        # augmentation it doesn't normalize it.
        y = y / np.max((-np.min(y), np.max(y)))

        # create audio object for shennong
        audio = Audio(data=y, sample_rate=sample_rate)

        # create processor
        processor = BottleneckProcessor(weights=self.weights)

        # define parameters

        #processor.frame_length = self.duration
        #processor.frame_shift = self.step

        # extract features
        bottleneck = processor.process(audio)

        # Compute Pitch
        if self.with_pitch:
            # extract pitch
            pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax)

            ## concatenate mfcc w/pitch - sometimes Kaldi adds to pitch
            ## one frame so give 2 frames of tolerance
            #bottleneck = bottleneck.concatenate(pitch, 2)
            bottleneck = self.concatenate_with_pitch(bottleneck.data,
                                                     pitch.data)
            ## add 1 frame at begining and 1 frame at end to ensure that
            ## we have the same length as mfccs etc..
            bottleneck = np.insert(bottleneck,
                                   0,
                                   np.zeros((1, bottleneck.shape[1])),
                                   axis=0)
            bottleneck = np.insert(bottleneck,
                                   bottleneck.shape[0],
                                   np.zeros((1, bottleneck.shape[1])),
                                   axis=0)
        else:
            bottleneck = bottleneck.data

        return bottleneck
예제 #2
0
def test_weights(weights):
    # make sure all the pretrained weights are here, and contains the
    # required entries
    proc = BottleneckProcessor(weights=weights)
    assert proc.weights == weights
    w = proc._get_weights()
    assert list(w.keys()) == [
        'bn_std', 'input_mean', 'b2', 'b5', 'input_std', 'W5', 'W7', 'W6',
        'b6', 'b7', 'W3', 'W2', 'context', 'b3', 'bn_mean', 'W1', 'b1'
    ]
예제 #3
0
def test_bad_params():
    w = 'BadWeights'
    with pytest.raises(ValueError) as err:
        BottleneckProcessor(w)
    assert 'invalid weights' in str(err.value)

    b = BottleneckProcessor()
    with pytest.raises(ValueError) as err:
        b.set_params(**{'weights': w})
    assert 'invalid weights' in str(err.value)
def transform_all_wavs(folder_wav, type, folder_out): # will output [timexdim}
    processor = BottleneckProcessor(weights=type)
    count = 0
    for file in os.listdir(folder_wav):
        if count % 500 == 0:
            print(count)
        count += 1
        if not file.endswith('.wav'):
            continue
        audio = Audio.load(os.path.join(folder_wav, file))

        features = processor.process(audio)
        #print(features.shape)
        #print(features)
        np.savetxt(fname = os.path.join(folder_out,file[:-4] + '.csv'), X=features._data)
예제 #5
0
def test_process(capsys, audio, mfcc, weights):
    get_logger(level='debug')

    proc = BottleneckProcessor(weights=weights)
    feat = proc.process(audio)
    assert feat.shape == (140, 80)
    assert feat.shape[1] == proc.ndims
    assert np.allclose(feat.times, mfcc.times)
    assert proc.frame_length == 0.025
    assert proc.frame_shift == 0.01
    assert proc.sample_rate == 8000

    # check the log messages
    captured = capsys.readouterr().err
    assert 'resampling audio from 16000Hz@16b to 8000Hz@16b' in captured
    assert '{} frames of speech detected (on 140 total frames)'.format(
        '118' if audio._sox_binary else '121') in captured
예제 #6
0
def test_silence():
    silence = Audio(np.zeros((100, )), 16000)

    with pytest.raises(RuntimeError) as err:
        BottleneckProcessor().process(silence)
    assert 'no voice detected in signal' in str(err.value)

    # silence VAD all false
    vad = _compute_vad(silence.data, null_logger(), bugfix=True)
    assert not vad.any()
예제 #7
0
def test_params(weights):
    p = {'weights': weights, 'dither': 0.1}
    assert BottleneckProcessor(**p).get_params() == p

    b = BottleneckProcessor()
    assert b.weights == 'BabelMulti'
    b.set_params(**p)
    assert BottleneckProcessor(**p).get_params() == p
    assert b.weights == weights
def get_features(sound_file, chosen_processor):
    # computes the feature coefficients of a sound file

    #     :param sound_file : sound file in format .wav
    #     :type amount: .wav file
    #     :returns: feature coefficients per frame of 25ms every 10ms can be 'filterbank'
    #     'plp', 'rasteplp' or 'bottleneck'
    #     :rtype: a numpy array

    audio = Audio.load(sound_file)
    processors = {
        'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate),
        'plp': PlpProcessor(sample_rate=audio.sample_rate),
        'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate),
        'bottleneck': BottleneckProcessor(weights='BabelMulti')
    }

    features = chosen_processor.process(audio)
    features = pd.DataFrame(features)
    return (features)
예제 #9
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('wav', help='wav file to compute features on')

    # load the wav file
    wav_file = parser.parse_args().wav
    audio = Audio.load(wav_file)

    # initialize features processors
    processors = {
        'spectrogram': SpectrogramProcessor(sample_rate=audio.sample_rate),
        'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate),
        'mfcc': MfccProcessor(sample_rate=audio.sample_rate),
        'plp': PlpProcessor(sample_rate=audio.sample_rate),
        'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate),
        'bottleneck': BottleneckProcessor(weights='BabelMulti')}

    # compute the features for all processors
    features = {k: v.process(audio) for k, v in processors.items()}

    # plot the audio signal and the resulting features
    fig, axes = plt.subplots(
        nrows=len(processors)+1,
        gridspec_kw={'top': 0.95, 'bottom': 0.05, 'hspace': 0},
        subplot_kw={'xticks': [], 'yticks': []})
    time = np.arange(0.0, audio.nsamples) / audio.sample_rate
    axes[0].plot(time, audio.astype(np.float32).data)
    axes[0].set_xlim(0.0, audio.duration)
    axes[0].text(
        0.02, 0.8, 'audio',
        bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'},
        transform=axes[0].transAxes)

    for n, (k, v) in enumerate(features.items(), start=1):
        axes[n].imshow(v.data.T, aspect='auto')
        axes[n].text(
            0.02, 0.8, k,
            bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'},
            transform=axes[n].transAxes)

    plt.show()
예제 #10
0
        l = shortest_path_position[1][0]

    # divide the shortest distance by the length of the path
    average_distance = (distance_matrix[vector_1.shape[0]-1][vector_2.shape[0]-1]) \
                        / path_length
    return average_distance


all_features = {}

# get bottleneck features of all .wav files (stimuli)
for root, dirs, files in os.walk(WAV_FOLDER):
    for wav_file in files:
        if wav_file.endswith(".wav"):
            audio = Audio.load(root + wav_file)
            processor = BottleneckProcessor(weights='BabelMulti')
            features = processor.process(audio)
            vectors = features.data
            utterance = wav_file.split('.')[0]
            all_features[utterance] = vectors

for row in distance_list.itertuples():
    row_index = getattr(row, 'Index')
    trip_id = getattr(row, 'tripletid')
    bottle_oth = all_features[trip_id + "_OTH"]
    bottle_tgt = all_features[trip_id + "_TGT"]
    bottle_x = all_features[trip_id + "_X"]

    eucl_oth_x = \
        calculate_distances_dtw(bottle_oth,\
                                bottle_x)
예제 #11
0
def test_compare_original(audio_8k, bottleneck_original):
    feat = BottleneckProcessor(weights='BabelMulti',
                               dither=0).process(audio_8k)
    assert bottleneck_original.shape == feat.shape
    assert bottleneck_original == pytest.approx(feat.data, abs=2e-2)
예제 #12
0
def test_available_weights():
    weights = BottleneckProcessor.available_weights()
    assert len(weights) == 3
    for w in ('BabelMulti', 'FisherMono', 'FisherTri'):
        assert w in weights
        assert os.path.isfile(weights[w])
예제 #13
0
                    '/encoder_' + str(num_phones) + \
                    '_' + lang + '.pkl', 'wb'))
    pickle.dump(kmeans, open(str(num_phones) + '_' + lang + \
                    '/kmeans_' + str(num_phones) + \
                    '_' + lang + '.pkl', 'wb'))


all_features = {}
# get bottleneck features of all .wav files (stimuli)
for root, dirs, files in os.walk(WAV_FOLDER):
    for wav_file in files:
        if wav_file.endswith(".wav"):
            audio = Audio.load(root + wav_file)
            all_features[wav_file] = audio

processor = BottleneckProcessor(weights='BabelMulti')
corpus_features = processor.process_all(all_features)

open_feats = []
for key in corpus_features:
    # access every features object
    feats = corpus_features[key].data
    # put them all together
    open_feats.append(feats)

unlisted_feats = np.asarray(open_feats)
#flattened_feats = np.concatenate(all_corpus_features, axis=0)
flattened_feats = np.concatenate(unlisted_feats)
# idx = (3,6,9)
# flattened_feats = np.insert(flattened_feats, idx, 0, axis=1)
print flattened_feats.shape
import argparse
import glob
import os
import pickle
import numpy as np
import pandas as pd
from pathlib import Path

from shennong.audio import Audio
from shennong.features.processor.mfcc import MfccProcessor
from shennong.features.postprocessor.delta import DeltaPostProcessor
from shennong.features.processor.bottleneck import BottleneckProcessor

mfcc_processor  = MfccProcessor(sample_rate=8000)
delta_processor = DeltaPostProcessor(order=2)
bnf_processor   = BottleneckProcessor(weights='BabelMulti')

parser = argparse.ArgumentParser(
    description='example: python wav_to_shennong-feats.py mfcc wrm-pd',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

parser.add_argument('features', help='features to extract using the Shennong library (mfcc or bnf), use _all_ for both')
parser.add_argument('dataset', help = 'name of dataset, use _all_ to iterate over all')

parser.add_argument('--feats_dir',  default='data/interim/features', help = "directory for features")
parser.add_argument('--datasets_dir', default='data/raw/datasets', help = "directory for raw datasets and labels files")

parser.add_argument('--queries_dir',  default='queries', help = "directory with .wav files for queries")
parser.add_argument('--references_dir',  default='references', help = "directory with .wav files for references")