def get_features(self, y, sample_rate): """Feature extraction Parameters ---------- y : (n_samples, 1) numpy array Waveform sample_rate : int Sample rate Returns ------- data : (n_frames, n_dimensions) numpy array Features """ # scale the audio signal between -1 and 1 before # creating audio object w/ shennong: Do this because # when pyannote uses "data augmentation", it normalizes # the signal, but when loading the data without data # augmentation it doesn't normalize it. y = y / np.max((-np.min(y), np.max(y))) # create audio object for shennong audio = Audio(data=y, sample_rate=sample_rate) # create processor processor = BottleneckProcessor(weights=self.weights) # define parameters #processor.frame_length = self.duration #processor.frame_shift = self.step # extract features bottleneck = processor.process(audio) # Compute Pitch if self.with_pitch: # extract pitch pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax) ## concatenate mfcc w/pitch - sometimes Kaldi adds to pitch ## one frame so give 2 frames of tolerance #bottleneck = bottleneck.concatenate(pitch, 2) bottleneck = self.concatenate_with_pitch(bottleneck.data, pitch.data) ## add 1 frame at begining and 1 frame at end to ensure that ## we have the same length as mfccs etc.. bottleneck = np.insert(bottleneck, 0, np.zeros((1, bottleneck.shape[1])), axis=0) bottleneck = np.insert(bottleneck, bottleneck.shape[0], np.zeros((1, bottleneck.shape[1])), axis=0) else: bottleneck = bottleneck.data return bottleneck
def test_weights(weights): # make sure all the pretrained weights are here, and contains the # required entries proc = BottleneckProcessor(weights=weights) assert proc.weights == weights w = proc._get_weights() assert list(w.keys()) == [ 'bn_std', 'input_mean', 'b2', 'b5', 'input_std', 'W5', 'W7', 'W6', 'b6', 'b7', 'W3', 'W2', 'context', 'b3', 'bn_mean', 'W1', 'b1' ]
def test_bad_params(): w = 'BadWeights' with pytest.raises(ValueError) as err: BottleneckProcessor(w) assert 'invalid weights' in str(err.value) b = BottleneckProcessor() with pytest.raises(ValueError) as err: b.set_params(**{'weights': w}) assert 'invalid weights' in str(err.value)
def transform_all_wavs(folder_wav, type, folder_out): # will output [timexdim} processor = BottleneckProcessor(weights=type) count = 0 for file in os.listdir(folder_wav): if count % 500 == 0: print(count) count += 1 if not file.endswith('.wav'): continue audio = Audio.load(os.path.join(folder_wav, file)) features = processor.process(audio) #print(features.shape) #print(features) np.savetxt(fname = os.path.join(folder_out,file[:-4] + '.csv'), X=features._data)
def test_process(capsys, audio, mfcc, weights): get_logger(level='debug') proc = BottleneckProcessor(weights=weights) feat = proc.process(audio) assert feat.shape == (140, 80) assert feat.shape[1] == proc.ndims assert np.allclose(feat.times, mfcc.times) assert proc.frame_length == 0.025 assert proc.frame_shift == 0.01 assert proc.sample_rate == 8000 # check the log messages captured = capsys.readouterr().err assert 'resampling audio from 16000Hz@16b to 8000Hz@16b' in captured assert '{} frames of speech detected (on 140 total frames)'.format( '118' if audio._sox_binary else '121') in captured
def test_silence(): silence = Audio(np.zeros((100, )), 16000) with pytest.raises(RuntimeError) as err: BottleneckProcessor().process(silence) assert 'no voice detected in signal' in str(err.value) # silence VAD all false vad = _compute_vad(silence.data, null_logger(), bugfix=True) assert not vad.any()
def test_params(weights): p = {'weights': weights, 'dither': 0.1} assert BottleneckProcessor(**p).get_params() == p b = BottleneckProcessor() assert b.weights == 'BabelMulti' b.set_params(**p) assert BottleneckProcessor(**p).get_params() == p assert b.weights == weights
def get_features(sound_file, chosen_processor): # computes the feature coefficients of a sound file # :param sound_file : sound file in format .wav # :type amount: .wav file # :returns: feature coefficients per frame of 25ms every 10ms can be 'filterbank' # 'plp', 'rasteplp' or 'bottleneck' # :rtype: a numpy array audio = Audio.load(sound_file) processors = { 'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate), 'plp': PlpProcessor(sample_rate=audio.sample_rate), 'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate), 'bottleneck': BottleneckProcessor(weights='BabelMulti') } features = chosen_processor.process(audio) features = pd.DataFrame(features) return (features)
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('wav', help='wav file to compute features on') # load the wav file wav_file = parser.parse_args().wav audio = Audio.load(wav_file) # initialize features processors processors = { 'spectrogram': SpectrogramProcessor(sample_rate=audio.sample_rate), 'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate), 'mfcc': MfccProcessor(sample_rate=audio.sample_rate), 'plp': PlpProcessor(sample_rate=audio.sample_rate), 'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate), 'bottleneck': BottleneckProcessor(weights='BabelMulti')} # compute the features for all processors features = {k: v.process(audio) for k, v in processors.items()} # plot the audio signal and the resulting features fig, axes = plt.subplots( nrows=len(processors)+1, gridspec_kw={'top': 0.95, 'bottom': 0.05, 'hspace': 0}, subplot_kw={'xticks': [], 'yticks': []}) time = np.arange(0.0, audio.nsamples) / audio.sample_rate axes[0].plot(time, audio.astype(np.float32).data) axes[0].set_xlim(0.0, audio.duration) axes[0].text( 0.02, 0.8, 'audio', bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'}, transform=axes[0].transAxes) for n, (k, v) in enumerate(features.items(), start=1): axes[n].imshow(v.data.T, aspect='auto') axes[n].text( 0.02, 0.8, k, bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'}, transform=axes[n].transAxes) plt.show()
l = shortest_path_position[1][0] # divide the shortest distance by the length of the path average_distance = (distance_matrix[vector_1.shape[0]-1][vector_2.shape[0]-1]) \ / path_length return average_distance all_features = {} # get bottleneck features of all .wav files (stimuli) for root, dirs, files in os.walk(WAV_FOLDER): for wav_file in files: if wav_file.endswith(".wav"): audio = Audio.load(root + wav_file) processor = BottleneckProcessor(weights='BabelMulti') features = processor.process(audio) vectors = features.data utterance = wav_file.split('.')[0] all_features[utterance] = vectors for row in distance_list.itertuples(): row_index = getattr(row, 'Index') trip_id = getattr(row, 'tripletid') bottle_oth = all_features[trip_id + "_OTH"] bottle_tgt = all_features[trip_id + "_TGT"] bottle_x = all_features[trip_id + "_X"] eucl_oth_x = \ calculate_distances_dtw(bottle_oth,\ bottle_x)
def test_compare_original(audio_8k, bottleneck_original): feat = BottleneckProcessor(weights='BabelMulti', dither=0).process(audio_8k) assert bottleneck_original.shape == feat.shape assert bottleneck_original == pytest.approx(feat.data, abs=2e-2)
def test_available_weights(): weights = BottleneckProcessor.available_weights() assert len(weights) == 3 for w in ('BabelMulti', 'FisherMono', 'FisherTri'): assert w in weights assert os.path.isfile(weights[w])
'/encoder_' + str(num_phones) + \ '_' + lang + '.pkl', 'wb')) pickle.dump(kmeans, open(str(num_phones) + '_' + lang + \ '/kmeans_' + str(num_phones) + \ '_' + lang + '.pkl', 'wb')) all_features = {} # get bottleneck features of all .wav files (stimuli) for root, dirs, files in os.walk(WAV_FOLDER): for wav_file in files: if wav_file.endswith(".wav"): audio = Audio.load(root + wav_file) all_features[wav_file] = audio processor = BottleneckProcessor(weights='BabelMulti') corpus_features = processor.process_all(all_features) open_feats = [] for key in corpus_features: # access every features object feats = corpus_features[key].data # put them all together open_feats.append(feats) unlisted_feats = np.asarray(open_feats) #flattened_feats = np.concatenate(all_corpus_features, axis=0) flattened_feats = np.concatenate(unlisted_feats) # idx = (3,6,9) # flattened_feats = np.insert(flattened_feats, idx, 0, axis=1) print flattened_feats.shape
import argparse import glob import os import pickle import numpy as np import pandas as pd from pathlib import Path from shennong.audio import Audio from shennong.features.processor.mfcc import MfccProcessor from shennong.features.postprocessor.delta import DeltaPostProcessor from shennong.features.processor.bottleneck import BottleneckProcessor mfcc_processor = MfccProcessor(sample_rate=8000) delta_processor = DeltaPostProcessor(order=2) bnf_processor = BottleneckProcessor(weights='BabelMulti') parser = argparse.ArgumentParser( description='example: python wav_to_shennong-feats.py mfcc wrm-pd', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument('features', help='features to extract using the Shennong library (mfcc or bnf), use _all_ for both') parser.add_argument('dataset', help = 'name of dataset, use _all_ to iterate over all') parser.add_argument('--feats_dir', default='data/interim/features', help = "directory for features") parser.add_argument('--datasets_dir', default='data/raw/datasets', help = "directory for raw datasets and labels files") parser.add_argument('--queries_dir', default='queries', help = "directory with .wav files for queries") parser.add_argument('--references_dir', default='references', help = "directory with .wav files for references")