def signal2features(signal, audio_parameters, features_param): if features_param['feature_type'] == 'mfcc': remove_energy = not features_param.get('energy', False) features = mfcc_spec( signal, audio_parameters['sample_rate'], (int(audio_parameters['window_t'] * audio_parameters['sample_rate']), int(audio_parameters['hop_t'] * audio_parameters['sample_rate'])), num_filt=features_param['n_filt'], fft_size=features_param['n_fft'], num_coeffs=features_param['n_coef'] + remove_energy) if remove_energy: features = features[:, 1:] elif features_param['feature_type'] == 'lmfe': features = mfcc_spec( signal, audio_parameters['sample_rate'], (int(audio_parameters['window_t'] * audio_parameters['sample_rate']), int(audio_parameters['hop_t'] * audio_parameters['sample_rate'])), num_filt=features_param['n_filt'], fft_size=features_param['n_fft'], num_coeffs=features_param['n_coef'], return_parts=True)[2][:, :features_param['n_coef']] # Insert new features here return features
def __get_data__(manifest, sr, max_frame_len, intencode): """ :param progress_bar: :return: """ pg = tqdm if progress_bar else lambda x: x random.shuffle(manifest) inputs, targets = [], [] for md in pg(manifest): audio_path = md[0] labels_path = md[1] _, audio = read(audio_path) # audio = load_audio(audio_path) labels = read_txt(labels_path)[0].replace(" ", "") mfccs = mfcc_spec(audio, sr, window_stride=(160, 80), fft_size=512, num_filt=20, num_coeffs=13) mfccs = normalize(mfccs) diff = max_frame_len - mfccs.shape[0] if diff >= 0: mfccs = np.pad(mfccs, ((0, diff), (0, 0)), "constant") target = intencode.convert_to_ints(labels) if target is not None: inputs.append(mfccs) targets.append(target) return (inputs, targets)
def get_data(self, progress_bar=True): """ 返回音频的MFCC特征和相应的label """ pg = tqdm if progress_bar else lambda x: x inputs, targets, input_lengths= [], [], [] meta_data = [] for label in self.labels: path = os.listdir(os.path.join(self.data_path, label)) for audio in path: audio_path = os.path.join(self.data_path, label, audio) meta_data.append((audio_path, label)) random.shuffle(meta_data) #打乱数据集 for md in pg(meta_data): audio_path = md[0] label = md[1] _, audio = wavfile.read(audio_path) mfccs = mfcc_spec( audio, self.sr, window_stride=(160, 80), fft_size=512, num_filt=20, num_coeffs=13 ) mfccs = normalize(mfccs) diff = self.max_frame_len - mfccs.shape[0] input_lengths.append(mfccs.shape[0]) mfccs = np.pad(mfccs, ((0, diff), (0, 0)), "constant")#padding inputs.append(mfccs) target = self.intencode.convert_to_ints(label) targets.append(target) return inputs, targets, input_lengths
def wav2feat(wavPath): _,audio = read(wavPath) mfccs = mfcc_spec( audio, 16000, window_stride=(160, 80), fft_size=512, num_filt=20, num_coeffs=13 ) mfccs = normalize(mfccs) diff = 225 - mfccs.shape[0] mfccs = np.pad(mfccs, ((0, diff), (0, 0)), "constant") sample = torch.Tensor(mfccs) sample = sample.transpose(0, 1) return sample
def get_data(self, progress_bar=True): """Currently returns mfccs and integer encoded data Returns: (list, list): inputs shape (sample_size, frame_len, mfcs_features) targets shape (sample_size, seq_len) seq_len is variable """ pg = tqdm if progress_bar else lambda x: x inputs, targets = [], [] meta_data = [] for labels in self.labels: path = os.listdir(os.path.join(self.data_path, labels)) for audio in path: audio_path = os.path.join(self.data_path, labels, audio) meta_data.append((audio_path, labels)) random.shuffle(meta_data) errFileList = [] for md in pg(meta_data): audio_path = md[0] labels = md[1] try: _, audio = read(audio_path) except ValueError: print(audio_path) errFileList.append(audio_path) continue # _, audio = read(audio_path) mfccs = mfcc_spec( audio, self.sr, window_stride=(160, 80), fft_size=512, num_filt=20, num_coeffs=13 ) mfccs = normalize(mfccs) diff = self.max_frame_len - mfccs.shape[0] mfccs = np.pad(mfccs, ((0, diff), (0, 0)), "constant") inputs.append(mfccs) target = self.intencode.convert_to_ints(labels) targets.append(target) print(errFileList) return inputs, targets
def __init__(self, sample_rate, fft_size=400, window_stride=(400, 200), num_filt=40, num_coeffs=40): super(MFCC, self).__init__() self.sample_rate = sample_rate self.window_stride = window_stride self.fft_size = fft_size self.num_filt = num_filt self.num_coeffs = num_coeffs self.mfcc = lambda x: mfcc_spec(x, self.sample_rate, self. window_stride, self.fft_size, self. num_filt, self.num_coeffs)
def get_mfcc(self, path): audio, sr = librosa.load(path, sr=None) # sr = 16000 sampling rate of `y` mfccs = mfcc_spec(audio, sr, window_stride=self.windows_stride, fft_size=self.fft_size, num_filt=self.num_filt, num_coeffs=self.num_coeffs) mfccs /= 16 if self.positive_shift: mfccs += 1 mfccs = np.where(mfccs < 0, 0, mfccs) if mfccs.shape[0] < self.max_len: pad = np.zeros((self.max_len - mfccs.shape[0], mfccs.shape[1],)) mfccs = np.vstack((mfccs, pad)) elif mfccs.shape[0] > self.max_len: mfccs = mfccs[:, :self.max_len] return torch.FloatTensor(np.expand_dims(mfccs, axis=0))
def sample_to_training_data(item): audio_fn = item["audio"] sr, y = wavfile.read(audio_fn) duration = y.shape[0] / sr mfcc = mfcc_spec(y, sr) activity = np.zeros((mfcc.shape[0])) N = mfcc.shape[0] for utt in item["json"]: if utt["word"] == "sil": continue start = utt["start"] end = utt["end"] start_index = round(N * start / duration) end_index = round(N * end / duration) activity[start_index:end_index] = 1. return mfcc, activity
def extractSoundFeatures(sig): allFeature = [] for numberData in range(len(sig)): feature = [] for channel in range(sig[numberData].shape[0]): x = sig[numberData][channel, :] powers, filters, mels, mfccs = mfcc_spec(x, 2048, return_parts=True, num_coeffs=len(x)) feature.append(np.std(mfccs, axis=0)) feature.append(np.sum(mfccs, axis=0)) feature.extend(mfccs) # feature.append(np.max(powers, axis=0)) # feature.append(np.min(powers, axis=0)) allFeature.append(np.asarray(feature)) return np.asarray(allFeature)
def _process(self): self._processing = True features = mfcc_spec( self._buffer, sample_rate=self.mfccParams.sample_rate, window_stride=(self.mfccParams.window_l, self.mfccParams.stride_l), num_coeffs=self.mfccParams.n_coef + (not self.mfccParams.energy), num_filt=self.mfccParams.n_filt, fft_size=self.mfccParams.n_fft) if not self.mfccParams.energy: features = features[:, 1:] self._buffer = self._buffer[len(features) * self.mfccParams.stride_l:] if self._consumer is not None: self._consumer.input(features) self._processing = False with self._condition: self._condition.notify()
def vectorize_raw(audio: np.ndarray) -> np.ndarray: """Turns audio into feature vectors, without clipping for length""" if len(audio) == 0: raise InvalidAudio('Cannot vectorize empty audio!') sample_rate = 16000 window_t = 0.1 window_samples = int(sample_rate * window_t + 0.5) hop_t = 0.05 hop_samples = int(sample_rate * hop_t + 0.5) n_filt = 20 n_fft = 512 n_mfcc = 13 return mfcc_spec(audio, sample_rate, (window_samples, hop_samples), num_filt=n_filt, fft_size=n_fft, num_coeffs=n_mfcc)
def mfcc_from_file(filename): sr, y = wavfile.read(filename) mfcc = mfcc_spec(y, sr) duration = y.shape[0] / sr return mfcc, duration
import numpy as np from sonopy import power_spec, mel_spec, mfcc_spec, filterbanks from scipy.io import wavfile sr, audio = wavfile.read('test.wav') # powers = power_spec(audio, window_stride=(100, 50), fft_size=512) # mels = mel_spec(audio, sr, window_stride=(1600, 800), fft_size=1024, num_filt=30) mfccs = mfcc_spec(audio, sr, window_stride=(160, 80), fft_size=512, num_filt=20, num_coeffs=13) print(mfccs) # filters = filterbanks(16000, 20, 257) # Probably not ever useful # powers, filters, mels, mfccs = mfcc_spec(audio, sr, return_parts=True)
neg_loss = -(1 - yt) * K.log(1 - yp + K.epsilon()) return LOSS_BIAS * K.mean(neg_loss) + (1. - LOSS_BIAS) * K.mean(pos_loss) qqq = keras.models.load_model( "qqq.net", custom_objects={'weighted_log_loss': weighted_log_loss}) samples, sample_rate = librosa.load("testing/negative-00.wav", duration=2.0, sr=16000) window_samples = int(sample_rate * 0.1 + 0.5) hop_samples = int(sample_rate * 0.05 + 0.5) mfccs = mfcc_spec(samples, sample_rate, (window_samples, hop_samples), num_filt=20, fft_size=512, num_coeffs=13) def my_mfcc_spec(audio, sample_rate, window_stride=(160, 80), fft_size=512, num_filt=20, num_coeffs=13, return_parts=False): """Calculates mel frequency cepstrum coefficient spectrogram""" powers = power_spec(audio, window_stride, fft_size) if powers.size == 0: return np.empty((0, min(num_filt, num_coeffs)))
from sonopy import mfcc_spec, mel_spec inhibit_t = 0.4 inhibit_dist_t = 1.0 inhibit_hop_t = 0.1 vectorizers = { Vectorizer.mels: lambda x: mel_spec(x, pr.sample_rate, (pr.window_samples, pr.hop_samples), num_filt=pr.n_filt, fft_size=pr.n_fft), Vectorizer.mfccs: lambda x: mfcc_spec(x, pr.sample_rate, (pr.window_samples, pr.hop_samples), num_filt=pr.n_filt, fft_size=pr.n_fft, num_coeffs=pr.n_mfcc), Vectorizer.speechpy_mfccs: lambda x: __import__('speechpy').feature. mfcc(x, pr.sample_rate, pr.window_t, pr.hop_t, pr.n_mfcc, pr.n_filt, pr. n_fft) } def vectorize_raw(audio: np.ndarray) -> np.ndarray: """Turns audio into feature vectors, without clipping for length""" if len(audio) == 0: raise InvalidAudio('Cannot vectorize empty audio!') return vectorizers[pr.vectorizer](audio)