def lr_preprocess(self, x): global LR_HOP_DURATION global HOP_DURATION global AUDIO_SAMPLE_RATE if self.raw_max_length is None: self.raw_max_length = get_max_length(x) scale = self.raw_max_length/84000 LR_HOP_DURATION = max(BASE_LR_HOP_DURATION,BASE_LR_HOP_DURATION*scale) HOP_DURATION = max(BASE_HOP_DURATION,BASE_HOP_DURATION*scale/2) AUDIO_SAMPLE_RATE = max(BASE_AUDIO_SAMPLE_RATE,int(BASE_AUDIO_SAMPLE_RATE*scale/3)) print('LR_HOP_DURATION---%s'%LR_HOP_DURATION) print('HOP_DURATION---%s'%HOP_DURATION) print('AUDIO_SAMPLE_RATE---%s'%AUDIO_SAMPLE_RATE) x = [sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE] for sample in x] x_mel = extract_melspectrogram_parallel( x, n_mels=30, use_power_db=True,lr=True) # x_contrast = extract_bandwidth_parallel(x) x_feas = [] for i in range(len(x_mel)): mel = np.mean(x_mel[i], axis=0).reshape(-1) mel_std = np.std(x_mel[i], axis=0).reshape(-1) # contrast = np.mean(x_contrast[i], axis=0).reshape(-1) # contrast_std = np.std(x_contrast[i], axis=0).reshape(-1) # contrast, contrast_std x_feas.append(np.concatenate([mel, mel_std], axis=-1)) x_feas = np.asarray(x_feas) scaler = StandardScaler() X = scaler.fit_transform(x_feas[:, :]) return X
def nn_preprocess(self, x, n_mfcc=96, max_duration=5, is_mfcc=True): if self.raw_max_length is None: self.raw_max_length = get_max_length(x) if self.raw_max_length > (MIDDLE_DURATION * AUDIO_SAMPLE_RATE): self.need_30s = True if len(self._train_y) < 1000 and self._num_classes < 30: self.crnn_first = True self.raw_max_length = min(max_duration * AUDIO_SAMPLE_RATE, self.raw_max_length) self.raw_max_length = max(MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE, self.raw_max_length) x = [sample[0:self.raw_max_length] for sample in x] if is_mfcc: # extract mfcc x = extract_mfcc_parallel(x, n_mfcc=n_mfcc) else: x = extract_melspectrogram_parallel(x, n_mels=128, use_power_db=True) if self.fea_max_length is None: self.fea_max_length = get_max_length(x) self.fea_max_length = min(MAX_FRAME_NUM, self.fea_max_length) x = pad_seq(x, pad_len=self.fea_max_length) return x
def preprocess_data(self, x): if IS_CUT_AUDIO: x = [sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE] for sample in x] x_mel = extract_melspectrogram_parallel( x, n_mels=128, use_power_db=True) # x_mel = extract_mfcc_parallel(x, n_mfcc=96) if self.max_length is None: self.max_length = get_max_length(x_mel) self.max_length = min(MAX_FRAME_NUM, self.max_length) x_mel = pad_seq(x_mel, pad_len=self.max_length) x_mel = x_mel[:, :, :, np.newaxis] return x_mel
def preprocess_data(self, x): if IS_CUT_AUDIO: x = [sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE] for sample in x] # extract mfcc x_mfcc = extract_mfcc_parallel(x, n_mfcc=64) x_mel = extract_melspectrogram_parallel(x, n_mels=64, use_power_db=True) if self.max_length is None: self.max_length = get_max_length(x_mfcc) self.max_length = min(MAX_FRAME_NUM, self.max_length) x_mfcc = pad_seq(x_mfcc, self.max_length) x_mel = pad_seq(x_mel, self.max_length) x_feas = np.concatenate([x_mfcc, x_mel], axis=-1) x_feas = x_feas[:, :, :, np.newaxis] # x_mel = pad_seq(x_mel, self.max_length) # x_mel = x_mel[:, :, :, np.newaxis] return x_feas
def preprocess_data(self, x): # cut down x = [sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE] for sample in x] # extract mfcc # x_mfcc = extract_mfcc_parallel(x, n_mfcc=63) x_mel = extract_melspectrogram_parallel(x, n_mels=40, use_power_db=True) # x_chroma_stft = extract_chroma_stft_parallel(x, n_chroma=12) # x_rms = extract_rms_parallel(x) # x_contrast = extract_spectral_contrast_parallel(x, n_bands=6) # x_flatness = extract_spectral_flatness_parallel(x) # x_polyfeatures = extract_poly_features_parallel(x, order=1) # x_cent = extract_spectral_centroid_parallel(x) # x_bw = extract_bandwidth_parallel(x) # x_rolloff = extract_spectral_rolloff_parallel(x) # x_zcr = extract_zero_crossing_rate_parallel(x) x_feas = [] for i in range(len(x_mel)): mel = np.mean(x_mel[i], axis=0).reshape(-1) mel_std = np.std(x_mel[i], axis=0).reshape(-1) # mel = np.mean(x_mel[i], axis=0).reshape(-1) # mel_std = np.std(x_mel[i], axis=0).reshape(-1) # chroma_stft = np.mean(x_chroma_stft[i], axis=0).reshape(-1) # chroma_stft_std = np.std(x_chroma_stft[i], axis=0).reshape(-1) # rms = np.mean(x_rms[i], axis=0).reshape(-1) # contrast = np.mean(x_contrast[i], axis=0).reshape(-1) # contrast_std = np.std(x_contrast[i], axis=0).reshape(-1) # flatness = np.mean(x_flatness[i], axis=0).reshape(-1) # polyfeatures = np.mean(x_polyfeatures[i], axis=0).reshape(-1) # cent = np.mean(x_cent[i], axis=0).reshape(-1) # cent_std = np.std(x_cent[i], axis=0).reshape(-1) # bw = np.mean(x_bw[i], axis=0).reshape(-1) # bw_std = np.std(x_bw[i], axis=0).reshape(-1) # rolloff = np.mean(x_rolloff[i], axis=0).reshape(-1) # zcr = np.mean(x_zcr[i], axis=0).reshape(-1) x_feas.append(np.concatenate([mel, mel_std], axis=-1)) # x_feas.append(np.concatenate([mfcc, mel, contrast, bw, cent, mfcc_std, mel_std, contrast_std, bw_std, cent_std])) x_feas = np.asarray(x_feas) scaler = StandardScaler() X = scaler.fit_transform(x_feas[:, :]) # log( 'x_feas shape: {X.shape}\n' # 'x_feas[0]: {X[0]}') return X
def preprocess_data(self, x): if IS_CUT_AUDIO: x = [ sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE] for sample in x ] # extract mfcc x_mfcc = extract_mfcc_parallel(x, n_mfcc=20) x_mel = extract_melspectrogram_parallel(x, n_mels=20, use_power_db=True) x_chroma_stft = extract_chroma_stft_parallel(x, n_chroma=12) # x_rms = extract_rms_parallel(x) x_contrast = extract_spectral_contrast_parallel(x, n_bands=6) x_flatness = extract_spectral_flatness_parallel(x) # x_polyfeatures = extract_poly_features_parallel(x, order=1) x_cent = extract_spectral_centroid_parallel(x) x_bw = extract_bandwidth_parallel(x) x_rolloff = extract_spectral_rolloff_parallel(x) x_zcr = extract_zero_crossing_rate_parallel(x) x_feas = [] for i in range(len(x_mfcc)): mfcc = np.mean(x_mfcc[i], axis=0).reshape(-1) mel = np.mean(x_mel[i], axis=0).reshape(-1) chroma_stft = np.mean(x_chroma_stft[i], axis=0).reshape(-1) # rms = np.mean(x_rms[i], axis=0).reshape(-1) contrast = np.mean(x_contrast[i], axis=0).reshape(-1) flatness = np.mean(x_flatness[i], axis=0).reshape(-1) # polyfeatures = np.mean(x_polyfeatures[i], axis=0).reshape(-1) cent = np.mean(x_cent[i], axis=0).reshape(-1) bw = np.mean(x_bw[i], axis=0).reshape(-1) rolloff = np.mean(x_rolloff[i], axis=0).reshape(-1) zcr = np.mean(x_zcr[i], axis=0).reshape(-1) x_feas.append( np.concatenate([ mfcc, mel, chroma_stft, contrast, flatness, cent, bw, rolloff, zcr ], axis=-1)) x_feas = np.asarray(x_feas) scaler = StandardScaler() X = scaler.fit_transform(x_feas[:, :]) return X
def lr_preprocess(self, x): x = [sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE] for sample in x] x_mel = extract_melspectrogram_parallel(x, n_mels=30, use_power_db=True) # x_contrast = extract_bandwidth_parallel(x) x_feas = [] for i in range(len(x_mel)): mel = np.mean(x_mel[i], axis=0).reshape(-1) mel_std = np.std(x_mel[i], axis=0).reshape(-1) # contrast = np.mean(x_contrast[i], axis=0).reshape(-1) # contrast_std = np.std(x_contrast[i], axis=0).reshape(-1) # contrast, contrast_std x_feas.append(np.concatenate([mel, mel_std], axis=-1)) x_feas = np.asarray(x_feas) scaler = StandardScaler() X = scaler.fit_transform(x_feas[:, :]) return X
def nn_preprocess(self, x, n_mfcc=96, max_duration=5, is_mfcc=True): global LR_HOP_DURATION global HOP_DURATION global AUDIO_SAMPLE_RATE if self.raw_max_length is None: self.raw_max_length = get_max_length(x) scale = self.raw_max_length/84000 LR_HOP_DURATION = max(BASE_LR_HOP_DURATION,BASE_LR_HOP_DURATION*scale) HOP_DURATION = max(BASE_HOP_DURATION,BASE_HOP_DURATION*scale/2) AUDIO_SAMPLE_RATE = max(BASE_AUDIO_SAMPLE_RATE,int(BASE_AUDIO_SAMPLE_RATE*scale/3)) print('LR_HOP_DURATION---%s'%LR_HOP_DURATION) print('HOP_DURATION---%s'%HOP_DURATION) print('AUDIO_SAMPLE_RATE---%s'%AUDIO_SAMPLE_RATE) if self.raw_max_length > (MIDDLE_DURATION * AUDIO_SAMPLE_RATE): self.need_30s = True if len(self._train_y) < 1000 and self._num_classes < 30: self.crnn_first = True self.raw_max_length = min( max_duration * AUDIO_SAMPLE_RATE, self.raw_max_length) self.raw_max_length = max( MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE, self.raw_max_length) x = [sample[0:self.raw_max_length] for sample in x] if is_mfcc: # extract mfcc x = extract_mfcc_parallel(x, n_mfcc=n_mfcc) else: x = extract_melspectrogram_parallel( x, n_mels=128, use_power_db=True) if self.fea_max_length is None: self.fea_max_length = get_max_length(x) self.fea_max_length = min(MAX_FRAME_NUM, self.fea_max_length) x = pad_seq(x, pad_len=self.fea_max_length) return x