def get_audio_mfcc_features(txt_files, wav_files, n_input, n_context, word_num_map, txt_labels=None, specgram_type='mfcc', mean_std_filepath='data/aishell/mean_std.npz'): """ Get MFCC/linear specgram features. The dim of MFCC is 39, contains 13 mfcc + 13 delta1 + 13 delta2. Linear specgram contains 161 features in different frequency section. :param txt_files: :param wav_files: :param n_input: :param n_context: :param word_num_map: :param txt_labels: :return: """ audio_features = [] audio_features_len = [] text_vector = [] text_vector_len = [] if txt_files != None: txt_labels = txt_files get_feature = AudioFeaturizer(specgram_type) normalizer = FeatureNormalizer(mean_std_filepath) for txt_obj, wav_file in zip(txt_labels, wav_files): # Turn inputs into features if specgram_type == 'mfcc': audio_data = audiofile_to_input_vector( wav_file, n_input, n_context) # get mfcc feature ( ???, 741 ) elif specgram_type == 'linear': speech_segment = SpeechSegment.from_file(wav_file, "") specgram = get_feature.featurize(speech_segment) audio_data = normalizer.apply(specgram) audio_data = np.transpose( audio_data) # get linear specgram feature, (?, 161) audio_data = audio_data.astype('float32') audio_features.append(audio_data) audio_features_len.append(np.int32(len(audio_data))) target = [] if txt_files != None: # txt_obj是文件 target = trans_text_ch_to_vector(txt_obj, word_num_map) else: target = trans_text_ch_to_vector(None, word_num_map, txt_obj) # txt_obj是labels text_vector.append(target) text_vector_len.append(len(target)) audio_features = np.asarray(audio_features) audio_features_len = np.asarray(audio_features_len) text_vector = np.asarray(text_vector) text_vector_len = np.asarray(text_vector_len) return audio_features, audio_features_len, text_vector, text_vector_len
def audiofile_to_input_vector(audio_filename, n_input, n_context): """ Compute MFCC features with n_context :param audio_filename: :param n_input: :param n_context: :return: """ fs, audio = wav.read(audio_filename) # get mfcc features with dim 39 get_feature = AudioFeaturizer("mfcc") speech_segment = SpeechSegment.from_file(audio_filename, "") orig_inputs = get_feature.featurize(speech_segment) # (39, ?) orig_inputs = np.transpose(orig_inputs) # trans to time major (?, 39) train_inputs = np.zeros((orig_inputs.shape[0], n_input + 2 * n_input * n_context)) #(***/2, 195) empty_mfcc = np.zeros((n_input)) # Prepare input data, consist of three parts, # output is (past hyparam.n_context * 39 + current + future hyparam.n_context * 39) time_slices = range(train_inputs.shape[0]) context_past_min = time_slices[0] + n_context context_future_max = time_slices[-1] - n_context for time_slice in time_slices: # padding with 0 for the first of 9,mfcc features need_empty_past = max(0, (context_past_min - time_slice)) empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past)) data_source_past = orig_inputs[ max(0, time_slice - n_context):time_slice] # padding with 0 for the last of 9,mfcc features need_empty_future = max(0, (time_slice - context_future_max)) empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future)) data_source_future = orig_inputs[time_slice + 1:time_slice + n_context + 1] if need_empty_past: past = np.concatenate((empty_source_past, data_source_past)) else: past = data_source_past if need_empty_future: future = np.concatenate((data_source_future, empty_source_future)) else: future = data_source_future past = np.reshape(past, n_context * 39) now = orig_inputs[time_slice] future = np.reshape(future, n_context * n_input) train_inputs[time_slice] = np.concatenate((past, now, future)) # Tran data to Norm distribution, minus mean value then over the varr train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs) # shape of train_inputs: (shape(orig_inputs)/2, n_context * 2 * 39 + 39) return train_inputs
class FeatureNormalizer(object): """音频特征归一化类 如果mean_std_filepath不是None,则normalizer将直接从文件初始化。否则,使用manifest_path应该给特征mean和stddev计算 :param mean_std_filepath: 均值和标准值的文件路径 :type mean_std_filepath: None|str :param manifest_path: 用于计算均值和标准值的数据列表,一般是训练的数据列表 :type meanifest_path: None|str :param num_samples: 用于计算均值和标准值的音频数量 :type num_samples: int :param random_seed: 随机种子 :type random_seed: int :raises ValueError: 如果mean_std_filepath和manifest_path(或mean_std_filepath和featurize_func)都为None """ def __init__(self, mean_std_filepath, manifest_path=None, num_samples=500, random_seed=0): if not mean_std_filepath: if not manifest_path: raise ValueError( "如果mean_std_filepath是None,那么meanifest_path和featurize_func不应该是None" ) self._rng = random.Random(random_seed) self.audio_featurizer = AudioFeaturizer() self._compute_mean_std(manifest_path, num_samples) else: self._read_mean_std_from_file(mean_std_filepath) def apply(self, features, eps=1e-14): """使用均值和标准值计算音频特征的归一化值 :param features: 需要归一化的音频 :type features: ndarray :param eps: 添加到标准值以提供数值稳定性 :type eps: float :return: 已经归一化的数据 :rtype: ndarray """ return (features - self._mean) / (self._std + eps) def write_to_file(self, filepath): """将计算得到的均值和标准值写入到文件中 :param filepath: 均值和标准值写入的文件路径 :type filepath: str """ np.savez(filepath, mean=self._mean, std=self._std) def _read_mean_std_from_file(self, filepath): """从文件中加载均值和标准值""" npzfile = np.load(filepath) self._mean = npzfile["mean"] self._std = npzfile["std"] def _compute_mean_std(self, manifest_path, num_samples): """从随机抽样的实例中计算均值和标准值""" manifest = read_manifest(manifest_path) sampled_manifest = self._rng.sample(manifest, num_samples) features = [] for instance in tqdm(sampled_manifest): audio = self.audio_featurizer.load_audio_file( instance["audio_path"]) feature = self.audio_featurizer.featurize(audio) features.append(feature) features = np.hstack(features) self._mean = np.mean(features, axis=1).reshape([-1, 1]) self._std = np.std(features, axis=1).reshape([-1, 1])