示例#1
0
def get_audio_mfcc_features(txt_files,
                            wav_files,
                            n_input,
                            n_context,
                            word_num_map,
                            txt_labels=None,
                            specgram_type='mfcc',
                            mean_std_filepath='data/aishell/mean_std.npz'):
    """ Get MFCC/linear specgram  features. The dim of MFCC is 39, contains 13 mfcc + 13 delta1 + 13 delta2.
        Linear specgram contains 161 features in different frequency section.
    
    :param txt_files:
    :param wav_files:
    :param n_input:
    :param n_context:
    :param word_num_map:
    :param txt_labels:
    :return:
    """
    audio_features = []
    audio_features_len = []
    text_vector = []
    text_vector_len = []
    if txt_files != None:
        txt_labels = txt_files
    get_feature = AudioFeaturizer(specgram_type)
    normalizer = FeatureNormalizer(mean_std_filepath)
    for txt_obj, wav_file in zip(txt_labels, wav_files):
        # Turn inputs into features
        if specgram_type == 'mfcc':
            audio_data = audiofile_to_input_vector(
                wav_file, n_input, n_context)  # get mfcc feature ( ???, 741 )
        elif specgram_type == 'linear':
            speech_segment = SpeechSegment.from_file(wav_file, "")
            specgram = get_feature.featurize(speech_segment)
            audio_data = normalizer.apply(specgram)
            audio_data = np.transpose(
                audio_data)  # get linear specgram feature, (?, 161)
        audio_data = audio_data.astype('float32')

        audio_features.append(audio_data)
        audio_features_len.append(np.int32(len(audio_data)))

        target = []
        if txt_files != None:  # txt_obj是文件
            target = trans_text_ch_to_vector(txt_obj, word_num_map)
        else:
            target = trans_text_ch_to_vector(None, word_num_map,
                                             txt_obj)  # txt_obj是labels
        text_vector.append(target)
        text_vector_len.append(len(target))

    audio_features = np.asarray(audio_features)
    audio_features_len = np.asarray(audio_features_len)
    text_vector = np.asarray(text_vector)
    text_vector_len = np.asarray(text_vector_len)
    return audio_features, audio_features_len, text_vector, text_vector_len
示例#2
0
def audiofile_to_input_vector(audio_filename, n_input, n_context):
    """ Compute MFCC features with n_context

    :param audio_filename:
    :param n_input:
    :param n_context:
    :return:
    """
    fs, audio = wav.read(audio_filename)

    # get mfcc features with dim 39
    get_feature = AudioFeaturizer("mfcc")
    speech_segment = SpeechSegment.from_file(audio_filename, "")
    orig_inputs = get_feature.featurize(speech_segment) # (39, ?)
    orig_inputs = np.transpose(orig_inputs) # trans to time major (?, 39)


    train_inputs = np.zeros((orig_inputs.shape[0], n_input + 2 * n_input * n_context)) #(***/2, 195)
    empty_mfcc = np.zeros((n_input))

    # Prepare input data, consist of three parts, 
    # output is (past hyparam.n_context * 39 + current + future hyparam.n_context * 39)
    time_slices = range(train_inputs.shape[0])
    context_past_min = time_slices[0] + n_context
    context_future_max = time_slices[-1] - n_context
    for time_slice in time_slices:
        # padding with 0 for the first of 9,mfcc features
        need_empty_past = max(0, (context_past_min - time_slice))
        empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past))
        data_source_past = orig_inputs[ max(0, time_slice - n_context):time_slice]

        # padding with 0 for the last of 9,mfcc features
        need_empty_future = max(0, (time_slice - context_future_max))
        empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
        data_source_future = orig_inputs[time_slice + 1:time_slice + n_context + 1]

        if need_empty_past:
            past = np.concatenate((empty_source_past, data_source_past))
        else:
            past = data_source_past

        if need_empty_future:
            future = np.concatenate((data_source_future, empty_source_future))
        else:
            future = data_source_future

        past = np.reshape(past, n_context * 39)
        now = orig_inputs[time_slice]
        future = np.reshape(future, n_context * n_input)
        train_inputs[time_slice] = np.concatenate((past, now, future))

    # Tran data to Norm distribution, minus mean value then over the varr
    train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)

    # shape of train_inputs: (shape(orig_inputs)/2, n_context * 2 * 39 + 39)
    return train_inputs 
示例#3
0
 def __init__(self,
              mean_std_filepath,
              manifest_path=None,
              num_samples=500,
              random_seed=0):
     if not mean_std_filepath:
         if not manifest_path:
             raise ValueError(
                 "如果mean_std_filepath是None,那么meanifest_path和featurize_func不应该是None"
             )
         self._rng = random.Random(random_seed)
         self.audio_featurizer = AudioFeaturizer()
         self._compute_mean_std(manifest_path, num_samples)
     else:
         self._read_mean_std_from_file(mean_std_filepath)
示例#4
0
def main():
    print_arguments(args)

    audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type)

    def augment_and_featurize(audio_segment):
        return audio_featurizer.featurize(audio_segment)

    normalizer = FeatureNormalizer(
        mean_std_filepath=None,
        manifest_path=args.manifest_path,
        featurize_func=augment_and_featurize,
        num_samples=args.num_samples)
    normalizer.write_to_file(args.output_path)
示例#5
0
        str,
        'ctc_beam_search',
        '结果解码方法',
        choices=['ctc_beam_search', 'ctc_greedy'])
add_arg('lang_model_path', str, 'lm/zh_giga.no_cna_cmn.prune01244.klm',
        "语言模型文件路径")
args = parser.parse_args()

print_arguments(args)
# 加载数据字典
with open(args.dataset_vocab, 'r', encoding='utf-8') as f:
    labels = eval(f.read())
vocabulary = [labels[i] for i in range(len(labels))]

# 提取音频特征器和归一化器
audio_featurizer = AudioFeaturizer()
normalizer = FeatureNormalizer(mean_std_filepath=args.mean_std_path)

# 创建模型
model = DeepSpeech2Model(feat_size=audio_featurizer.feature_dim(),
                         dict_size=len(vocabulary),
                         num_conv_layers=args.num_conv_layers,
                         num_rnn_layers=args.num_rnn_layers,
                         rnn_size=args.rnn_layer_size)
model.set_state_dict(
    paddle.load(os.path.join(args.model_path, 'model.pdparams')))
model.eval()

# 集束搜索方法的处理
if args.decoder == "ctc_beam_search":
    try:
示例#6
0
class FeatureNormalizer(object):
    """音频特征归一化类

    如果mean_std_filepath不是None,则normalizer将直接从文件初始化。否则,使用manifest_path应该给特征mean和stddev计算

    :param mean_std_filepath: 均值和标准值的文件路径
    :type mean_std_filepath: None|str
    :param manifest_path: 用于计算均值和标准值的数据列表,一般是训练的数据列表
    :type meanifest_path: None|str
    :param num_samples: 用于计算均值和标准值的音频数量
    :type num_samples: int
    :param random_seed: 随机种子
    :type random_seed: int
    :raises ValueError: 如果mean_std_filepath和manifest_path(或mean_std_filepath和featurize_func)都为None
    """
    def __init__(self,
                 mean_std_filepath,
                 manifest_path=None,
                 num_samples=500,
                 random_seed=0):
        if not mean_std_filepath:
            if not manifest_path:
                raise ValueError(
                    "如果mean_std_filepath是None,那么meanifest_path和featurize_func不应该是None"
                )
            self._rng = random.Random(random_seed)
            self.audio_featurizer = AudioFeaturizer()
            self._compute_mean_std(manifest_path, num_samples)
        else:
            self._read_mean_std_from_file(mean_std_filepath)

    def apply(self, features, eps=1e-14):
        """使用均值和标准值计算音频特征的归一化值

        :param features: 需要归一化的音频
        :type features: ndarray
        :param eps:  添加到标准值以提供数值稳定性
        :type eps: float
        :return: 已经归一化的数据
        :rtype: ndarray
        """
        return (features - self._mean) / (self._std + eps)

    def write_to_file(self, filepath):
        """将计算得到的均值和标准值写入到文件中

        :param filepath: 均值和标准值写入的文件路径
        :type filepath: str
        """
        np.savez(filepath, mean=self._mean, std=self._std)

    def _read_mean_std_from_file(self, filepath):
        """从文件中加载均值和标准值"""
        npzfile = np.load(filepath)
        self._mean = npzfile["mean"]
        self._std = npzfile["std"]

    def _compute_mean_std(self, manifest_path, num_samples):
        """从随机抽样的实例中计算均值和标准值"""
        manifest = read_manifest(manifest_path)
        sampled_manifest = self._rng.sample(manifest, num_samples)
        features = []
        for instance in tqdm(sampled_manifest):
            audio = self.audio_featurizer.load_audio_file(
                instance["audio_path"])
            feature = self.audio_featurizer.featurize(audio)
            features.append(feature)
        features = np.hstack(features)
        self._mean = np.mean(features, axis=1).reshape([-1, 1])
        self._std = np.std(features, axis=1).reshape([-1, 1])