示例#1
0
    def process_utterance(self, audio_file, transcript):
        """Load, augment, featurize and normalize for speech data.

        :param audio_file: Filepath or file object of audio file.
        :type audio_file: basestring | file
        :param transcript: Transcription text.
        :type transcript: basestring
        :return: Tuple of audio feature tensor and data of transcription part,
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        try:
            is_str = isinstance(audio_file, basestring)
        except:
            is_str = isinstance(audio_file, str)
        if is_str and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), transcript)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, transcript)
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        specgram = self._normalizer.apply(specgram)
        return specgram, transcript_part
示例#2
0
def get_audio_mfcc_features(txt_files,
                            wav_files,
                            n_input,
                            n_context,
                            word_num_map,
                            txt_labels=None,
                            specgram_type='mfcc',
                            mean_std_filepath='data/aishell/mean_std.npz'):
    """ Get MFCC/linear specgram  features. The dim of MFCC is 39, contains 13 mfcc + 13 delta1 + 13 delta2.
        Linear specgram contains 161 features in different frequency section.
    
    :param txt_files:
    :param wav_files:
    :param n_input:
    :param n_context:
    :param word_num_map:
    :param txt_labels:
    :return:
    """
    audio_features = []
    audio_features_len = []
    text_vector = []
    text_vector_len = []
    if txt_files != None:
        txt_labels = txt_files
    get_feature = AudioFeaturizer(specgram_type)
    normalizer = FeatureNormalizer(mean_std_filepath)
    for txt_obj, wav_file in zip(txt_labels, wav_files):
        # Turn inputs into features
        if specgram_type == 'mfcc':
            audio_data = audiofile_to_input_vector(
                wav_file, n_input, n_context)  # get mfcc feature ( ???, 741 )
        elif specgram_type == 'linear':
            speech_segment = SpeechSegment.from_file(wav_file, "")
            specgram = get_feature.featurize(speech_segment)
            audio_data = normalizer.apply(specgram)
            audio_data = np.transpose(
                audio_data)  # get linear specgram feature, (?, 161)
        audio_data = audio_data.astype('float32')

        audio_features.append(audio_data)
        audio_features_len.append(np.int32(len(audio_data)))

        target = []
        if txt_files != None:  # txt_obj是文件
            target = trans_text_ch_to_vector(txt_obj, word_num_map)
        else:
            target = trans_text_ch_to_vector(None, word_num_map,
                                             txt_obj)  # txt_obj是labels
        text_vector.append(target)
        text_vector_len.append(len(target))

    audio_features = np.asarray(audio_features)
    audio_features_len = np.asarray(audio_features_len)
    text_vector = np.asarray(text_vector)
    text_vector_len = np.asarray(text_vector_len)
    return audio_features, audio_features_len, text_vector, text_vector_len
示例#3
0
def audiofile_to_input_vector(audio_filename, n_input, n_context):
    """ Compute MFCC features with n_context

    :param audio_filename:
    :param n_input:
    :param n_context:
    :return:
    """
    fs, audio = wav.read(audio_filename)

    # get mfcc features with dim 39
    get_feature = AudioFeaturizer("mfcc")
    speech_segment = SpeechSegment.from_file(audio_filename, "")
    orig_inputs = get_feature.featurize(speech_segment) # (39, ?)
    orig_inputs = np.transpose(orig_inputs) # trans to time major (?, 39)


    train_inputs = np.zeros((orig_inputs.shape[0], n_input + 2 * n_input * n_context)) #(***/2, 195)
    empty_mfcc = np.zeros((n_input))

    # Prepare input data, consist of three parts, 
    # output is (past hyparam.n_context * 39 + current + future hyparam.n_context * 39)
    time_slices = range(train_inputs.shape[0])
    context_past_min = time_slices[0] + n_context
    context_future_max = time_slices[-1] - n_context
    for time_slice in time_slices:
        # padding with 0 for the first of 9,mfcc features
        need_empty_past = max(0, (context_past_min - time_slice))
        empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past))
        data_source_past = orig_inputs[ max(0, time_slice - n_context):time_slice]

        # padding with 0 for the last of 9,mfcc features
        need_empty_future = max(0, (time_slice - context_future_max))
        empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
        data_source_future = orig_inputs[time_slice + 1:time_slice + n_context + 1]

        if need_empty_past:
            past = np.concatenate((empty_source_past, data_source_past))
        else:
            past = data_source_past

        if need_empty_future:
            future = np.concatenate((data_source_future, empty_source_future))
        else:
            future = data_source_future

        past = np.reshape(past, n_context * 39)
        now = orig_inputs[time_slice]
        future = np.reshape(future, n_context * n_input)
        train_inputs[time_slice] = np.concatenate((past, now, future))

    # Tran data to Norm distribution, minus mean value then over the varr
    train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)

    # shape of train_inputs: (shape(orig_inputs)/2, n_context * 2 * 39 + 39)
    return train_inputs 
示例#4
0
    def process_utterance(self, filename, transcript):
        """Load, augment, featurize and normalize for speech data.

        :param filename: Audio filepath
        :type filename: basestring | file
        :param transcript: Transcription text.
        :type transcript: basestring
        :return: Tuple of audio feature tensor and list of token ids for
                 transcription.
        :rtype: tuple of (2darray, list)
        """
        if filename.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(filename), transcript)
        else:
            speech_segment = SpeechSegment.from_file(filename, transcript)
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
        specgram = self._normalizer.apply(specgram)
        return specgram, text_ids
示例#5
0
    def process_utterance(self, audio_file):
        """对语音数据加载、预处理

        :param audio_file: 音频文件的文件路径或文件对象
        :type audio_file: str | file
        :return: 预处理的音频数据
        :rtype: 2darray
        """
        speech_segment = SpeechSegment.from_file(audio_file, "")
        specgram, _ = self._speech_featurizer.featurize(speech_segment, False)
        specgram = self._normalizer.apply(specgram)
        return specgram
示例#6
0
    def process_utterance(self, audio_file, transcript):
        """对语音数据加载、扩充、特征化和归一化

        :param audio_file: 音频文件的文件路径或文件对象
        :type audio_file: str | file
        :param transcript: 音频对应的文本
        :type transcript: str
        :return: 经过归一化等预处理的音频数据,音频文件对应文本的ID
        :rtype: tuple of (2darray, list)
        """
        speech_segment = SpeechSegment.from_file(audio_file, transcript)
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        specgram = self._normalizer.apply(specgram)
        return specgram, transcript_part
示例#7
0
    def process_utterance_from_bytes(self, bytes, transcript,
                                     **soundfile_options):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Bytes read from the file.
        :type audio_file: byte string.
        :param transcript: Transcription text.
        :type transcript: basestring
        :param soundfile_options: Options for opening with soundfile library.
        :type soundfile_options: **kwargs
        :return: Tuple of audio feature tensor and data of transcription part,
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        speech_segment = SpeechSegment.from_bytes(bytes, transcript,
                                                  **soundfile_options)

        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        specgram = self._normalizer.apply(specgram)

        return specgram, transcript_part