예제 #1
0
파일: parser.py 프로젝트: kldami/KoSpeech
    def parse_audio(self, audio_path: str, augment_method: int) -> Tensor:
        """
        Parses audio.

        Args:
             audio_path (str): path of audio file
             augment_method (int): flag indication which augmentation method to use.

        Returns: feature_vector
            - **feature_vector** (torch.FloatTensor): feature from audio file.
        """
        signal = load_audio(audio_path, self.del_silence)

        if augment_method == SpectrogramParser.NOISE_INJECTION or augment_method == SpectrogramParser.HYBRID_AUGMENT:
            signal = self.noise_injector(signal)

        feature_vector = self.transforms(signal)

        if self.normalize:
            feature_vector -= feature_vector.mean()
            feature_vector /= np.std(feature_vector)

        # Refer to "Sequence to Sequence Learning with Neural Network" paper
        if self.input_reverse:
            feature_vector = feature_vector[:, ::-1]
            feature_vector = FloatTensor(np.ascontiguousarray(np.swapaxes(feature_vector, 0, 1)))
        else:
            feature_vector = FloatTensor(feature_vector).transpose(0, 1)

        if augment_method == SpectrogramParser.SPEC_AUGMENT or augment_method == SpectrogramParser.HYBRID_AUGMENT:
            feature_vector = self.spec_augment(feature_vector)

        return feature_vector
예제 #2
0
def parse_audio(audio_path: str, del_silence: bool = True) -> Tensor:
    signal = load_audio(audio_path, del_silence)

    feature_vector = torchaudio.compliance.kaldi.fbank(
        Tensor(signal).unsqueeze(0),
        num_mel_bins=80,
        frame_length=20,
        frame_shift=10,
        window_type='hamming').transpose(0, 1).numpy()
    feature_vector -= feature_vector.mean()
    feature_vector = Tensor(feature_vector).transpose(0, 1)

    return feature_vector
def parse_audio(audio_path: str, del_silence: bool = True) -> Tensor:
    signal = load_audio(audio_path, del_silence)
    mfcc = librosa.feature.mfcc(y=signal,
                                sr=16000,
                                n_mfcc=40,
                                n_fft=320,
                                hop_length=160)

    mfcc -= mfcc.mean()
    mfcc = Tensor(mfcc).transpose(0, 1)

    mfcc = mfcc[:, ::-1]
    mfcc = torch.FloatTensor(np.ascontiguousarray(np.swapaxes(mfcc, 0, 1)))

    return mfcc
예제 #4
0
def parse_audio(audio_path: str,
                del_silence: bool = False,
                audio_extension: str = 'pcm') -> Tensor:
    signal = load_audio(audio_path, del_silence, extension=audio_extension)
    feature = torchaudio.compliance.kaldi.fbank(
        waveform=Tensor(signal).unsqueeze(0),
        num_mel_bins=80,
        frame_length=20,
        frame_shift=10,
        window_type='hamming').transpose(0, 1).numpy()

    feature -= feature.mean()
    feature /= np.std(feature)

    return torch.FloatTensor(feature).transpose(0, 1)
예제 #5
0
파일: parser.py 프로젝트: hwiorn/KoSpeech
    def parse_audio(self, audio_path: str, augment_method: int) -> Tensor:
        """
        Parses audio.

        Args:
             audio_path (str): path of audio file
             augment_method (int): flag indication which augmentation method to use.

        Returns: feature_vector
            - **feature_vector** (torch.FloatTensor): feature from audio file.
        """
        signal = load_audio(audio_path,
                            self.del_silence,
                            extension=self.audio_extension)

        if signal is None:
            logger.info("Audio is None : {0}".format(audio_path))
            return None

        feature = self.transforms(signal)

        if self.normalize:
            feature -= feature.mean()
            feature /= np.std(feature)

        # Refer to "Sequence to Sequence Learning with Neural Network" paper
        if self.input_reverse:
            feature = feature[:, ::-1]
            feature = FloatTensor(
                np.ascontiguousarray(np.swapaxes(feature, 0, 1)))
        else:
            feature = FloatTensor(feature).transpose(0, 1)

        if augment_method == SpectrogramParser.SPEC_AUGMENT:
            feature = self.spec_augment(feature)

        return feature