예제 #1
0
    def parse_audio(self, audio_path):
        if self.aug_conf and self.aug_conf.speed_volume_perturb:
            y = load_randomly_augmented_audio(audio_path, self.sample_rate)
        else:
            y = load_audio(audio_path)
        if self.noise_injector:
            add_noise = np.random.binomial(1, self.aug_conf.noise_prob)
            if add_noise:
                y = self.noise_injector.inject_noise(y)
        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)
        # STFT
        D = librosa.stft(y,
                         n_fft=n_fft,
                         hop_length=hop_length,
                         win_length=win_length,
                         window=self.window)
        spect, phase = librosa.magphase(D)
        # S = log(S+1)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        if self.normalize:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)

        if self.aug_conf and self.aug_conf.spec_augment:
            spect = spec_augment(spect)

        return spect
예제 #2
0
    def parse_audio(self, audio_path):
        if self.aug_conf and self.aug_conf.speed_volume_perturb:
            y = load_randomly_augmented_audio(audio_path, self.sample_rate)
        else:
            y = load_audio(audio_path)
        if self.noise_injector:
            add_noise = np.random.binomial(1, self.aug_conf.noise_prob)
            if add_noise:
                y = self.noise_injector.inject_noise(y)

        ##get ten file de ve hinh
        # nanlist=audio_path.split("/")
        # nanLs = nanlist[len(nanlist)-1]
        # name = nanLs.split(".")[0]+"_"+nanLs.split(".")[1]

        #tín hiệu thô
        # fig1,ax22= plt.subplots()
        # plt.title('Tín hiệu thô của câu nói \'anh có thể gọi cho tôi không\'')
        # plt.plot(y)
        # plt.xlabel('Sample')
        # plt.ylabel('Amplitude')
        # fig1.savefig('/work/Source/deepspeech.pytorch/deepspeech_pytorch/quyenImg/'+name+'tinhieutho'+'.png')

        n_fft = int(self.sample_rate * self.window_size)  #320
        win_length = n_fft  #320
        hop_length = int(self.sample_rate * self.window_stride)  #160
        # STFT
        D = librosa.stft(
            y,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=self.window
        )  #array([[ 2.42148260e-01+0.00000000e+00j, -1.00020550e-01+0.00000000e+00j,

        spect, phase = librosa.magphase(
            D
        )  # S = log(S+1)##array([[ 2.42148260e-01+0.00000000e+00j, -1.00020550e-01+0.00000000e+00j,

        # print("++**",audio_path)
        # fig, ax = plt.subplots()
        # img = librosa.display.specshow(librosa.amplitude_to_db(spect,ref=np.max), y_axis='log', x_axis='time', ax=ax)
        # ax.set_title(audio_path)
        # fig.savefig('/work/Source/deepspeech.pytorch/deepspeech_pytorch/quyenImg/'+name+'.png')

        #có thanh độ lớn biên độ
        # log_spectrogram = librosa.amplitude_to_db(spect)
        # plt.figure(figsize=(12,8))
        # librosa.display.specshow(log_spectrogram, sr=self.sample_rate,
        # y_axis='log', x_axis='time',hop_length=160)
        # plt.xlabel("Time")
        # plt.ylabel("Frequency")
        # plt.colorbar(format="%+2.0f dB")
        # plt.title("Spectrogram (dB)")
        # plt.savefig('/work/Source/deepspeech.pytorch/deepspeech_pytorch/quyenImg/'+name+'.png')

        spect = np.log1p(
            spect
        )  #tensor([[2.1684e-01, 9.5329e-02, 1.0469e-01,  ..., 1.2308e-03, 2.3625e-03,

        # fig2, ax2 = plt.subplots()
        # img2 = librosa.display.specshow(librosa.amplitude_to_db(spect,ref=np.max), y_axis='log', x_axis='time', ax=ax)
        # ax2.set_title(audio_path+"(log)")
        # fig2.savefig('/work/Source/deepspeech.pytorch/deepspeech_pytorch/quyenImg/'+name+"(log)"+'.png')

        spect = torch.FloatTensor(
            spect
        )  #tensor([[2.1684e-01, 9.5329e-02, 1.0469e-01,  ..., 1.2308e-03, 2.3625e-03,
        if self.normalize:

            mean = spect.mean()  #tính trung bình cộng
            #mean=np.log1p(mean)

            std = spect.std()
            #std=np.log1p(std)#độ lệch chuẩn

            if (mean == torch.tensor(0) or std == torch.tensor(0)):
                print("nan nan")

            spect.add_(-mean)
            spect.div_(std)

        if self.aug_conf and self.aug_conf.spec_augment:
            spect = spec_augment(spect)

        return spect