示例#1
0
def articulation_continuous(audio_filename,
                            flag_plots,
                            sizeframe=0.04,
                            step=0.02,
                            nB=22,
                            nMFCC=12,
                            minf0=60,
                            maxf0=350,
                            voice_bias=-0.5,
                            len_thr_miliseconds=270.0,
                            pitch_method='praat'):

    fs, data_audio = read(audio_filename)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))
    size_frameS = sizeframe * float(fs)
    size_stepS = step * float(fs)
    overlap = size_stepS / size_frameS

    if pitch_method == 'praat':
        name_audio = audio_filename.split('/')
        temp_uuid = 'artic' + name_audio[-1][0:-4]
        temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt'
        temp_filename_f0 = '../tempfiles/tempF0' + temp_uuid + '.txt'
        praat_functions.praat_vuv(audio_filename,
                                  temp_filename_f0,
                                  temp_filename_vuv,
                                  time_stepF0=step,
                                  minf0=minf0,
                                  maxf0=maxf0)
        F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                         len(data_audio) / float(fs), step)
        segmentsFull, segmentsOn, segmentsOff = praat_functions.read_textgrid_trans(
            temp_filename_vuv, data_audio, fs, sizeframe)
        os.remove(temp_filename_vuv)
        os.remove(temp_filename_f0)
    elif pitch_method == 'rapt':
        data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
        F0 = pysptk.sptk.rapt(data_audiof,
                              fs,
                              int(size_stepS),
                              min=minf0,
                              max=maxf0,
                              voice_bias=voice_bias,
                              otype='f0')
        segments = read_Textgrid(path_base + 'vuv.txt', file_audio, win_trans)
        segmentsOn = V_UV(F0, data_audio, fs, 'onset')
        segmentsOff = V_UV(F0, data_audio, fs, 'offset')

    BBEon, MFCCon = extractTrans(segmentsOn, fs, size_frameS, size_stepS, nB,
                                 nMFCC)
    BBEoff, MFCCoff = extractTrans(segmentsOff, fs, size_frameS, size_stepS,
                                   nB, nMFCC)

    DMFCCon = np.asarray(
        [np.diff(MFCCon[:, nf], n=1) for nf in range(MFCCon.shape[1])]).T
    DDMFCCon = np.asarray(
        [np.diff(MFCCon[:, nf], n=2) for nf in range(MFCCon.shape[1])]).T

    DMFCCoff = np.asarray(
        [np.diff(MFCCoff[:, nf], n=1) for nf in range(MFCCoff.shape[1])]).T
    DDMFCCoff = np.asarray(
        [np.diff(MFCCoff[:, nf], n=2) for nf in range(MFCCoff.shape[1])]).T

    # TODO: Make parameters configurable. (If worth it)
    name_audio = audio_filename.split('/')
    temp_uuid = 'artic' + name_audio[-1][0:-4]
    temp_filename = '../tempfiles/tempFormants' + temp_uuid + '.txt'
    praat_functions.praat_formants(audio_filename, temp_filename, sizeframe,
                                   step)
    [F1, F2] = praat_functions.decodeFormants(temp_filename)
    os.remove(temp_filename)

    if len(F0) < len(F1):
        F0 = np.hstack((F0, np.zeros(len(F1) - len(F0))))
    else:
        F1 = np.hstack((F1, np.zeros(len(F0) - len(F1))))
        F2 = np.hstack((F2, np.zeros(len(F0) - len(F2))))

    pos0 = np.where(F0 == 0)[0]
    dpos0 = np.hstack(([1], np.diff(pos0)))
    f0u = np.split(pos0, np.where(dpos0 > 1)[0])

    thr_sil = int(len_thr_miliseconds / step)

    sil_seg = []
    for l in range(len(f0u)):
        if len(f0u[l]) >= thr_sil:
            F1[f0u[l]] = 0
            F2[f0u[l]] = 0
        sil_seg.append(f0u)

    sil_seg = np.hstack(sil_seg)

    F1nz = F1[F1 != 0]
    F2nz = F2[F2 != 0]
    DF1 = np.diff(F1, n=1)
    DF2 = np.diff(F2, n=1)
    DDF1 = np.diff(F1, n=2)
    DDF2 = np.diff(F2, n=2)

    if flag_plots:
        plot_art(data_audio, fs, F0, F1, F2, segmentsOn, segmentsOff)

    return BBEon, MFCCon, DMFCCon, DDMFCCon, BBEoff, MFCCoff, DMFCCoff, DDMFCCoff, F1nz, DF1, DDF1, F2nz, DF2, DDF2
示例#2
0
    def extract_features_file(self,
                              audio,
                              static=True,
                              plots=False,
                              fmt="npy",
                              kaldi_file=""):
        """Extract the articulation features from an audio file

        :param audio: .wav audio file.
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> articulation=Articulation()
        >>> file_audio="../audios/001_ddk1_PCGITA.wav"
        >>> features1=articulation.extract_features_file(file_audio, static=True, plots=True, fmt="npy")
        >>> features2=articulation.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe")
        >>> features3=articulation.extract_features_file(file_audio, static=False, plots=True, fmt="torch")
        >>> articulation.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test")
        
        >>> path_audio="../audios/"
        >>> features1=articulation.extract_features_path(path_audio, static=True, plots=False, fmt="npy")
        >>> features2=articulation.extract_features_path(path_audio, static=True, plots=False, fmt="csv")
        >>> features3=articulation.extract_features_path(path_audio, static=False, plots=True, fmt="torch")
        >>> articulation.extract_features_path(path_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test.ark")

        """
        fs, data_audio = read(audio)
        data_audio = data_audio - np.mean(data_audio)
        data_audio = data_audio / float(np.max(np.abs(data_audio)))
        size_frameS = self.sizeframe * float(fs)
        size_stepS = self.step * float(fs)

        if self.pitch_method == 'praat':
            name_audio = audio.split('/')
            temp_uuid = 'articulation' + name_audio[-1][0:-4]
            if not os.path.exists(self.PATH + '/../tempfiles/'):
                os.makedirs(self.PATH + '/../tempfiles/')
            temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt'
            temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt'
            praat_functions.praat_vuv(audio,
                                      temp_filename_f0,
                                      temp_filename_vuv,
                                      time_stepF0=self.step,
                                      minf0=self.minf0,
                                      maxf0=self.maxf0)
            F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                             len(data_audio) / float(fs),
                                             self.step)
            segmentsFull, segmentsOn, segmentsOff = praat_functions.read_textgrid_trans(
                temp_filename_vuv, data_audio, fs, self.sizeframe)
            os.remove(temp_filename_vuv)
            os.remove(temp_filename_f0)
        elif self.pitch_method == 'rapt':
            data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
            F0 = pysptk.sptk.rapt(data_audiof,
                                  fs,
                                  int(size_stepS),
                                  min=self.minf0,
                                  max=self.maxf0,
                                  voice_bias=self.voice_bias,
                                  otype='f0')

            segmentsOn = V_UV(F0, data_audio, fs, 'onset')
            segmentsOff = V_UV(F0, data_audio, fs, 'offset')

        BBEon, MFCCon = extractTrans(segmentsOn, fs, size_frameS, size_stepS,
                                     self.nB, self.nMFCC)
        BBEoff, MFCCoff = extractTrans(segmentsOff, fs, size_frameS,
                                       size_stepS, self.nB, self.nMFCC)

        DMFCCon = np.asarray(
            [np.diff(MFCCon[:, nf], n=1) for nf in range(MFCCon.shape[1])]).T
        DDMFCCon = np.asarray(
            [np.diff(MFCCon[:, nf], n=2) for nf in range(MFCCon.shape[1])]).T

        DMFCCoff = np.asarray(
            [np.diff(MFCCoff[:, nf], n=1) for nf in range(MFCCoff.shape[1])]).T
        DDMFCCoff = np.asarray(
            [np.diff(MFCCoff[:, nf], n=2) for nf in range(MFCCoff.shape[1])]).T

        name_audio = audio.split('/')
        temp_uuid = 'artic' + name_audio[-1][0:-4]
        if not os.path.exists(self.PATH + '/../tempfiles/'):
            os.makedirs(self.PATH + '/../tempfiles/')
        temp_filename = self.PATH + '/../tempfiles/tempFormants' + temp_uuid + '.txt'
        praat_functions.praat_formants(audio, temp_filename, self.sizeframe,
                                       self.step)
        [F1, F2] = praat_functions.decodeFormants(temp_filename)
        os.remove(temp_filename)

        if len(F0) < len(F1):
            F0 = np.hstack((F0, np.zeros(len(F1) - len(F0))))
            F1nz = np.zeros((0, 1))
            F2nz = np.zeros((0, 1))
            DF1 = np.zeros((0, 1))
            DDF1 = np.zeros((0, 1))
            DF2 = np.zeros((0, 1))
            DDF2 = np.zeros((0, 1))
        else:
            F1 = np.hstack((F1, np.zeros(len(F0) - len(F1))))
            F2 = np.hstack((F2, np.zeros(len(F0) - len(F2))))

            pos0 = np.where(F0 == 0)[0]
            dpos0 = np.hstack(([1], np.diff(pos0)))
            f0u = np.split(pos0, np.where(dpos0 > 1)[0])

            thr_sil = int(self.len_thr_miliseconds / self.step)

            sil_seg = []
            for l in range(len(f0u)):
                if len(f0u[l]) >= thr_sil:
                    F1[f0u[l]] = 0
                    F2[f0u[l]] = 0
                sil_seg.append(f0u)

            sil_seg = np.hstack(sil_seg)

            F1nz = F1[F1 != 0]
            F2nz = F2[F2 != 0]
            DF1 = np.diff(F1, n=1)
            DF2 = np.diff(F2, n=1)
            DDF1 = np.diff(F1, n=2)
            DDF2 = np.diff(F2, n=2)

            if plots:
                self.plot_art(data_audio, fs, F0, F1, F2, segmentsOn,
                              segmentsOff)

            if len(F1nz) == 0:
                F1nz = np.zeros((0, 1))
            if len(F2nz) == 0:
                F2nz = np.zeros((0, 1))
            if len(DF1) == 0:
                DF1 = np.zeros((0, 1))
            if len(DDF1) == 0:
                DDF1 = np.zeros((0, 1))
            if len(DF2) == 0:
                DF2 = np.zeros((0, 1))
            if len(DDF2) == 0:
                DDF2 = np.zeros((0, 1))

        feat_v = dynamic2statict_artic([
            BBEon, MFCCon, DMFCCon, DDMFCCon, BBEoff, MFCCoff, DMFCCoff,
            DDMFCCoff, F1nz, DF1, DDF1, F2nz, DF2, DDF2
        ])
        feat_mat = np.hstack(
            (BBEon[2:, :], MFCCon[2:, :], DMFCCon[1:, :], DDMFCCon))

        if fmt in ("npy", "txt"):
            if static:
                return feat_v
            return feat_mat
        if fmt in ("dataframe", "csv"):
            if static:
                head_st = []
                df = {}
                for k in ["avg", "std", "skewness", "kurtosis"]:
                    for h in self.head:
                        head_st.append(k + " " + h)
                for e, k in enumerate(head_st):
                    #print(feat_v.shape, len(head_st), e, k)
                    df[k] = [feat_v[e]]

                return pd.DataFrame(df)
            else:
                df = {}
                for e, k in enumerate(self.head_dyn):
                    df[k] = feat_mat[:, e]
                return pd.DataFrame(df)
        if fmt == "torch":
            if static:
                feat_t = torch.from_numpy(feat_v)
                return feat_t
            return torch.from_numpy(feat_mat)

        if fmt == "kaldi":
            if static:
                raise ValueError(
                    "Kaldi is only supported for dynamic features")
            name_all = audio.split('/')
            dictX = {name_all[-1]: feat_mat}
            save_dict_kaldimat(dictX, kaldi_file)