Exemplo n.º 1
0
    def __test(hz, resolution, bins_per_octave, tuning):

        est_tuning = librosa.pitch_tuning(hz,
                                          resolution=resolution,
                                          bins_per_octave=bins_per_octave)

        assert np.abs(tuning - est_tuning) <= resolution
Exemplo n.º 2
0
    def __test(hz, resolution, bins_per_octave, tuning):

        est_tuning = librosa.pitch_tuning(hz,
                                          resolution=resolution,
                                          bins_per_octave=bins_per_octave)

        assert np.abs(tuning - est_tuning) <= resolution
Exemplo n.º 3
0
def lpc_emotion_upload():
    entry = dict()
    wav_files = []
    SAMPLE_RATE = 44100
    b, _ = librosa.core.load('pickles/catalyst.wav', sr=SAMPLE_RATE)
    y, sr = librosa.load('pickles/catalyst.wav')
    lpc = librosa.lpc(y, 5)
    for no in range(0, len(lpc)):
        entry['LIB_LPC{0}'.format(no)] = lpc[no]
    y, sr = librosa.load('pickles/catalyst.wav')
    pitches, magnitudes = librosa.core.piptrack(y, sr)
    # Select out pitches with high energy
    pitches = pitches[magnitudes > np.median(magnitudes)]
    pit = librosa.pitch_tuning(pitches)

    entry['pitch'] = pit

    wav_files.append(entry)
    wav_df = pd.DataFrame(wav_files)
    lpc_clf = joblib.load('pickles/lpc_model.sav')

    bar = pd.DataFrame(lpc_clf.predict_proba(wav_df))
    bar.columns = lpc_clf.classes_
    bar_t = bar.T
    bar_t.columns = ['values']
    print('HERE')

    fig = go.Figure(data=[
        go.Pie(labels=lpc_clf.classes_, values=bar_t['values'], hole=.3),
    ])
    return lpc_clf.predict(wav_df), fig
    def get_wav_df(self):
        wav_files = []
        for wav in os.listdir(self.wav_dir):
            if wav.endswith('.wav'):
                entry = dict()
                entry['Session'] = wav

                fs, signal = swav.read(self.wav_dir + '/' + wav)
                y, sr = librosa.load(self.wav_dir + '/' + wav)
                lpc = librosa.lpc(y, 5)
                for no in range(0, len(lpc)):
                    entry['LIB_LPC{0}'.format(no)] = lpc[no]
                y, sr = librosa.load(self.wav_dir + '/' + wav)
                pitches, magnitudes = librosa.core.piptrack(y, sr)
                # Select out pitches with high energy
                pitches = pitches[magnitudes > np.median(magnitudes)]
                pit = librosa.pitch_tuning(pitches)

                entry['pitch'] = pit

                wav_files.append(entry)

        # wav_files = []
        # entry = dict()
        # iemocap_wav_list = self._load()
        # print(iemocap_wav_list.getframerate())
        # print(iemocap_wav_list)
        # entry['Session'] = glob.glob("*.wav", iemocap_wav_list)
        # if bool(entry):
        #     wav_files.append(entry)
        wav_df = pd.DataFrame(wav_files)
        return wav_df
Exemplo n.º 5
0
def mfcc_emotion_upload():
    entry = dict()
    wav_files = []
    SAMPLE_RATE = 44100

    b, _ = librosa.core.load('pickles/catalyst.wav', sr=SAMPLE_RATE)
    y, sr = librosa.load('pickles/catalyst.wav')
    entry['Mean_RMS'] = np.mean(librosa.feature.rms(y=y))
    entry['STD_RMS'] = np.std(librosa.feature.rms(y=y))
    assert _ == SAMPLE_RATE
    mfcc_feature = librosa.feature.mfcc(b, sr=SAMPLE_RATE, n_mfcc=20)
    delta_mfcc = librosa.feature.delta(mfcc_feature)
    d_delta_mfcc = librosa.feature.delta(mfcc_feature, order=2)
    mean_mfcc = np.mean(mfcc_feature, axis=1)
    std_mfcc = np.mean(mfcc_feature, axis=1)
    mean_ddmfcc = np.mean(d_delta_mfcc, axis=1)
    std_ddmfcc = np.std(d_delta_mfcc, axis=1)
    mean_dmfcc = np.mean(delta_mfcc, axis=1)
    std_dmfcc = np.std(delta_mfcc, axis=1)
    for no in range(0, len(np.mean(delta_mfcc, axis=1))):
        entry['Mean_MFCC{0}'.format(no)] = mean_mfcc[no]
        entry['STD_MFCC{0}'.format(no)] = std_mfcc[no]
        entry['Mean_DDMFCC{0}'.format(no)] = mean_ddmfcc[no]
        entry['STD_DDMFCC{0}'.format(no)] = std_ddmfcc[no]
        entry['Mean_Delta_MFCC{0}'.format(no)] = mean_dmfcc[no]
        entry['STD_Delta_MFCC{0}'.format(no)] = std_dmfcc[no]
    pitches, magnitudes = librosa.core.piptrack(y, sr)
    # Select out pitches with high energy
    pitches = pitches[magnitudes > np.median(magnitudes)]
    pit = librosa.pitch_tuning(pitches)
    entry['pitch'] = pit

    wav_files.append(entry)
    wav_df = pd.DataFrame(wav_files)
    mfcc_clf = joblib.load('pickles/mfcc_model.sav')
    bar = pd.DataFrame(mfcc_clf.predict_proba(wav_df))
    bar.columns = mfcc_clf.classes_
    bar_t = bar.T
    bar_t.columns = ['values']
    fig = go.Figure(data=[
        go.Pie(labels=mfcc_clf.classes_, values=bar_t['values'], hole=.3),
    ])
    return mfcc_clf.predict(wav_df), fig
    def get_wav_df(self):
        wav_files = []
        for wav in os.listdir(self.wav_dir):
           if wav.endswith('.wav'):
               entry = dict()
               entry['Session'] = wav
               SAMPLE_RATE = 44100

               b, _ = librosa.core.load(self.wav_dir + '/' + wav, sr=SAMPLE_RATE)
               y, sr = librosa.load(self.wav_dir + '/' + wav)
               entry['Mean_RMS'] = np.mean(librosa.feature.rms(y=y))
               entry['STD_RMS'] = np.std(librosa.feature.rms(y=y))
               assert _ == SAMPLE_RATE
               mfcc_feature = librosa.feature.mfcc(b, sr=SAMPLE_RATE, n_mfcc=20)
               delta_mfcc = librosa.feature.delta(mfcc_feature)
               d_delta_mfcc = librosa.feature.delta(mfcc_feature, order=2)
               mean_mfcc = np.mean(mfcc_feature, axis=1)
               std_mfcc = np.mean(mfcc_feature, axis=1)
               mean_ddmfcc = np.mean(d_delta_mfcc, axis=1)
               std_ddmfcc = np.std(d_delta_mfcc,axis=1)
               mean_dmfcc = np.mean(delta_mfcc, axis=1)
               std_dmfcc = np.std(delta_mfcc, axis=1)
               for no in range(0, len(np.mean(delta_mfcc, axis=1))):
                   entry['Mean_MFCC{0}'.format(no)] = mean_mfcc[no]
                   entry['STD_MFCC{0}'.format(no)] = std_mfcc[no]
                   entry['Mean_DDMFCC{0}'.format(no)] = mean_ddmfcc[no]
                   entry['STD_DDMFCC{0}'.format(no)] = std_ddmfcc[no]
                   entry['Mean_Delta_MFCC{0}'.format(no)] = mean_dmfcc[no]
                   entry['STD_Delta_MFCC{0}'.format(no)] = std_dmfcc[no]
               y, sr = librosa.load(self.wav_dir + '/' + wav)
               pitches, magnitudes = librosa.core.piptrack(y, sr)
               # Select out pitches with high energy
               pitches = pitches[magnitudes > np.median(magnitudes)]
               pit = librosa.pitch_tuning(pitches)
               entry['pitch'] = pit

               wav_files.append(entry)

        wav_df = pd.DataFrame(wav_files)
        return wav_df
Exemplo n.º 7
0
def estimate_a4(pitches, sr):
    pitches_sel = []
    # Pick out pitches that last longer than `min_pitch_frame`
    for row in range(0, pitches.shape[0]):
        line_frames = []
        line_freq = []
        for col in range(0, pitches.shape[1]):
            if (pitches[row, col] != 0):
                line_frames.append(col)
                line_freq.append(pitches[row, col])
            else:
                if (len(line_frames) > 0):
                    if (len(line_frames) >= min_pitch_frame):
                        line_time = librosa.frames_to_time(line_frames,
                                                           sr=sr,
                                                           hop_length=hop_len)
                        if (line_freq[0] < max_freq):
                            pitches_sel.extend(line_freq.copy())
                    line_frames.clear()
                    line_freq.clear()

    offset_to_a4 = librosa.pitch_tuning(pitches_sel)
    return 440 * (2**(offset_to_a4 / 12))
Exemplo n.º 8
0
 def run(self):
     logging.info("Starting Pitch detector")
     # This loop condition have to be checked frequently, so the code inside may not be blocking
     while not self.terminated:
         new_frame = self.audio_frames.get()  # Get new frame (blocking)
         if self.counter == 0:
             self.frames = new_frame
             self.counter += 1
         elif self.counter >= BUFFER_SIZE:
             self.frames = np.append(self.frames, new_frame)
             pitches, magnitudes = librosa.piptrack(self.frames,
                                                    SAMPLE_RATE)
             # Select out pitches with high energy
             pitches = pitches[magnitudes > np.median(magnitudes)]
             new_tuning = int(50 + 100 * librosa.pitch_tuning(pitches))
             if np.abs(self.last_pitch -
                       new_tuning) > PITCH_CHANGE_THRESHOLD:
                 self.last_pitch = new_tuning
                 self.manager.new_tuning(new_tuning)
             self.counter = 0
         else:
             self.frames = np.append(self.frames, new_frame)
             self.counter += 1
def features(X, sample_rate):

    stft = np.abs(librosa.stft(X))

    # fmin 和 fmax 对应于人类语音的最小最大基本频率
    pitches, magnitudes = librosa.piptrack(X, sr=sample_rate, S=stft, fmin=70, fmax=400)
    pitch = []
    for i in range(magnitudes.shape[1]):
        index = magnitudes[:, 1].argmax()
        pitch.append(pitches[index, i])

    pitch_tuning_offset = librosa.pitch_tuning(pitches)
    pitchmean = np.mean(pitch)
    pitchstd = np.std(pitch)
    pitchmax = np.max(pitch)
    pitchmin = np.min(pitch)

    # 频谱质心
    cent = librosa.feature.spectral_centroid(y=X, sr=sample_rate)
    cent = cent / np.sum(cent)
    meancent = np.mean(cent)
    stdcent = np.std(cent)
    maxcent = np.max(cent)

    # 谱平面
    flatness = np.mean(librosa.feature.spectral_flatness(y=X))

    # 使用系数为50的MFCC特征
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0)
    mfccsstd = np.std(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0)
    mfccmax = np.max(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0)

    # 色谱图
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)

    # 梅尔频率
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)

    # ottava对比
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)

    # 过零率
    zerocr = np.mean(librosa.feature.zero_crossing_rate(X))

    S, phase = librosa.magphase(stft)
    meanMagnitude = np.mean(S)
    stdMagnitude = np.std(S)
    maxMagnitude = np.max(S)

    # 均方根能量
    rmse = librosa.feature.rmse(S=S)[0]
    meanrms = np.mean(rmse)
    stdrms = np.std(rmse)
    maxrms = np.max(rmse)

    ext_features = np.array([
        flatness, zerocr, meanMagnitude, maxMagnitude, meancent, stdcent,
        maxcent, stdMagnitude, pitchmean, pitchmax, pitchstd,
        pitch_tuning_offset, meanrms, maxrms, stdrms
    ])

    ext_features = np.concatenate((ext_features, mfccs, mfccsstd, mfccmax, chroma, mel, contrast))

    return ext_features
Exemplo n.º 10
0
    filename, rhythm_code, pitch_code = 'F:/项目/花城音乐项目/样式数据/9.08MP3/旋律/xx3.wav', '[2000;250,250,250,250,1000;2000;500,500,1000]', '[6,5,6,3,5,6,3,2,1,6-]'
    y, sr = librosa.load(filename, offset=0.75, duration=0.2)
    y, sr = librosa.load(filename, offset=1.1, duration=0.2)  # -0.029\
    # y, sr = librosa.load(filename, offset=1.3, duration=0.2)    # 0.160
    # y, sr = librosa.load(filename, offset=2.6, duration=0.2)    # -0.48
    # y, sr = librosa.load(filename, offset=2.8, duration=0.2)    #-0.169
    # y, sr = librosa.load(filename, offset=3, duration=0.2)
    # y, sr = librosa.load(filename, offset=3.3, duration=0.2)
    # y, sr = librosa.load(filename, offset=3.55, duration=0.2)

    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
    np.set_printoptions(threshold=np.nan)
    print(pitches[np.nonzero(pitches)])

    pitches = pitches[magnitudes > np.median(magnitudes)]
    p = librosa.pitch_tuning(pitches)
    print(p)

    tun = librosa.estimate_tuning(y=y, sr=sr)
    print(tun)

    onset_frames_time = [
        0.7662585, 1.27709751, 2.80961451, 3.0185941, 3.29723356, 3.57587302,
        3.80807256, 4.80653061, 7.2678458, 7.70902494
    ]
    onset_frames_time_diff = np.diff(onset_frames_time)
    onset_frames_time_diff = list(onset_frames_time_diff)
    onset_frames_time_diff.append(0.2)
    for i, o in enumerate(onset_frames_time):
        offset = round(o, 2)
        duration = round(onset_frames_time_diff[i], 2)
Exemplo n.º 11
0
def probabilities(y, note_min, note_max, sr, frame_length, window_length,
                  hop_length, pitch_acc, voiced_acc, onset_acc, spread):
    """
    Estimate prior (observed) probabilities from audio signal

    Parameters
    ----------
    y : 1-D numpy array
        Array containing audio samples

    note_min : string, 'A#4' format
        Lowest note supported by this estimator
    note_max : string, 'A#4' format
        Highest note supported by this estimator
    sr : int
        Sample rate.
    frame_length : int
    window_length : int
    hop_length : int
        Parameters for FFT estimation
    pitch_acc : float, between 0 and 1
        Probability (estimated) that the pitch estimator is correct.
    voiced_acc : float, between 0 and 1
        Estimated accuracy of the "voiced" parameter.
    onset_acc : float, between 0 and 1
        Estimated accuracy of the onset detector.
    spread : float, between 0 and 1
        Probability that the singer/musician had a one-semitone deviation
        due to vibrato or glissando.
    Returns
    -------
    P : 2D numpy array.
        P[j,t] is the prior probability of being in state j at time t.
    """

    fmin = librosa.note_to_hz(note_min)
    fmax = librosa.note_to_hz(note_max)
    midi_min = librosa.note_to_midi(note_min)
    midi_max = librosa.note_to_midi(note_max)
    n_notes = midi_max - midi_min + 1

    # F0 and voicing
    f0, voiced_flag, voiced_prob = librosa.pyin(y, fmin * 0.9, fmax * 1.1, sr,
                                                frame_length, window_length,
                                                hop_length)
    tuning = librosa.pitch_tuning(f0)
    f0_ = np.round(librosa.hz_to_midi(f0 - tuning)).astype(int)
    onsets = librosa.onset.onset_detect(y,
                                        sr=sr,
                                        hop_length=hop_length,
                                        backtrack=True)

    P = np.ones((n_notes * 2 + 1, len(f0)))

    for t in range(len(f0)):
        # probability of silence or onset = 1-voiced_prob
        # Probability of a note = voiced_prob * (pitch_acc) (estimated note)
        # Probability of a note = voiced_prob * (1-pitch_acc) (estimated note)
        if voiced_flag[t] == False:
            P[0, t] = voiced_acc
        else:
            P[0, t] = 1 - voiced_acc

        for j in range(n_notes):
            if t in onsets:
                P[(j * 2) + 1, t] = onset_acc
            else:
                P[(j * 2) + 1, t] = 1 - onset_acc

            if j + midi_min == f0_[t]:
                P[(j * 2) + 2, t] = pitch_acc

            elif np.abs(j + midi_min - f0_[t]) == 1:
                P[(j * 2) + 2, t] = pitch_acc * spread

            else:
                P[(j * 2) + 2, t] = 1 - pitch_acc

    return P
Exemplo n.º 12
0
def main():
    # Enable colored output
    colorama.init()

    parser = argparse.ArgumentParser(
        description=
        'This tool can analyze audio files and estimate the frequecy of A4.')
    parser.add_argument("filename")
    parser.add_argument('-s',
                        '--silent',
                        action="store_true",
                        dest='silent',
                        help='process the given audio file silently')
    parser.add_argument('-o',
                        '--offset',
                        dest='offset',
                        type=float,
                        default=0,
                        help='the offset of the audio to process, default 0')
    parser.add_argument(
        '-d',
        '--duration',
        dest='duration',
        type=float,
        help=
        'the duration of the audio to process. It will process to the end if the argument is not used'
    )
    args = parser.parse_args()

    if (args.silent):
        auto_process(args.filename, args.offset, args.duration)
    else:
        print(Fore.YELLOW + Back.RED + Style.BRIGHT +
              'Welcome to RainEggplant\'s concert pitch analyzer!\n' +
              Style.RESET_ALL)
        print(Fore.YELLOW + Style.BRIGHT +
              'Please follow the instructions to get the result:' +
              Style.RESET_ALL)
        print(
            Fore.YELLOW + Style.BRIGHT + '[1] ' + Style.RESET_ALL +
            'We are going to generate a spectrogram with pitch lines.\n' +
            'After the window pops up, please come back to watch the instructions.'
        )
        print(Fore.CYAN + Style.BRIGHT + "Press Enter to continue... " +
              Style.RESET_ALL,
              end='')
        input()
        print('This may take serveral seconds, please wait.\n')
        tunes = show_spectrogram(args.filename, args.offset, args.duration)

        print(
            Fore.YELLOW + Style.BRIGHT + '[2] ' + Style.RESET_ALL +
            'Now you have seen the spectrogram.\n' +
            '- The green lines are peak frequency of that location.\n' +
            '- The white vertical lines divide the spectrogram into serveral fragments, according to pitch and volume changes.\n'
            +
            '  They are labeled with index. If the labels are overlapped, you can zoom in to seperate them.\n'
            '- You can also use the tools in the tool bar to zoom, drag, save, etc.\n'
            +
            '- The time, frequency and note (relative to A4=440Hz) which you are pointing at will be shown in the status bar.\n'
        )

        print(
            Fore.YELLOW + Style.BRIGHT + '[3] ' + Style.RESET_ALL +
            'After you inspect the spectrogram, you need to decide whether the data is suitable for analyzing or not.\n'
            +
            'If not suitable, re-run the program with different offset, duration or filename.'
        )
        print(Fore.CYAN + Style.BRIGHT + 'Process current data? (y/n) ' +
              Style.RESET_ALL,
              end='')
        cont = input()
        while cont.lower() not in ('y', 'n'):
            print(Fore.CYAN + Style.BRIGHT + 'Process current data? (y/n) ' +
                  Style.RESET_ALL,
                  end='')
            cont = input()

        if cont == 'n':
            return

        print(
            '\n' + Fore.YELLOW + Style.BRIGHT + '[4] ' + Style.RESET_ALL +
            'Now you need to select the range of the audio file for analyzing. There are two modes:\n'
            +
            '\t1. Give start and end time, and let the program analyze automatically (similar to `silent mode`).\n'
            + '\t2. [Pro] Give notes and their sustaining time.\n' +
            '\t   This mode will give you a more accurate and specific result.'
        )
        print(Fore.CYAN + Style.BRIGHT + 'Select mode: (1/2) ' +
              Style.RESET_ALL,
              end='')
        mode = input()
        while mode.lower() not in ('1', '2'):
            print(Fore.CYAN + 'Select mode: (1/2) ' + Style.RESET_ALL, end='')
            mode = input()

        if mode == '1':
            print(
                '\n' + Fore.YELLOW + Style.BRIGHT + '[5] ' + Style.RESET_ALL +
                'Now enter the start and end time according to the spectrogram:'
            )
            # TODO: Add validation.
            while True:
                start_time = float(input('start time: '))
                end_time = float(input('end time: '))
                print(Fore.YELLOW + Style.BRIGHT, end='')
                auto_process(args.filename, args.offset + start_time,
                             end_time - start_time)
                print()

                # Re-estimate using another range
                print(Style.RESET_ALL + Fore.CYAN + Style.BRIGHT +
                      'Re-estimate using another range? (y/n) ' +
                      Style.RESET_ALL,
                      end='')
                again = input()
                while again.lower() not in ('y', 'n'):
                    print(Fore.CYAN + Style.BRIGHT +
                          'Re-estimate using another range? (y/n) ' +
                          Style.RESET_ALL,
                          end='')
                    again = input()

                if again.lower() == 'n':
                    break

        else:
            print(
                '\n' + Fore.YELLOW + Style.BRIGHT + '[5] ' + Style.RESET_ALL +
                'Now add the note, its start time and end time according to the pitch lines.\n'
                +
                '  Format: NOTENAME STARTTIME ENDTIME    (e.g. "A4 1.2 2")\n' +
                'Enter `q` to stop adding.')

            while True:
                notes = []
                # TODO: Add validation.
                while True:
                    input_msg = input('+ ')
                    if input_msg.lower() == 'q':
                        break
                    notes.append(input_msg.split())

                note_names = [row[0] for row in notes]

                # Normalize note names (like Bb to A#)
                for i in range(len(notes)):
                    # notes[i][0] = librosa.hz_to_note(
                    #     librosa.note_to_hz(notes[i][0]))
                    notes[i][1] = float(notes[i][1])
                    notes[i][2] = float(notes[i][2])

                note_names = [row[0] for row in notes]
                note_names = librosa.hz_to_note(librosa.note_to_hz(note_names))

                # Filter pitches
                tunes_match = {}
                for note_name in note_names:
                    tunes_match[note_name] = []

                for (time_seq, freq_seq) in tunes:
                    note_name = librosa.hz_to_note(np.mean(freq_seq))
                    if note_name in note_names:
                        tunes_match[note_name].append((time_seq, freq_seq))

                # Calculate A4 frequencies from the notes
                a4s = []
                for i in range(len(notes)):
                    n_frame = 0
                    freq_sum = 0
                    for (time_seq, freq_seq) in tunes_match[note_names[i]]:
                        for t in range(len(time_seq)):
                            if (time_seq[t] >= notes[i][1]
                                    and time_seq[t] <= notes[i][2]):
                                n_frame += 1
                                freq_sum += freq_seq[t]

                    if n_frame == 0:
                        print(Fore.RED + 'Warning: ' + Style.RESET_ALL +
                              'note `%s` not found, skipping...' % notes[i][0])
                        continue

                    freq_avg = freq_sum / n_frame
                    offset_to_a4 = librosa.pitch_tuning(freq_avg)
                    a4 = 440 * (2**(offset_to_a4 / 12))
                    a4s.append(a4)

                print(
                    Fore.YELLOW + Style.BRIGHT +
                    'The estimated frequencies of A4 from each note are:\n\t',
                    end='')
                print(['{:.1f}'.format(i) for i in a4s])
                print('Average estimated frequency: {:.1f} Hz, '.format(
                    np.mean(a4s)) +
                      'median frequency: {:.1f} Hz, '.format(np.median(a4s)) +
                      'standard deviation: {:.1f} Hz.\n'.format(np.std(a4s)) +
                      Style.RESET_ALL)

                # Re-estimate
                print(Style.RESET_ALL + Fore.CYAN + Style.BRIGHT +
                      'Re-estimate using different notes? (y/n) ' +
                      Style.RESET_ALL,
                      end='')
                again = input()
                while again.lower() not in ('y', 'n'):
                    print(Fore.CYAN + Style.BRIGHT +
                          'Re-estimate using different notes? (y/n) ' +
                          Style.RESET_ALL,
                          end='')
                    again = input()

                if again.lower() == 'n':
                    break

    print()
Exemplo n.º 13
0
def features(X, sample_rate: float) -> np.ndarray:
    stft = np.abs(librosa.stft(X))

    # fmin and fmax correspond to the minimum and the maximum basic frequency of human speech
    pitches, magnitudes = librosa.piptrack(X,
                                           sr=sample_rate,
                                           S=stft,
                                           fmin=70,
                                           fmax=400)
    pitch = []
    for i in range(magnitudes.shape[1]):
        index = magnitudes[:, 1].argmax()
        pitch.append(pitches[index, i])

    pitch_tuning_offset = librosa.pitch_tuning(pitches)
    pitchmean = np.mean(pitch)
    pitchstd = np.std(pitch)
    pitchmax = np.max(pitch)
    pitchmin = np.min(pitch)

    # Spectral centroids
    cent = librosa.feature.spectral_centroid(y=X, sr=sample_rate)
    cent = cent / np.sum(cent)
    meancent = np.mean(cent)
    stdcent = np.std(cent)
    maxcent = np.max(cent)

    # Spectral plane
    flatness = np.mean(librosa.feature.spectral_flatness(y=X))

    # The MFCC feature with coefficient being 50
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T,
                    axis=0)
    mfccsstd = np.std(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T,
                      axis=0)
    mfccmax = np.max(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T,
                     axis=0)

    # Chromatography
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,
                     axis=0)

    # Mel frequency
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)

    # ottava contrast
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft,
                                                         sr=sample_rate).T,
                       axis=0)

    # zero-crossing rate
    zerocr = np.mean(librosa.feature.zero_crossing_rate(X))

    S, phase = librosa.magphase(stft)
    meanMagnitude = np.mean(S)
    stdMagnitude = np.std(S)
    maxMagnitude = np.max(S)

    # RMS energy
    rmse = librosa.feature.rmse(S=S)[0]
    meanrms = np.mean(rmse)
    stdrms = np.std(rmse)
    maxrms = np.max(rmse)

    ext_features = np.array([
        flatness,
        zerocr,
        meanMagnitude,
        maxMagnitude,
        meancent,
        stdcent,
        maxcent,
        stdMagnitude,
        pitchmean,
        pitchmax,
        pitchstd,
        pitch_tuning_offset,
        meanrms,
        maxrms,
        stdrms,
    ])

    ext_features = np.concatenate(
        (ext_features, mfccs, mfccsstd, mfccmax, chroma, mel, contrast))

    return ext_features
Exemplo n.º 14
0
def rasta_emotion_upload():
    wav_files = []
    entry = dict()
    SAMPLE_RATE = 44100
    b, _ = librosa.core.load('pickles/catalyst.wav', sr=SAMPLE_RATE)
    y, sr = librosa.load('pickles/catalyst.wav')
    entry['Mean_RMS'] = np.mean(librosa.feature.rms(y=y))
    entry['STD_RMS'] = np.std(librosa.feature.rms(y=y))
    assert _ == SAMPLE_RATE

    spf = wave.open('pickles/catalyst.wav')

    signal = spf.readframes(-1)
    input_sig = np.fromstring(signal, 'Int16')

    matrix = plp(input_sig,
                 nwin=0.025,
                 fs=sr,
                 plp_order=13,
                 shift=0.01,
                 get_spec=False,
                 get_mspec=False,
                 prefac=0.97,
                 rasta=True)

    rasta_f_df = pd.DataFrame(matrix[0])
    mean_rastaplp = np.asarray((np.mean(rasta_f_df, axis=0)).tolist())
    std_rastaplp = np.asarray((np.std(rasta_f_df, axis=0)).tolist())
    delta_rastaplp = librosa.feature.delta(rasta_f_df)
    d_delta_rastaplp = librosa.feature.delta(rasta_f_df, order=2)

    mean_ddrastaplp = np.mean(d_delta_rastaplp, axis=0)
    std_ddrastaplp = np.std(d_delta_rastaplp, axis=0)
    mean_drastaplp = np.mean(delta_rastaplp, axis=0)
    std_drastaplp = np.std(delta_rastaplp, axis=0)

    for no in range(0, 13):
        entry['Mean_RASTAPLP{0}'.format(no)] = mean_rastaplp[no]
        entry['STD_RASTAPLP{0}'.format(no)] = std_rastaplp[no]
        entry['Mean_DDRastaPLP{0}'.format(no)] = mean_ddrastaplp[no]
        entry['STD_DDRastaPLP{0}'.format(no)] = std_ddrastaplp[no]
        entry['Mean_Delta_RastaPLP{0}'.format(no)] = mean_drastaplp[no]
        entry['STD_Delta_RastaPLP{0}'.format(no)] = std_drastaplp[no]
    y, sr = librosa.load('/pickles/catalyst.wav')
    pitches, magnitudes = librosa.core.piptrack(y, sr)
    # Select out pitches with high energy
    pitches = pitches[magnitudes > np.median(magnitudes)]
    pit = librosa.pitch_tuning(pitches)
    entry['pitch'] = pit

    wav_files.append(entry)
    wav_df = pd.DataFrame(wav_files)
    rasta_clf = joblib.load('pickles/rastaplp_model.sav')

    bar = pd.DataFrame(rasta_clf.predict_proba(wav_df))
    bar.columns = rasta_clf.classes_
    bar_t = bar.T
    bar_t.columns = ['values']
    print('HERE')

    fig = go.Figure(data=[
        go.Pie(labels=rasta_clf.classes_, values=bar_t['values'], hole=.3),
    ])
    return rasta_clf.predict(wav_df), fig
    def get_wav_df(self):
        wav_files = []
        for wav in os.listdir(self.wav_dir):
            if wav.endswith('.wav'):
                entry = dict()
                entry['Session'] = wav
                SAMPLE_RATE = 44100
                b, _ = librosa.core.load(self.wav_dir + '/' + wav,
                                         sr=SAMPLE_RATE)
                y, sr = librosa.load(self.wav_dir + '/' + wav)
                entry['Mean_RMS'] = np.mean(librosa.feature.rms(y=y))
                entry['STD_RMS'] = np.std(librosa.feature.rms(y=y))
                assert _ == SAMPLE_RATE

                spf = wave.open(self.wav_dir + '/' + wav, 'r')

                signal = spf.readframes(-1)
                input_sig = np.fromstring(signal, 'Int16')

                matrix = plp(input_sig,
                             nwin=0.025,
                             fs=sr,
                             plp_order=13,
                             shift=0.01,
                             get_spec=False,
                             get_mspec=False,
                             prefac=0.97,
                             rasta=True)

                rasta_f_df = pd.DataFrame(matrix[0])
                mean_rastaplp = np.asarray((np.mean(rasta_f_df,
                                                    axis=0)).tolist())
                std_rastaplp = np.asarray((np.std(rasta_f_df,
                                                  axis=0)).tolist())
                delta_rastaplp = librosa.feature.delta(rasta_f_df)
                d_delta_rastaplp = librosa.feature.delta(rasta_f_df, order=2)

                mean_ddrastaplp = np.mean(d_delta_rastaplp, axis=0)
                std_ddrastaplp = np.std(d_delta_rastaplp, axis=0)
                mean_drastaplp = np.mean(delta_rastaplp, axis=0)
                std_drastaplp = np.std(delta_rastaplp, axis=0)

                for no in range(0, 13):
                    entry['Mean_RASTAPLP{0}'.format(no)] = mean_rastaplp[no]
                    entry['STD_RASTAPLP{0}'.format(no)] = std_rastaplp[no]
                    entry['Mean_DDRastaPLP{0}'.format(
                        no)] = mean_ddrastaplp[no]
                    entry['STD_DDRastaPLP{0}'.format(no)] = std_ddrastaplp[no]
                    entry['Mean_Delta_RastaPLP{0}'.format(
                        no)] = mean_drastaplp[no]
                    entry['STD_Delta_RastaPLP{0}'.format(
                        no)] = std_drastaplp[no]
                y, sr = librosa.load(self.wav_dir + '/' + wav)
                pitches, magnitudes = librosa.core.piptrack(y, sr)
                # Select out pitches with high energy
                pitches = pitches[magnitudes > np.median(magnitudes)]
                pit = librosa.pitch_tuning(pitches)
                entry['pitch'] = pit

                wav_files.append(entry)
        wav_df = pd.DataFrame(wav_files)
        return wav_df
Exemplo n.º 16
0
def features(X, sample_rate):
    stft = np.abs(librosa.stft(X))

    pitches, magnitudes = librosa.piptrack(X,
                                           sr=sample_rate,
                                           S=stft,
                                           fmin=70,
                                           fmax=400)
    pitch = []
    for i in range(magnitudes.shape[1]):
        index = magnitudes[:, 1].argmax()
        pitch.append(pitches[index, i])

    pitch_tuning_offset = librosa.pitch_tuning(pitches)
    pitchmean = np.mean(pitch)
    pitchstd = np.std(pitch)
    pitchmax = np.max(pitch)
    pitchmin = np.min(pitch)

    # Spectrum center
    cent = librosa.feature.spectral_centroid(y=X, sr=sample_rate)
    cent = cent / np.sum(cent)
    meancent = np.mean(cent)
    stdcent = np.std(cent)
    maxcent = np.max(cent)

    # Spectral plane
    flatness = np.mean(librosa.feature.spectral_flatness(y=X))

    # MFCC
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T,
                    axis=0)
    mfccsstd = np.std(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T,
                      axis=0)
    mfccmax = np.max(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T,
                     axis=0)

    # Chroma
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,
                     axis=0)

    # Mel frequency
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)

    # ottava
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft,
                                                         sr=sample_rate).T,
                       axis=0)

    # Zero crossing rate
    zerocr = np.mean(librosa.feature.zero_crossing_rate(X))

    S, phase = librosa.magphase(stft)
    meanMagnitude = np.mean(S)
    stdMagnitude = np.std(S)
    maxMagnitude = np.max(S)

    # Root mean square energy
    rmse = librosa.feature.rmse(S=S)[0]
    meanrms = np.mean(rmse)
    stdrms = np.std(rmse)
    maxrms = np.max(rmse)

    ext_features = np.array([
        flatness, zerocr, meanMagnitude, maxMagnitude, meancent, stdcent,
        maxcent, stdMagnitude, pitchmean, pitchmax, pitchstd,
        pitch_tuning_offset, meanrms, maxrms, stdrms
    ])

    ext_features = np.concatenate(
        (ext_features, mfccs, mfccsstd, mfccmax, chroma, mel, contrast))

    return ext_features