示例#1
0
def calculate_acoustic_features(args, waveform):
    n_fft = int(args.window*SAMPLE_RATE/1000.0)
    hop_length = int(args.step * SAMPLE_RATE / 1000.0)
    if 'mfe' == args.feature_type:
        if args.backend=='speechpy':
            log_cut = 1e-8
            spec, energy = mfe(waveform, SAMPLE_RATE, frame_length=args.window*1e-3,
                frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft)
            if args.energy:
                acoustic_features = np.hstack((spec, energy[:, np.newaxis]))
            acoustic_features = np.log(acoustic_features + log_cut)
        else:
            spec = librosa.feature.melspectrogram(y=waveform, sr=SAMPLE_RATE, n_fft=n_fft, 
                hop_length=hop_length, n_mels=args.n_mels)
            acoustic_features = librosa.core.amplitude_to_db(spec).transpose()
            if args.energy:
                energy = librosa.feature.rmse(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose()
                acoustic_features = np.hstack((acoustic_features, energy))
    elif 'mfcc' == args.feature_type:
        if args.backend=='speechpy':
            acoustic_features = mfcc(waveform, SAMPLE_RATE, frame_length=args.window*1e-3,
                frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft,
                num_cepstral = args.n_mfcc)
        else:
            acoustic_features = librosa.feature.mfcc(y=waveform, sr=SAMPLE_RATE, n_mfcc=args.n_mfcc,
                n_fft=n_fft, hop_length=hop_length, n_mels=args.n_mels).transpose()
            if args.energy:
                energy = librosa.feature.rmse(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose()
                acoustic_features = np.hstack((acoustic_features, energy))
    elif 'lyon' == args.feature_type:
        waveform /= np.abs(waveform).max()
        acoustic_features = lyon_calc.lyon_passive_ear(waveform[:, np.newaxis].astype(np.double),
                                                       SAMPLE_RATE, hop_length)
        max_val = acoustic_features.max()
        if max_val > 0:
            acoustic_features /= max_val
        acoustic_features = acoustic_features.astype(np.float32)
        if args.energy:
            energy = librosa.feature.rmse(y=waveform, frame_length=hop_length, hop_length=hop_length).transpose()
            energy /= energy.max()
            len_delta = acoustic_features.shape[0] - energy.shape[0]
            if len_delta > 0:
                energy = np.pad(energy, [(0, len_delta), (0, 0)], 'edge')
            else:
                energy = energy[:acoustic_features.shape[0], :]
            acoustic_features = np.hstack((acoustic_features, energy))
    else:
        raise ValueError('Unexpected features type.')
    if args.deltas:
        orig_shape = acoustic_features.shape
        if args.backend=='speechpy':
            acoustic_features = extract_derivative_feature(acoustic_features)
        else:
            delta = librosa.feature.delta(acoustic_features, axis=0)
            ddelta = librosa.feature.delta(acoustic_features, order=2, axis=0)
            acoustic_features = np.stack((acoustic_features[:, :, np.newaxis],
                delta[:, :, np.newaxis], ddelta[:, :, np.newaxis]), axis=-1)
        acoustic_features = np.reshape(acoustic_features, (-1, orig_shape[-1] * 3))
    return acoustic_features
def compute_fbank(file, debug=True):
    sr, signal = wav.read(file)
    if debug:
        print('signal shape: ', signal.shape)
    # Pre-emphasizing.
    signal_preemphasized = processing.preemphasis(signal,
                                                  cof=data_config.preemphasis)
    # Stacking frames
    frames = processing.stack_frames(signal_preemphasized,
                                     sampling_frequency=sr,
                                     frame_length=data_config.window_size,
                                     frame_stride=data_config.hop_size,
                                     zero_padding=True)

    # Extracting power spectrum
    power_spectrum = processing.power_spectrum(
        frames, fft_points=512)  # num_frames x fft_length
    if debug:
        print('power spectrum shape=', power_spectrum.shape)

    ############# Extract fbanks features #############
    log_fbank = feature.lmfe(signal_preemphasized,
                             sampling_frequency=sr,
                             frame_length=data_config.window_size,
                             frame_stride=data_config.hop_size,
                             num_filters=data_config.num_mels,
                             fft_length=512,
                             low_frequency=0,
                             high_frequency=None)  # num_frames x num_filters

    if data_config.apply_cmvn:
        # Cepstral mean variance normalization.
        log_fbank_cmvn = processing.cmvn(log_fbank,
                                         variance_normalization=True)
        if debug:
            print('fbank(mean + variance normalized) feature shape=',
                  log_fbank_cmvn.shape)
        log_fbank = log_fbank_cmvn  # num_frames x num_filters

    # Extracting derivative features
    log_fbank = feature.extract_derivative_feature(log_fbank)
    # print('log fbank feature cube shape=', log_fbank_feature_cube.shape) # num_frames x num_filters x 3

    # frameSlice and dowmSampling
    # concat_mat = concat_frame(log_fbank)
    # log_fbank = subsampling(concat_mat)
    # log_fbank = build_LFR_features(log_fbank, data_config.LFR_m, data_config.LFR_n)
    if debug:
        print('concat & subsample shape=', log_fbank.shape)

    return log_fbank
示例#3
0
def generate_mfec_features(audio_file_name):
    final = []
    fs, signal = wav.read(audio_file_name)
    mfe = speechpy.feature.mfe(signal,
                               sampling_frequency=fs,
                               frame_length=0.02,
                               frame_stride=0.02,
                               num_filters=40,
                               fft_length=512,
                               low_frequency=0,
                               high_frequency=None)
    mfe_1 = mfe[0][:15]
    mfe_final = extract_derivative_feature(mfe_1)
    for d1 in mfe_final:
        temp_array = []
        for d2 in d1:
            temp = np.array(d2).reshape(1, 3)
            temp_array.append(temp)
        final.append(temp_array)

    return final
示例#4
0
############# Extract MFCC features #############
mfcc = feature.mfcc(signal,
                    sampling_frequency=fs,
                    frame_length=0.020,
                    frame_stride=0.01,
                    num_filters=40,
                    fft_length=512,
                    low_frequency=0,
                    high_frequency=None)

# Cepstral mean variance normalization.
mfcc_cmvn = processing.cmvn(mfcc, variance_normalization=True)
print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)

# Extracting derivative features
mfcc_feature_cube = feature.extract_derivative_feature(mfcc)
print('mfcc feature cube shape=', mfcc_feature_cube.shape)

############# Extract logenergy features #############
logenergy = feature.lmfe(signal,
                         sampling_frequency=fs,
                         frame_length=0.020,
                         frame_stride=0.01,
                         num_filters=40,
                         fft_length=512,
                         low_frequency=0,
                         high_frequency=None)
logenergy_feature_cube = feature.extract_derivative_feature(logenergy)
print('logenergy features=', logenergy.shape)
示例#5
0
            if count != 0:
                elapsed_time = time.time() - start_time
            curr_id = subdir[29:38]
            # print(curr_id)
            count = count + 1
            start_time = time.time()

    for file in files:

        sound = AudioSegment.from_wav(subdir + "/" + file)
        sound = sound.set_channels(1)
        sound.export("modified.wav", format="wav")
        sample_rate, samples = wavfile.read("modified.wav")

        features = lmfe(samples, sample_rate, 0.025, 0.01, 40)
        features = extract_derivative_feature(features)

        timevar = 100
        if features.shape[0] >= timevar:
            no_cuts = int(features.shape[0] / timevar)
            for i in range(no_cuts):
                cut = features[i * timevar:(i * timevar) + timevar:, :, :]
                # print("cut: ", cut.shape)
                with open(filename2, "a") as myfile:
                    myfile.write(curr_id + "\n")
                with open(filename, "a") as myfile:
                    for data_slice in cut:
                        np.savetxt(myfile,
                                   data_slice,
                                   delimiter=',',
                                   newline="\n")