Exemplo n.º 1
0
def get_ffts(audio_file):
    """Computes the FFT of each frame of a WAVE file.

  Splits the WAVE file into frames of equal temporal length and performs
  an FFT on each.

  Args:
    audio_file: A WAVE file.

  Returns:
    An iterable of the FFTs of the frames of the WAVE file.
  """
    global COMP_FRAME_SIZE
    # Read the file, and determine its length in frames
    (sample, data) = utils.read_wave_from_file(audio_file)
    total_frames = (data.size / sample) / COMP_FRAME_SIZE
    # Allocate space for the FFT decompsitions of each frame of sound data
    fft_out = numpy.ndarray(shape=(total_frames, sample * COMP_FRAME_SIZE), dtype=numpy.complex128)
    # Loop invariant:
    # 0 <= frame_index <= total_frames
    # results in an array (fft_out) of FFTs that correspond to the frames of
    #  the audio file
    frame_index = 0
    while frame_index < total_frames:
        fft = numpy.fft.fft(data[frame_index * COMP_FRAME_SIZE * sample : (frame_index + 1) * COMP_FRAME_SIZE * sample])
        fft_out[frame_index] = fft
        frame_index = frame_index + 1
    return fft_out
Exemplo n.º 2
0
def get_mfcc(path):
    """Finds the MFCCs and FFTs of a WAVE file.

  Args:
    path: The path to a WAVE file.

  Returns:
    A tuple of two iterables, the FFTs and MFCCs of the frames of the
    WAVE file.
  """
    global COMP_FRAME_SIZE
    # Read the file, and determine its length in frames
    (sample, data) = utils.read_wave_from_file(path)
    total_frames = (data.size / sample) / COMP_FRAME_SIZE

    step = COMP_FRAME_SIZE * sample
    window = hamming(step)

    # Allocate space for the FFT decompositions of each frame of sound data
    fft_out = []
    mfcc_out = []

    # Loop invariant:
    #   0 <= frame_index <= total_frames
    #   results in an array (fft_out) of FFTs that correspond to the
    #    frames of the WAVE file
    filterbank_cache = {}
    frame_index = 0

    while frame_index + (1 - FRAME_OVERLAP_FACTOR) < total_frames:
        # Obtain the frame_indexth frame from the data
        frame = data[frame_index * step : (frame_index + 1) * step]

        # Generate the FFT of the frame windowed by the hamming window
        frame_fft = numpy.fft.rfft(frame * window, n=256)
        frame_fft[frame_fft == 0] = 0.000003
        nfft = len(frame_fft)

        # Compute the mel triangular filterbank or get a cached version
        fb_key = (sample, nfft)
        if fb_key in filterbank_cache:
            filterbank = filterbank_cache[fb_key]
        else:
            filterbank = triangular_filters(sample, nfft).T
            filterbank[filterbank == 0] = 0.00003
            filterbank_cache[fb_key] = filterbank

        # The power spectrum of the frame
        power_spectrum = numpy.abs(frame_fft)
        # Filtered by the mel filterbank
        mel_power_spectrum = numpy.log10(numpy.dot(power_spectrum, filterbank))
        # With the discrete cosine transform to find the cepstrum
        cepstrum = dct(mel_power_spectrum, type=2, norm="ortho", axis=-1)

        fft_out.append(frame_fft)
        mfcc_out.append(cepstrum[: int(len(cepstrum) * SIGNIFICANT_MFCC)])
        frame_index = frame_index + FRAME_OVERLAP_FACTOR

    return numpy.array(mfcc_out)