Пример #1
def praat_raw_formants(wav_fn, praat_path, frame_shift=1, window_size=25, num_formants=4, max_formant_freq=6000):
    """Return raw estimated formants and corresponding time points using Praat

        See praat_formants() documentation.
        praat_raw_formants() doesn't have the data_len and frame_precision

        estimates_raw - raw formant and bandwidth estimates plus corresponding
                        time points [dictionary of NumPy vectors]

    The estimates_raw dictionary uses keys like
    'pF1', 'pF2', 'pF3', 'pF4', 'pB1', 'pB2', 'pB3', 'pB4'
    ('pF1' is the first Snack Formant, 'pB2' is the second Snack bandwidth
    vector, etc.) and each entry is a NumPy vector of length data_len.  The
    number of keys corresponds with the number of formants specified.  There
    is always a key 'ptFormants' which corresponds to the vector of time
    points matching the estimated formant and bandwidth vectors.
    # Setup command to call Praat F0 script
    praat_cmd = [praat_path, '--run']
    praat_cmd.append(os.path.join(praat_script_dir, 'praatformants.praat'))
    praat_cmd.extend([os.path.abspath(wav_fn), os.path.splitext(wav_fn)[1]])
    praat_cmd.extend([str(frame_shift / 1000), str(window_size / 1000)])
    praat_cmd.extend([str(num_formants), str(max_formant_freq)])

    # Run Praat F0 script
    return_code = call(praat_cmd)

    if return_code != 0: # pragma: no cover
        raise OSError('Praat error')

    # Path for Praat output file corresponding to wav_fn
    fmt_fn = wav_fn.split('.')[0] + '.pfmt'
    # Load results from Praat file
    if os.path.isfile(fmt_fn):
        # Praat allows half integer values for num_formants
        # So we round up to get total number of formant columns
        num_cols = 2 + round_half_away_from_zero(num_formants) * 2
        # Define dictionary that uses undef for all columns
        undef_dict = {i: undef for i in range(num_cols)}
        data_raw = np.loadtxt(fmt_fn, dtype=float, skiprows=1, converters=undef_dict)
        # Cleanup and remove Praat file
    else: # pragma: no cover
        raise OSError('Praat error -- unable to locate .pfmt file')

    # Put results into dictionary
    estimates_raw = {}
    estimates_raw['ptFormants'] = data_raw[:, 0]
    for i in range(1, round_half_away_from_zero(num_formants) + 1):
        estimates_raw['pF' + str(i)] = data_raw[:, 2*i]
        estimates_raw['pB' + str(i)] = data_raw[:, 2*i+1]

    return estimates_raw
Пример #2
def praat_pitch(wav_fn, data_len, praat_path, frame_shift=1, method='cc',
                frame_precision=1, min_pitch=40, max_pitch=500,
                silence_threshold=0.03, voice_threshold=0.45, octave_cost=0.01,
                octave_jumpcost=0.35, voiced_unvoiced_cost=0.14,
                kill_octave_jumps=False, interpolate=False, smooth=False,
    """Estimate F0 using Praat

                      wav_fn - WAV file to be processed [string]
                    data_len - Length of measurement vector [integer]
                  praat_path - Path to Praat executable [string]
                 frame_shift - Length of each frame in ms [integer]
                               (default = 1)
                      method - Method for calculating Praat pitch, either
                               'ac' (autocorrelation) or 'cc'
                               (cross-correlation) [string]
                               (default = 'cc')
             frame_precision - Accuracy of F0 values in multiples of frame
                               length [integer]
                               (default = 1)
                   min_pitch - Minimum F0 considered in Hz [integer]
                               (default = 40)
                   max_pitch - Maximum F0 considered in Hz [integer]
                               (default = 500)
           silence_threshold - Relative amplitude below which a frame is
                               considered to be silent [float]
                               (default = 0.03)
             voice_threshold - Strength of the unvoiced candidate, relative to
                               the maximum possible autocorrelation [float]
                               (default = 0.45)
                 octave_cost - Degree of favouring of high-frequency
                               candidates, relative to the maximum possible
                               autocorrelation [float]
                               (default = 0.01)
             octave_jumpcost - Degree of disfavouring of pitch changes,
                               relative to the maximum possible autocorrelation
                               (default = 0.35)
        voiced_unvoiced_cost - Degree of disfavouring of voiced/unvoiced
                               transitions, relative to the maximum possible
                               autocorrelation [float]
                               (default = 0.14)
           kill_octave_jumps - Whether to try removing pitch halving and
                               doubling [Boolean]
                               (default = False)
                 interpolate - Whether to interpolate missing pitch values in
                               post-processing [Boolean]
                               (default = False)
                      smooth - Whether to smooth pitch in post-processing,
                               using bandwidth specified by smooth_bandwidth
                               (default = False)
            smooth_bandwidth - Bandwidth in Hz to use for smoothing if
                               smooth is set to True [integer]
                               (default = 5)

        F0 - F0 estimates (NumPy vector)

    Raw Praat estimates are at time points that don't completely match the time
    points in our measurement vectors, so we need to interpolate.  We use a
    crude interpolation method, that has precision set by frame_precision.

    For more information, see the Praat manual pages:
    # Compute raw Praat F0 estimates
    t_raw, F0_raw = praat_raw_pitch(wav_fn, praat_path, frame_shift, method,
                                    min_pitch, max_pitch, silence_threshold,
                                    voice_threshold, octave_cost,
                                    octave_jumpcost, voiced_unvoiced_cost,
                                    kill_octave_jumps, interpolate, smooth,

    # Initialize F0 measurement vector with NaN
    F0 = np.full(data_len, np.nan)
    # Convert time from seconds to nearest whole millisecond
    t_raw_ms = np.int_(round_half_away_from_zero(t_raw * 1000))

    # Raw Praat estimates are at time points that don't completely match
    # the time points in our measurement vectors, so we need to interpolate.
    # We use a crude interpolation method, that has precision set by
    # frame_precision.

    # Determine start and stop times
    start = 0
    if t_raw_ms[-1] % frame_shift == 0:
        stop = t_raw_ms[-1] + frame_shift
        stop = t_raw_ms[-1]
    # Iterate through timepoints corresponding to each frame in time range
    for idx_f, t_f in enumerate(range(start, stop, frame_shift)):
        # Find closest time point among calculated Praat values
        min_idx = np.argmin(np.abs(t_raw_ms - t_f))

        # If closest time point is too far away, skip
        if np.abs(t_raw_ms[min_idx] - t_f) > frame_precision * frame_shift:

        # If index is in range, set value of F0
        if (idx_f >= 0) and (idx_f < data_len): # pragma: no branch
            F0[idx_f] = F0_raw[min_idx]

    return F0
Пример #3
 def test_round_half_away_from_zero(self):
     self.assertEqual(round_half_away_from_zero(3.5), 4)
     self.assertEqual(round_half_away_from_zero(3.2), 3)
     self.assertEqual(round_half_away_from_zero(-2.7), -3)
     self.assertEqual(round_half_away_from_zero(-4.3), -4)
Пример #4
def praat_formants(wav_fn, data_len, praat_path, frame_shift=1, window_size=25,
                   frame_precision=1, num_formants=4, max_formant_freq=6000):
    """Estimate formants and bandwidths using Praat

                      wav_fn - WAV file to be processed [string]
                    data_len - Length of measurement vector [integer]
                  praat_path - Path to Praat executable [string]
                 frame_shift - Length of each frame in ms [integer]
                               (default = 1)
                window_size  - Length of analysis window in ms [integer]
                               (default = 25)
             frame_precision - Accuracy of F0 values in multiples of frame
                               length [integer]
                               (default = 1)
                num_formants - Number of formants to extract,
                               usually an integer but half-integer values are
                               allowed [float]
                               (default = 4)
            max_formant_freq - Maximum allowed frequency for formant search
                               range in Hz [integer]
                               (default = 6000)

        estimates - Formant and bandwidth vectors [dictionary of NumPy vectors]

    The estimates dictionary uses keys like
    'pF1', 'pF2', 'pF3', 'pF4', 'pB1', 'pB2', 'pB3', 'pB4'
    ('pF1' is the first Snack Formant, 'pB2' is the second Snack bandwidth
    vector, etc.) and each entry is a NumPy vector of length data_len.  The
    number of keys corresponds with the number of formants specified.

    Raw Praat estimates are at time points that don't completely match the time
    points in our measurement vectors, so we need to interpolate.  We use a
    crude interpolation method, that has precision set by frame_precision.

    For more information, see the Praat manual page:
    # Compute raw Praat formant estimates
    estimates_raw = praat_raw_formants(wav_fn, praat_path, frame_shift,
                                       window_size, num_formants,

    # Initialize measurement vectors with NaN
    estimates = {}
    for k in estimates_raw:
        if k != 'ptFormants':
            estimates[k] = np.full(data_len, np.nan)

    # Raw Praat estimates are at time points that don't completely match
    # the time points in our measurement vectors, so we need to interpolate.
    # We use a crude interpolation method, that has precision set by
    # frame_precision.

    # Convert time from seconds to nearest whole millisecond
    t_raw_ms = np.int_(round_half_away_from_zero(estimates_raw['ptFormants'] * 1000))

    # Determine start and stop times
    start = 0
    if t_raw_ms[-1] % frame_shift == 0:
        stop = t_raw_ms[-1] + frame_shift
        stop = t_raw_ms[-1]
    # Iterate through timepoints corresponding to each frame in time range
    for idx_f, t_f in enumerate(range(start, stop, frame_shift)):
        # Find closest time point among calculated Praat values
        min_idx = np.argmin(np.abs(t_raw_ms - t_f))

        # If closest time point is too far away, skip
        if np.abs(t_raw_ms[min_idx] - t_f) > frame_precision * frame_shift:

        # If index is in range, set measurement value
        if (idx_f >= 0) and (idx_f < data_len): # pragma: no branch
            for k in estimates_raw:
                if k != 'ptFormants':
                    estimates[k][idx_f] = estimates_raw[k][min_idx]

    return estimates
Пример #5
 def test_round_half_away_from_zero(self):
     self.assertEqual(round_half_away_from_zero(3.5), 4)
     self.assertEqual(round_half_away_from_zero(3.2), 3)
     self.assertEqual(round_half_away_from_zero(-2.7), -3)
     self.assertEqual(round_half_away_from_zero(-4.3), -4)
Пример #6
def shr_pitch(wav_data,
    """Return a list of Subharmonic ratios and F0 values computed from wav_data.

    wav_data        a vector of data read from a wav file
    fps             frames rate of the wav file
    windows_length  width of analysis window
    frame_shift     distance to move window for each analysis iteration
    min_pitch       minimum pitch in Hz used in SHR estimation
    max_pitch       maximum pitch in Hz used in SHR estimation
    shr_threshold   subharmonic-to-harmonic ratio threshold in the range of
                        [0,1].  If the estimated SHR is greater than the
                        threshold, the subharmonic is regarded as F0 candidate.
                        Otherwise, the harmonic is favored.
    frame_precision maximum number of frames the time alignment can be off
                        by when selecting values for output
    datalen         the number of values in the output vector; leftover
                        input data is dropped, and the vector is padded
                        with NaNs when no input data corresponds to
                        the output frame time.

    # XXX the octave code produces 201 output points given a datalen
    # of 200.  Presumably a bug in the matlab code.  But we'll emulate it.
    datalen += 1
    kw = {}
    # XXX This is awkward, fix it in refactoring later.
    if len(list(filter(None, (min_pitch, max_pitch)))) == 1:
        raise ValueError(
            'none or both of min_pitch, max_pitch must be specified')
    elif min_pitch:
        kw['F0MinMax'] = (min_pitch, max_pitch)
    if window_length is not None:
        kw['frame_length'] = window_length
    if frame_shift is not None:
        kw['timestep'] = frame_shift
    if shr_threshold is not None:
        kw['SHR_Threshold'] = shr_threshold
    f0_time, f0_value, shr_value, f0_candidates = shrp(wav_data, fps, **kw)

    # "Postprocess subharmonic-harmonic ratios and f0 tracks"

    # "Initialize F0 and subharmonic-harmonic ratio values"
    F0 = np.full(datalen, np.nan)
    SHR = np.full(datalen, np.nan)

    # "time locations rounded to nearest ms"
    # VoiceSauce uses Matlab, and Matlab's round function uses the
    # round-half-away-from-zero method.  However, NumPy uses the
    # round-half-to-even method.  So we use our own round-half-away-from-zero
    # method here.
    t = round_half_away_from_zero(f0_time)

    # "Like timecoures from Praat, we might have missing values so pad with NaNs at
    # beginning and end if necessary."
    # RDM XXX note that it looks to me like after the leading NaN padding this
    # actually ends up padding with frame_precision copies of the first frame
    # that comes within the precision window, and then offsets all of the
    # others by frame_precision*frame_shift.  This algorithm could use a lot of
    # improvement I think.  But for now, we are emulating the voicesauce code.
    start = 0
    finish = t[-1]
    increment = frame_shift
    for k in np.arange(start, finish, increment):
        # "try to find the closest value"
        dabs = np.abs(t - k)
        inx = dabs.argmin()
        if dabs[inx] > frame_precision * frame_shift:
            # "no valid value found"
        n = int(round(k / frame_shift)) + 1
        if n < 0 or n >= datalen:
        F0[n] = f0_value[inx]
        SHR[n] = shr_value[inx]
        # "I eventually would like to get candidates as well"
    return SHR, F0
Пример #7
def praat_pitch(wav_fn,
    """Estimate F0 using Praat

                      wav_fn - WAV file to be processed [string]
                    data_len - Length of measurement vector [integer]
                  praat_path - Path to Praat executable [string]
                 frame_shift - Length of each frame in ms [integer]
                               (default = 1)
                      method - Method for calculating Praat pitch, either
                               'ac' (autocorrelation) or 'cc'
                               (cross-correlation) [string]
                               (default = 'cc')
             frame_precision - Accuracy of F0 values in multiples of frame
                               length [integer]
                               (default = 1)
                   min_pitch - Minimum F0 considered in Hz [integer]
                               (default = 40)
                   max_pitch - Maximum F0 considered in Hz [integer]
                               (default = 500)
           silence_threshold - Relative amplitude below which a frame is
                               considered to be silent [float]
                               (default = 0.03)
             voice_threshold - Strength of the unvoiced candidate, relative to
                               the maximum possible autocorrelation [float]
                               (default = 0.45)
                 octave_cost - Degree of favouring of high-frequency
                               candidates, relative to the maximum possible
                               autocorrelation [float]
                               (default = 0.01)
             octave_jumpcost - Degree of disfavouring of pitch changes,
                               relative to the maximum possible autocorrelation
                               (default = 0.35)
        voiced_unvoiced_cost - Degree of disfavouring of voiced/unvoiced
                               transitions, relative to the maximum possible
                               autocorrelation [float]
                               (default = 0.14)
           kill_octave_jumps - Whether to try removing pitch halving and
                               doubling [Boolean]
                               (default = False)
                 interpolate - Whether to interpolate missing pitch values in
                               post-processing [Boolean]
                               (default = False)
                      smooth - Whether to smooth pitch in post-processing,
                               using bandwidth specified by smooth_bandwidth
                               (default = False)
            smooth_bandwidth - Bandwidth in Hz to use for smoothing if
                               smooth is set to True [integer]
                               (default = 5)

        F0 - F0 estimates (NumPy vector)

    Raw Praat estimates are at time points that don't completely match the time
    points in our measurement vectors, so we need to interpolate.  We use a
    crude interpolation method, that has precision set by frame_precision.

    For more information, see the Praat manual pages:
    # Compute raw Praat F0 estimates
    t_raw, F0_raw = praat_raw_pitch(wav_fn, praat_path, frame_shift, method,
                                    min_pitch, max_pitch, silence_threshold,
                                    voice_threshold, octave_cost,
                                    octave_jumpcost, voiced_unvoiced_cost,
                                    kill_octave_jumps, interpolate, smooth,

    # Initialize F0 measurement vector with NaN
    F0 = np.full(data_len, np.nan)
    # Convert time from seconds to nearest whole millisecond
    t_raw_ms = np.int_(round_half_away_from_zero(t_raw * 1000))

    # Raw Praat estimates are at time points that don't completely match
    # the time points in our measurement vectors, so we need to interpolate.
    # We use a crude interpolation method, that has precision set by
    # frame_precision.

    # Determine start and stop times
    start = 0
    if t_raw_ms[-1] % frame_shift == 0:
        stop = t_raw_ms[-1] + frame_shift
        stop = t_raw_ms[-1]
    # Iterate through timepoints corresponding to each frame in time range
    for idx_f, t_f in enumerate(range(start, stop, frame_shift)):
        # Find closest time point among calculated Praat values
        min_idx = np.argmin(np.abs(t_raw_ms - t_f))

        # If closest time point is too far away, skip
        if np.abs(t_raw_ms[min_idx] - t_f) > frame_precision * frame_shift:

        # If index is in range, set value of F0
        if (idx_f >= 0) and (idx_f < data_len):  # pragma: no branch
            F0[idx_f] = F0_raw[min_idx]

    return F0
Пример #8
def praat_raw_formants(wav_fn,
    """Return raw estimated formants and corresponding time points using Praat

        See praat_formants() documentation.
        praat_raw_formants() doesn't have the data_len and frame_precision

        estimates_raw - raw formant and bandwidth estimates plus corresponding
                        time points [dictionary of NumPy vectors]

    The estimates_raw dictionary uses keys like
    'pF1', 'pF2', 'pF3', 'pF4', 'pB1', 'pB2', 'pB3', 'pB4'
    ('pF1' is the first Snack Formant, 'pB2' is the second Snack bandwidth
    vector, etc.) and each entry is a NumPy vector of length data_len.  The
    number of keys corresponds with the number of formants specified.  There
    is always a key 'ptFormants' which corresponds to the vector of time
    points matching the estimated formant and bandwidth vectors.
    # Setup command to call Praat F0 script
    praat_cmd = [praat_path, '--run']
    praat_cmd.append(os.path.join(praat_script_dir, 'praatformants.praat'))
    praat_cmd.extend([os.path.abspath(wav_fn), os.path.splitext(wav_fn)[1]])
    praat_cmd.extend([str(frame_shift / 1000), str(window_size / 1000)])
    praat_cmd.extend([str(num_formants), str(max_formant_freq)])

    # Run Praat F0 script
    return_code = call(praat_cmd)

    if return_code != 0:  # pragma: no cover
        raise OSError('Praat error')

    # Path for Praat output file corresponding to wav_fn
    fmt_fn = wav_fn.split('.')[0] + '.pfmt'
    # Load results from Praat file
    if os.path.isfile(fmt_fn):
        # Praat allows half integer values for num_formants
        # So we round up to get total number of formant columns
        num_cols = 2 + round_half_away_from_zero(num_formants) * 2
        # Define dictionary that uses undef for all columns
        undef_dict = {i: undef for i in range(num_cols)}
        data_raw = np.loadtxt(fmt_fn,
        # Cleanup and remove Praat file
    else:  # pragma: no cover
        raise OSError('Praat error -- unable to locate .pfmt file')

    # Put results into dictionary
    estimates_raw = {}
    estimates_raw['ptFormants'] = data_raw[:, 0]
    for i in range(1, round_half_away_from_zero(num_formants) + 1):
        estimates_raw['pF' + str(i)] = data_raw[:, 2 * i]
        estimates_raw['pB' + str(i)] = data_raw[:, 2 * i + 1]

    return estimates_raw
Пример #9
def praat_formants(wav_fn,
    """Estimate formants and bandwidths using Praat

                      wav_fn - WAV file to be processed [string]
                    data_len - Length of measurement vector [integer]
                  praat_path - Path to Praat executable [string]
                 frame_shift - Length of each frame in ms [integer]
                               (default = 1)
                window_size  - Length of analysis window in ms [integer]
                               (default = 25)
             frame_precision - Accuracy of F0 values in multiples of frame
                               length [integer]
                               (default = 1)
                num_formants - Number of formants to extract,
                               usually an integer but half-integer values are
                               allowed [float]
                               (default = 4)
            max_formant_freq - Maximum allowed frequency for formant search
                               range in Hz [integer]
                               (default = 6000)

        estimates - Formant and bandwidth vectors [dictionary of NumPy vectors]

    The estimates dictionary uses keys like
    'pF1', 'pF2', 'pF3', 'pF4', 'pB1', 'pB2', 'pB3', 'pB4'
    ('pF1' is the first Snack Formant, 'pB2' is the second Snack bandwidth
    vector, etc.) and each entry is a NumPy vector of length data_len.  The
    number of keys corresponds with the number of formants specified.

    Raw Praat estimates are at time points that don't completely match the time
    points in our measurement vectors, so we need to interpolate.  We use a
    crude interpolation method, that has precision set by frame_precision.

    For more information, see the Praat manual page:
    # Compute raw Praat formant estimates
    estimates_raw = praat_raw_formants(wav_fn, praat_path, frame_shift,
                                       window_size, num_formants,

    # Initialize measurement vectors with NaN
    estimates = {}
    for k in estimates_raw:
        if k != 'ptFormants':
            estimates[k] = np.full(data_len, np.nan)

    # Raw Praat estimates are at time points that don't completely match
    # the time points in our measurement vectors, so we need to interpolate.
    # We use a crude interpolation method, that has precision set by
    # frame_precision.

    # Convert time from seconds to nearest whole millisecond
    t_raw_ms = np.int_(
        round_half_away_from_zero(estimates_raw['ptFormants'] * 1000))

    # Determine start and stop times
    start = 0
    if t_raw_ms[-1] % frame_shift == 0:
        stop = t_raw_ms[-1] + frame_shift
        stop = t_raw_ms[-1]
    # Iterate through timepoints corresponding to each frame in time range
    for idx_f, t_f in enumerate(range(start, stop, frame_shift)):
        # Find closest time point among calculated Praat values
        min_idx = np.argmin(np.abs(t_raw_ms - t_f))

        # If closest time point is too far away, skip
        if np.abs(t_raw_ms[min_idx] - t_f) > frame_precision * frame_shift:

        # If index is in range, set measurement value
        if (idx_f >= 0) and (idx_f < data_len):  # pragma: no branch
            for k in estimates_raw:
                if k != 'ptFormants':
                    estimates[k][idx_f] = estimates_raw[k][min_idx]

    return estimates