Exemplo n.º 1
0
def extractSnrFeatures ( signal, sampleRate=16000, windowSize=30,windowStep=10 ):
	"""Extract SNR features for each audio window

	This function construct a SNR signal that is readed through consecutive windows. For each window, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is a features matrix where each time ordered row refers to a window, and each column represents a single metric. In this case the SNR refers to a speech signal to noise ratio. It distinguish the main voice signal from other background audio signal with the WADA algorithm definition.

	Args:
		signal (numpy.array): a mono PCM16 audio signal
		sampleRate (int): The audio signal sample rate(values different then 16000 may cose troubles)
		windowSize (float): The size of the aggregative windo in second
		windowStep (float): the duration between 2 consecutive windows(overlapping is aloud)

	Returns:
		numpy.ndarray: The SNR features matrix

	"""


	snrs = st.getSnrs ( signal, sampleRate)
	snrIndex = 0
	minSnr = np.min(snrs)
	vadMask = st.getVadMask(signal, sampleRate)
	vadMaskReader = st.getSignalReader( vadMask, 100, 1, 0.5)
	for window in vadMaskReader:
		if np.mean(window) <=  0.5: snrs[snrIndex] = minSnr
		snrIndex +=1
	snrFeatures = []
	snrReader = st.getSignalReader( snrs, 2, windowSize, windowStep)
	for window in snrReader:
		localFeatures = computeLocalFeatures( window , 2, minSnr)
		snrFeatures.append( localFeatures)
	snrFeatures = np.array(snrFeatures)
	return snrFeatures
Exemplo n.º 2
0
def extractRhythmFeatures(signal=None,
                          sampleRate=None,
                          sylFile=None,
                          windowSize=30,
                          windowStep=10):
    """Extract rythm features for each audio window 

	This function constructs 3 rhythm signals that are readed through consecutive windows. For each window and each signal, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is 3 features matrixes where each time ordered row refers to a window, and each column represents a single metric. The 3 signals represent respectively: syllabic rate(number of syllables per second), syllabic durations (mean syllable duration for each second), vowel durations(mean vowel durations for each second). If The param sylFile is specified, the syllabic detection is not performed but those are loaded from a precompputed .syl file. In this case the params signal and sampleRate are ignored.

	Args:
		signal (numpy.array): a mono PCM16 audio signal
		sampleRate (int): The audio signal sample rate(values different than 16000 may cose troubles)
		sylFile (str): The path to the precomputed syllables .syl file
		windowSize (float): The size of the aggregative windo in second
		windowStep (float): the duration between 2 consecutive windows(overlapping is aloud)

	Returns:
		numpy.ndarray, numpy.ndarray, numpy.ndarray: The syllabic rate features matrix, the syllabic duration features matrix, the vowel duration features matrix

	"""

    #if already extracted syllables are available
    if sylFile:
        syllables = None
        with open(sylFile, "rb") as sylData:
            syllables = pickle.load(sylData)
    else:
        syllables = st.getSyllables2(signal, sampleRate)
    #get syllabic rate features
    syllabicRates = st.getSyllabicRates(syllables)
    #for each syllabic rate window compute the statistic features
    syllabicRateFeatures = []
    syllabicRateReader = st.getSignalReader(syllabicRates, 1, windowSize,
                                            windowStep)
    for window in syllabicRateReader:
        localFeatures = computeLocalFeatures(window)
        syllabicRateFeatures.append(localFeatures)
    syllabicRateFeatures = np.array(syllabicRateFeatures)
    #exactly same approach for the other rhythm features
    #get syllabic durations features
    syllabicDurations = st.getSyllabicDurations(syllables)
    syllabicDurationFeatures = []
    syllabicDurationReader = st.getSignalReader(syllabicDurations, 1,
                                                windowSize, windowStep)
    for window in syllabicDurationReader:
        localFeatures = computeLocalFeatures(window)
        syllabicDurationFeatures.append(localFeatures)
    syllabicDurationFeatures = np.array(syllabicDurationFeatures)
    #get vowel duration features
    vowelDurations = st.getVowelDurations(syllables)
    vowelDurationFeatures = []
    vowelDurationReader = st.getSignalReader(vowelDurations, 1, windowSize,
                                             windowStep)
    for window in vowelDurationReader:
        localFeatures = computeLocalFeatures(window)
        vowelDurationFeatures.append(localFeatures)
    vowelDurationFeatures = np.array(vowelDurationFeatures)
    return syllabicRateFeatures, syllabicDurationFeatures, vowelDurationFeatures
Exemplo n.º 3
0
def getVadMask(signal,
               sampleRate=16000,
               vad=None,
               aggressiveness=3,
               windowWidth=0.03,
               windowStep=0.01):
    """Construct a boolean mask for identifying speech zones in a audio signal

	This function returns a boolean mask for detecting speech zones in an audio signal. The audio signal is readed through short terme windows, and for each windo, a boolean value is raised(1 if the window contains speech 0 otherwise ). The result is a mask or a vector containing a decision value for each window. This way, the mask could bi multiplied by a higher level signal (with the same step spacing) , for speech filtering. An other usage is to compute a speech quantity ratio of a audio segment through the mean or median value of the boolean mask, for detection thresholding tasks.

	Args:
		signal (numpy.array): mono int16 audio signal
		sampleRate (int): audio signal sample rate(should be equal to 16000 for avoiding troubles)
		vad (webrtcvad.Vad): if specified an existing configured web RTC VAD, otherwise an new instance is created using the aggressiveness param
		aggressiveness (int): The aggressiveness of the Vad instance, could only be [1,2,3] with 3 the most aggressive value
		windowWidth (float): The size of the audio window in second, could only be [0.01,0.02,0.03]
		windowStep (float): Duration between 2 consecutive windows (overlapping is aloud)

	Returns:
		numpy.array: the mask or boolean vector using the specified window step as value spacing

	"""

    vadMask = []
    if not vad: vad = webrtcvad.Vad(aggressiveness)
    signalReader = st.getSignalReader(signal,
                                      sampleRate,
                                      windowWidth=windowWidth,
                                      step=windowStep)
    for window in signalReader:
        bytesFrame = window.tobytes()
        vadMask.append(vad.is_speech(bytesFrame, sampleRate))
    vadMask = np.array(vadMask)
    return vadMask
Exemplo n.º 4
0
def extractEnergyFeatures ( signal, sampleRate=16000, windowSize=30,windowStep=10):
	"""Extract energy features for each audio window 

	This function construct a energy signal that is readed through consecutive windows. For each window, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is a features matrix where each time ordered row refers to a window, and each column represents a single metric.

	Args:
		signal (numpy.array): a mono PCM16 audio signal
		sampleRate (int): The audio signal sample rate(values different then 16000 may cose troubles)
		windowSize (float): The size of the aggregative windo in second
		windowStep (float): the duration between 2 consecutive windows(overlapping is aloud)

	Returns:
		numpy.ndarray: The energy features matrix

	"""


	energies = st.collectEnergies ( signal )
	vadMask = st.getVadMask(signal, sampleRate)
	energies *= vadMask
	energyFeatures = []
	energyReader = st.getSignalReader( energies, 100, windowSize, windowStep)
	for window in energyReader:
		localFeatures = computeLocalFeatures( window )
		energyFeatures.append( localFeatures)
	energyFeatures = np.array(energyFeatures)
	return energyFeatures
Exemplo n.º 5
0
def extractSpectralFeatures ( signal, sampleRate, windowSize=30,windowStep=10):
	"""Extract spectral features for each audio window

	This function constructs 2 spectral signals that are readed through consecutive windows. For each window and each signal, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is 2 features matrixes where each time ordered row refers to a window, and each column represents a single metric. The 2 signals represent respectively: spectral centroid, spectral flatness.

	Args:
		signal (numpy.array): a mono PCM16 audio signal
		sampleRate (int): The audio signal sample rate(values different then 16000 may cose troubles)
		windowSize (float): The size of the aggregative windo in second
		windowStep (float): the duration between 2 consecutive windows(overlapping is aloud)

	Returns:
		numpy.ndarray, numpy.ndarray: The spectral centroid features matrix, the spectral flatness features matrix

	"""


	floatSignal = np.float32(signal)
	spectralCentroids = librosa.feature.spectral_centroid( floatSignal, sampleRate, n_fft=512, hop_length=160, center=False)
	spectralCentroids = spectralCentroids.flatten()
	vadMask = st.getVadMask( signal, sampleRate)
	spectralCentroids, vadMask = st.equalizeShapes( spectralCentroids, vadMask)
	spectralCentroids *= vadMask
	nonZeros = np.nonzero(spectralCentroids)[0]
	semitoneScale = st.getSemitoneScale()
	spectralCentroids[nonZeros] = st.quantifyValues( spectralCentroids[nonZeros], semitoneScale)
	semitones = np.zeros( spectralCentroids.size)
	semitones[nonZeros]= st.pitches2semitones( spectralCentroids[nonZeros], semitoneScale)
	semitoneFeatures = []
	semitoneReader = st.getSignalReader( semitones, 100, windowSize, windowStep)
	for window in semitoneReader:
		localFeatures = computeLocalFeatures( window )
		semitoneFeatures.append( localFeatures)
	semitoneFeatures = np.array(semitoneFeatures)
	spectralCentroidFeatures = semitoneFeatures
	spectralFlatnesses = librosa.feature.spectral_flatness ( floatSignal, n_fft=512, hop_length=160, center=False)
	spectralFlatnesses = spectralFlatnesses.flatten()
	spectralFlatnesses, vadMask = st.equalizeShapes( spectralFlatnesses, vadMask)
	spectralFlatnesses *= vadMask
	spectralFlatnessFeatures = []
	spectralFlatnessReader = st.getSignalReader( spectralFlatnesses, 100, windowSize, windowStep)
	for window in spectralFlatnessReader:
		localFeatures = computeLocalFeatures( window )
		spectralFlatnessFeatures.append( localFeatures)
	spectralFlatnessFeatures = np.array(spectralFlatnessFeatures)
	return spectralCentroidFeatures, spectralFlatnessFeatures
Exemplo n.º 6
0
def getRecCurve(signal,
                sampleRate=16000,
                firstLFFrequency=300,
                firstHFFrequency=1000,
                lastFrequency=5600,
                windowDuration=0.032,
                stepDuration=0.01):
    """Return the REC reduced energy cumulating curve of an audio signal

	This function reads an audio signal through consecutive windows. for each window a REC value is computed using the speechTools.energy.computeRec function that needs to pass a frequency setting for distinguishing low and high frequencies.
	The REC curve is a good tool for detecting vowel positions by finding large positive peaks in the curve. To select those peaks the function computes as well a REC threshold sett to the curve  median value.

	Args:
		signal (numpy.array): The mono audio signal
		sampleRate (int): The signal sample rate
		firstLFFrequency (int): The first low frequency in hz
		firstHFFrequency (int): The first high frequency in hz
		lastFrequency (int): The last frequency in hz
		windowDuration (float): The window duration in second
		stepDuration (float): The duration between 2 consecutive windows

	Returns:
		numpy.array: The REC curve time ordered values

	"""

    windowSize = int(windowDuration * sampleRate)
    windowStep = stepDuration * sampleRate
    spectralBase = st.getSpectralBase(windowSize, sampleRate)
    firstLFBean = getFrequencyBean(firstLFFrequency, spectralBase)
    firstHFBean = getFrequencyBean(firstHFFrequency, spectralBase)
    lastBean = getFrequencyBean(lastFrequency, spectralBase)
    nbRecs = int((signal.size - windowSize) // windowStep) + 1
    recCurve = np.zeros(nbRecs)
    signalReader = st.getSignalReader(signal,
                                      sampleRate,
                                      windowDuration,
                                      stepDuration,
                                      withWindowIndex=True)
    for window, windowIndex in signalReader:
        if np.any(window):
            recCurve[windowIndex] = st.computeRec(window,
                                                  sampleRate,
                                                  firstLFBean=firstLFBean,
                                                  firstHFBean=firstHFBean,
                                                  lastBean=lastBean)
    #for i in range(1, nbRecs - 1): recCurve[i] = np.mean(recCurve[i-1:i+2])
    recThreshold = np.median(recCurve)
    return recCurve, recThreshold
Exemplo n.º 7
0
def extractPitchFeatures(signal,
                         sampleRate=16000,
                         pitchesFile=None,
                         windowSize=30,
                         windowStep=10):
    """Extract pitch features for each audio window 

	This function construct a pitch signal that is readed through consecutive windows. For each window, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is a features matrix where each time ordered row refers to a window, and each column represents a single metric. if The param pitchFile is specified, the pitch signal is not constructed  from the given audio signal but directly loaded from a .f0 file.

	Args:
		signal (numpy.array): a mono PCM16 audio signal
		sampleRate (int): The audio signal sample rate(values different than 16000 may cose troubles)
		pitchesFile (str): The path to the precomputed pitch .f0 file
		windowSize (float): The size of the aggregative windo in second
		windowStep (float): the duration between 2 consecutive windows(overlapping is aloud)

	Returns:
		numpy.ndarray: The pitch features matrix

	"""

    #if precomputed pitches are available
    if pitchesFile:
        pitches = None
        with open(pitchesFile, "rb") as pitchData:
            pitches = pickle.load(pitchData)
    else:
        pitches = st.getPitches(signal, sampleRate)
    # positive pitches intersection with VAD
    nonZeros = np.nonzero(pitches)[0]
    vadMask = st.getVadMask(signal, sampleRate)
    pitches, vadMask = st.equalizeShapes(pitches, vadMask)
    pitches[nonZeros] *= vadMask[nonZeros]
    #pitches conversion on semitone for macro melody quantification and geting spectral linearity
    nonZeros = np.nonzero(pitches)[0]
    semitoneScale = st.getSemitoneScale()
    pitches[nonZeros] = st.quantifyValues(pitches[nonZeros], semitoneScale)
    semitones = np.zeros(pitches.size)
    semitones[nonZeros] = st.pitches2semitones(pitches[nonZeros],
                                               semitoneScale)
    #for each pitches window compute the statistic features
    pitchFeatures = []
    semitoneReader = st.getSignalReader(semitones, 100, windowSize, windowStep)
    for window in semitoneReader:
        localFeatures = computeLocalFeatures(window)
        pitchFeatures.append(localFeatures)
    pitcheFeatures = np.array(pitchFeatures)
    return pitchFeatures
Exemplo n.º 8
0
def getSnrs(signal, sampleRate):
    """Return the voice signal to noise ratio measures of an audio signal

	This function reads an audio signal and compute the voice signal to noise ratio using the wave forme based WADA algorithme, that is implemented in the external WADA module. This type of SNR measures the level of a single voice signal reported to the other background sounds considered as noise. Here, the used WADA algorithm is not short term designed. It canot be used on short signal windows (I.E 30ms) but on longer zones (I.E 1s) the measures  are 50% overlaped, so that we optain a measure each 500ms, and a sample rate of 2.

	Args:
		signal (numpy.array): mono int16 audio signal
		sampleRate (int): audio signal sample rate(should be equal to 16000 for avoiding troubles)

	Returns:
		numpy.array: the Time ordered SNR measures

	"""

    normalizedSignal = st.WADA.pcm2float(signal, "float32")
    snrs = []
    signalReader = st.getSignalReader(normalizedSignal, sampleRate, 1, 0.5)
    for window in signalReader:
        snrs.append(st.WADA.compute_snr_on_signal(window))
    snrs = np.array(snrs)
    return snrs
Exemplo n.º 9
0
def collectEnergies(signal, sampleRate=16000, windowWidth=0.03, step=0.01):
    """Return the short terme energy values of each window of an audio signal

	This function reads an audio signal through consecutive windows. For each window the short terme energy is computed as the mean of the squared window amplitudes.

	Args:
		signal (numpy.array): The mono audio signal
		sampleRate (int): the signal sample rate
		windowWidth (float): The duration of the window in second
		step (float): the duration between 2 consecutive windows

	Returns:
		numpy.array: The resulting energy values vector

	"""

    energies = []
    signalReader = st.getSignalReader(signal, sampleRate, windowWidth, step)
    for window in signalReader:
        energies.append(np.mean(np.float64(window)**2))
    energyVector = np.array(energies)
    return energyVector