Exemplo n.º 1
0
def extractEnergyFeatures ( signal, sampleRate=16000, windowSize=30,windowStep=10):
	"""Extract energy features for each audio window 

	This function construct a energy signal that is readed through consecutive windows. For each window, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is a features matrix where each time ordered row refers to a window, and each column represents a single metric.

	Args:
		signal (numpy.array): a mono PCM16 audio signal
		sampleRate (int): The audio signal sample rate(values different then 16000 may cose troubles)
		windowSize (float): The size of the aggregative windo in second
		windowStep (float): the duration between 2 consecutive windows(overlapping is aloud)

	Returns:
		numpy.ndarray: The energy features matrix

	"""


	energies = st.collectEnergies ( signal )
	vadMask = st.getVadMask(signal, sampleRate)
	energies *= vadMask
	energyFeatures = []
	energyReader = st.getSignalReader( energies, 100, windowSize, windowStep)
	for window in energyReader:
		localFeatures = computeLocalFeatures( window )
		energyFeatures.append( localFeatures)
	energyFeatures = np.array(energyFeatures)
	return energyFeatures
Exemplo n.º 2
0
def extractSnrFeatures ( signal, sampleRate=16000, windowSize=30,windowStep=10 ):
	"""Extract SNR features for each audio window

	This function construct a SNR signal that is readed through consecutive windows. For each window, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is a features matrix where each time ordered row refers to a window, and each column represents a single metric. In this case the SNR refers to a speech signal to noise ratio. It distinguish the main voice signal from other background audio signal with the WADA algorithm definition.

	Args:
		signal (numpy.array): a mono PCM16 audio signal
		sampleRate (int): The audio signal sample rate(values different then 16000 may cose troubles)
		windowSize (float): The size of the aggregative windo in second
		windowStep (float): the duration between 2 consecutive windows(overlapping is aloud)

	Returns:
		numpy.ndarray: The SNR features matrix

	"""


	snrs = st.getSnrs ( signal, sampleRate)
	snrIndex = 0
	minSnr = np.min(snrs)
	vadMask = st.getVadMask(signal, sampleRate)
	vadMaskReader = st.getSignalReader( vadMask, 100, 1, 0.5)
	for window in vadMaskReader:
		if np.mean(window) <=  0.5: snrs[snrIndex] = minSnr
		snrIndex +=1
	snrFeatures = []
	snrReader = st.getSignalReader( snrs, 2, windowSize, windowStep)
	for window in snrReader:
		localFeatures = computeLocalFeatures( window , 2, minSnr)
		snrFeatures.append( localFeatures)
	snrFeatures = np.array(snrFeatures)
	return snrFeatures
Exemplo n.º 3
0
def extractPitchFeatures(signal,
                         sampleRate=16000,
                         pitchesFile=None,
                         windowSize=30,
                         windowStep=10):
    """Extract pitch features for each audio window 

	This function construct a pitch signal that is readed through consecutive windows. For each window, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is a features matrix where each time ordered row refers to a window, and each column represents a single metric. if The param pitchFile is specified, the pitch signal is not constructed  from the given audio signal but directly loaded from a .f0 file.

	Args:
		signal (numpy.array): a mono PCM16 audio signal
		sampleRate (int): The audio signal sample rate(values different than 16000 may cose troubles)
		pitchesFile (str): The path to the precomputed pitch .f0 file
		windowSize (float): The size of the aggregative windo in second
		windowStep (float): the duration between 2 consecutive windows(overlapping is aloud)

	Returns:
		numpy.ndarray: The pitch features matrix

	"""

    #if precomputed pitches are available
    if pitchesFile:
        pitches = None
        with open(pitchesFile, "rb") as pitchData:
            pitches = pickle.load(pitchData)
    else:
        pitches = st.getPitches(signal, sampleRate)
    # positive pitches intersection with VAD
    nonZeros = np.nonzero(pitches)[0]
    vadMask = st.getVadMask(signal, sampleRate)
    pitches, vadMask = st.equalizeShapes(pitches, vadMask)
    pitches[nonZeros] *= vadMask[nonZeros]
    #pitches conversion on semitone for macro melody quantification and geting spectral linearity
    nonZeros = np.nonzero(pitches)[0]
    semitoneScale = st.getSemitoneScale()
    pitches[nonZeros] = st.quantifyValues(pitches[nonZeros], semitoneScale)
    semitones = np.zeros(pitches.size)
    semitones[nonZeros] = st.pitches2semitones(pitches[nonZeros],
                                               semitoneScale)
    #for each pitches window compute the statistic features
    pitchFeatures = []
    semitoneReader = st.getSignalReader(semitones, 100, windowSize, windowStep)
    for window in semitoneReader:
        localFeatures = computeLocalFeatures(window)
        pitchFeatures.append(localFeatures)
    pitcheFeatures = np.array(pitchFeatures)
    return pitchFeatures
Exemplo n.º 4
0
def extractSpectralFeatures ( signal, sampleRate, windowSize=30,windowStep=10):
	"""Extract spectral features for each audio window

	This function constructs 2 spectral signals that are readed through consecutive windows. For each window and each signal, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is 2 features matrixes where each time ordered row refers to a window, and each column represents a single metric. The 2 signals represent respectively: spectral centroid, spectral flatness.

	Args:
		signal (numpy.array): a mono PCM16 audio signal
		sampleRate (int): The audio signal sample rate(values different then 16000 may cose troubles)
		windowSize (float): The size of the aggregative windo in second
		windowStep (float): the duration between 2 consecutive windows(overlapping is aloud)

	Returns:
		numpy.ndarray, numpy.ndarray: The spectral centroid features matrix, the spectral flatness features matrix

	"""


	floatSignal = np.float32(signal)
	spectralCentroids = librosa.feature.spectral_centroid( floatSignal, sampleRate, n_fft=512, hop_length=160, center=False)
	spectralCentroids = spectralCentroids.flatten()
	vadMask = st.getVadMask( signal, sampleRate)
	spectralCentroids, vadMask = st.equalizeShapes( spectralCentroids, vadMask)
	spectralCentroids *= vadMask
	nonZeros = np.nonzero(spectralCentroids)[0]
	semitoneScale = st.getSemitoneScale()
	spectralCentroids[nonZeros] = st.quantifyValues( spectralCentroids[nonZeros], semitoneScale)
	semitones = np.zeros( spectralCentroids.size)
	semitones[nonZeros]= st.pitches2semitones( spectralCentroids[nonZeros], semitoneScale)
	semitoneFeatures = []
	semitoneReader = st.getSignalReader( semitones, 100, windowSize, windowStep)
	for window in semitoneReader:
		localFeatures = computeLocalFeatures( window )
		semitoneFeatures.append( localFeatures)
	semitoneFeatures = np.array(semitoneFeatures)
	spectralCentroidFeatures = semitoneFeatures
	spectralFlatnesses = librosa.feature.spectral_flatness ( floatSignal, n_fft=512, hop_length=160, center=False)
	spectralFlatnesses = spectralFlatnesses.flatten()
	spectralFlatnesses, vadMask = st.equalizeShapes( spectralFlatnesses, vadMask)
	spectralFlatnesses *= vadMask
	spectralFlatnessFeatures = []
	spectralFlatnessReader = st.getSignalReader( spectralFlatnesses, 100, windowSize, windowStep)
	for window in spectralFlatnessReader:
		localFeatures = computeLocalFeatures( window )
		spectralFlatnessFeatures.append( localFeatures)
	spectralFlatnessFeatures = np.array(spectralFlatnessFeatures)
	return spectralCentroidFeatures, spectralFlatnessFeatures
Exemplo n.º 5
0
def detectVocalActivity(signal,
                        sampleRate,
                        segments,
                        aggressiveness=3,
                        aloudError=0.25,
                        reductionFactor=0.8):
    """Find respectively speech, plosives and silence segments in a sequence of audio stable segments

	This function reads a sequence of stable audio segments and returns separately  and respectively the segments corresponding to speech, plosives and silence. Each segment is a list with the forme : [start time in second, end time in second].
	The affectations use an web RTC VAD object with an given aggressiveness.
	The stable segments are phonologicaly supra segmantal units, smaller than phonems and not  differentiated as those. The stable segments are provided by the speechTools.speech.getFBSegments function, using the forward-backward algorithm.
	The result is 3 lists representing  the segments containing speech, those containing plosives ( less than 150ms silences like in the sounds "P, T, C"), and long silence segments containing any speech.

	Args:
		signal (numpy.array): mono int16 audio signal
		sampleRate (int): audio signal sample rate(should be equal to 16000 for avoiding troubles)
		segments (numpy.ndarray): the time ordered stable audio segments
		aggressiveness (int): The aggressiveness of the Vad instance, could only be [1,2,3] with 3 the most aggressive value
		aloudError (float): The aloud error  between 0 and 1 for quantifying speech quantity in each segment
		reductionFactor (float): The symmetrical length reduction applied to each segment for avoiding segment transition perturbations

	Returns:
		numpy.array, numpy.array, numpy.array: List of speech segments, list of plosive segments, list of silence segments

	"""

    speechSegments = []
    plosiveSegments = []
    silenceSegments = []
    vad = webrtcvad.Vad(aggressiveness)
    for segmentStart, segmentEnd in segments:
        #find the segment corresponding signal window
        windowStart = int(segmentStart * sampleRate)
        windowEnd = int(segmentEnd * sampleRate)
        windowSize = windowEnd - windowStart
        #find the reduced window which is a troncated window for excluding   the  parts of the window that could be impacted by the neighbour windows
        reducedWindowSize = int(windowSize * reductionFactor)
        reducedWindowStart = int(windowStart + (windowSize *
                                                (1 - reductionFactor) * 0.5))
        reducedWindowEnd = reducedWindowStart + reducedWindowSize
        reducedWindow = signal[reducedWindowStart:reducedWindowEnd]
        #get the VAD mask for the reduced window
        vadMask = st.getVadMask(reducedWindow,
                                sampleRate,
                                vad=vad,
                                windowWidth=0.01,
                                windowStep=0.005)
        #compute the speech quantity i.e. the proportion of [1] in the mask
        speechQuantity = 1
        if vadMask.size > 0:
            speechQuantity = vadMask[vadMask == 1].size / vadMask.size
        # if there is enough speech: add the segment to the speech segments
        #else if it is a very short silence  add it to plosives
        #else add it to silences
        if speechQuantity > (1 - aloudError):
            speechSegments.append([segmentStart, segmentEnd])
        elif (segmentEnd - segmentStart) < 0.15:
            plosiveSegments.append([segmentStart, segmentEnd])
        else:
            silenceSegments.append([segmentStart, segmentEnd])
    speechSegments = np.array(speechSegments)
    plosiveSegments = np.array(plosiveSegments)
    silenceSegments = np.array(silenceSegments)
    return speechSegments, plosiveSegments, silenceSegments