예제 #1
0
def preprocess_audio(voice_file):
    if voice_file:
        voice, sample_rate = librosa.load(voice_file, sr=16000, mono=True)
        voice = svt.rms_silence_filter(voice)
        voice = svt.extract_mfcc(voice)

        return voice
    else:
        print("Error: No voice file received for preprocessing.")
예제 #2
0
def voice_model():
    results = numpy.asmatrix(())
    basepath = "./Registrazioni/"
    i = 0
    for entry in os.listdir(basepath):
        i += 1
        if os.path.isfile(os.path.join(basepath, entry)):
            if entry.endswith(".wav"):
                if i == 1:
                    file_name = str(entry[:-5])
                audio_file = basepath + "/" + entry
                # converte l'audio in un vettore di floating point
                # data è il vero e proprio vettore di tipo float32
                # sr è un numero >0 che indica la frequenza di campionamento
                data, sr = librosa.load(audio_file, sr=16000, mono=True)

                nyq = 0.5 * sr
                cutoff = 250
                normal_cutoff = cutoff / nyq

                numerator, denominator = sg.butter(1, normal_cutoff, 'low')
                data = sg.filtfilt(numerator, denominator, data)
                data = svt.rms_silence_filter(data)

                mfcc = svt.extract_mfcc(data, sr, winlen=0.025, winstep=0.01)

                # Standardizza un dataset secondo la standard-scaler
                mfcc = preprocessing.scale(mfcc)

                delta = librosa.feature.delta(mfcc)
                combined = numpy.hstack((mfcc, delta))

                mfcc = combined

                if i == 1:
                    results = mfcc
                else:
                    results = numpy.vstack((results, mfcc))

    # classe che permette di stimare i parametri di una gaussian mixture model
    model = sklearn.mixture.GaussianMixture(n_components=i,
                                            covariance_type='full',
                                            n_init=1)

    # stima i parametri del modello con l'algoritmo EM
    model.fit(results)

    filename = './Trainer/model' + file_name + ".gmm"
    pickle.dump(model, open(filename, 'wb'))
    remove_wav_files()
예제 #3
0
def obtainMFCCfromWav(index, nomefile):

    AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)),
                           "./wavFiles/" + nomefile + str(index) + ".wav")

    data, sr = librosa.load(AUDIO_FILE, sr=16000, mono=True)
    data = svt.rms_silence_filter(data)
    #chroma = librosa.feature.chroma_cqt(y=data, sr=sr)
    #chroma_med = librosa.decompose.nn_filter(chroma, aggregate = numpy.median,metric = 'cosine')
    #data = librosa.decompose.nn_filter(data)
    mfcc = svt.extract_mfcc(data, sr, winlen=0.025, winstep=0.01)
    mfcc = preprocessing.scale(mfcc)  #standardizza il dataset lungo un asse
    delta = librosa.feature.delta(mfcc)
    combined = numpy.hstack((mfcc, delta))
    numpy.savetxt("./mfcc/mfcc" + nomefile + str(index) + ".txt", combined)
    return combined
예제 #4
0
def segment_by_voice(segments,
                     samplerate=16000,
                     segment_length=None,
                     threshold=200):
    '''
        Cut off silence parts from the signal audio data. Doesn't work with signals data affected by environment noise.
        You would consider apply a noise filter before using this silence filter or make sure that environment noise is small enough to be considered as silence.

        :param data: the audio signal data
        :param samplerate: if no segment_length is given, segment_length will be equals samplerate/100 (around 0.01 secs per segment).
        :param segment_length: the number of frames per segment. I.e. for a sample rate SR, a segment length equals SR/100 will represent a chunk containing 0.01 seconds of audio.
        :param threshold: the threshold value. Values less than or equal values will be cut off. The default value was defined at [1] (see the references).
        :returns: the param "data" without silence parts.
        '''
    if segment_length is None:
        segment_length = int(samplerate / 100)

    voice_segments = []
    last_slice_mfcc = None
    for i, data in enumerate(segments):
        accumulate_data = np.array([])
        for index in range(0, len(data), segment_length):
            data_slice = data[index:index + segment_length]
            mfcc = svt.extract_mfcc(data_slice)
            if last_slice_mfcc is None:
                distance = -1
            else:
                distance = svt.compute_distance(mfcc, last_slice_mfcc)
            last_slice_mfcc = mfcc

            if distance < threshold and distance > 0:
                # same voice
                accumulate_data = np.append(accumulate_data, data_slice)
            else:
                # different
                if accumulate_data.shape[0] > 0:
                    voice_segments.append(accumulate_data)
                accumulate_data = data_slice

        if accumulate_data.shape[0] > 0:
            voice_segments.append(accumulate_data)

    return voice_segments
예제 #5
0
 def test_extract_mfcc(self):
     mfcc_test = extract_mfcc(self.file1, self.sr1)
     self.assertEqual(mfcc_test.all(), self.mfcc1.all())
예제 #6
0
def read_all_gmms():
    # Variabili riconoscimento voce
    models = []
    speakers = []
    find = False
    frequency_sample = 44100
    seconds = 3

    print("Avvio riconoscimento vocale: parla\n")
    myrecording = sd.rec(int(seconds * frequency_sample), samplerate=frequency_sample, channels=2)
    sd.wait()
    write('Registrazioni/input' + str(1000) + '.wav', frequency_sample, myrecording)

    data, sr = librosa.load('Registrazioni/input' + str(1000) + '.wav', sr=16000, mono=True)

    nyq = 0.5 * frequency_sample
    cutoff = 250
    normal_cutoff = cutoff / nyq
    numerator, denominator = sg.butter(1, normal_cutoff, 'low')
    data = sg.filtfilt(numerator, denominator, data)
    data = svt.rms_silence_filter(data)

    mfcc = svt.extract_mfcc(data, sr, winlen=0.025, winstep=0.01)
    mfcc = preprocessing.scale(mfcc)  # standardizza il dataset usando la standard scaler
    delta = librosa.feature.delta(mfcc)
    combined = np.hstack((mfcc, delta))

    user_name = ""

    basepath = "./Trainer"
    for entry in os.listdir(basepath):
        if os.path.isfile(os.path.join(basepath, entry)):
            if entry.endswith(".gmm"):
                gmm_readed = pickle.load(open(basepath + "/" + entry, 'rb'))
                speakers.append(entry)
                models.append(gmm_readed)

    log_likelihood = np.zeros(len(models))

    for i in range(len(models)):
        # Calcola la probabilità su scala logaritmica per campione del parametro.
        # ritorna Log likelihood del Gaussian mixture dato il parametro combined.
        gmm = models[i]
        scores = np.array(gmm.score(combined))

        # Somma i valori di somiglianza per ogni campione
        log_likelihood[i] = scores.sum()

    # print(f"Log likelihood senza normalizzazione: {log_likelihood}")
    winner = np.argmax(log_likelihood)
    """
    print("i valori con la normalizzazione minmax")
    print(minmax_scale(log_likelihood))
    print("i valori con la normalizzazione scalescale")
    print(scale(log_likelihood))
    """
    trovato = log_likelihood
    if round(trovato[winner]) >= 59:
        print("Trovato\n")
        print(scale(log_likelihood))
        print(speakers[winner])
        user_name = str(speakers[winner][5:-4])
        find = True
    else:
        print("Non trovato\n")
        find = False

    if os.path.exists("./Registrazioni/input1000.wav"):
        os.remove("./Registrazioni/input1000.wav")
    log_likelihood = []
    print(f"Nome audio: {user_name}")
    return find, user_name