def preprocess_audio(voice_file): if voice_file: voice, sample_rate = librosa.load(voice_file, sr=16000, mono=True) voice = svt.rms_silence_filter(voice) voice = svt.extract_mfcc(voice) return voice else: print("Error: No voice file received for preprocessing.")
def voice_model(): results = numpy.asmatrix(()) basepath = "./Registrazioni/" i = 0 for entry in os.listdir(basepath): i += 1 if os.path.isfile(os.path.join(basepath, entry)): if entry.endswith(".wav"): if i == 1: file_name = str(entry[:-5]) audio_file = basepath + "/" + entry # converte l'audio in un vettore di floating point # data è il vero e proprio vettore di tipo float32 # sr è un numero >0 che indica la frequenza di campionamento data, sr = librosa.load(audio_file, sr=16000, mono=True) nyq = 0.5 * sr cutoff = 250 normal_cutoff = cutoff / nyq numerator, denominator = sg.butter(1, normal_cutoff, 'low') data = sg.filtfilt(numerator, denominator, data) data = svt.rms_silence_filter(data) mfcc = svt.extract_mfcc(data, sr, winlen=0.025, winstep=0.01) # Standardizza un dataset secondo la standard-scaler mfcc = preprocessing.scale(mfcc) delta = librosa.feature.delta(mfcc) combined = numpy.hstack((mfcc, delta)) mfcc = combined if i == 1: results = mfcc else: results = numpy.vstack((results, mfcc)) # classe che permette di stimare i parametri di una gaussian mixture model model = sklearn.mixture.GaussianMixture(n_components=i, covariance_type='full', n_init=1) # stima i parametri del modello con l'algoritmo EM model.fit(results) filename = './Trainer/model' + file_name + ".gmm" pickle.dump(model, open(filename, 'wb')) remove_wav_files()
def obtainMFCCfromWav(index, nomefile): AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), "./wavFiles/" + nomefile + str(index) + ".wav") data, sr = librosa.load(AUDIO_FILE, sr=16000, mono=True) data = svt.rms_silence_filter(data) #chroma = librosa.feature.chroma_cqt(y=data, sr=sr) #chroma_med = librosa.decompose.nn_filter(chroma, aggregate = numpy.median,metric = 'cosine') #data = librosa.decompose.nn_filter(data) mfcc = svt.extract_mfcc(data, sr, winlen=0.025, winstep=0.01) mfcc = preprocessing.scale(mfcc) #standardizza il dataset lungo un asse delta = librosa.feature.delta(mfcc) combined = numpy.hstack((mfcc, delta)) numpy.savetxt("./mfcc/mfcc" + nomefile + str(index) + ".txt", combined) return combined
def segment_by_voice(segments, samplerate=16000, segment_length=None, threshold=200): ''' Cut off silence parts from the signal audio data. Doesn't work with signals data affected by environment noise. You would consider apply a noise filter before using this silence filter or make sure that environment noise is small enough to be considered as silence. :param data: the audio signal data :param samplerate: if no segment_length is given, segment_length will be equals samplerate/100 (around 0.01 secs per segment). :param segment_length: the number of frames per segment. I.e. for a sample rate SR, a segment length equals SR/100 will represent a chunk containing 0.01 seconds of audio. :param threshold: the threshold value. Values less than or equal values will be cut off. The default value was defined at [1] (see the references). :returns: the param "data" without silence parts. ''' if segment_length is None: segment_length = int(samplerate / 100) voice_segments = [] last_slice_mfcc = None for i, data in enumerate(segments): accumulate_data = np.array([]) for index in range(0, len(data), segment_length): data_slice = data[index:index + segment_length] mfcc = svt.extract_mfcc(data_slice) if last_slice_mfcc is None: distance = -1 else: distance = svt.compute_distance(mfcc, last_slice_mfcc) last_slice_mfcc = mfcc if distance < threshold and distance > 0: # same voice accumulate_data = np.append(accumulate_data, data_slice) else: # different if accumulate_data.shape[0] > 0: voice_segments.append(accumulate_data) accumulate_data = data_slice if accumulate_data.shape[0] > 0: voice_segments.append(accumulate_data) return voice_segments
def test_extract_mfcc(self): mfcc_test = extract_mfcc(self.file1, self.sr1) self.assertEqual(mfcc_test.all(), self.mfcc1.all())
def read_all_gmms(): # Variabili riconoscimento voce models = [] speakers = [] find = False frequency_sample = 44100 seconds = 3 print("Avvio riconoscimento vocale: parla\n") myrecording = sd.rec(int(seconds * frequency_sample), samplerate=frequency_sample, channels=2) sd.wait() write('Registrazioni/input' + str(1000) + '.wav', frequency_sample, myrecording) data, sr = librosa.load('Registrazioni/input' + str(1000) + '.wav', sr=16000, mono=True) nyq = 0.5 * frequency_sample cutoff = 250 normal_cutoff = cutoff / nyq numerator, denominator = sg.butter(1, normal_cutoff, 'low') data = sg.filtfilt(numerator, denominator, data) data = svt.rms_silence_filter(data) mfcc = svt.extract_mfcc(data, sr, winlen=0.025, winstep=0.01) mfcc = preprocessing.scale(mfcc) # standardizza il dataset usando la standard scaler delta = librosa.feature.delta(mfcc) combined = np.hstack((mfcc, delta)) user_name = "" basepath = "./Trainer" for entry in os.listdir(basepath): if os.path.isfile(os.path.join(basepath, entry)): if entry.endswith(".gmm"): gmm_readed = pickle.load(open(basepath + "/" + entry, 'rb')) speakers.append(entry) models.append(gmm_readed) log_likelihood = np.zeros(len(models)) for i in range(len(models)): # Calcola la probabilità su scala logaritmica per campione del parametro. # ritorna Log likelihood del Gaussian mixture dato il parametro combined. gmm = models[i] scores = np.array(gmm.score(combined)) # Somma i valori di somiglianza per ogni campione log_likelihood[i] = scores.sum() # print(f"Log likelihood senza normalizzazione: {log_likelihood}") winner = np.argmax(log_likelihood) """ print("i valori con la normalizzazione minmax") print(minmax_scale(log_likelihood)) print("i valori con la normalizzazione scalescale") print(scale(log_likelihood)) """ trovato = log_likelihood if round(trovato[winner]) >= 59: print("Trovato\n") print(scale(log_likelihood)) print(speakers[winner]) user_name = str(speakers[winner][5:-4]) find = True else: print("Non trovato\n") find = False if os.path.exists("./Registrazioni/input1000.wav"): os.remove("./Registrazioni/input1000.wav") log_likelihood = [] print(f"Nome audio: {user_name}") return find, user_name