def process_batch(path, csv_path, checkpoint_tag='last', output_tag=None): # Process batch batch_output_root = os.path.join(RECONSTRUCTION_ROOT, path) reconstructed_paths = glob.glob( os.path.join(batch_output_root, '**', 'checkpoint-last.pkl')) batch_ids_list = [path.split('/')[-2] for path in reconstructed_paths] batch_ids_list.sort() for batch_id in tqdm(batch_ids_list, desc="Eval batch"): fn = 'speaker_id_all' if args.all_speakers else 'speaker_id_100' if output_tag is not None: fn += '_%s' % output_tag if checkpoint_tag == 'last': npy_path = os.path.join(batch_output_root, batch_id, '%s.npy' % fn) else: npy_path = os.path.join(batch_output_root, batch_id, '%s-%s.npy' % (fn, checkpoint_tag)) print("Outputting to %s" % npy_path) if os.path.exists(npy_path) and not args.recompute: continue input_path = os.path.join(csv_path, batch_id + '.csv') utts = open(input_path).read().strip().split('\n')[1:] utt_ids = [os.path.basename(u.split(',')[0])[:-4] for u in utts] speaker_ids = [u.split('-')[0] for u in utt_ids] utt_ids = [u.split('-') for u in utt_ids] utt_ids = ["%s_%s-%s" % tuple(u) for u in utt_ids] y_pred = np.zeros(shape=(len(speaker_ids), num_samples)) org_y_pred = np.zeros(shape=(len(speaker_ids), num_samples)) for i in tqdm(range(len(speaker_ids))): fn = os.path.join(batch_output_root, batch_id, 'checkpoint-last.pkl') # original_utt = read_mfcc_from_pkl(os.path.join(batch_output_root, batch_id, '%s_samples.pkl' % batch_id), i, idx=1) original_utt = read_mfcc_from_pkl(os.path.join( batch_output_root, batch_id, 'samples.pkl'), i, idx=1) input_data = get_all_speakers_batch( speaker_ids[i], utt_ids[i], read_mfcc_from_pkl(fn, i), original_utt) if args.all_speakers else get_batch(ids[0], fn) predictions = model.m.predict(input_data, batch_size=100) reconstructed_embedding = predictions[0] anchor_embedding = predictions[1] for j, other_than_anchor_embedding in enumerate( predictions[2:]): # positive + negatives y_pred[i][j] = batch_cosine_similarity( [reconstructed_embedding], [other_than_anchor_embedding])[0] org_y_pred[i][j] = batch_cosine_similarity( [anchor_embedding], [other_than_anchor_embedding])[0] tqdm.write( str(np.argsort(y_pred[i])[-5:]) + "\t" + str(np.argsort(org_y_pred[i])[-5:])) np.save(npy_path, [y_pred, org_y_pred])
def get_batch(self, batch_size, is_test=False, predict=None): if predict is None: predict = self.model.m.predict from test import batch_cosine_similarity num_triplets = batch_size // 3 inputs = [] k = 2 # do not change this. for speaker in self.speakers_list: inputs.append( self.select_speaker_data(speaker, n=k, is_test=is_test)) inputs = np.array( inputs) # num_speakers * [k, num_frames, num_fbanks, 1]. embeddings = predict(np.vstack(inputs)) assert embeddings.shape[-1] == 512 # (speaker, utterance, 512) embeddings = np.reshape(embeddings, (len(self.speakers_list), k, 512)) cs = batch_cosine_similarity(embeddings[:, 0], embeddings[:, 1]) arg_sort = np.argsort(cs) assert len(arg_sort) > num_triplets anchor_speakers = arg_sort[0:num_triplets] anchor_embeddings = embeddings[anchor_speakers, 0] negative_speakers = sorted( set(self.speakers_list) - set(anchor_speakers)) negative_embeddings = embeddings[negative_speakers, 0] selected_negative_speakers = [] for anchor_embedding in anchor_embeddings: cs_negative = [ batch_cosine_similarity([anchor_embedding], neg) for neg in negative_embeddings ] selected_negative_speakers.append(negative_speakers[int( np.argmax(cs_negative))]) # anchor with frame 0. # positive with frame 1. # negative with frame 0. assert len( set(selected_negative_speakers).intersection(anchor_speakers)) == 0 negative = inputs[selected_negative_speakers, 0] positive = inputs[anchor_speakers, 1] anchor = inputs[anchor_speakers, 0] batch_x = np.vstack([anchor, positive, negative]) batch_y = np.zeros(shape=(len(batch_x), len(self.speakers_list))) return batch_x, batch_y
def main(): model = DeepSpeakerModel() model.m.load_weights( '/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/checkpoints-triplets/ResCNN_triplet_training_checkpoint_265.h5', by_name=True) # mfcc_001 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/5-F-27/5.wav', SAMPLE_RATE), NUM_FRAMES) # mfcc_002 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/5-F-27/5-2.wav', SAMPLE_RATE), NUM_FRAMES) # predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0)) # predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0)) # mfcc_003 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/6-M-45/6.wav', SAMPLE_RATE), NUM_FRAMES) # predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0)) # print('SAME SPEAKER', batch_cosine_similarity(predict_001, predict_002)) # print('DIFF SPEAKER', batch_cosine_similarity(predict_001, predict_003)) features = [] labels = [] for x in range(10): mfcc1, mfcc2, label = load_data() feature1 = model.m.predict(np.expand_dims(mfcc1, axis=0)) feature2 = model.m.predict(np.expand_dims(mfcc2, axis=0)) cost = batch_cosine_similarity(feature1, feature2) # print(cost) features.append(cost[0]) labels.append(label) # print(cost.shape) # load 2 file (random) + label, predict roi dua vao SVM, # dung den triplet # features = feature1 + feature2 from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC features = np.array(features) labels = np.array(labels) clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) clf.fit(features, labels) svm_pickle = open('svm.pkl', 'wb') pickle.dump(clf, svm_pickle) svm_pickle.close()
def find_statistics(pred_tensor, basepath): samples = find_files(basepath, 'npy') base = dict() #{speaker: [cosine sum, number of utterances, max cosine]} for sample in samples: ensure_dir_for_filename(sample) sp = sample.split('/')[-2] #print(sp) if not(sp in base): base[sp] = [0, 0, -1] samp_tensor = load_npy(sample) cos = batch_cosine_similarity(pred_tensor, samp_tensor) base[sp][0] += cos; base[sp][1] += 1 if(cos > base[sp][2]): base[sp][2] = cos res = dict() #{speaker: [average cosine, max cosine]} for key in base: average = base[key][0]/base[key][1] res[key] = [average, base[key][2]] return res
def play(): text = None out_file = r"D:/Projects/Internship/samtest/file_out.wav" rootdir = os.path.join(os.getcwd(), 'samples') attendance_file_path = os.path.join(os.getcwd(), 'Attendance_data\out.csv') def print_data(info): with open(r'\Attendance_data\out.csv', 'rb') as handle: unserialized_data = csv.reader(handle) print(info, unserialized_data) # if data doesn't exist if not os.path.exists(attendance_file_path) and not os.path.isfile( attendance_file_path): if not os.path.exists('Attendance_data'): os.makedirs('Attendance_data') d = { 'Date': [], 'EmpName': [], 'EmpID': [], 'In': [], 'Out': [], 'Duration': [], 'Attendance': [] } df = pd.DataFrame(data=d) print('\nCreating New Attendance DataFrame : ') print(df) df.to_csv(r'Attendance_data\out.csv', index=False) #print_data('Data is created : \n') # compression_opts = dict(method='zip', # archive_name='out.csv') # df.to_csv('out.zip', index=False, # compression=compression_opts) names = [] for subdir, dirs, files in os.walk(rootdir): for dir_name in dirs: names.append(dir_name) class bcolors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKCYAN = '\033[96m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' def pyttsx3(text): # obtain voice property voices = engine.getProperty('voices') # voice id 1 is for female and 0 for male engine.setProperty('voice', voices[1].id) # convert to audio and play engine.say(text) engine.runAndWait() print( bcolors.OKGREEN + "\n\nWelcome to Attendance System based on Speaker Recognition.\n\nRules are simple, say your name and roll num and the attendance will be updated.\n" ) pyttsx3( "Welcome to Attendance System based on Speaker Recognition. Rules are simple, say your name and roll num and the attendance will be updated. Warning: Don't try to give proxy" ) print(bcolors.WARNING + "Warning: Don't try to give proxy" + bcolors.ENDC + "\n") audio = pyaudio.PyAudio() FORMAT = pyaudio.paInt16 CHANNELS = 2 RATE = 44100 CHUNK = 1024 RECORD_SECONDS = 12 # start Recording stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) r = sr.Recognizer() print("Speak something...\n") pyttsx3( "The recording has started, please say Hello ewarn,along with your name and employee ID and if you are signing in or out" ) frames = [] for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): data = stream.read(CHUNK) frames.append(data) pyttsx3( "The recording has completed, and now your information will be updated, please be patient and if you feel there is an error kindly contact the adminstrator" ) print("Recording saved\n") # stop Recording stream.stop_stream() stream.close() audio.terminate() waveFile = wave.open(out_file, 'wb') waveFile.setnchannels(CHANNELS) waveFile.setsampwidth(audio.get_sample_size(FORMAT)) waveFile.setframerate(RATE) waveFile.writeframes(b''.join(frames)) waveFile.close() with sr.AudioFile(out_file) as source: #print("Say something!") audio = r.record(source) # read the entire audio file try: # for testing purposes, we're just using the default API key # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` # instead of `r.recognize_google(audio)` #print("Did you say? " + r.recognize_google(audio)) text = r.recognize_google(audio) except sr.UnknownValueError: print("eWarn could not understand audio") if "hello" not in text: print("Trigger word missing, Please try again") pyttsx3("Trigger word missing, Please try again") exit(0) # Reproducible results. np.random.seed(123) random.seed(123) # Define the model here. model = DeepSpeakerModel() # Load the checkpoint. model.m.load_weights('Model.h5', by_name=True) mfcc_005 = sample_from_mfcc(read_mfcc(out_file, SAMPLE_RATE), NUM_FRAMES) # Call the model to get the embeddings of shape (1, 512) for each file. predict_005 = model.m.predict(np.expand_dims(mfcc_005, axis=0)) #names = [] select = dict() from statistics import mean for subdir, dirs, files in os.walk(rootdir): for dir_name in dirs: #names.append(dir_name) #print('person dir : ', dir_name) #print('person dir files : \n', os.listdir(os.path.join(rootdir, dir_name))) select_list = list() for file_name in os.listdir(os.path.join(rootdir, dir_name)): #print(file_name) #print('person dir files seperate : \n', os.path.join(rootdir, dir_name, file_name)) mfcc_001 = sample_from_mfcc( read_mfcc(os.path.join(rootdir, dir_name, file_name), SAMPLE_RATE), NUM_FRAMES) predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0)) select_list.append( batch_cosine_similarity(predict_005, predict_001)[0]) #print(select_list) select[dir_name] = mean(select_list) select_list.clear() #print('Names : ', names) print('\nPredcitions :', select) Keymax = max(select, key=select.get) if (select[Keymax]) >= 0.5: print('The Speaker is: ', Keymax.split('+')[0]) pyttsx3('The Speaker is ' + str(Keymax.split('+')[0])) time_in = None time_out = None #'EmpName': [], 'EmpID':[], 'In':[], 'Out':[], 'Duration':[], 'Attendance':[]} if text.lower().split().count('in') == 1: #print('text has in', text) time_in = datetime.datetime.now() print("Current time for in:-", time_in) df_in = pd.read_csv(attendance_file_path, parse_dates=['Date']) temp_in = {'Date': datetime.datetime.date(time_in), 'EmpName': Keymax.split('+')[0], 'EmpID': Keymax.split('+')[1], \ 'In': time_in, 'Out': 'zero', 'Duration': 'zero', 'Attendance': 'zero'} temp_df = pd.DataFrame(temp_in, index=[0]) #print("temp_in", temp_in) #print("temp_df", temp_df) if not df_in.empty: print('DataFrame is not empty!') #df_in.append(temp_df, ignore_index = True) print('\n\nIN Before Update\n', df_in) df3 = pd.concat([df_in, temp_df], ignore_index=True) df3.reset_index() df3.to_csv(r'Attendance_data\out.csv', index=False) print('\n\ndf3\n', df3.tail(5)) if df_in.empty: print('DataFrame is empty!') #df_new = pd.DataFrame(temp_in) temp_df.to_csv(r'Attendance_data\out.csv', index=False) print('After IN Update', temp_df) exit(0) if text.lower().split().count('out') == 1: #print('Text has out') df_out = pd.read_csv(attendance_file_path, parse_dates=['Date']) #print(df_out) time_out = datetime.datetime.now() print("Current time for out:-", time_out) in1 = df_out['In'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int(Keymax.split('+')[1]))] #print(in1) df_out['Out'].loc[(df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int( Keymax.split('+')[1]))] = time_out out1 = df_out['Out'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int(Keymax.split('+')[1]))] #print(out1) delta = pd.to_datetime(out1) - pd.to_datetime(in1) #print(delta) df_out['Duration'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int(Keymax.split('+')[1]))] = delta day1 = df_out['Attendance'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now() - datetime.timedelta(days=1)))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int(Keymax.split('+')[1]))] #print(day1.empty) if day1.empty: df_out['Attendance'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int(Keymax.split('+')[1]))] = 1 else: df_out['Attendance'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int( Keymax.split('+')[1]))] = int(day1[0]) + 1 df_out.to_csv(r'Attendance_data\out.csv', index=False) print(df_out.tail(5)) exit(0) else: print("Don't try to give proxy") pyttsx3("Don't try to give proxy") exit(0)
from audio import read_mfcc from batcher import sample_from_mfcc from constants import SAMPLE_RATE, NUM_FRAMES from conv_models import DeepSpeakerModel from test import batch_cosine_similarity np.random.seed(123) random.seed(123) model = DeepSpeakerModel() model.m.load_weights( '/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5', by_name=True) mfcc_001 = sample_from_mfcc( read_mfcc('samples/PhilippeRemy/PhilippeRemy_001.wav', SAMPLE_RATE), NUM_FRAMES) mfcc_002 = sample_from_mfcc( read_mfcc('samples/PhilippeRemy/PhilippeRemy_002.wav', SAMPLE_RATE), NUM_FRAMES) predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0)) predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0)) mfcc_003 = sample_from_mfcc( read_mfcc('samples/1255-90413-0001.flac', SAMPLE_RATE), NUM_FRAMES) predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0)) print('SAME SPEAKER', batch_cosine_similarity(predict_001, predict_002)) print('DIFF SPEAKER', batch_cosine_similarity(predict_001, predict_003))
def distinguish(file_path, voice_root_path): try: np.random.seed(123) random.seed(123) interpreter = tf.lite.Interpreter( model_path=join(dirname(__file__), 'model.tflite')) interpreter.allocate_tensors() input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() if os.path.exists(file_path): test_audio = sample_from_mfcc(read_mfcc(file_path, SAMPLE_RATE), NUM_FRAMES) test_audio = test_audio.astype(np.float32) interpreter.set_tensor(input_details[0]['index'], np.expand_dims(test_audio, axis=0)) interpreter.invoke() test_predict = interpreter.get_tensor(output_details[0]['index']) all_audio = [] for root, dirs, files in os.walk( join(voice_root_path, 'wave_numpy')): root = root.replace('\\', '/') for file in files: if file.endswith('npy'): all_audio.append( (root + '/' + file, np.load(root + '/' + file))) if len(all_audio) > 0: print('use exist numpy') result = [] for audio in all_audio: result.append( (audio[0], batch_cosine_similarity(test_predict, audio[1]))) else: print('use original corpus') all_addr = [] for root, dirs, files in os.walk( join(voice_root_path, 'wave_original')): root = root.replace('\\', '/') for file in files: if file.endswith('flac') or file.endswith('wav'): all_addr.append(root + '/' + file) audio_all = [] for addr in all_addr: audio = sample_from_mfcc(read_mfcc(addr, SAMPLE_RATE), NUM_FRAMES) audio = audio.astype(np.float32) interpreter.set_tensor(input_details[0]['index'], np.expand_dims(audio, axis=0)) interpreter.invoke() predict_one = interpreter.get_tensor( output_details[0]['index']) audio_all.append((addr, predict_one)) result = [] for audio in audio_all: result.append( (audio[0], batch_cosine_similarity(test_predict, audio[1]))) cos_max = (result[0][1], result[0][0]) for i, print_out in enumerate(result): if print_out[1] > cos_max[0][0]: cos_max = (print_out[1], print_out[0]) if cos_max[0] > 0.60: return out_format(cos_max[1], cos_max[0].item()) else: return 'dont exist', 0 else: return 'no wave input', 0 except Exception: return 'error', 0
by_name=True) # Sample some inputs for WAV/FLAC files for the same speaker. # To have reproducible results every time you call this function, set the seed every time before calling it. # np.random.seed(123) # random.seed(123) s1 = 'samples/LibriSpeechSamples/27/124992/27-124992-0022.wav' s2 = 'samples/LibriSpeechSamples/27/124992/27-124992-0000.wav' s3 = 'samples/LibriSpeechSamples/26/496/26-496-0026.wav' mfcc_001 = sample_from_mfcc(read_mfcc(s1, SAMPLE_RATE), NUM_FRAMES) mfcc_002 = sample_from_mfcc(read_mfcc(s2, SAMPLE_RATE), NUM_FRAMES) # Call the model to get the embeddings of shape (1, 512) for each file. predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0)) predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0)) # Do it again with a different speaker. mfcc_003 = sample_from_mfcc(read_mfcc(s3, SAMPLE_RATE), NUM_FRAMES) predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0)) # Compute the cosine similarity and check that it is higher for the same speaker. print(s1, ' & ', s2) print('SAME SPEAKER', batch_cosine_similarity(predict_001, predict_002)) # SAME SPEAKER [0.81564593] print(s1, ' & ', s3) print('DIFF SPEAKER', batch_cosine_similarity(predict_001, predict_003)) # DIFF SPEAKER [0.1419204]
# Load the checkpoint. model.m.load_weights('ResCNN_triplet_training_checkpoint_265.h5', by_name=True) # Sample some inputs for WAV/FLAC files for the same speaker. # To have reproducible results every time you call this function, set the seed every time before calling it. # np.random.seed(123) # random.seed(123) mfcc_001 = sample_from_mfcc( read_mfcc('samples/PhilippeRemy/PhilippeRemy_001.wav', SAMPLE_RATE), NUM_FRAMES) mfcc_002 = sample_from_mfcc( read_mfcc('samples/PhilippeRemy/PhilippeRemy_002.wav', SAMPLE_RATE), NUM_FRAMES) # Call the model to get the embeddings of shape (1, 512) for each file. predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0)) predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0)) # Do it again with a different speaker. mfcc_003 = sample_from_mfcc( read_mfcc('samples/1255-90413-0001.flac', SAMPLE_RATE), NUM_FRAMES) predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0)) # Compute the cosine similarity and check that it is higher for the same speaker. same_speaker_similarity = batch_cosine_similarity(predict_001, predict_002) diff_speaker_similarity = batch_cosine_similarity(predict_001, predict_003) print('SAME SPEAKER', same_speaker_similarity) # SAME SPEAKER [0.81564593] print('DIFF SPEAKER', diff_speaker_similarity) # DIFF SPEAKER [0.1419204] assert same_speaker_similarity > diff_speaker_similarity
temp_speaker.append(f[j]) for k in range(len(temp_speaker)): if k==0: enroll_speaker.append(temp_speaker[k]) speakerID_enroll.append(i) else: test_speaker.append(temp_speaker[k]) speakerID_test.append(i) count=0 for i in range(len(test_speaker)): mfcc_test=sample_from_mfcc(read_mfcc(test_speaker[i], SAMPLE_RATE), NUM_FRAMES) predict_002 =model.predict(np.expand_dims(mfcc_test, axis=0)) print(predict_002.shape) max_score= -10**8 pred_speaker=None true_speaker=speakerID_test[i] for j in range(len(enroll_speaker)): mfcc_enroll=sample_from_mfcc(read_mfcc(enroll_speaker[j], SAMPLE_RATE), NUM_FRAMES) predict_001 =model.predict(np.expand_dims(mfcc_enroll, axis=0)) score=batch_cosine_similarity(predict_001, predict_002) if score>max_score: max_score=score pred_speaker=speakerID_enroll[j] print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_speaker, pred_speaker, true_speaker==pred_speaker)) if pred_speaker==true_speaker: count+=1 print('accuracy: ',count/len(test_speaker))
def get_batch_train(self, batch_size): from test import batch_cosine_similarity s1 = time() self.batch_count += 1 if self.batch_count % self.history_every == 0: self.update_triplets_history() all_indexes = range(len(self.history_embeddings_train)) anchor_indexes = np.random.choice(a=all_indexes, size=batch_size // 3, replace=False) s2 = time() similar_negative_indexes = [] dissimilar_positive_indexes = [] # could be made parallel. for anchor_index in anchor_indexes: s21 = time() anchor_embedding = self.history_embeddings[anchor_index] anchor_speaker = extract_speaker(self.history_utterances[anchor_index]) # why self.nb_speakers // 2? just random. because it is fast. otherwise it's too much. negative_indexes = [j for (j, a) in enumerate(self.history_utterances) if extract_speaker(a) != anchor_speaker] negative_indexes = np.random.choice(negative_indexes, size=self.nb_speakers // 2) s22 = time() anchor_embedding_tile = [anchor_embedding] * len(negative_indexes) anchor_cos = batch_cosine_similarity(anchor_embedding_tile, self.history_embeddings[negative_indexes]) s23 = time() similar_negative_index = negative_indexes[np.argsort(anchor_cos)[-1]] # [-1:] similar_negative_indexes.append(similar_negative_index) s24 = time() positive_indexes = [j for (j, a) in enumerate(self.history_utterances) if extract_speaker(a) == anchor_speaker and j != anchor_index] s25 = time() anchor_embedding_tile = [anchor_embedding] * len(positive_indexes) s26 = time() anchor_cos = batch_cosine_similarity(anchor_embedding_tile, self.history_embeddings[positive_indexes]) dissimilar_positive_index = positive_indexes[np.argsort(anchor_cos)[0]] # [:1] dissimilar_positive_indexes.append(dissimilar_positive_index) s27 = time() s3 = time() batch_x = np.vstack([ self.history_model_inputs[anchor_indexes], self.history_model_inputs[dissimilar_positive_indexes], self.history_model_inputs[similar_negative_indexes] ]) s4 = time() # for anchor, positive, negative in zip(history_utterances[anchor_indexes], # history_utterances[dissimilar_positive_indexes], # history_utterances[similar_negative_indexes]): # print('anchor', os.path.basename(anchor), # 'positive', os.path.basename(positive), # 'negative', os.path.basename(negative)) # print('_' * 80) # assert utterances as well positive != anchor. anchor_speakers = [extract_speaker(a) for a in self.history_utterances[anchor_indexes]] positive_speakers = [extract_speaker(a) for a in self.history_utterances[dissimilar_positive_indexes]] negative_speakers = [extract_speaker(a) for a in self.history_utterances[similar_negative_indexes]] assert len(anchor_indexes) == len(dissimilar_positive_indexes) assert len(similar_negative_indexes) == len(dissimilar_positive_indexes) assert list(self.history_utterances[dissimilar_positive_indexes]) != list( self.history_utterances[anchor_indexes]) assert anchor_speakers == positive_speakers assert negative_speakers != anchor_speakers batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something. for a in anchor_speakers: self.metadata_train_speakers[a] += 1 for a in positive_speakers: self.metadata_train_speakers[a] += 1 for a in negative_speakers: self.metadata_train_speakers[a] += 1 s5 = time() # print('1-2', s2 - s1) # print('2-3', s3 - s2) # print('3-4', s4 - s3) # print('4-5', s5 - s4) # print('21-22', (s22 - s21) * (batch_size // 3)) # print('22-23', (s23 - s22) * (batch_size // 3)) # print('23-24', (s24 - s23) * (batch_size // 3)) # print('24-25', (s25 - s24) * (batch_size // 3)) # print('25-26', (s26 - s25) * (batch_size // 3)) # print('26-27', (s27 - s26) * (batch_size // 3)) return batch_x, batch_y
# Define the model here. #model = DeepSpeakerModel() # Load the checkpoint. #model.m.load_weights('ResCNN_triplet_training_checkpoint_265.h5', by_name=True) df = pd.read_csv('../dataset_3_consolidado/todos.csv') resultados = [] for i, linha in df.iterrows(): for j in range(i, df.last_valid_index()): linha2 = df.loc[j] embeddings_1 = np.load('../'+linha['path'].replace('.wav', '.npy')) embeddings_2 = np.load('../'+linha2['path'].replace('.wav', '.npy')) score = batch_cosine_similarity(embeddings_1, embeddings_2) resultados.append([linha['voz'], linha['classe'], linha2['voz'], linha2['classe'], score]) df_resultado = pd.DataFrame(resultados, columns=['voz-1', 'classe-1', 'voz-2', 'classe-2','score']) df_resultado.to_csv('resultado.csv', index=False) # mfcc_001 = sample_from_mfcc(read_mfcc('../data/NORMAL/GM_NORMAL/GM6_NORMAL.wav', SAMPLE_RATE), NUM_FRAMES) # mfcc_002 = sample_from_mfcc(read_mfcc('../data/DISFARCE/GM_DISFARCE/GM6_DISFARCE.wav', SAMPLE_RATE), NUM_FRAMES) # # Call the model to get the embeddings of shape (1, 512) for each file. # predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0)) # predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0))
def get_score(embeds): return batch_cosine_similarity(embeds[0], embeds[1])
def process_single_utts(path, checkpoint_tag='last', output_tag=None): # Process single utts root = os.path.join(RECONSTRUCTION_ROOT, path) if path else RECONSTRUCTION_ROOT reconstructed_paths = glob.glob( os.path.join(root, '**', 'checkpoint-%s.pkl' % checkpoint_tag)) ids_list = [path.split('/')[-2] for path in reconstructed_paths] ids_list.sort() for i, utt_id in tqdm(list(enumerate(ids_list)), desc='Eval'): fn = 'speaker_id_all' if args.all_speakers else 'speaker_id_100' if output_tag is not None: fn += '_%s' % output_tag if checkpoint_tag == 'last': npy_path = os.path.join(root, utt_id, '%s.npy' % fn) else: npy_path = os.path.join(root, utt_id, '%s-%s.npy' % (fn, checkpoint_tag)) print("Outputting to %s" % npy_path) if os.path.exists(npy_path) and not args.recompute: continue ids = re.split('-|_', utt_id) y_pred = np.zeros(shape=num_samples) org_y_pred = np.zeros(shape=num_samples) # fn = sorted(list(glob.glob('/home/trungvd/repos/speech-reconstruction/outputs/librispeech/' + '-'.join(ids) + '/checkpoint-*.pkl')))[-1] fn = os.path.join(root, utt_id, 'checkpoint-%s.pkl' % checkpoint_tag) if not os.path.exists(fn): fn = os.path.join(root, utt_id, 'checkpoint-%s.pkl' % checkpoint_tag) reconstructed_utt = read_mfcc_from_pkl(fn) original_utt = read_mfcc_from_pkl(os.path.join( RECONSTRUCTION_ROOT, "outputs", "librispeech", utt_id, '%s_samples.pkl' % utt_id), 0, idx=1) # original_utt = read_mfcc_from_pkl(os.path.join(RECONSTRUCTION_ROOT, "outputs", "librispeech", utt_id, 'samples.pkl'), 0, idx=1) input_data = get_all_speakers_batch( ids[0], utt_id, reconstructed_utt, original_utt) if args.all_speakers else get_batch(ids[0], fn) predictions = model.m.predict(input_data, batch_size=100) reconstructed_embeddings = predictions[:1] anchor_embedding = predictions[1] for j, other_than_anchor_embedding in enumerate( predictions[2:]): # positive + negatives y_pred[j] = np.max([ batch_cosine_similarity([reconstructed_embedding], [other_than_anchor_embedding])[0] for reconstructed_embedding in reconstructed_embeddings ], 0) org_y_pred[j] = batch_cosine_similarity( [anchor_embedding], [other_than_anchor_embedding])[0] normalize = lambda x: (x - np.mean(x)) / np.var(x) tqdm.write('\t'.join([ utt_id, "pred: " + str(np.argsort(y_pred)[-5:]), str(y_pred[np.argsort(y_pred)[-5:]]), "org: " + str(np.argsort(org_y_pred)[-5:]), str(org_y_pred[np.argsort(org_y_pred)[-5:]]), # "mae: " + str(np.average(np.abs(normalize(reconstructed_utt) - normalize(original_utt)))) ])) np.save(npy_path, [y_pred, org_y_pred])