def iden(testfile,fa_data_dir,iden_model,max_sec, step_sec, frame_step): # 读入测试数据、标签 print("Use {} for test".format(testfile)) iden_list = np.loadtxt(testfile, str,delimiter=",") labels = np.array([int(i[1]) for i in iden_list]) voice_list = np.array([os.path.join(fa_data_dir, i[0]) for i in iden_list]) # Load model print("Load model form {}".format(iden_model)) model = load_model(iden_model) print("Start identifying...") total_length = len(voice_list) res, p_labels = [], [] buckets = build_buckets(max_sec, step_sec, frame_step) for c, ID in enumerate(voice_list): if c % 1000 == 0: print('Finish identifying for {}/{}th wav.'.format(c, total_length)) specs = get_fft_spectrum(ID, buckets) v = model.predict(specs.reshape(1, *specs.shape, 1)) v = np.squeeze(v) p_labels.append(np.argmax(v)) p_labels = np.array(p_labels) compare = (labels == p_labels) counts = sum(compare==True) acc = counts/total_length print(acc)
def get_fft_features_from_list_file(list_file, max_sec): buckets = build_buckets(max_sec, c.BUCKET_STEP, c.FRAME_STEP) result = pd.read_csv(list_file, delimiter=",") result['features'] = result['filename'].apply( lambda x: get_fft_spectrum(x, buckets)) # iterate over rows with iterrows dataX = [] dataY = [] for index, row in result.iterrows(): x = row['features'] y = row['speaker'] # print(row['speaker']) if isinstance(y, str): y = y.replace(c.SPEAKER_PREFIX, '') y = int(y, 10) dataY.append(y) values = x.reshape(*x.shape, 1) # print("here", values.shape) if (values.shape[1] == c.MAX_SEC * 100): #same time dataX.append(values) dataX = np.stack((dataX), axis=0) dataY = np.asarray(dataY) # print(dataY) print("X.shape: {}, Y.shape{}".format(dataX.shape, dataY.shape)) return dataX, dataY
def score(testfile, fa_data_dir, test_model_path, max_sec, step_sec, frame_step, metric): print("Use {} for test".format(testfile)) verify_list = np.loadtxt(testfile, str) verify_lb = np.array([int(i[0]) for i in verify_list]) list1 = np.array([os.path.join(fa_data_dir, i[1]) for i in verify_list]) list2 = np.array([os.path.join(fa_data_dir, i[2]) for i in verify_list]) total_list = np.concatenate((list1, list2)) unique_list = np.unique(total_list) # Load model print("Load model form {}".format(test_model_path)) model = load_model(test_model_path) model = Model(inputs=model.layers[0].input, outputs=model.layers[34].output) # 取 fc7 层的输出(1024,) print("Start testing...") total_length = len(unique_list) # 4715 feats, scores, labels = [], [], [] buckets = build_buckets(max_sec, step_sec, frame_step) for c, ID in enumerate(unique_list): if c % 50 == 0: print('Finish extracting features for {}/{}th wav.'.format( c, total_length)) specs = get_fft_spectrum(ID, buckets) v = model.predict(specs.reshape(1, *specs.shape, 1)) feats += [v] feats = np.array(feats) # 计算相似度 for c, (p1, p2) in enumerate(zip(list1, list2)): ind1 = np.where(unique_list == p1)[0][0] ind2 = np.where(unique_list == p2)[0][0] v1 = feats[ind1, 0, 0] v2 = feats[ind2, 0, 0] # print(v1 - v2) # print(np.sum(v1*v1)) # print(np.sum(v2*v2)) scores += [np.sum(v1 * v2)] labels += [verify_lb[c]] # print("scores: ",scores) # print("labels: ",labels) # if c>0: # break scores = np.array(scores) labels = np.array(labels) eer = calculate_eer(labels, scores) print("EER: {}".format(eer))
def get_embeddings_from_list_file(model, list_file, max_sec): buckets = build_buckets(max_sec, c.BUCKET_STEP, c.FRAME_STEP) result = pd.read_csv(list_file, delimiter=",") result['features'] = result['filename'].apply( lambda x: get_fft_spectrum(x, buckets)) result['embedding'] = result['features'].apply( lambda x: np.squeeze(model.predict(x.reshape(1, *x.shape, 1)))) return result[['filename', 'speaker', 'embedding']]
def get_ids_random_feature(self, Id): id_directory = os.path.join(self.directory, Id) id_random_speech = os.path.join( id_directory, random.choice(os.listdir(id_directory))) id_random_utterance_path = os.path.join( id_random_speech, random.choice(os.listdir(id_random_speech))) fft = get_fft_spectrum(id_random_utterance_path, self.buckets) return fft
def get_train_list(path): buckets = build_buckets(c.MAX_SEC, c.BUCKET_STEP, c.FRAME_STEP) read_csv = pd.read_csv(path, delimiter=",") print("Preprocessing voice data...") read_csv["voice"] = read_csv["filename"].apply( lambda x: get_fft_spectrum(c.FA_DIR + x, buckets)) read_csv["lable"] = read_csv["speaker"].apply(lambda x: to_one_hot(x - 1)) return read_csv
def get_embeddings_from_file(model, file_path, max_sec): buckets = build_buckets(max_sec, c.BUCKET_STEP, c.FRAME_STEP) result = pd.DataFrame({'filename': [file_path]}) result['features'] = result['filename'].apply( lambda x: get_fft_spectrum(x, buckets)) result['embedding'] = result['features'].apply( lambda x: np.squeeze(model.predict(x.reshape(1, *x.shape, 1)))) # result['speaker'] = 19 return result[['filename', 'embedding']]
def get_embeddings_from_list_file(model, list_file, max_sec): buckets = build_buckets(max_sec, c.BUCKET_STEP, c.FRAME_STEP) result = pd.read_csv(list_file, delimiter=",") result['features'] = result['filename'].apply( lambda x: get_fft_spectrum(x, buckets) ) # Get as much fft as possible from the buckets 'size' for each filename result['embedding'] = result['features'].apply( lambda x: np.squeeze(model.predict(x.reshape(1, *x.shape, 1))) ) # Squeeze function removes all single-dimensionnal entries from the shape of an array return result[['filename', 'speaker', 'embedding']]
def get_ids_features(self, Id, count=np.iinfo(np.int32).max): yielded = 0 features = [] for path in self.__file_iterator__(Id): if yielded == count: break feature = get_fft_spectrum(path, self.buckets) features.append(feature) yielded += 1 return features
def get_embeddings_from_list_file(model, list_file, max_sec): buckets = build_buckets(max_sec, c.BUCKET_STEP, c.FRAME_STEP) result = pd.read_csv(list_file, delimiter=",") # print(result) # time.sleep(10) result['features'] = result['filename'].apply(lambda x: get_fft_spectrum(c.FA_DIR+x, buckets)) # 获取每个人的语音特征 矩阵 # print(result['features']) # time.sleep(10) result['embedding'] = result['features'].apply(lambda x: np.squeeze(model.predict(x.reshape(1,*x.shape,1))))# to input (1, 512, 1000, 1) per voice(feature) # print(result['embedding']) # time.sleep(10) return result[['filename','speaker','embedding']]
def load_wave(wave_file): buckets = build_buckets(10, 1, 0.01) data = get_fft_spectrum(wave_file, buckets) if data.shape[1] == 300: pass else: start = np.random.randint(0, data.shape[1] - 300) data = data[:, start:start + 300] data = np.expand_dims(data, -1) return data
def iden(testfile, fa_data_dir, iden_model, max_sec, step_sec, frame_step, dim, batch_size, n_classes, epoch): # 读入测试数据、标签 print("Use {} for test".format(testfile)) iden_list = np.loadtxt(testfile, str, delimiter=",") voice_list = np.array([os.path.join(fa_data_dir, i[0]) for i in iden_list]) total_length = len(voice_list) device = torch.device('cuda') labels = torch.tensor([int(i[1]) for i in iden_list]).to(device) criterion = nn.CrossEntropyLoss() # Load model print("Load model form {}".format(iden_model)) model = torch.load(iden_model).to(device) print("Start identifying...") acc = 0 loss = 0 model.eval() for num, ID in enumerate(voice_list): if num % 100 == 0: print('Finish identifying for {}/{}th wav.'.format( num, total_length)) b_data = torch.tensor( get_fft_spectrum(ID, build_buckets(max_sec, step_sec, frame_step), mode="test").tolist()).to(device) b_data = b_data.unsqueeze(0).to(device) with torch.no_grad(): eval_predict, tmp_eval_accuracy = model(b_data.unsqueeze(0), labels[num].unsqueeze(0), 1, n_classes) tmp_eval_loss = criterion(eval_predict, labels[num].unsqueeze(0).long()).to(device) loss += tmp_eval_loss.item() acc += tmp_eval_accuracy loss /= total_length acc /= total_length viz.line(torch.tensor([loss]), [epoch], update='append', win='test_loss', opts={'title': 'test_loss'}) viz.line(torch.tensor([acc]), [epoch + 1], update='append', win='test_acc', opts={'title': 'test_acc'}) print("eval_loss:", loss) print("eval_acc:", acc)
def _gene_Data(self, list_IDs_temp, indexes): '得到频谱数组和类标签,以输入模型进行训练' b_data = np.empty((self.batch_size,) + self.dim) b_labels = np.empty((self.batch_size,), dtype=int) for i, ID in enumerate(list_IDs_temp): b_data[i, :, :, 0] = get_fft_spectrum(ID, self.buckets) b_labels[i] = self.labels[indexes[i]] # 0~n-1 # b_labels[i] = self.labels[indexes[i]] - 1 # 1~n b_labels = keras.utils.to_categorical(b_labels, num_classes=self.n_classes) b_labels = b_labels.reshape(self.batch_size, 1, 1, self.n_classes) # os.system("pause") return b_data, b_labels
def get_np_list(file_list, buckets): voice = np.empty() for pt in range(len(file_list)): np.concatenate(get_fft_spectrum(c.FA_DIR + pt, buckets)) print(voice.shape)
def get_embedding(model, wav_file, max_sec): buckets = build_buckets(max_sec, c.BUCKET_STEP, c.FRAME_STEP) signal = get_fft_spectrum(wav_file, buckets) embedding = np.squeeze(model.predict(signal.reshape(1, *signal.shape, 1))) return embedding
def give_vggvox_input(input_file_path): buckets = build_buckets(c.MAX_SEC, c.BUCKET_STEP, c.FRAME_STEP) file = pd.read_csv(input_file_path, delimiter=",") output = file['filename'].apply(lambda x: get_fft_spectrum(x, buckets)) return output
def give_vggvox_input_simple(input_file_path): buckets = build_buckets(c.MAX_SEC, c.BUCKET_STEP, c.FRAME_STEP) output = get_fft_spectrum(input_file_path, buckets) return output