def init_model(): # gpu configuration toolkits.initialize_GPU(args) import model # construct the data generator. params = { 'dim': (257, None, 1), 'nfft': 512, 'min_slice': 720, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) if args.resume: if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') return network_eval
def initialize_model(): # Set basic environments. # Initialize GPUs toolkits.initialize_GPU(args) # ==> loading the pre-trained model. import model model_eval = None if args.aggregation == 'avg': if args.loss == 'softmax': model_eval = model.Vggface2_ResNet50(mode=args.mode) else: raise IOError('==> unknown loss type.') else: raise IOError('==> unknown aggregation mode.') print('test: {}_{}_{} on {} benchmark.'.format(args.net, args.aggregation, args.loss, args.benchmark)) if args.resume: if os.path.isfile(args.resume): model_eval.load_weights(args.resume, by_name=True) print('==> successfully loaded the model {}'.format(args.resume)) else: raise IOError('==> can not find the model to load {}'.format( args.resume)) return model_eval
def extract_features(paths, args): # GPU configuration toolkits.initialize_GPU(args) network_eval = model.vggvox_resnet2d_icassp( input_dim=PARAMS["dim"], num_class=PARAMS["n_classes"], mode="eval", args=args ) network_eval.load_weights(os.path.join(args.resume), by_name=True) num_paths = len(paths) feats = np.zeros((num_paths, PARAMS["feat_dim"])) for i, path in enumerate(tqdm(paths)): specs = ut.load_data( path, win_length=PARAMS["win_length"], sr=PARAMS["sampling_rate"], hop_length=PARAMS["hop_length"], n_fft=PARAMS["nfft"], spec_len=PARAMS["spec_len"], mode="eval", ) specs = np.expand_dims(np.expand_dims(specs, 0), -1) feats[i] = network_eval.predict(specs) return feats
def main(args): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== total_list = [os.path.join(args.data_path, file) for file in os.listdir(args.data_path)] unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = {'dim': (257, None, 1), 'nfft': 512, 'min_slice': 720, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format(args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. feats = [] for ID in unique_list: specs = preprocess.load_data(ID, split=False, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], min_slice=params['min_slice']) specs = np.expand_dims(np.expand_dims(specs[0], 0), -1) v = network_eval.predict(specs) feats += [v] feats = np.array(feats)[:,0,:] preprocess.similar(feats)
def diarize(segments, sr=16000, win_len=400, hop_len=160, embedding_per_sec=1.0, overlap_rate=0.1): logger.debug("[Speaker diarization] Initializing models") # Initialize ghostvlad toolkits.initialize_GPU(Expando({"gpu": ""})) ghostvlad_model = model.vggvox_resnet2d_icassp(input_dim=(257, None, 1), num_class=5994, mode="eval", args=Expando({"net": "resnet34s", "loss": "softmax", "vlad_cluster": 8, "ghost_cluster": 2, "bottleneck_dim": 512, "aggregation_mode": "gvlad"})) ghostvlad_model.load_weights("ghostvlad/pretrained/weights.h5", by_name=True) # Initialize uisrnn sys.argv = sys.argv[:1] model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnn_model = uisrnn.UISRNN(model_args) uisrnn_model.load("uisrnn/pretrained/saved_model.uisrnn_benchmark") logger.debug("[Speaker diarization] Calculating utterance features") utterances_spec = prepare_ghostvlad_data(segments, sr, win_len, hop_len, embedding_per_sec, overlap_rate) feats = [] for spec in utterances_spec: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = ghostvlad_model.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) logger.debug("[Speaker diarization] Clustering utterance features") labels = uisrnn_model.predict(feats, inference_args) logger.debug("[Speaker diarization] Tagging segments speakers") embedding_duration = (1/embedding_per_sec) * (1.0 - overlap_rate) labels_count = len(labels) current = 0 for segment in segments: begin_index = math.floor(current/embedding_duration) current += segment.end-segment.begin end_index = math.ceil(current/embedding_duration) segment_labels = [labels[index] for index in range(begin_index, min(end_index, labels_count))] if len(segment_labels) > 0: segment.speaker = max(segment_labels, key=segment_labels.count) else: segment.speaker = 999 return segments
def get_prediction(): toolkits.initialize_GPU(args) import model import generator params = { 'dim': (257, args.spec_len, 1), 'mp_pooler': toolkits.set_mp(processes=12), 'nfft': 512, 'spec_len': args.spec_len, 'win_length': 400, 'hop_length': 160, 'n_classes': 2, 'sampling_rate': 16000, 'batch_size': args.batch_size, 'shuffle': False, 'normalize': True, } network = model.stutter_model(input_dim=params['dim'], num_class=params['n_classes'], mode='train', args=args) personal_folder = '/home/amh/Projects/Stutter/model/Filler/person_{}/'.format( args.person) personal_files = os.listdir(personal_folder)[-1] best_model = os.path.join(personal_folder, personal_files) print(best_model) network.load_weights(best_model) vallist, vallb = toolkits.get_datalist( args, path='../meta/Stutter_Leave_One_Fillers/validation_labels_{}.txt'. format(args.person)) vld_gen = generator.DataGenerator(vallist.flatten(), vallb.flatten(), **params) step = 0 preds = [] for i in range((len(vallist) // args.batch_size) + 1): x_data, y_data = vld_gen.__getitem__(index=step + i) preds.extend(np.argmax(network.predict(x_data), axis=1)) from sklearn.metrics import accuracy_score print(accuracy_score(preds, vallb))
def initialize_model(): from model import Vggface2_ResNet50 # Set basic environments. # Initialize GPUs toolkits.initialize_GPU() # ==> loading the pre-trained model. input1 = Input(shape=(224, 224, 3)) input2 = Input(shape=(224, 224, 3)) # x1 = resnet.resnet50_backend(input1) # x2 = resnet.resnet50_backend(input2) base_model = Vggface2_ResNet50(include_top=False) base_model.load_weights(weight_file, by_name=True) print("successfully load model ", weight_file) for x in base_model.layers: x.trainable = True x1 = base_model(input1) x2 = base_model(input2) x1 = Concatenate(axis=-1)([GlobalMaxPool2D()(x1), GlobalAvgPool2D()(x1)]) x2 = Concatenate(axis=-1)([GlobalMaxPool2D()(x2), GlobalAvgPool2D()(x2)]) x3 = Subtract()([x1, x2]) x3 = Multiply()([x3, x3]) x1_ = Multiply()([x1, x1]) x2_ = Multiply()([x2, x2]) x4 = Subtract()([x1_, x2_]) x = Concatenate(axis=-1)([x4, x3]) x = Dense(100, activation="relu")(x) x = Dropout(0.3)(x) x = Dense(25, activation="relu")(x) x = Dropout(0.3)(x) out = Dense(1, activation="sigmoid")(x) model = Model([input1, input2], out) # for x in model.layers[-21:]: # x.trainable = True model.compile(loss="binary_crossentropy", metrics=['acc'], optimizer=Adam(0.00005)) model.summary() return model
def main(): # gpu configuration toolkits.initialize_GPU() import model as model convertlist = np.loadtxt('../meta_data/Test_list.txt', dtype=str, usecols=[1]) convertlist = np.array([os.path.join(datapath, i) for i in convertlist]) # ================================== # Get Model # ================================== # construct the data generator. params = {'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.ghostvlad_model_resnet(input_dim=params['dim'], mode='eval') network_eval.load_weights(os.path.join('../../saved_models/ghostvlad_weights.h5'), by_name=True) print('==> start converting.') for ID in tqdm(convertlist): specs = ut.load_data(ID, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs = np.expand_dims(np.expand_dims(specs, 0), -1) v = network_eval.predict(specs) target_path = ID.replace(datapath, target) if not os.path.isdir(os.path.dirname(target_path)): try: os.makedirs(os.path.dirname(target_path)) except FileExistsError: pass np.save(target_path, v[0])
def load_model(): toolkits.initialize_GPU(args) global network_eval network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load')
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== print('==> calculating test({}) data lists...'.format(args.test_type)) if args.test_type == 'normal': verify_list = np.loadtxt('../meta/voxceleb1_veri_test.txt', str) elif args.test_type == 'hard': verify_list = np.loadtxt('../meta/voxceleb1_veri_test_hard.txt', str) elif args.test_type == 'extend': verify_list = np.loadtxt('../meta/voxceleb1_veri_test_extended.txt', str) else: raise IOError('==> unknown test type.') verify_lb = np.array([int(i[0]) for i in verify_list]) list1 = np.array([os.path.join(args.data_path, i[1]) for i in verify_list]) list2 = np.array([os.path.join(args.data_path, i[2]) for i in verify_list]) total_list = np.concatenate((list1, list2)) unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = { 'dim': (257, 100, 1), 'nfft': 512, 'spec_len': 100, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) result_path = set_result_path(args) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. total_length = len(unique_list) feats, scores, labels = [], [], [] for c, ID in enumerate(unique_list): if c % 50 == 0: print('Finish extracting features for {}/{}th wav.'.format( c, total_length)) specs = ut.load_data(ID, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs = np.expand_dims(np.expand_dims(specs, 0), -1) v = network_eval.predict(specs) feats += [v] feats = np.array(feats) # ==> compute the pair-wise similarity. for c, (p1, p2) in enumerate(zip(list1, list2)): ind1 = np.where(unique_list == p1)[0][0] ind2 = np.where(unique_list == p2)[0][0] v1 = feats[ind1, 0] v2 = feats[ind2, 0] scores += [np.sum(v1 * v2)] labels += [verify_lb[c]] print('scores : {}, gt : {}'.format(scores[-1], verify_lb[c])) scores = np.array(scores) labels = np.array(labels) np.save(os.path.join(result_path, 'prediction_scores.npy'), scores) np.save(os.path.join(result_path, 'groundtruth_labels.npy'), labels) eer, thresh = toolkits.calculate_eer(labels, scores) print('==> model : {}, EER: {}'.format(args.resume, eer))
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== print('Calculating test data lists...') # AI project list file if args.test_type == 'ai': verify_list = np.loadtxt('model/meta/sets.txt', str) else: raise IOError('Unknown test type.') verify_lb = np.array([int(i[0]) for i in verify_list]) list1 = np.array([os.path.join(args.data_path, i[1]) for i in verify_list]) list2 = np.array([os.path.join(args.data_path, i[2]) for i in verify_list]) total_list = np.concatenate((list1, list2)) unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = {'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model if args.resume: # Load pretrained weight if os.path.isfile('model/src/weights.h5'): network_eval.load_weights('model/src/weights.h5', by_name=True) print('Successfully loading model {}.'.format(args.resume)) else: raise IOError("No checkpoint found at '{}'".format(args.resume)) else: raise IOError('Please type in the model to load') print('\nStart testing...') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. total_length = len(unique_list) feats, scores, labels = [], [], [] for c, ID in enumerate(unique_list): specs = ut.load_data(ID, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs = np.expand_dims(np.expand_dims(specs, 0), -1) v = network_eval.predict(specs) feats += [v] feats = np.array(feats) allscores = [] match = [] nomatch = [] # ==> compute the pair-wise similarity. print("Model 1 scores") for c, (p1, p2) in enumerate(zip(list1, list2)): ind1 = np.where(unique_list == p1)[0][0] ind2 = np.where(unique_list == p2)[0][0] v1 = feats[ind1, 0] v2 = feats[ind2, 0] scores += [np.sum(v1*v2)] labels += [verify_lb[c]] if c != 0 and verify_lb[c] == 1: match.append(scores[-1]) elif verify_lb[c] == 0: nomatch.append(scores[-1]) allscores.append(scores[-1]) print('Score : {}'.format(scores[-1])) # For evaluation # match = [str(x) for x in match] # nomatch = [str(x) for x in nomatch] # with open("./eval/result.txt", "a") as w: # matches = ','.join(match) # nomatches = ','.join(nomatch) # w.write(matches+'\n') # w.write(nomatches+'\n') with open("result1.pickle", "wb") as w: pickle.dump(scores, w)
def extract_embeddings(input_path=time_100_emp_train, mode='train'): toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) if args.resume: weight_path = os.path.join(base_path, args.resume) if os.path.isfile(weight_path): print('loading graph') network_eval.load_weights(weight_path, by_name=True) else: return 'Issue with loading graph' else: return 'Pre-trained graph is required' if mode == 'train': audio_files = [ filename for filename in Path(input_path).rglob('*.wav') ] total_files = len(audio_files) * 10 working_file = 0 emb_store = {} for audio in audio_files: print(f'processing {os.path.basename(os.path.dirname(audio))} ') specs = ut.load_data_aug(audio, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') count_file = 0 for sample in specs: print(f'Augmentation count is {count_file}') print(f'Processing file {working_file} of {total_files}') sample_spec = np.expand_dims(np.expand_dims(sample, 0), -1) class_label = os.path.basename(os.path.dirname(audio)) v = network_eval.predict(sample_spec) old_data = [] if class_label in emb_store.keys(): pre_data = emb_store.get(class_label) pre_data.append(v[0]) old_data = pre_data else: old_data.append(v[0]) emb_store[class_label] = old_data count_file += 1 working_file += 1 logging.info(f'For {audio} label stored is {class_label}') with open('../data/training_features_augmented.pickle', 'wb') as handle: pickle.dump(emb_store, handle, protocol=pickle.HIGHEST_PROTOCOL) else: specs = ut.load_data(input_path, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs = np.expand_dims(np.expand_dims(specs, 0), -1) vector_embedding = network_eval.predict(specs)[0] return vector_embedding
def main(): # gpu configuration toolkits.initialize_GPU(args) # get speaker id from folder name totalList = [os.path.join(dataPath, file) for file in os.listdir(dataPath)] uniqueList = np.unique(totalList) speakerList = [extractSpeakerId(u) for u in uniqueList] # get audio file for each speaker speakerAudioDict = {} for speaker in speakerList: # root path rootPath = os.path.join(dataPath, speaker) # get list of files fileList = getListOfFiles(rootPath) # add to dict speakerAudioDict[speaker] = fileList # get embedding for each audio of speaker speakerToFeatureDict = {} for speaker in speakerList: # construct the data generator. params = { 'dim': (257, None, 1), 'nfft': 512, 'min_slice': 720, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') feats = [] for ID in speakerAudioDict[speaker]: specs = preprocess.load_data(ID, split=False, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], min_slice=params['min_slice']) specs = np.expand_dims(np.expand_dims(specs[0], 0), -1) v = network_eval.predict(specs) feats += [v] speakerToFeatureDict[speaker] = feats # save to file with open('speaker_data.pickle', 'wb') as handle: pickle.dump(speakerToFeatureDict, handle, protocol=pickle.HIGHEST_PROTOCOL)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model import generator # ================================== # Get Train/Val. # ================================== trnlist, trnlb, l2i = toolkits.load_from_kaldi_dir(args, "train", min_len=300) vallist, vallb, _ = toolkits.load_from_kaldi_dir(args, "val", min_len=300, label2idx=l2i) if args.cmvn: cmvn_stats = kaldiio.load_mat(args.cmvn) mean_stats = cmvn_stats[0, :-1] count = cmvn_stats[0, -1] offset = np.expand_dims(mean_stats, 0) / count print("offset", offset) CMVN = offset else: CMVN = None if args.post_cmvn: cmvn_stats = kaldiio.load_mat(args.post_cmvn) mean_stats = cmvn_stats[0, :-1] count = cmvn_stats[0, -1] offset = np.expand_dims(mean_stats, 0) / count print("offset", offset) POSTCMVN = offset else: POSTCMVN = None # construct the data generator. params = { 'dim': (args.dim, 300, 1), 'mp_pooler': toolkits.set_mp(processes=args.multiprocess), 'nfft': 512, 'spec_len': 300, 'win_length': 400, 'hop_length': 160, 'n_classes': 8, 'sampling_rate': 16000, 'tandem': args.tandem, 'batch_size': args.batch_size, 'shuffle': True, 'normalize': False, 'cmvn': CMVN, 'postcmvn': POSTCMVN } # Datasets partition = {'train': trnlist, 'val': vallist} labels = {'train': trnlb.flatten(), 'val': vallb.flatten()} # Generators trn_gen = generator.DataGenerator(partition['train'], labels['train'], **params) val_gen = generator.DataGenerator(partition['val'], labels['val'], **params) network = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='train', args=args) # ==> load pre-trained model ??? mgpu = len(keras.backend.tensorflow_backend._get_available_gpus()) if args.resume: print("Attempting to load", args.resume) if args.resume: if os.path.isfile(args.resume): if mgpu == 1: # by_name=True, skip_mismatch=True # https://github.com/WeidiXie/VGG-Speaker-Recognition/issues/46 network.load_weights(os.path.join(args.resume), by_name=True, skip_mismatch=True) else: network.layers[mgpu + 1].load_weights( os.path.join(args.resume)) print('==> successfully loading model {}.'.format(args.resume)) else: print("==> no checkpoint found at '{}'".format(args.resume)) print(network.summary()) print('==> gpu {} is, training {} images, classes: 0-{} ' 'loss: {}, aggregation: {}, ohemlevel: {}'.format( args.gpu, len(partition['train']), np.max(labels['train']), args.loss, args.aggregation_mode, args.ohem_level)) model_path, log_path = set_path(args) with open(os.path.join(model_path, 'label2idx'), 'w') as f: for key in l2i.keys(): f.write(key + ' ' + str(l2i[key]) + '\n') normal_lr = keras.callbacks.LearningRateScheduler(step_decay) tbcallbacks = keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=0, write_graph=True, write_images=False, update_freq=args.batch_size * 16) callbacks = [ keras.callbacks.ModelCheckpoint(os.path.join( model_path, 'weights-{epoch:02d}-{val_loss:.3f}.h5'), monitor='val_loss', mode='min', save_best_only=True), normal_lr, tbcallbacks ] if args.ohem_level > 1: # online hard negative mining will be used candidate_steps = int(len(partition['train']) // args.batch_size) iters_per_epoch = int( len(partition['train']) // (args.ohem_level * args.batch_size)) ohem_generator = generator.OHEM_generator( network, trn_gen, candidate_steps, args.ohem_level, args.batch_size, params['dim'], params['n_classes']) A = ohem_generator.next( ) # for some reason, I need to warm up the generator network.fit_generator(generator.OHEM_generator( network, trn_gen, iters_per_epoch, args.ohem_level, args.batch_size, params['dim'], params['n_classes']), steps_per_epoch=iters_per_epoch, epochs=args.epochs, max_queue_size=10, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1) else: network.fit_generator(trn_gen, validation_data=val_gen, steps_per_epoch=int( len(partition['train']) // args.batch_size), epochs=args.epochs, max_queue_size=10, callbacks=callbacks, use_multiprocessing=True, workers=12, verbose=1)
parser.add_argument('--vlad_cluster', default=8, type=int) parser.add_argument('--bottleneck_dim', default=16, type=int) parser.add_argument('--aggregation_mode', default='gvlad', choices=['avg', 'vlad', 'gvlad'], type=str) # set up learning rate, training loss and optimizer. parser.add_argument('--loss', default='regression', choices=['softmax', 'amsoftmax', 'regression'], type=str) global args args = parser.parse_args() toolkits.initialize_GPU(args) import model def main(): params = { 'dim': (257, None, 1), 'n_fft': 512, 'win_length': 400, 'hop_length': 160, 'n_classes': 2, 'sampling_rate': 16000, 'normalize': True, }
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5, exportFile=None, expectedSpeakers=2): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e n_speakers = len(speakerSlice) print('N-SPeakers:', n_speakers) global speaker_final speaker_final = [pdb.empty()] * n_speakers for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] diarization_try(wav_path, s / 1000, e / 1000, spk) s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) # Find the Top n Speakers speaker_final.sort(key=lambda speaker: speaker.duration_seconds, reverse=True) speaker_final = speaker_final[0:expectedSpeakers] # Export the Files iso_wav_path = wav_path.split(".")[0] itr = 0 while itr < len(speaker_final): write_path = exportFile + "_speaker" + str(itr) + ".wav" speaker_final[itr].export(write_path, format="wav") itr += 1 del speaker_final
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== total_list = [os.path.join(args.data_path, file) for file in os.listdir(args.data_path)] unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = {'dim': (257, None, 1), 'nfft': 512, 'min_slice': 720, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format(args.resume)) else: raise IOError('==> please type in the model to load') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. SRC_PATH = r'/data/dataset/SpkWav120' SRC_PATH = r'./ghostvlad/SRC_PATH' # bencq path print(SRC_PATH) path_spk_tuples = prepare_data(SRC_PATH) train_sequence = [] train_cluster_id = [] CNT = 7000 # 7000 for epoch in range(CNT): # Random choice utterances from whole wavfiles # A merged utterance contains [10,20] utterances splits_count = np.random.randint(10, 20, 1) # 最小值,最大值,[维度] path_spks = random.sample(path_spk_tuples, splits_count[0]) utterance_specs, utterance_speakers = load_data(path_spks, min_win_time=500, max_win_time=1600) feats = [] for spec in utterance_specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:,0,:] # [splits, embedding dim] train_sequence.append(feats) train_cluster_id.append(utterance_speakers) print("epoch:{}, utterance length: {}, speakers: {}".format(epoch, len(utterance_speakers), len(path_spks))) np.savez('training_data', train_sequence=train_sequence, train_cluster_id=train_cluster_id)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== print('==> calculating test({}) data lists...'.format(args.test_type)) publicTest = pd.read_csv("/content/VoveDataset/public-test.csv") list1 = addPath(np.array(publicTest["audio_1"])) list2 = addPath(np.array(publicTest["audio_2"])) total_list = np.concatenate((list1, list2)) unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) result_path = "/content/VGG-Speaker-Recognition/result" print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. total_length = len(unique_list) feats, scores, labels = [], [], [] for c, ID in enumerate(pbar(unique_list)): specs = ut.load_data(ID, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs = np.expand_dims(np.expand_dims(specs, 0), -1) v = network_eval.predict(specs) feats += [v] feats = np.array(feats) np.save("/content/feats.npy", feats)
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e # for spk,timeDicts in speakerSlice.items(): # print('========= ' + str(spk) + ' =========') # for timeDict in timeDicts: # s = timeDict['start'] # e = timeDict['stop'] # s = fmtTime(s) # change point moves to the center of the slice # e = fmtTime(e) # print(s+' ==> '+e) # p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) # p.draw() # p.plot.show() speech_r = speech_reg.Recognizer() sound = AudioSegment.from_wav(wav_path) for spk in speakerSlice.keys(): print('========= ' + str(spk) + ' =========') for item_dict in speakerSlice[spk]: audio_seg = sound[item_dict['start']:item_dict['stop']] s = item_dict['start'] e = item_dict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) item_dict.update({'content': audio_seg}) filename = 'speaker' + str(spk) + '-' + str( item_dict['start'] / 1000) + '-' + str( item_dict['stop'] / 1000) + '.wav' audio_seg.export(filename, format="wav") audio = speech_reg.AudioFile(filename) # words=speech_reg.AudioData(audio_seg,sample_rate=fs,sample_width=2) with audio as source: words = speech_r.record(source) try: res = speech_r.recognize_google(words) except speech_reg.UnknownValueError: try: res = speech_r.recognize_sphinx(words) except speech_reg.UnknownValueError: res = '' item_dict.update({'content': res}) print(res) return speakerSlice
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== vallist, vallb = toolkits.get_hike_datalist( meta_paths=args.test_meta_data_path, data_paths=args.test_data_path, mode=model_config['loss']) _, valscore = toolkits.get_hike_datalist( meta_paths=args.test_meta_data_path, data_paths=args.test_data_path, mode='mse') # ================================== # Get Model # ================================== # construct the data generator. num_class = len(score_rule) input_length = int(args.audio_length * 25) params = { 'dim': (513, None, 1), 'mp_pooler': toolkits.set_mp(processes=args.multiprocess), 'nfft': 1024, 'spec_len': input_length, 'win_length': 1024, 'hop_length': 640, 'n_classes': num_class, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=model_config) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True, skip_mismatch=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') v = [] for ID in vallist: val_data = ut.load_data(ID, params['win_length'], params['sampling_rate'], params['hop_length'], params['nfft'], params['spec_len'], 'test', args.data_format) info = network_eval.predict(np.expand_dims(val_data, (0, -1))) v += info.tolist() v = np.array(v) print('val data shape {}'.format(v.shape)) if model_config['loss'] == 'mse': v = v.T[0] * 10 + 5 vallb = vallb * 10 + 5 metric = np.square(np.subtract(v, vallb)).mean() print('mse: ', metric) v_test = np.vstack([v, vallb]).astype('float').T df = np.hstack([vallist.reshape(-1, 1), v_test]) df = pd.DataFrame(data=df, columns=['content', 'score_predict', 'score_true']) else: valscore = valscore * 10 + 5 v_predict = ((v < 0.5) * 1)[:, 0] metric = sum(v_predict == vallb) / len(vallb) print('confusion matrix: ', confusion_matrix(vallb, v_predict)) print('accuracy ', metric) v_test = np.hstack([v, vallb.reshape(-1, 1), valscore.reshape(-1, 1)]).astype('float') df = np.hstack([vallist.reshape(-1, 1), v_test]) df = pd.DataFrame(data=df, columns=[ 'content', 'prob_0', 'prob_1', 'true_label', 'score_true' ]) date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") df.to_csv( os.path.join(args.save_dir, '{}_{}_{}.csv'.format(date, model_config['loss'], metric)))
def main(wav_path, check, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) if check != '': specs1, interval1 = load_data(check, embedding_per_second=1.2, overlap_rate=0.4) mapTable1, keys1 = genMap(interval1) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] featss = np.array(feats)[:, 0, :].astype(float) predicted_label = uisrnnModel.predict(featss, inference_args) total_speaker = len(set(predicted_label)) global no_speakers print("predicted_label: %s" % predicted_label) no_speakers = len(set(predicted_label)) print('total no of speakers', no_speakers) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms if check != '': for spec1 in specs1: spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1) v = network_eval.predict(spec1) feats += [v] featss = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] print("=====================") print(feats) print(featss) print("=====================") predicted_label2 = uisrnnModel.predict(featss, inference_args) check_speaker = len(set(predicted_label2)) print("predicted_label2: %s" % predicted_label2) print('same Speaker' if total_speaker == check_speaker else 'not the same speaker') print('speaker detected as ' + str(predicted_label2[-1]) if total_speaker == check_speaker else '') speakerSlice2 = arrangeResult(predicted_label2, time_spec_rate) print("=============speakerSlice2===============") for spk, timeDicts in speakerSlice2.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice2[spk][tid]['start'] = s speakerSlice2[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice2.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) print("=============speakerSlice2===============") #print(predicted_label,'**************************') center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model params = { 'dim': (513, None, 1), 'n_fft': 1024, 'win_length': 1024, 'hop_length': 640, 'n_classes': 2, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True, skip_mismatch=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') if sum([args.data_path.endswith(i) for i in ['.wav', '.m4a', 'mp3']]) == 1: wav, sr_ret = librosa.load(args.data_path, sr=params['sampling_rate'], offset=5) linear_spect = ut.lin_spectogram_from_wav(wav, params['hop_length'], params['win_length'], params['n_fft']) print('sample_rate is ', sr_ret) elif args.data_path.endswith('.npy'): linear_spect = np.load(args.data_path) else: raise IOError('wrong input format') mag, _ = librosa.magphase(linear_spect) # magnitude mag_T = mag.T spec_mag = mag_T mu = np.mean(spec_mag, 0, keepdims=True) std = np.std(spec_mag * (10**5), 0, keepdims=True) / (10**5) spec_mag = (spec_mag - mu) / (std + 1e-3) spec_mag = np.expand_dims(spec_mag, (0, -1)) print(spec_mag.shape) if args.loss == 'regression': v = network_eval.predict(spec_mag) * 10 + 5 print('the predicted score is: {}'.format(v)) else: v = network_eval.predict(spec_mag) print(v)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model import generator # ================================== # Get Train/Val. # ================================== trnlist, trnlb = toolkits.get_hike_datalist( meta_paths=args.train_meta_data_path, data_paths=args.train_data_path, mode=model_config['loss']) vallist, vallb = toolkits.get_hike_datalist( meta_paths=args.val_meta_data_path, data_paths=args.val_data_path, mode=model_config['loss']) input_length = int(args.audio_length * 25) num_class = len(score_rule) # construct the data generator. params = { 'dim': (513, input_length, 1), 'mp_pooler': toolkits.set_mp(processes=args.multiprocess), 'nfft': 1024, 'spec_len': input_length, 'win_length': 1024, 'hop_length': 640, 'n_classes': num_class, 'sampling_rate': 16000, 'batch_size': model_config['batch_size'], 'shuffle': True, 'normalize': True, 'loss': model_config['loss'], 'data_format': args.data_format } # Datasets partition = {'train': trnlist.flatten(), 'val': vallist.flatten()} labels = {'train': trnlb.flatten(), 'val': vallb.flatten()} # Generators wandb.init(project='vgg_speaker') trn_gen = generator.DataGenerator(partition['train'], labels['train'], **params) val_gen = generator.DataGenerator(partition['val'], labels['val'], **params) network = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='train', args=model_config) # # val data # val_data = [params['mp_pooler'].apply_async(ut.load_data, # args=(ID, params['win_length'], params['sampling_rate'], params['hop_length'], # params['nfft'], params['spec_len'], 'train', args.data_format)) for ID in partition['val']] # val_data = np.expand_dims(np.array([p.get() for p in val_data]), -1) # ==> load pre-trained model ??? print(keras.backend.tensorflow_backend._get_available_gpus()) if args.resume: print("Attempting to load", args.resume) if args.resume: if os.path.isfile(args.resume): network.load_weights(os.path.join(args.resume), by_name=True, skip_mismatch=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise ValueError("==> no checkpoint found at '{}'".format( args.resume)) print(network.summary()) print('==> gpu {} is, training {} images, classes: 0-{} ' 'loss: {}, aggregation: {}, ohemlevel: {}'.format( args.gpu, len(partition['train']), np.max(labels['train']), model_config['loss'], model_config['aggregation_mode'], model_config['ohem_level'])) model_path, log_path = set_path(args, model_config) normal_lr = keras.callbacks.LearningRateScheduler(step_decay) # tbcallbacks = keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=0, write_graph=True, write_images=False, # update_freq=model_config['batch_size'] * 16) callbacks = [ keras.callbacks.ModelCheckpoint( os.path.join(model_path, 'weights-{epoch:02d}-{loss:.3f}.h5'), monitor='loss', mode='min', save_best_only=True, period=20, ), normal_lr, WandbCallback() ] if model_config[ 'ohem_level'] > 1: # online hard negative mining will be used candidate_steps = int( len(partition['train']) // model_config['batch_size']) iters_per_epoch = int( len(partition['train']) // (model_config['ohem_level'] * model_config['batch_size'])) ohem_generator = generator.OHEM_generator( network, trn_gen, candidate_steps, model_config['ohem_level'], model_config['batch_size'], params['dim'], params['n_classes']) A = ohem_generator.next( ) # for some reason, I need to warm up the generator network.fit_generator(generator.OHEM_generator( network, trn_gen, iters_per_epoch, model_config['ohem_level'], model_config['batch_size'], params['dim'], params['n_classes']), steps_per_epoch=iters_per_epoch, epochs=model_config['epochs'], max_queue_size=10, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1) else: if model_config['loss'] != 'mse': network.fit_generator(trn_gen, steps_per_epoch=int( len(partition['train']) // model_config['batch_size']), epochs=model_config['epochs'], max_queue_size=10, validation_data=val_gen, validation_freq=1, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1) else: network.fit_generator(trn_gen, steps_per_epoch=int( len(partition['train']) // model_config['batch_size']), epochs=model_config['epochs'], max_queue_size=10, validation_data=val_gen, validation_freq=1, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Model # ================================== # construct the data generator. params = {'dim': (23, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) utt2ark, utt2idx, all_list, utt2data = {}, {}, [], {} for idx, kaldi_data_dir in enumerate(args.kaldi_data_dirs): if not os.path.exists(args.emb_out_dirs[idx]): os.makedirs(args.emb_out_dirs[idx]) feats_path = os.path.join(kaldi_data_dir, 'feats.scp') vad_path = os.path.join(kaldi_data_dir, 'vad.scp') assert os.path.exists(feats_path), 'Path `{}` does not exists.'.format(feats_path) with open(feats_path) as f: for line in f: key, ark = line.split() ark, position = ark.split(':') input_tuple = (key, ark, int(position)) utt2data[key] = ut.load_data(input_tuple, mode='eval') utt2idx[key] = idx with open(vad_path) as f: for line in f: key, ark = line.split() ark, position = ark.split(':') vad_array = None for ark_key, vec in kaldi_io.read_vec_flt_ark(ark): if key == ark_key: vad_array = np.array(vec, dtype=bool) assert vad_array is not None assert vad_array.size == utt2data[key].shape[1], 'Shapes does not fit: vad {}, mfcc {}'.format( vad_array.size, utt2data[key].shape[1]) utt2data[key] = ut.apply_cmvn_sliding(utt2data[key]).T[vad_array] # ==> load pre-trained model ??? if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loaded model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format(args.resume)) print('==> start testing.') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. for idx, utt in enumerate(utt2data): embedding = network_eval.predict(utt2data[utt].T[np.newaxis, :, :, np.newaxis]).squeeze() ut.write_txt_vectors( os.path.join(args.emb_out_dirs[utt2idx[utt]], 'xvector.{}.txt'.format(idx)), {utt: embedding})
def main(): # gpu configuration toolkits.initialize_GPU(args) import model import generator import keras # ================================== # Get Train/Val. # ================================== trnlist, trnlb = toolkits.get_voxceleb2_datalist( args, path='../meta/vox2_train_wav.txt') vallist, vallb = toolkits.get_voxceleb2_datalist( args, path='../meta/vox2_val_wav.txt') # construct the data generator. params = { 'dim': (257, 250, 1), 'mp_pooler': toolkits.set_mp(processes=args.multiprocess), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'batch_size': args.batch_size, 'shuffle': True, 'normalize': True, } # Datasets partition = {'train': trnlist.flatten(), 'val': vallist.flatten()} labels = {'train': trnlb.flatten(), 'val': vallb.flatten()} # Generators trn_gen = generator.DataGenerator(partition['train'], labels['train'], **params) network = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='train', args=args) # ==> load pre-trained model ??? mgpu = len(keras.backend.tensorflow_backend._get_available_gpus()) if args.resume: if os.path.isfile(args.resume): if mgpu == 1: network.load_weights(os.path.join(args.resume)) else: network.layers[mgpu + 1].load_weights(os.path.join( args.resume)) print('==> successfully loading model {}.'.format(args.resume)) else: print("==> no checkpoint found at '{}'".format(args.resume)) print(network.summary()) print('==> gpu {} is, training {} images, classes: 0-{} ' 'loss: {}, aggregation: {}, ohemlevel: {}'.format( args.gpu, len(partition['train']), np.max(labels['train']), args.loss, args.aggregation_mode, args.ohem_level)) model_path, log_path = set_path(args) normal_lr = keras.callbacks.LearningRateScheduler(step_decay) tbcallbacks = keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=0, write_graph=True, write_images=False, update_freq=args.batch_size * 16) callbacks = [ keras.callbacks.ModelCheckpoint(os.path.join( model_path, 'weights-{epoch:02d}-{acc:.3f}.h5'), monitor='loss', mode='min', save_best_only=True), normal_lr, tbcallbacks ] if args.ohem_level > 1: # online hard negative mining will be used candidate_steps = int(len(partition['train']) // args.batch_size) iters_per_epoch = int( len(partition['train']) // (args.ohem_level * args.batch_size)) ohem_generator = generator.OHEM_generator( network, trn_gen, candidate_steps, args.ohem_level, args.batch_size, params['dim'], params['n_classes']) A = ohem_generator.next( ) # for some reason, I need to warm up the generator network.fit_generator(generator.OHEM_generator( network, trn_gen, iters_per_epoch, args.ohem_level, args.batch_size, params['dim'], params['n_classes']), steps_per_epoch=iters_per_epoch, epochs=args.epochs, max_queue_size=10, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1) else: network.fit_generator(trn_gen, steps_per_epoch=int( len(partition['train']) // args.batch_size), epochs=args.epochs, max_queue_size=10, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1)
def main(wav_path, embedding_per_second=1.0, n_classes=5994, overlap_rate=0.5, plot_results=True): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) # model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) print('intervals', intervals, len(intervals)) print('mapTable', mapTable, len(mapTable)) print('keys', keys, len(keys)) # print('mapTable, keys', mapTable, keys) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) # print('v',v.shape) #print('feats', feats.shape) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) print(feats.shape) print(inference_args) print('predicted_label', predicted_label) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms print('time_spec_rate', time_spec_rate) center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) print('speakerSlice', speakerSlice) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) print(spk, timeDicts) for tid, timeDict in enumerate(timeDicts): print(tid, timeDict) s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] print('offset', offset) s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset print('i,s,e') print(i, s, e, tid, spk) print('>>>>>', i, s, e, tid, spk) speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e speaker_assingments = [] for spk, timeDicts in speakerSlice.items(): speaker = str(spk) print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: start = timeDict['start'] end = timeDict['stop'] start = fmtTime( start) # change point moves to the center of the slice end = fmtTime(end) print(start + ' ==> ' + end) speaker_assingments.append((start, end, speaker, wav_path)) if plot_results: p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show() return feats, predicted_label, intervals, speaker_assingments, time_spec_rate
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = arguments.parse_arguments() model_args.observation_dim = 512 diarization_Model = UISRNN(model_args) diarization_Model.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = diarization_Model.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, pick=True, size=(25, 6)) p.draw() p.plot.show()
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) #specs1,interval1 = load_data(r'wavs/REC20190716140159.wav', embedding_per_second=1.2, overlap_rate=0.4) #mapTable1,keys1 =genMap(interval1) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] # ============================================================================= # for spec1 in specs1: # spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1) # v = network_eval.predict(spec1) # feats += [v] # ============================================================================= feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] #print(len(feats),'00000000') #predicted_label = uisrnnModel.predict(feats, inference_args) #silhoutte score # ============================================================================= # sli=[] # fromsel=[] # li=[] # knum=[] # for i in range(10): # li=[] # range_n_clusters = list (range(2,5)) # for n_clusters in range_n_clusters: # clusterer = KMeans(n_clusters=n_clusters) # preds = clusterer.fit_predict(feats) # centers = clusterer.cluster_centers_ # # score = silhouette_score (feats, preds, metric='euclidean') # print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score)) # li.append([n_clusters,score,clusterer,centers]) # # ============================================================================= # # print([float(str(i[1])[:4]) for i in li]) # # kvalue=(max([float(str(i[1])[:4]) for i in li])) # # for i in range(len(li)): # # if kvalue==float(str(li[i][1])[:4]): # # true_k=li[i][0] # # break # # ============================================================================= # maxi=li[0][1] # for i in range(1,len(li)): # if li[i][1]-maxi>=0.005: # maxi=li[i][1] # for i in li: # if i[1]==maxi: # true_k=i[0] # # ============================================================================= # # maxi=max([i[1] for i in li]) # # for i in li: # # if i[1]==maxi: # # true_k=i[0] # # ============================================================================= # fromsel.append(li[true_k-2]) # print(true_k) # knum.append(true_k) # kval=(max(set(knum), key=knum.count)) # print(kval) # ============================================================================= clusterer = SpectralClusterer(min_clusters=2, max_clusters=100, p_percentile=0.95, gaussian_blur_sigma=1) predicted_label = clusterer.predict(feats) # ============================================================================= # clusters = KMeans(n_clusters=40, init='k-means++', max_iter=100, n_init=1, random_state = 0) # clusters.fit(feats) # tsne = TSNEVisualizer() # tsne.fit(feats, ["c{}".format(c) for c in clusters.labels_]) # tsne.poof() # ============================================================================= global no_speakers no_speakers = len(set(predicted_label)) #print(predicted_label,'**************************') time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show()
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== total_list = [ os.path.join(args.data_path, file) for file in os.listdir(args.data_path) ] unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = { 'dim': (257, None, 1), 'nfft': 512, 'min_slice': 720, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. train_cluster_id = [] train_sequence = [] SRC_PATH = r'/data/dataset/SpkWav120' wavDir = os.listdir(SRC_PATH) wavDir.sort() for i, spkDir in enumerate(wavDir): # Each speaker's directory spk = spkDir # speaker name wavPath = os.path.join(SRC_PATH, spkDir, 'audio') print('Processing speaker({}) : {}'.format(i, spk)) for wav in os.listdir(wavPath): # wavfile utter_path = os.path.join(wavPath, wav) feats = [] specs = load_data(utter_path, split=True, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], min_slice=params['min_slice']) if (len(specs) < 1): continue for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :] # [splits, embedding dim] train_cluster_id.append([spk] * feats.shape[0]) train_sequence.append(feats) np.savez('training_data', train_sequence=train_sequence, train_cluster_id=train_cluster_id)
def __init__(self): self.filename2embedding = {} arguments = "--net resnet34s --gpu 0 --ghost_cluster 2 --vlad_cluster 8 --loss softmax " \ "--resume " \ "/media/ben/datadrive/Software/VGG-Speaker-Recognition/model/gvlad_softmax" \ "/2020-11-15_resnet34s_bs16_adam_lr0.001_vlad8_ghost2_bdim512_ohemlevel0" \ "/weights-42-0.931.h5 --data_path " \ "/media/ben/datadrive/Zalo/voice-verification/Train-Test-Data/dataset/".split() ZALO_TEST = "/media/ben/datadrive/Zalo/voice-verification/vgg_db_files/val_trials.txt" parser = argparse.ArgumentParser() # set up training configuration. parser.add_argument("--gpu", default="", type=str) parser.add_argument("--resume", default="", type=str) parser.add_argument("--batch_size", default=16, type=int) parser.add_argument("--data_path", default="/media/weidi/2TB-2/datasets/voxceleb1/wav", type=str) # set up network configuration. parser.add_argument("--net", default="resnet34s", choices=["resnet34s", "resnet34l"], type=str) parser.add_argument("--ghost_cluster", default=2, type=int) parser.add_argument("--vlad_cluster", default=8, type=int) parser.add_argument("--bottleneck_dim", default=512, type=int) parser.add_argument("--aggregation_mode", default="gvlad", choices=["avg", "vlad", "gvlad"], type=str) # set up learning rate, training loss and optimizer. parser.add_argument("--loss", default="softmax", choices=["softmax", "amsoftmax"], type=str) parser.add_argument("--test_type", default="normal", choices=["normal", "hard", "extend"], type=str) global args args = parser.parse_args(arguments) # gpu configuration toolkits.initialize_GPU(args) # ================================== # Get Train/Val. # ================================== print("==> Initialising inference engine...".format(args.test_type)) # ================================== # Get Model # ================================== # construct the data generator. self.params = {"dim": (257, None, 1), "nfft": 512, "spec_len": 250, "win_length": 400, "hop_length": 160, "n_classes": 5994, "sampling_rate": 16000, "normalize": True, } self.network_eval = model.vggvox_resnet2d_icassp(input_dim=self.params["dim"], num_class=self.params["n_classes"], mode="eval", args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the image_model == real_model. if os.path.isfile(args.resume): self.network_eval.load_weights(os.path.join(args.resume), by_name=True) print("==> successfully loading model {}.".format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format(args.resume)) else: raise IOError("==> please type in the model to load") print("==> start testing.")