Пример #1
0
def get_id_result():
	print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE))
	model = vggvox_model()
	model.load_weights(c.WEIGHTS_FILE)
	model.summary()

	print("Processing enroll samples....")
	enroll_result = get_embeddings_from_list_file(model, c.ENROLL_LIST_FILE, c.MAX_SEC)
	enroll_embs = np.array([emb.tolist() for emb in enroll_result['embedding']])
	speakers = enroll_result['speaker']

	print("Processing test samples....")
	test_result = get_embeddings_from_list_file(model, c.TEST_LIST_FILE, c.MAX_SEC)
	test_embs = np.array([emb.tolist() for emb in test_result['embedding']])

	print("Comparing test samples against enroll samples....")
	distances = pd.DataFrame(cdist(test_embs, enroll_embs, metric=c.COST_METRIC), columns=speakers)

	scores = pd.read_csv(c.TEST_LIST_FILE, delimiter=",",header=0,names=['test_file','test_speaker'])
	scores = pd.concat([scores, distances],axis=1)
	scores['result'] = scores[speakers].idxmin(axis=1)
	scores['correct'] = (scores['result'] == scores['test_speaker'])*1. # bool to int

	print("Writing outputs to [{}]....".format(c.RESULT_FILE))
	result_dir = os.path.dirname(c.RESULT_FILE)
	if not os.path.exists(result_dir):
	    os.makedirs(result_dir)
	with open(c.RESULT_FILE, 'w') as f:
		scores.to_csv(f, index=False)
Пример #2
0
def RT_CNN():
    print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE))
    model = vggvox_model()  # Creates a VGGVox model
    model.load_weights(
        c.WEIGHTS_FILE)  # Load the weights of the trained models
    model.summary()  # Print a summary of the loaded model

    print("Loading embeddings from enroll")
    toLoad = load("data/model/RTSP_CNN.out")
    enroll_embs = []
    speakers = []
    for spk, embs in toLoad.items():
        for e in embs:
            enroll_embs.append(e)
            speakers.append(spk)
        print(spk)

    count = 0
    buffer = AudioBuffer()

    start_time = time.time()
    while count < 3:
        count += 1
        buffer.record(chunk_size=c.SAMPLE_RATE)
        data = buffer.get_data()
        data = np.frombuffer(data, 'int16')
    buckets = build_buckets(c.MAX_SEC, c.BUCKET_STEP, c.FRAME_STEP)

    data *= 2**15

    while (len(data) / (c.FRAME_STEP * c.SAMPLE_RATE) < 101):
        data = np.append(data, 0)

    # get FFT spectrum
    data = remove_dc_and_dither(data, c.SAMPLE_RATE)
    data = sigproc.preemphasis(data, coeff=c.PREEMPHASIS_ALPHA)
    frames = sigproc.framesig(data,
                              frame_len=c.FRAME_LEN * c.SAMPLE_RATE,
                              frame_step=c.FRAME_STEP * c.SAMPLE_RATE,
                              winfunc=np.hamming)
    fft = abs(np.fft.fft(frames, n=c.NUM_FFT))
    fft_norm = normalize_frames(fft.T)

    # truncate to max bucket sizes
    rsize = max(k for k in buckets if k <= len(fft_norm.T))
    rstart = int((len(fft_norm.T) - rsize) / 2)
    x = fft_norm[:, rstart:rstart + rsize]

    test_embs = np.squeeze(model.predict(x.reshape(1, *x.shape, 1)))
    distances = []

    for embs in enroll_embs:
        distances.append(euclidean(test_embs, embs))

    print(len(speakers))

    idx = np.argmin(distances)

    print(speakers[idx])
    print("Ok, ", time.time() - start_time - 3, " seconds")
Пример #3
0
def train_vggvox_model(train_list_file):
    model = vggvox_model()
    train_data = get_train_list(train_list_file)
    # 编译模型
    model.compile(
        optimizer=optimizers.RMSprop(lr=0.1),
        loss="categorical_crossentropy",  # 使用分类交叉熵作为损失函数
        metrics=['acc'])  # 使用精度作为指标

    # 测试输入格式 (**Most important**)
    # data = np.random.randn(1,512,30,1)
    # lable = np.zeros((1251,))
    # lable[1000] = 1.
    # lable = lable.reshape((1,1,1,1251))
    # print(lable.shape)

    train_data["voice"] = train_data["voice"].apply(
        lambda x: x.reshape(1, *x.shape, 1))
    train_data["lable"] = train_data["lable"].apply(lambda x: x.reshape(
        (1, 1, 1, 1251)))

    print("Start training...")
    history = model.fit_generator(gene(train_data),
                                  epochs=100,
                                  steps_per_epoch=c.TRAIN_NUM)
    model.save_weights(filepath=c.PERSONAL_WEIGHT)

    print("loss: ", min(history.history["loss"]))
    print("Done!")
Пример #4
0
def batch_offline_test():
    print("Loading model for batch offline test from [{}]....".format(
        c.WEIGHTS_FILE))
    model = vggvox_model()
    model.load_weights(c.WEIGHTS_FILE)
    model.summary()

    print("Processing enroll samples in [{}]....".format(c.ENROLL_WAV_DIR))
    enroll_result = forward_offline(model, c.ENROLL_WAV_DIR,
                                    c.ENROLL_LIST_FILE, c.MAX_SEC_ENROLL)
    enroll_embs = np.array(
        [emb.tolist() for emb in enroll_result['embedding']])
    speakers = enroll_result['speaker']

    print("Processing test samples in [{}]....".format(c.TEST_WAV_DIR))
    test_result = forward_offline(model, c.TEST_WAV_DIR, c.TEST_LIST_FILE,
                                  c.MAX_SEC_TEST)
    test_embs = np.array([emb.tolist() for emb in test_result['embedding']])

    print("Comparing test samples against enroll samples....")
    distances = pd.DataFrame(cdist(test_embs,
                                   enroll_embs,
                                   metric=c.COST_METRIC),
                             columns=speakers)

    # get all speakers in top 10%
    num_speakers_top_1 = max(int(len(speakers) / 100), 1)
    num_speakers_top_5 = max(int(len(speakers) * 5 / 100), 1)
    num_speakers_top_10 = max(int(len(speakers) * 10 / 100), 1)

    results = pd.DataFrame(distances.columns[distances.values.argsort(1)
                                             [:, :num_speakers_top_10]].values,
                           index=distances.index)
    results = results.rename(columns=lambda x: 'result_{}'.format(x + 1))

    scores = pd.read_csv(c.TEST_LIST_FILE,
                         delimiter=",",
                         header=0,
                         names=['test_file', 'test_speaker'])
    scores = pd.concat([scores, distances, results], axis=1)
    scores['correct'] = (scores['result_1']
                         == scores['test_speaker']) * 1.  # bool to int

    correct = scores['correct']
    for i in range(1, num_speakers_top_10 + 1, 1):
        correct = np.logical_or(
            correct, scores['result_{}'.format(i)]
            == scores['test_speaker']) * 1.
        if i == num_speakers_top_1:
            scores['correct_top_1%'] = correct
        elif i == num_speakers_top_5:
            scores['correct_top_5%'] = correct
        elif i == num_speakers_top_10:
            scores['correct_top_10%'] = correct

    # output
    print("Writing outputs to [{}]....".format(c.OFFLINE_RESULT_FILE))
    with open(c.OFFLINE_RESULT_FILE, c.OFFLINE_RESULT_WRITE_OPTION) as f:
        scores.to_csv(f, index=False)
def train_vggvox_model(model_load_path, model_save_path, continue_training,
                       save_model):
    audiolist, labellist = tools.get_voxceleb1_datalist(
        c.FA_DIR, c.VERI_TRAIN_LIST_FILE)
    train_gene = tools.DataGenerator(audiolist, labellist, c.DIM, c.MAX_SEC,
                                     c.BUCKET_STEP, c.FRAME_STEP, c.BATCH_SIZE,
                                     c.N_CLASS)
    if continue_training == 1:
        print("load model from {}...".format(model_load_path))
        model = load_model(model_load_path)
    else:
        model = vggvox_model()
        # 编译模型
        model.compile(
            optimizer=optimizers.Adam(lr=c.LR,
                                      beta_1=0.9,
                                      beta_2=0.999,
                                      epsilon=1e-08,
                                      decay=0.0),
            loss="categorical_crossentropy",  # 使用分类交叉熵作为损失函数
            metrics=['acc'])  # 使用精度作为指标

    # train_data["voice"] = train_data["voice"].apply(lambda x: x.reshape(1,*x.shape,1))
    # train_data["lable"] = train_data["lable"].apply(lambda x: x.reshape((1, 1, 1, 1251)))

    tbcallbacks = keras.callbacks.TensorBoard(log_dir=c.TENSORBOARD_LOG_PATH,
                                              histogram_freq=0,
                                              write_graph=True,
                                              write_images=False,
                                              update_freq=c.BATCH_SIZE * 10000)
    callbacks = [
        keras.callbacks.ModelCheckpoint(os.path.join(
            c.VERI_MODEL_FA_PATH,
            'veri_model_128_{epoch:02d}_{loss:.3f}_{acc:.3f}.h5'),
                                        monitor='loss',
                                        mode='min',
                                        save_best_only=True,
                                        save_weights_only=False,
                                        period=5), tbcallbacks
    ]

    print("Start training...")
    history = model.fit_generator(train_gene,
                                  epochs=c.EPOCHS,
                                  steps_per_epoch=int(
                                      len(labellist) // c.BATCH_SIZE),
                                  callbacks=callbacks)

    print("save weights to {}...".format(c.PERSONAL_WEIGHT))
    model.save_weights(filepath=c.PERSONAL_WEIGHT, overwrite=True)
    if save_model == 1:
        print("save model to {}...".format(model_save_path))
        model.save(model_save_path, overwrite=True)
    tools.draw_loss_img(history.history, c.LOSS_PNG)
    tools.draw_acc_img(history.history, c.ACC_PNG)
    print("Done!")
Пример #6
0
def test():
    TEST_WAV1 = "data/wav/file1.wav"
    TEST_WAV2 = "data/wav/file2.wav"
    model = vggvox_model()
    model.load_weights("data/model_weights/model_0.h5")
    buckets = build_buckets(c.MAX_SEC_TEST, c.BUCKET_STEP_SEC)
    spec1 = read_and_process_audio(TEST_WAV1, buckets)
    emb1 = model.predict(spec1.reshape(1, *spec1.shape, 1))
    spec2 = read_and_process_audio(TEST_WAV2, buckets)
    emb2 = model.predict(spec2.reshape(1, *spec2.shape, 1))
    dist = np.linalg.norm(emb1 - emb2)
    print(dist)
Пример #7
0
def get_id_result():
    print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE))
    model = vggvox_model()  # Creates a VGGVox model
    model.load_weights(
        c.WEIGHTS_FILE)  # Load the weights of the trained models
    model.summary()  # Print a summary of the loaded model

    print("Processing enroll samples....")
    enroll_result = get_embeddings_from_list_file(
        model, c.ENROLL_LIST_FILE,
        c.MAX_SEC)  # Extracts information from fft using the VGGVox model
    enroll_embs = np.array(
        [emb.tolist() for emb in enroll_result['embedding']])
    speakers = enroll_result['speaker']

    toSave = defaultdict(list)
    for i in range(len(speakers)):
        toSave[speakers[i]].append(enroll_embs[i])
    dump(toSave, "data/model/RTSP_CNN.out")

    start_time = time.time()
    print("Processing test samples....")
    test_result = get_embeddings_from_list_file(model, c.TEST_LIST_FILE,
                                                c.MAX_SEC)
    test_embs = np.array([emb.tolist() for emb in test_result['embedding']])

    print("Comparing test samples against enroll samples....")
    distances = pd.DataFrame(
        cdist(test_embs, enroll_embs, metric=c.COST_METRIC), columns=speakers
    )  # Compute the distance between each test and enroll data

    scores = pd.read_csv(c.TEST_LIST_FILE,
                         delimiter=",",
                         header=0,
                         names=['test_file', 'test_speaker'])
    scores = pd.concat([scores, distances], axis=1)
    scores['result'] = scores[speakers].idxmin(axis=1)

    print(time.time() - start_time, " seconds")

    index = scores[speakers].index
    result = scores[speakers].idxmin(axis=1)
    for idx in index:
        if (min(scores[speakers].values[idx]) > 0.16):
            result[idx] = "Unknown"

    scores['result_threshold'] = result

    scores['correct'] = (scores['result']
                         == scores['test_speaker']) * 1.  # bool to int
    scores['correct_threshold'] = (scores['result_threshold'] ==
                                   scores['test_speaker']) * 1.  # bool to int
Пример #8
0
def retrain(x_train, y_train):
    print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE))

    baseline_model = vggvox_model()
    baseline_model.load_weights(c.WEIGHTS_FILE)

    print("Creating base network ...")
    model = vggvox_mod_model(baseline_model)
    model.summary()

    train_for_classification(model, x_train, y_train)

    return model
Пример #9
0
def retrain(x_train, y_train):
	print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE))
	
	baseline_model = vggvox_model()
	baseline_model.load_weights(c.WEIGHTS_FILE)
	
	print("Creating base network ...")
	model = vggvox_mod_model(baseline_model)
	model.summary()
	
	train_for_classification(model, x_train, y_train)
	
	# print("Creating siamese network ...")	
	# siamese_model = siamese_network(input_shape, model)

	# print("Training....")
	# siamese_model = train_siamese(siamese_model, tr_pairs, tr_y)

	return model
Пример #10
0
def verify(opt):
    input_wav_path = opt.input
    test_wav_path = opt.test
    metric_fn = opt.metric
    threshold = opt.threshold

    print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE))
    model = vggvox_model()
    model.load_weights(c.WEIGHTS_FILE)
    model.summary()

    print("Processing enroll samples....")
    enroll_result = get_embeddings_from_file(model, input_wav_path, c.MAX_SEC)
    enroll_embs = np.array(
        [emb.tolist() for emb in enroll_result['embedding']])
    # 	speakers = enroll_result['speaker']

    print("Processing test samples....")
    test_result = get_embeddings_from_file(model, test_wav_path, c.MAX_SEC)
    test_embs = np.array([emb.tolist() for emb in test_result['embedding']])

    print("Comparing test samples against enroll samples....")
    distances = pd.DataFrame(cdist(test_embs, enroll_embs, metric=metric_fn),
                             columns=['distance'])

    # 	scores = pd.read_csv(c.TEST_LIST_FILE, delimiter=",",header=0,names=['test_file','test_speaker'])
    scores = pd.DataFrame({
        'input': [input_wav_path.split('/')[-1]],
        'test': [test_wav_path.split('/')[-1]]
    })
    scores['metric'] = metric_fn
    scores = pd.concat([scores, distances], axis=1)
    scores['threshold'] = threshold
    scores['result'] = scores['distance'] < threshold
    # 	scores['correct'] = (scores['result'] == scores['test_speaker'])*1. # bool to int
    print(scores)
    print("Writing outputs to [{}]....".format(c.RESULT_FILE))
    result_dir = os.path.dirname(c.RESULT_FILE)
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    with open(c.RESULT_FILE, 'w') as f:
        scores.to_csv(f, index=False)
Пример #11
0
def get_id_result():

    #get training and testing pair for training
    (x_train,
     y_train) = get_fft_features_from_list_file(c.ENROLL_LIST_FILE, c.MAX_SEC)
    (x_test, y_test) = get_fft_features_from_list_file(c.TEST_LIST_FILE,
                                                       c.MAX_SEC)
    y_train = to_categorical(y_train, num_classes=c.NUM_CLASSES)
    y_test = to_categorical(y_test, num_classes=c.NUM_CLASSES)

    print("Y_train.shape: ", y_train.shape)

    if c.RETRAIN:
        model = retrain(x_train, y_train)
    else:
        baseline_model = vggvox_model()
        model = vggvox_mod_model(baseline_model)
        model.load_weights(c.VGGM_WEIGHTS_FILE)
        model = compile_model(model)

    score = model.evaluate(x_test, y_test, verbose=1)
    print("loss: {}, top-1 accuracy (/%): {}, top-5 accuracy (/%): {}".format(
        score[0], score[1], score[2]))
Пример #12
0
def offline_test():
    print("Loading model for offline test from [{}]....".format(
        c.WEIGHTS_FILE))
    model = vggvox_model()
    model.load_weights(c.WEIGHTS_FILE)
    model.summary()

    print("Processing enroll samples in [{}]....".format(c.ENROLL_WAV_DIR))
    enroll_result = forward_offline(model, c.ENROLL_WAV_DIR,
                                    c.ENROLL_LIST_FILE, c.MAX_SEC_ENROLL)
    enroll_embs = np.array(
        [emb.tolist() for emb in enroll_result['embedding']])
    speakers = enroll_result['speaker']

    print("Processing test samples in [{}]....".format(c.TEST_WAV_DIR))
    test_result = forward_offline(model, c.TEST_WAV_DIR, c.TEST_LIST_FILE,
                                  c.MAX_SEC_TEST)
    test_embs = np.array([emb.tolist() for emb in test_result['embedding']])

    print("Comparing test samples against enroll samples....")
    distances = pd.DataFrame(cdist(test_embs,
                                   enroll_embs,
                                   metric=c.COST_METRIC),
                             columns=speakers)

    scores = pd.read_csv(c.TEST_LIST_FILE,
                         delimiter=",",
                         header=0,
                         names=['test_file', 'test_speaker'])
    scores = pd.concat([scores, distances], axis=1)
    scores['result'] = scores[speakers].idxmin(axis=1)
    scores['correct'] = (scores['result']
                         == scores['test_speaker']) * 1.  # bool to int

    print("Writing outputs to [{}]....".format(c.OFFLINE_RESULT_FILE))
    with open(c.OFFLINE_RESULT_FILE, c.OFFLINE_RESULT_WRITE_OPTION) as f:
        scores.to_csv(f, index=False)
Пример #13
0
def online_test():
    print("Loading model for online test from [{}]....".format(c.WEIGHTS_FILE))
    model = vggvox_model()
    model.load_weights(c.WEIGHTS_FILE)
    model.summary()

    print("Processing enroll samples in [{}]....".format(c.ENROLL_WAV_DIR))
    enroll_result = forward_offline(model, c.ENROLL_WAV_DIR,
                                    c.ENROLL_LIST_FILE, c.MAX_SEC_ENROLL)
    enroll_embs = np.array(
        [emb.tolist() for emb in enroll_result['embedding']])
    speakers = enroll_result['speaker']

    with open(c.ONLINE_RESULT_FILE, c.ONLINE_RESULT_WRITE_OPTION) as f:
        f.write("condition,test_speaker,{},result,correct\n".format(
            ','.join(speakers)))
    CSV_PREFIX = c.ONLINE_CONDITION + "," + c.ONLINE_SPEAKER + ","

    p = pyaudio.PyAudio()
    while True:
        # Record
        stream = p.open(format=c.FORMAT,
                        channels=c.NUM_CHANNEL,
                        rate=c.SAMPLE_RATE,
                        input=True,
                        frames_per_buffer=c.CHUNK)
        print("\nStart speaking")
        frames = []
        for i in range(0, int(c.SAMPLE_RATE / c.CHUNK * c.ONLINE_RECORD_SEC)):
            data = stream.read(c.CHUNK)
            frames.append(data)
        print("Done recording")

        stream.stop_stream()
        stream.close()

        # Save audio
        wf = wave.open(c.ONLINE_WAV_FILE, 'wb')
        wf.setnchannels(c.NUM_CHANNEL)
        wf.setsampwidth(p.get_sample_size(c.FORMAT))
        wf.setframerate(c.SAMPLE_RATE)
        wf.writeframes(b''.join(frames))
        wf.close()

        # Test against enrolled samples
        print("Comparing test sample against enroll samples....")
        emb = forward_online(model, c.ONLINE_WAV_FILE, c.MAX_SEC_TEST)
        buff = CSV_PREFIX
        min_dist, min_spk = 1., None
        for i, spk in enumerate(enroll_result['speaker']):
            if c.COST_METRIC == "euclidean":
                dist = euclidean(emb, enroll_result['embedding'][i])
            elif c.COST_METRIC == "cosine":
                dist = cosine(emb, enroll_result['embedding'][i])
            else:
                print("Invalid cost metric [{}]".format(c.COST_METRIC))
            if dist < min_dist:
                min_dist, min_spk = dist, spk
            buff += str(dist) + ","
            print("Distance with speaker [{}]:\t{}".format(spk, dist))
        print("-----> {}".format(min_spk))

        correct = int(min_spk == c.ONLINE_SPEAKER)
        buff += min_spk + "," + str(correct)
        with open(c.ONLINE_RESULT_FILE, 'a') as f:
            f.write(buff + "\n")

    p.terminate()
Пример #14
0
 def load_model(self):
     model = vggvox_model()
     model.load_weights("data/model/weights.h5")
     return model