speakers = ['fuli', 'lzh', 'cl', 'lsq', 'lxx', 'wry', 'lms', 'zlb', 'zq']
enroll_wav_path = 'media'
dict_spkid_embeddings = {}
enroll_nums = 5
total_wavs = glob.glob(os.path.join(data, '*.wav'))
print('total wavs: ', len(total_wavs))
for speaker in speakers:
    #print(speaker)
    speaker_wavs = glob.glob(os.path.join(enroll_wav_path,
                                          speaker + '_*.wav'))[:5]
    #print(speaker, speaker_wavs)
    speaker_embeddings = []
    for wav in speaker_wavs:
        mfcc = sample_from_mfcc(read_mfcc(wav, SAMPLE_RATE), NUM_FRAMES)
        predict_feat = model.m.predict(np.expand_dims(mfcc, axis=0))
        speaker_embeddings.append(predict_feat)
    num_utterances = len(speaker_wavs)
    enroll_centroid_embeddings = get_centroid(speaker_embeddings,
                                              num_utterances)
    dict_spkid_embeddings[speaker] = enroll_centroid_embeddings

a = 0
total = 0
score = 0
name = 'Who?'
test_path = 'media'

for speaker in speakers:
    test_wavs = glob.glob(os.path.join(test_path, speaker + '_*.wav'))[6:]
Пример #2
0
    def record(self):
        ####计算enroll centeriod embedding####
        dict_spkid_embeddings = {}
        all_wavs = glob.glob(os.path.join(enroll_wav_path, '*.wav'))

        model = DeepSpeakerModel()
        #model.m.load_weights('weights/ResCNN_triplet_training_checkpoint_265.h5', by_name=True)
        model.m.load_weights(
            'weights/ResCNN_softmax_pre_training_checkpoint_102.h5',
            by_name=True)
        ## speakers###
        speakers = ['lms', 'zq', 'wry', 'lzh']

        # enroll_centroid_embeddings_npy = np.zeros((3, 256),dtype=float)
        # enroll_dict= {}
        i = 0
        for speaker in speakers:
            #print(speaker)
            speaker_wavs = glob.glob(
                os.path.join(enroll_wav_path, speaker + '_*.wav'))[:6]
            #print(speaker_wavs)
            speaker_embeddings = []
            for wav in speaker_wavs:
                mfcc_feat = sample_from_mfcc(read_mfcc(wav, SAMPLE_RATE),
                                             NUM_FRAMES)
                mfcc_feat = model.m.predict(np.expand_dims(mfcc_feat, axis=0))
                speaker_embeddings.append(mfcc_feat)
            num_utterances = len(speaker_wavs)
            enroll_centroid_embeddings = get_centroid(speaker_embeddings,
                                                      num_utterances)
            dict_spkid_embeddings[speaker] = enroll_centroid_embeddings
            i += 1
        #np.save('enroll_3.npy', enroll_dict)

        score_speakers = {}
        thres = 0.10

        self.pause_flag = False
        #self.record_pushButton.setText('停止')
        # 创建PyAudio对象
        self.pa = PyAudio()
        # 打开声卡,设置 采样深度为16位、声道数为2、采样率为16、模式为输入、采样点缓存数量为2048
        stream = self.pa.open(format=paInt16,
                              channels=1,
                              rate=16000,
                              input=True,
                              frames_per_buffer=3200)
        # 新建一个列表,用来存储采样到的数据
        record_buf = []
        while True:
            if self.pause_flag is True:
                break
            audio_data = stream.read(3200)  # 读出声卡缓冲区的音频数据
            record_buf.append(audio_data)  # 将读出的音频数据追加到record_buf列表
        my_path = 'test/' + strftime("%Y%m%d%H%M%S", localtime(
            time())) + '.wav'
        wf = wave.open(my_path, 'wb')  # 创建一个音频文件
        wf.setnchannels(1)  # 设置声道数为2
        wf.setsampwidth(2)  # 设置采样深度为
        wf.setframerate(16000)  # 设置采样率为16000
        # 将数据写入创建的音频文件
        wf.writeframes("".encode().join(record_buf))
        # 写完后将文件关闭
        wf.close()
        # 停止声卡
        stream.stop_stream()
        # 关闭声卡
        stream.close()
        # 终止pyaudio
        self.pa.terminate()
        self.pa = None
        #self.record_pushButton.setText('录制')
        self.record_pushButton.setEnabled(True)

        test_wav = my_path
        mfcc_feat = sample_from_mfcc(read_mfcc(test_wav, SAMPLE_RATE),
                                     NUM_FRAMES)
        output_feat = model.m.predict(np.expand_dims(mfcc_feat, axis=0))

        score = 0
        name = 'Who?'
        for speaker_name in dict_spkid_embeddings.keys():
            score_speaker = batch_cosine_similarity(
                dict_spkid_embeddings[speaker_name], output_feat)
            print('speaker: ', speaker_name, 'score: ', score_speaker)
            if score_speaker > score:
                score = score_speaker
                name = speaker_name
        print('speaker: ', name)
        print('score: ', score)

        self.speaker_label.setText('Speaker: %s' % name)
        self.score_label.setText('Score: %.4f' % score)
random.seed(123)

# Define the model here.
model = DeepSpeakerModel()

# Load the checkpoint.
model.m.load_weights('weights/ResCNN_triplet_training_checkpoint_265.h5',
                     by_name=True)

data = 'data_eng'

#labels = ['cl', 'fuli', 'gongwenhua', 'liuyuguang', 'lms', 'lsq', 'lxx', 'lzh', 'shanke', 'wry', 'zhangshuai163', 'zhuting', 'zlb', 'zq']
labels = ['fuli', 'lzh', 'cl', 'lsq', 'lxx', 'wry', 'lms', 'zlb', 'zq']
for speaker in labels:
    wavs = glob.glob(os.path.join(data, speaker, '*.wav'))
    print('{}: {} wavs'.format(speaker, len(wavs)))

if (__name__ == '__main__'):
    print('Load model successfully.')

    inputs_wavs = glob.glob(os.path.join(data, '*', '*.wav'))
    print(len(inputs_wavs))

    for wav in inputs_wavs:
        #print(wav)
        mfcc = sample_from_mfcc(read_mfcc(wav, SAMPLE_RATE), NUM_FRAMES)
        #print(mfcc.shape)
        predict_feat = model.m.predict(np.expand_dims(mfcc, axis=0))
        mel_db = predict_feat
        output_path = wav[:-4] + '.npy'
        np.save(output_path, mel_db)
    return centroid


#speakers = ['cl', 'fuli', 'gongwenhua', 'liuyuguang', 'lms', 'lsq', 'lxx', 'lzh', 'shanke', 'wry', 'zhangshuai163', 'zhuting', 'zlb', 'zq', 'yuyaqi']
speakers = ['fuli', 'lzh', 'cl', 'lsq', 'lxx', 'wry', 'lms', 'zlb', 'zq']
enroll_wav_path = 'data_eng'
dict_spkid_embeddings = {}
enroll_nums = 5
total_wavs = glob.glob(os.path.join(data, '*', '*.wav'))
print('total wavs: ', len(total_wavs))
for speaker in speakers:
    #print(speaker)
    speaker_wavs = glob.glob(os.path.join(enroll_wav_path, speaker, '*.wav'))
    length = len(speaker_wavs)
    speaker_embeddings = np.zeros((length, 512), dtype=float)
    print(speaker, length)
    #speaker_embeddings = []
    for i in range(length):
        #for wav in speaker_wavs:
        mfcc = sample_from_mfcc(read_mfcc(speaker_wavs[i], SAMPLE_RATE),
                                NUM_FRAMES)
        predict_feat = model.m.predict(np.expand_dims(mfcc, axis=0))
        #speaker_embeddings.append(predict_feat)
        speaker_embeddings[i] = predict_feat
    dict_spkid_embeddings[speaker] = speaker_embeddings
    # num_utterances = len(speaker_wavs)
    # enroll_centroid_embeddings = get_centroid(speaker_embeddings, num_utterances)
    # dict_spkid_embeddings[speaker] = enroll_centroid_embeddings
    # print(speaker, enroll_centroid_embeddings.shape)

np.save('enroll_9.npy', dict_spkid_embeddings)