Пример #1
0
def start_training(working_dir, pre_training_phase=True):
    ensures_dir(CHECKPOINTS_SOFTMAX_DIR)
    ensures_dir(CHECKPOINTS_TRIPLET_DIR)
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    if pre_training_phase:
        logger.info('Softmax pre-training.')
        kc = KerasFormatConverter(working_dir)
        num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
        dsm = DeepSpeakerModel(batch_input_shape,
                               include_softmax=True,
                               num_speakers_softmax=num_speakers_softmax)
        dsm.m.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])
        pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
        if pre_training_checkpoint is not None:
            initial_epoch = int(
                pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')
                [-1])
            logger.info(f'Initial epoch is {initial_epoch}.')
            logger.info(
                f'Loading softmax checkpoint: {pre_training_checkpoint}.')
            dsm.m.load_weights(pre_training_checkpoint)  # latest one.
        else:
            initial_epoch = 0
        fit_model_softmax(dsm,
                          kc.kx_train,
                          kc.ky_train,
                          kc.kx_test,
                          kc.ky_test,
                          initial_epoch=initial_epoch)
    else:
        logger.info('Training with the triplet loss.')
        dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
        triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
        pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
        if triplet_checkpoint is not None:
            logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.')
            dsm.m.load_weights(triplet_checkpoint)
        elif pre_training_checkpoint is not None:
            logger.info(
                f'Loading pre-training checkpoint: {pre_training_checkpoint}.')
            # If `by_name` is True, weights are loaded into layers only if they share the
            # same name. This is useful for fine-tuning or transfer-learning models where
            # some of the layers have changed.
            dsm.m.load_weights(pre_training_checkpoint, by_name=True)
        dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
        fit_model(dsm, working_dir, NUM_FRAMES)
def test(working_dir, checkpoint_file=None):
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    dsm = DeepSpeakerModel(batch_input_shape)
    if checkpoint_file is None:
        checkpoint_file = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
    if checkpoint_file is not None:
        logger.info(
            f'Found checkpoint [{checkpoint_file}]. Loading weights...')
        dsm.m.load_weights(checkpoint_file, by_name=True)
    else:
        logger.info(f'Could not find any checkpoint in {checkpoint_file}.')
        exit(1)

    fm, tpr, acc, eer = eval_model(working_dir, model=dsm)
    logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, '
                f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}')
                                   size=1)
        return self.kx_test[indices]

    def get_speaker_verification_data(self, positive_speaker,
                                      num_different_speakers):
        all_negative_speakers = list(
            set(self.speakers_list) - {positive_speaker})
        assert len(self.speakers_list) - 1 == len(all_negative_speakers)
        negative_speakers = np.random.choice(all_negative_speakers,
                                             size=num_different_speakers,
                                             replace=False)
        assert positive_speaker not in negative_speakers
        anchor = self._select_speaker_data(positive_speaker)
        positive = self._select_speaker_data(positive_speaker)
        data = [anchor, positive]
        data.extend([self._select_speaker_data(n) for n in negative_speakers])
        return np.vstack(data)


if __name__ == '__main__':
    np.random.seed(123)
    ltb = LazyTripletBatcher(working_dir='/Users/premy/deep-speaker/',
                             max_length=NUM_FRAMES,
                             model=DeepSpeakerModel())
    for i in range(1000):
        print(i)
        start = time()
        ltb.get_batch_train(batch_size=9)
        print(time() - start)
        # ltb.get_batch(batch_size=96)
Пример #4
0
    def record(self):
        ####计算enroll centeriod embedding####
        dict_spkid_embeddings = {}
        all_wavs = glob.glob(os.path.join(enroll_wav_path, '*.wav'))

        model = DeepSpeakerModel()
        #model.m.load_weights('weights/ResCNN_triplet_training_checkpoint_265.h5', by_name=True)
        model.m.load_weights(
            'weights/ResCNN_softmax_pre_training_checkpoint_102.h5',
            by_name=True)
        ## speakers###
        speakers = ['lms', 'zq', 'wry', 'lzh']

        # enroll_centroid_embeddings_npy = np.zeros((3, 256),dtype=float)
        # enroll_dict= {}
        i = 0
        for speaker in speakers:
            #print(speaker)
            speaker_wavs = glob.glob(
                os.path.join(enroll_wav_path, speaker + '_*.wav'))[:6]
            #print(speaker_wavs)
            speaker_embeddings = []
            for wav in speaker_wavs:
                mfcc_feat = sample_from_mfcc(read_mfcc(wav, SAMPLE_RATE),
                                             NUM_FRAMES)
                mfcc_feat = model.m.predict(np.expand_dims(mfcc_feat, axis=0))
                speaker_embeddings.append(mfcc_feat)
            num_utterances = len(speaker_wavs)
            enroll_centroid_embeddings = get_centroid(speaker_embeddings,
                                                      num_utterances)
            dict_spkid_embeddings[speaker] = enroll_centroid_embeddings
            i += 1
        #np.save('enroll_3.npy', enroll_dict)

        score_speakers = {}
        thres = 0.10

        self.pause_flag = False
        #self.record_pushButton.setText('停止')
        # 创建PyAudio对象
        self.pa = PyAudio()
        # 打开声卡,设置 采样深度为16位、声道数为2、采样率为16、模式为输入、采样点缓存数量为2048
        stream = self.pa.open(format=paInt16,
                              channels=1,
                              rate=16000,
                              input=True,
                              frames_per_buffer=3200)
        # 新建一个列表,用来存储采样到的数据
        record_buf = []
        while True:
            if self.pause_flag is True:
                break
            audio_data = stream.read(3200)  # 读出声卡缓冲区的音频数据
            record_buf.append(audio_data)  # 将读出的音频数据追加到record_buf列表
        my_path = 'test/' + strftime("%Y%m%d%H%M%S", localtime(
            time())) + '.wav'
        wf = wave.open(my_path, 'wb')  # 创建一个音频文件
        wf.setnchannels(1)  # 设置声道数为2
        wf.setsampwidth(2)  # 设置采样深度为
        wf.setframerate(16000)  # 设置采样率为16000
        # 将数据写入创建的音频文件
        wf.writeframes("".encode().join(record_buf))
        # 写完后将文件关闭
        wf.close()
        # 停止声卡
        stream.stop_stream()
        # 关闭声卡
        stream.close()
        # 终止pyaudio
        self.pa.terminate()
        self.pa = None
        #self.record_pushButton.setText('录制')
        self.record_pushButton.setEnabled(True)

        test_wav = my_path
        mfcc_feat = sample_from_mfcc(read_mfcc(test_wav, SAMPLE_RATE),
                                     NUM_FRAMES)
        output_feat = model.m.predict(np.expand_dims(mfcc_feat, axis=0))

        score = 0
        name = 'Who?'
        for speaker_name in dict_spkid_embeddings.keys():
            score_speaker = batch_cosine_similarity(
                dict_spkid_embeddings[speaker_name], output_feat)
            print('speaker: ', speaker_name, 'score: ', score_speaker)
            if score_speaker > score:
                score = score_speaker
                name = speaker_name
        print('speaker: ', name)
        print('score: ', score)

        self.speaker_label.setText('Speaker: %s' % name)
        self.score_label.setText('Score: %.4f' % score)