def start_training(working_dir): pre_training_phase=True ensures_dir(CHECKPOINTS_MTL_DIR) ensures_dir(CHECKPOINTS_MTL_DIR) batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] logger.info('Started training.') kc = KerasFormatConverter(working_dir) num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) logger.info(f'categorical_speakers: {kc.categorical_speakers.speaker_ids}') dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False, num_speakers_softmax=num_speakers_softmax) base_model = dsm.m x = base_model.output x = Dense(1024, name='shared')(x) y=Dense(1024,name='speaker_task')(x) speaker_out= Dense(num_speakers_softmax, activation='softmax',name='speaker_pred')(y) gender_out= Dense(1, activation='sigmoid',name='gender_pred')(x) model = Model(inputs=base_model.input, outputs=[speaker_out, gender_out]) model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy','binary_crossentropy'], metrics={'speaker_pred': 'accuracy', 'gender_pred': 'binary_accuracy'}) training_checkpoint = load_best_checkpoint(CHECKPOINTS_MTL_DIR) if training_checkpoint is not None: initial_epoch = int(training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1]) logger.info(f'Initial epoch is {initial_epoch}.') logger.info(f'Loading softmax checkpoint: {training_checkpoint}.') model.load_weights(training_checkpoint) # latest one. else: initial_epoch = 0 fit_model_mtl(model, kc.kx_train, kc.ky_train,kc.kg_train, kc.kx_test, kc.ky_test,kc.kg_test, initial_epoch=initial_epoch)
def start_training(working_dir, pre_training_phase=True): ensures_dir(CHECKPOINTS_SOFTMAX_DIR) ensures_dir(CHECKPOINTS_TRIPLET_DIR) batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] if pre_training_phase: logger.info('Softmax pre-training.') kc = KerasFormatConverter(working_dir) num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) dsm = DeepSpeakerModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if pre_training_checkpoint is not None: initial_epoch = int(pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1]) logger.info(f'Initial epoch is {initial_epoch}.') logger.info(f'Loading softmax checkpoint: {pre_training_checkpoint}.') dsm.m.load_weights(pre_training_checkpoint) # latest one. else: initial_epoch = 0 fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch) else: logger.info('Training with the triplet loss.') dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False) triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if triplet_checkpoint is not None: logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.') dsm.m.load_weights(triplet_checkpoint) elif pre_training_checkpoint is not None: logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.') # If `by_name` is True, weights are loaded into layers only if they share the # same name. This is useful for fine-tuning or transfer-learning models where # some of the layers have changed. dsm.m.load_weights(pre_training_checkpoint, by_name=True) dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss) fit_model(dsm, working_dir, NUM_FRAMES)
def build_keras_inputs(working_dir, counts_per_speaker): # counts_per_speaker: If you specify --counts_per_speaker 600,100, that means for each speaker, # you're going to generate 600 samples for training and 100 for testing. One sample is 160 frames # by default (~roughly 1.6 seconds). counts_per_speaker = [int(b) for b in counts_per_speaker.split(',')] kc = KerasFormatConverter(working_dir) kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker) kc.persist_to_disk()
def main(): select = True try: sys.argv[1] except Exception: select = False print('select', select) working_dir = '/media/philippe/8TB/deep-speaker' # by construction this losses should be much higher than the normal losses. # we select batches this way. batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] print('Testing with the triplet losses.') dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False) triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if triplet_checkpoint is not None: print(f'Loading triplet checkpoint: {triplet_checkpoint}.') dsm.m.load_weights(triplet_checkpoint) elif pre_training_checkpoint is not None: print(f'Loading pre-training checkpoint: {pre_training_checkpoint}.') # If `by_name` is True, weights are loaded into layers only if they share the # same name. This is useful for fine-tuning or transfer-learning models where # some of the layers have changed. dsm.m.load_weights(pre_training_checkpoint, by_name=True) dsm.m.compile(optimizer='adam', loss=deep_speaker_loss) kc = KerasFormatConverter(working_dir) if select: print('TripletBatcherSelectHardNegatives()') batcher = TripletBatcherSelectHardNegatives(kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, dsm) else: print('TripletBatcher()') batcher = TripletBatcher(kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test) batch_size = BATCH_SIZE losses = [] while True: _bx, _by = batcher.get_batch(batch_size, is_test=False) losses.append( dsm.m.evaluate(_bx, _by, verbose=0, batch_size=BATCH_SIZE)) print(np.mean(losses))
def main(args): ensures_dir(args.working_dir) if args.preprocess: if args.audio_dir is None: return Audio(cache_dir=args.working_dir, audio_dir=args.audio_dir, sample_rate=args.sample_rate) if args.build_keras_inputs: counts_per_speaker = [int(b) for b in args.counts_per_speaker.split(',')] kc = KerasFormatConverter(args.working_dir) kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker) kc.persist_to_disk() if args.train_embedding: if args.pre_training_phase: start_training(args.working_dir, pre_training_phase=args.pre_training_phase, epochs=args.epochs_pretrain) start_training(args.working_dir, pre_training_phase=False, epochs=args.epochs_triplet) if args.train_classifier: start_training(args.working_dir, pre_training_phase=False, classify=True, epochs=args.epochs_classifier)
import random import numpy as np from audio import read_mfcc from batcher import sample_from_mfcc from constants import SAMPLE_RATE, NUM_FRAMES from conv_models import DeepSpeakerModel from test import batch_cosine_similarity from batcher import KerasFormatConverter kc = KerasFormatConverter('./') # Define the model here. model = DeepSpeakerModel(include_softmax=False, include_classifier=True, num_speakers_softmax=len( kc.categorical_speakers.speaker_ids)) # Load the checkpoint. model.m.load_weights('checkpoints-classify/ResCNN_checkpoint_1.h5') mfcc_001 = sample_from_mfcc( read_mfcc('samples/train/0/0/0-0-Recording (12).m4a', SAMPLE_RATE), NUM_FRAMES) predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0)) print(np.argmax(predict_001[0]))
def build_keras_inputs(working_dir, counts_per_speaker): counts_per_speaker = [int(b) for b in counts_per_speaker.split(',')] kc = KerasFormatConverter(working_dir) kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker) kc.persist_to_disk()