def start_training(working_dir):
    pre_training_phase=True
    ensures_dir(CHECKPOINTS_MTL_DIR)
    ensures_dir(CHECKPOINTS_MTL_DIR)
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    logger.info('Started training.')
    kc = KerasFormatConverter(working_dir)
 
    num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
    logger.info(f'categorical_speakers: {kc.categorical_speakers.speaker_ids}')
    dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False, num_speakers_softmax=num_speakers_softmax)
    base_model = dsm.m
    x = base_model.output
    x = Dense(1024, name='shared')(x)
    y=Dense(1024,name='speaker_task')(x)
    speaker_out= Dense(num_speakers_softmax, activation='softmax',name='speaker_pred')(y)
    gender_out= Dense(1, activation='sigmoid',name='gender_pred')(x)
    model = Model(inputs=base_model.input, outputs=[speaker_out, gender_out])
    
    model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy','binary_crossentropy'], metrics={'speaker_pred': 'accuracy', 'gender_pred': 'binary_accuracy'})
    training_checkpoint = load_best_checkpoint(CHECKPOINTS_MTL_DIR)
    if training_checkpoint is not None:
        initial_epoch = int(training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
        logger.info(f'Initial epoch is {initial_epoch}.')
        logger.info(f'Loading softmax checkpoint: {training_checkpoint}.')
        model.load_weights(training_checkpoint)  # latest one.
    else:
        initial_epoch = 0
    fit_model_mtl(model, kc.kx_train, kc.ky_train,kc.kg_train, kc.kx_test, kc.ky_test,kc.kg_test, initial_epoch=initial_epoch)
示例#2
0
def start_training(working_dir, pre_training_phase=True):
    ensures_dir(CHECKPOINTS_SOFTMAX_DIR)
    ensures_dir(CHECKPOINTS_TRIPLET_DIR)
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    if pre_training_phase:
        logger.info('Softmax pre-training.')
        kc = KerasFormatConverter(working_dir)
        num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
        dsm = DeepSpeakerModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
        dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
        if pre_training_checkpoint is not None:
            initial_epoch = int(pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
            logger.info(f'Initial epoch is {initial_epoch}.')
            logger.info(f'Loading softmax checkpoint: {pre_training_checkpoint}.')
            dsm.m.load_weights(pre_training_checkpoint)  # latest one.
        else:
            initial_epoch = 0
        fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch)
    else:
        logger.info('Training with the triplet loss.')
        dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
        triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
        pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
        if triplet_checkpoint is not None:
            logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.')
            dsm.m.load_weights(triplet_checkpoint)
        elif pre_training_checkpoint is not None:
            logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.')
            # If `by_name` is True, weights are loaded into layers only if they share the
            # same name. This is useful for fine-tuning or transfer-learning models where
            # some of the layers have changed.
            dsm.m.load_weights(pre_training_checkpoint, by_name=True)
        dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
        fit_model(dsm, working_dir, NUM_FRAMES)
def build_keras_inputs(working_dir, counts_per_speaker):
    # counts_per_speaker: If you specify --counts_per_speaker 600,100, that means for each speaker,
    # you're going to generate 600 samples for training and 100 for testing. One sample is 160 frames
    # by default (~roughly 1.6 seconds).
    counts_per_speaker = [int(b) for b in counts_per_speaker.split(',')]
    kc = KerasFormatConverter(working_dir)
    kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker)
    kc.persist_to_disk()
示例#4
0
def main():
    select = True
    try:
        sys.argv[1]
    except Exception:
        select = False
    print('select', select)

    working_dir = '/media/philippe/8TB/deep-speaker'
    # by construction this  losses should be much higher than the normal losses.
    # we select batches this way.
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    print('Testing with the triplet losses.')
    dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
    triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
    pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
    if triplet_checkpoint is not None:
        print(f'Loading triplet checkpoint: {triplet_checkpoint}.')
        dsm.m.load_weights(triplet_checkpoint)
    elif pre_training_checkpoint is not None:
        print(f'Loading pre-training checkpoint: {pre_training_checkpoint}.')
        # If `by_name` is True, weights are loaded into layers only if they share the
        # same name. This is useful for fine-tuning or transfer-learning models where
        # some of the layers have changed.
        dsm.m.load_weights(pre_training_checkpoint, by_name=True)
    dsm.m.compile(optimizer='adam', loss=deep_speaker_loss)
    kc = KerasFormatConverter(working_dir)
    if select:
        print('TripletBatcherSelectHardNegatives()')
        batcher = TripletBatcherSelectHardNegatives(kc.kx_train, kc.ky_train,
                                                    kc.kx_test, kc.ky_test,
                                                    dsm)
    else:
        print('TripletBatcher()')
        batcher = TripletBatcher(kc.kx_train, kc.ky_train, kc.kx_test,
                                 kc.ky_test)
    batch_size = BATCH_SIZE
    losses = []
    while True:
        _bx, _by = batcher.get_batch(batch_size, is_test=False)
        losses.append(
            dsm.m.evaluate(_bx, _by, verbose=0, batch_size=BATCH_SIZE))
        print(np.mean(losses))
示例#5
0
def main(args):
    ensures_dir(args.working_dir)

    if args.preprocess:
        if args.audio_dir is None:
            return Audio(cache_dir=args.working_dir, audio_dir=args.audio_dir, sample_rate=args.sample_rate)
    if args.build_keras_inputs:
        counts_per_speaker = [int(b) for b in args.counts_per_speaker.split(',')]
        kc = KerasFormatConverter(args.working_dir)
        kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker)
        kc.persist_to_disk()

    if args.train_embedding:
        if args.pre_training_phase:
            start_training(args.working_dir, pre_training_phase=args.pre_training_phase, epochs=args.epochs_pretrain)
        start_training(args.working_dir,  pre_training_phase=False, epochs=args.epochs_triplet)
    if args.train_classifier:
        start_training(args.working_dir, pre_training_phase=False, classify=True, epochs=args.epochs_classifier)
import random

import numpy as np

from audio import read_mfcc
from batcher import sample_from_mfcc
from constants import SAMPLE_RATE, NUM_FRAMES
from conv_models import DeepSpeakerModel
from test import batch_cosine_similarity
from batcher import KerasFormatConverter

kc = KerasFormatConverter('./')
# Define the model here.
model = DeepSpeakerModel(include_softmax=False,
                         include_classifier=True,
                         num_speakers_softmax=len(
                             kc.categorical_speakers.speaker_ids))

# Load the checkpoint.
model.m.load_weights('checkpoints-classify/ResCNN_checkpoint_1.h5')

mfcc_001 = sample_from_mfcc(
    read_mfcc('samples/train/0/0/0-0-Recording (12).m4a', SAMPLE_RATE),
    NUM_FRAMES)
predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))
print(np.argmax(predict_001[0]))
示例#7
0
def build_keras_inputs(working_dir, counts_per_speaker):
    counts_per_speaker = [int(b) for b in counts_per_speaker.split(',')]
    kc = KerasFormatConverter(working_dir)
    kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker)
    kc.persist_to_disk()