Пример #1
0
def get_predictions(index, partition, input_to_softmax, model_path, spectrogram_features=True):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights

    return the predicted probability matrix (in a 2D matrix) and the ground truth
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=spectrogram_features)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    return (prediction[0], transcr, audio_path)
Пример #2
0
def train_model(input_to_softmax, 
                pickle_path,
                save_model_path,
                train_json = 'train_corpus.json',
                valid_json = 'valid_corpus.json',
                minibatch_size = 20,
                spectrogram = True,
                mfcc_dim = 13,
                optimizer = SGD(lr = 0.02, decay = 1e-6, momentum = 0.9, nesterov = True, clipnorm = 5),
                epochs = 20,
                verbose = 1,
                sort_by_duration = False,
                max_duration = 10.0):
    
    audio_gen = AudioGenerator(minibatch_size = minibatch_size, spectrogram = spectrogram, mfcc_dim = mfcc_dim, 
        max_duration = max_duration, sort_by_duration = sort_by_duration)
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    num_train_examples = len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    model = add_ctc_loss(input_to_softmax)
    model.compile(loss = {'ctc': lambda y_true, y_pred: y_pred}, optimizer = optimizer)
    if not os.path.exists('results'):
        os.makedirs('results')

    checkpointer = ModelCheckpoint(filepath = 'results/' + save_model_path, verbose = 0)
    hist = model.fit_generator(generator = audio_gen.next_train(), steps_per_epoch = steps_per_epoch,
        epochs = epochs, validation_data = audio_gen.next_valid(), validation_steps = validation_steps,
        callbacks = [checkpointer], verbose = verbose)

    with open('results/' + pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Пример #3
0
def get_predictions(index, partition, input_to_softmax, model_path):
    '''
    Get the model's decoded predictions to caculate metrics
    '''
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if partition == 'valid':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    label = transcr
    predicted = ''.join(int_sequence_to_text(pred_ints))

    return label, predicted
Пример #4
0
def train_model(input_to_softmax,
                pickle_path,
                save_model_path,
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.02,
                              decay=1e-6,
                              momentum=0.9,
                              nesterov=True,
                              clipnorm=5),
                epochs=20,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):
    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size,
                               spectrogram=spectrogram,
                               mfcc_dim=mfcc_dim,
                               max_duration=max_duration,
                               sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples = len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples // minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths)
    validation_steps = num_valid_samples // minibatch_size
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)
    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={
        'ctc': lambda y_true, y_pred: y_pred
    },
                  optimizer=optimizer)
    # make results directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')
    # add checkpointer
    checkpointer = ModelCheckpoint(filepath=os.path.join(
        'results', save_model_path),
                                   verbose=0)
    # train the model
    callbacks = [TQDMNotebookCallback(), checkpointer
                 ] if verbose < 0 else [checkpointer]
    hist = model.fit_generator(generator=audio_gen.next_train(),
                               steps_per_epoch=steps_per_epoch,
                               epochs=epochs,
                               validation_data=audio_gen.next_valid(),
                               validation_steps=validation_steps,
                               callbacks=callbacks,
                               verbose=verbose)
    # save model loss
    with open(os.path.join('results', pickle_path), 'wb') as f:
        pickle.dump(hist.history, f)
def train_model(input_to_softmax, 
                pickle_path,
                save_model_path,
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                # another option for the optimizer
                # optimizer=RMSprop(clipvalue=0.5),
                optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=1.0, clipvalue=0.5),
                # clipnorm was originally set to 5
                # there are many exploding gradients and clipnorm/clipvalue can help
                epochs=20,
                verbose=1,
                sort_by_duration=False,
                max_duration=20.0):

    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size,
                               spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration,
                               sort_by_duration=sort_by_duration)

    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)

    # calculate steps_per_epoch
    num_train_examples=len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size

    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    # add checkpointer
    checkpointer = ModelCheckpoint(filepath='results/'+save_model_path, verbose=0)

    keras.backend.get_session().run(tf.global_variables_initializer())

    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,
                               epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,
                               callbacks=[checkpointer], verbose=verbose)

    # save model loss
    with open('results/'+pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Пример #6
0
def train_my_model(model, pickle_path, save_model_path, train_json='train_corpus.json', valid_json='valid_corpus.json', minibatch_size=20, spectrogram=True,
                epochs=20,
                verbose=1,
                sort_by_duration=False,
                max_duration=40.0):
    
    """
        Gabriel Freire: Train my own model
        sample_models.py > own_model(input_dim=161, output_dim=29)
    """    
    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size, 
        spectrogram=spectrogram, max_duration=max_duration,
        sort_by_duration=sort_by_duration)
    
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    
    # calculate steps_per_epoch
    num_train_examples=len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size
    print("Num of training examples: {}".format(num_train_examples))
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    print("Num of validation examples: {}".format(num_valid_samples))
    
    # add ctc loss
    model = add_ctc_loss(model)
    # Compile
    optimizer = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
    # optimizer = Adam(lr=0.02, clipnorm=5, decay=1e-6)
    model.compile(loss=ctc, optimizer=optimizer)

    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    # make tensorboard/ directory, if necessary
    if not os.path.exists('tensorboard'):
        os.makedirs('tensorboard')
    
    # add checkpointer and tensorboard callbacks
    checkpointer = ModelCheckpoint(filepath='results/' + save_model_path, verbose=0)
    tensorboard = TensorBoard(log_dir='tensorboard/{}/'.format('cnn_rnn_own_model_events'), write_graph=False, write_images=True)

    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,
        epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,
        callbacks=[checkpointer, tensorboard], verbose=verbose)

    # save model loss
    with open('results/'+pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Пример #7
0
def train_model(input_to_softmax, 
                pickle_path,
                save_model_path,
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
                epochs=30,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):
    
    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size, 
        spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration,
        sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples=len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

    
    # make results/ directory, if necessary
    #if not os.path.exists('results'):
    #   os.makedirs('results')
        
    #if os.path.exists('/gdrive/My Drive/results/model_end.h5'):
    #    model.load_weights('/gdrive/My Drive/results/model_end.h5')
    
    resume_weights = '/gdrive/My Drive/results/rnn_model.hdf5'
    
    if os.path.isfile(resume_weights):
        print ("Resumed model's weights from {}".format(resume_weights))
        model.load_weights(resume_weights)
    
    # add checkpointer
    checkpointer = ModelCheckpoint(filepath='/gdrive/My Drive/results/'+save_model_path, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)

    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,callbacks=[checkpointer], verbose=verbose)
def train_model(input_to_softmax, 
                pickle_path,
                save_model_path,
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
                epochs=1,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):
    
    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size, 
        spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration,
        sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples=len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    # add checkpointer
    checkpointer = ModelCheckpoint(filepath='results/'+save_model_path, verbose=0)

    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,
        epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,
        callbacks=[checkpointer], verbose=verbose)

    # save model loss
    with open('results/'+pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Пример #9
0
def get_predictions(index, partition, trained_model, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        trained_model (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features from Dataset
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    print("Trained model output length:\n" +
          str(trained_model.output_length(data_point.shape[0])))
    # obtain and decode the acoustic model's predictions
    trained_model.load_weights(model_path)
    prediction = trained_model.predict(np.expand_dims(data_point, axis=0))
    output_length = [trained_model.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    transcription = ''.join(int_sequence_to_text(pred_ints))
    # Correction using KenLM language model toolkit
    corrected_transcription = correction(transcription)

    print('-' * 80)
    print(repr(audio_path).replace(r"\\", r"/"))
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    print('Raw prediction:\n' + str(prediction[0]))
    print('CTC Decoded predicted Ints before conversion to text:\n' +
          str(pred_ints))
    print('Predicted transcription:\n' + '\n' + transcription)
    print('Predicted transcription with correction:\n' +
          corrected_transcription)
    print('-' * 80)
Пример #10
0
def validation_sentences():
    """ storage the validation sentences
    Params:
        None
    """
    # load the test data
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_validation_data()
    # obtain the true transcription and the audio features
    num = 500
    f = open('C:/Users/mribles/Desktop/corpus.txt', 'a')
    while num > 490:
        transcr = data_gen.valid_texts[num]
        f.write(transcr + '\n')
        num = num -1
    f.close()
def get_predictions(index, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()

    print(len(data_gen.valid_texts))
    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    if not data_gen.valid_texts[index]:
        return

    with open(r'D:\DIPLOMSKA\results\predictions_cnn_rnn_12.txt',
              'a+',
              encoding='utf8') as fp:
        fp.write('True transcription:\n' + '\n' + transcr + '\n')
        #print(transcr)
        fp.write('-' * 30 + '\n')
        fp.write('Predicted transcription:\n' + '\n' +
                 ''.join(int_sequence_to_text(pred_ints)) + '\n')
        #print(int_sequence_to_text(pred_ints))
        fp.write('-' * 30 + '\n')
Пример #12
0
def get_predictions_rec(input_to_softmax, a_path, model_path):
    data_gen = AudioGenerator(spectrogram=False)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    audio_path = a_path
    data_point = data_gen.normalize(data_gen.featurize(audio_path))

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    return 'Predicted transcription:\n' + '\n' + ''.join(
        int_sequence_to_text(pred_ints))
Пример #13
0
def get_predictions(index,
                    partition,
                    input_to_softmax,
                    model_path,
                    spectogram=True,
                    mfcc_dim=13):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=spectogram, mfcc_dim=mfcc_dim)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights('results/' + model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    print('-' * 80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    print('Predicted transcription:\n' + '\n' +
          ''.join(int_sequence_to_text(pred_ints)))
    print('-' * 80)
Пример #14
0
def _predict(name="200_32_3.wav"):
    '''
    Get the predicted results of a single sample
    :param name:
    '''

    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()

    audio_path_valid = data_gen.valid_audio_paths
    audio_path_train = data_gen.train_audio_paths
    idx = -1
    partition = "valid"
    for i in range(len(audio_path_valid)):
        rets = audio_path_valid[i].split('/')
        if rets[-1] == name:
            idx = i
            break
    if idx == -1:
        for i in range(len(audio_path_train)):
            rets = audio_path_train[i].split('/')
            if rets[-1] == name:
                idx = i
                partition = "train"
                break

    start = datetime.now()
    label, predicted = get_predictions(index=idx,
                                       partition=partition,
                                       input_to_softmax=mmodel1(
                                           input_dim=13,
                                           filters=512,
                                           kernel_size=5,
                                           conv_stride=1,
                                           conv_border_mode='same',
                                           units=1024,
                                           output_dim=95),
                                       model_path='results/mmodel1.h5')
    time = datetime.now() - start
    return label, predicted, str(time)
Пример #15
0
def get_predictions(index, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions ·
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    starttime = time.time()
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()
    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    #print(input_to_softmax.summary())
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(
       prediction, output_length, greedy=True)[0][0]) + 1).flatten().tolist()
    Audio(audio_path)
    print('-' * 80)
    b = "".join(int_sequence_to_text(pred_ints))
    a = transcr
    print("Greedy_predictions:\n" + '\n' + ''.join(int_sequence_to_text(pred_ints)))
    print('1. Word Error Rate for ASR ==', wer(a, b)*100, '%')
    endtime = time.time()
    print('2. Real Time Factor for ASR ==',(endtime - starttime)/data_gen.valid_durations[index],'\n')
    print('-' * 80)
Пример #16
0
def lexcion_predictions(index, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions ·
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    starttime = time.time()
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()
    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    #print(input_to_softmax.summary())
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    print('-' * 80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    b = testline(prediction[0])
    a = transcr
    print("TokenPassing_predictions:\n")
    print(b + '\n')
    print('1. Word Error Rate for ASR ==', wer(a, b)*100, '%')
    endtime = time.time()
    print('2. Real Time Factor for ASR ==',(endtime - starttime)/data_gen.valid_durations[index])
Пример #17
0
def train_model_history(
                input_to_softmax, 
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
                epochs=20,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):
    
    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size, 
        spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration,
        sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples=len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,
        epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,
        callbacks=[], verbose=verbose)

    return hist
Пример #18
0
from wer import wer
from data_generator import AudioGenerator
from keras import backend as K
from utils import int_sequence_to_text
import numpy as np

from keras.callbacks import ModelCheckpoint, Callback

valid_cache = []
data_gen = AudioGenerator(spectrogram=True)
data_gen.load_train_data()
data_gen.load_validation_data()
for index in range(len(data_gen.valid_texts)):
    transcr = data_gen.valid_texts[index]
    audio_path = data_gen.valid_audio_paths[index]
    data_point = data_gen.normalize(data_gen.featurize(audio_path))
    valid_cache.append(data_point)


def calculate_wer2(input_to_softmax, model_path, words=False):
    # data_gen = AudioGenerator()
    # data_gen.load_train_data()
    # data_gen.load_validation_data()
    wers = []
    input_to_softmax.load_weights(model_path)

    l = len(data_gen.valid_texts)
    l = 100
    for index in range(l):
        transcr = data_gen.valid_texts[index]
        # audio_path = data_gen.valid_audio_paths[index]
Пример #19
0
def train_model(input_to_softmax,
                pickle_path,
                save_model_path,
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.02,
                              decay=1e-6,
                              momentum=0.9,
                              nesterov=True,
                              clipnorm=5),
                epochs=30,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):

    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size,
                               spectrogram=spectrogram,
                               mfcc_dim=mfcc_dim,
                               max_duration=max_duration,
                               sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples = len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples // minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths)
    validation_steps = num_valid_samples // minibatch_size

    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={
        'ctc': lambda y_true, y_pred: y_pred
    },
                  optimizer=optimizer)

    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    # Stops training once the model performance stops improving on a hold out validation dataset.
    eraly_stop = EarlyStopping(monitor='val_loss',
                               min_delta=0.0001,
                               patience=3,
                               verbose=1,
                               mode='auto')
    # Reducing the Learning Rate if result is not improving.
    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  patience=2,
                                  factor=0.1,
                                  min_lr=1e-6,
                                  mode='auto',
                                  verbose=1)
    # add checkpointer
    #checkpointer = ModelCheckpoint(filepath='results/'+save_model_path, verbose=0)

    # Save the model after every epoch.
    checkpointer = ModelCheckpoint(filepath='results/' + save_model_path,
                                   monitor='val_loss',
                                   verbose=1,
                                   save_best_only=True,
                                   mode='min',
                                   save_weights_only=True)

    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(),
                               steps_per_epoch=steps_per_epoch,
                               epochs=epochs,
                               validation_data=audio_gen.next_valid(),
                               validation_steps=validation_steps,
                               callbacks=[checkpointer, eraly_stop, reduce_lr],
                               verbose=verbose)

    # save model loss
    with open('results/' + pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Пример #20
0
def train_model(input_to_softmax,
                pickle_path,
                save_model_path,
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.02,
                              decay=1e-6,
                              momentum=0.9,
                              nesterov=True,
                              clipnorm=5),
                epochs=20,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):

    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size,
                               spectrogram=spectrogram,
                               mfcc_dim=mfcc_dim,
                               max_duration=max_duration,
                               sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples = len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples // minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths)
    validation_steps = num_valid_samples // minibatch_size

    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(
        loss={
            'ctc': lambda y_true, y_pred: y_pred
        },
        optimizer=optimizer,
    )

    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    # add checkpointer
    # checkpointer = ModelCheckpoint(filepath='results/'+save_model_path, verbose=0)

    # callbacks
    # Notes:
    # added ReduceLROnPlateau to decrease the learning rate when the model doesn't improve for 2 epochs
    # added EarlyStopping to stop the training when the model clearly overfits
    callbacks = [
        ModelCheckpoint(filepath='results/' + save_model_path, verbose=0),
        ReduceLROnPlateau(monitor="val_loss",
                          factor=0.9,
                          patience=2,
                          verbose=verbose,
                          min_lr=0.001),
        EarlyStopping(patience=4)
    ]

    # train the model
    hist = model.fit_generator(
        generator=audio_gen.next_train(),
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        validation_data=audio_gen.next_valid(),
        validation_steps=validation_steps,
        callbacks=callbacks,
        verbose=verbose)  # originally `callbacks=[checkpointer]

    # save model loss
    with open('results/' + pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Пример #21
0
def get_group_predictions(input_to_softmax, model_path, partition):
    starttime = time.time()
    wer_sum = 0
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()
    input_to_softmax.load_weights(model_path)
    # obtain the true transcription and the audio features
    if partition == 'validation':
        num = 99
        while num >= 0:
            index = random.randint(1, 2500)
            transcr = data_gen.valid_texts[index]
            audio_path = data_gen.valid_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
            # obtain and decode the acoustic model's predictions
            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][
                                    0]) + 1).flatten().tolist()
            # print('True transcription:\n' + '\n' + transcr)
            b = "".join(int_sequence_to_text(pred_ints))
            a = transcr
            # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))
            # print('-' * 80)
            # print('1.Editable Distance for ASR ==', edit(a, b), '\n')
            if (wer(a, b) <= 1):
                print('index_%d' % index, ':')
                wer_sum = wer_sum + wer(a, b)
                print(wer(a, b))
                print("Transcription: ",a)
                print("Prediction:    ",b)
                print('-' * 80)
            elif ():
                num = num + 1
            num = num - 1
    elif partition == 'train':
        num = 999
        while num >= 0:
            index = random.randint(1, 10000)
            transcr = data_gen.train_texts[index]
            audio_path = data_gen.train_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))

            # obtain and decode the acoustic model's predictions

            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][
                                    0]) + 1).flatten().tolist()
            # print('True transcription:\n' + '\n' + transcr)
            b = "".join(int_sequence_to_text(pred_ints))
            a = transcr
            # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))
            # print('-' * 80)
            # print('1.Editable Distance for ASR ==', edit(a, b), '\n')

            if (wer(a, b) <= 1):
                print('index_%d' % index, ':')
                wer_sum = wer_sum + wer(a, b)
                print(wer(a, b))
                print("Transcription: ",a)
                print("Prediction:    ",b)
                print('-' * 80)
            elif ():
                num = num + 1
            num = num - 1
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    endtime = time.time()
    #print('1. Average Word Error Rate for ASR ==', wer_sum/100 , '%')
    print('1. Average Word Error Rate for ASR ==', wer_sum / 100)
    print('2. Average Real Time Factor for ASR ==', (endtime - starttime) / 100, '\n')
Пример #22
0
def train_model(
        input_to_softmax,
        pickle_path,
        #save_model_path,
        train_json='train_corpus.json',
        valid_json='valid_corpus.json',
        minibatch_size=20,
        spectrogram=True,
        mfcc_dim=13,
        #optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
        optimizer=adam,
        epochs=20,
        verbose=1,
        sort_by_duration=True,
        max_duration=16.7):
    # create a class instance for obtaining batches of data
    audio_gen_train = AudioGenerator(minibatch_size=minibatch_size,
                                     spectrogram=spectrogram,
                                     mfcc_dim=mfcc_dim,
                                     max_duration=max_duration,
                                     sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen_train.load_train_data(train_json)
    audio_gen_test = AudioGenerator(minibatch_size=minibatch_size,
                                    spectrogram=spectrogram,
                                    mfcc_dim=mfcc_dim,
                                    max_duration=30.0,
                                    sort_by_duration=False)
    audio_gen_test.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples = len(audio_gen_train.train_audio_paths)
    print('number of train examples ==', num_train_examples)
    steps_per_epoch = num_train_examples // minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen_test.valid_audio_paths)
    validation_steps = num_valid_samples // minibatch_size
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    # model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer, metrics=['accuracy'])
    model.compile(loss={
        'ctc': lambda y_true, y_pred: y_pred
    },
                  optimizer='adam')
    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    # add checkpointer
    #checkpointer = ModelCheckpoint(filepath='results/' + save_model_path, verbose=0, period=1)
    filepath = "results/test-{epoch:02d}.hdf5"
    checkpointer = ModelCheckpoint(filepath, verbose=0, period=5)
    # train the model
    hist = model.fit_generator(generator=audio_gen_train.next_train(),
                               steps_per_epoch=steps_per_epoch,
                               epochs=epochs,
                               validation_data=audio_gen_test.next_valid(),
                               validation_steps=validation_steps,
                               callbacks=[checkpointer],
                               verbose=verbose)

    # save model loss
    with open('results/' + pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Пример #23
0
def get_predictions(indexes, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=False, mfcc_dim=13)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # loading language model
    alphabet = ''.join(index_map.values())
    language_model = LanguageModel('data/word/corpus.txt', alphabet)

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)

    for index in indexes:
        # obtain the true transcription and the audio features
        if partition == 'validation':
            transcr = data_gen.valid_texts[index]
            audio_path = data_gen.valid_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
        elif partition == 'train':
            transcr = data_gen.train_texts[index]
            audio_path = data_gen.train_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
        elif partition == 'test':
            transcr = 'hello how are you'
            audio_path = '../datasets/AgentExpress/hello.wav.wav'
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
            #print(data_point)
        else:
            raise Exception(
                'Invalid partition!  Must be "train" or "validation"')

        prediction = input_to_softmax.predict(
            np.expand_dims(data_point, axis=0))

        output_length = [input_to_softmax.output_length(data_point.shape[0])]

        #print(K.eval(K.ctc_decode(prediction, output_length, False, 30, 1)[0][0]))
        pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                     1).flatten().tolist()

        # play the audio file, and display the true and predicted transcriptions
        print('-' * 80)
        Audio(audio_path)
        print('ground_truth:' + ' ' * 4 + transcr)
        print('best_path:' + ' ' * 7 +
              ''.join(int_sequence_to_text(pred_ints)))
        pred_beam = ctcBeamSearch(prediction[0], alphabet, None)
        print('beam_search:' + ' ' * 5 + pred_beam)
        pred_beam_lm = ctcBeamSearch(prediction[0], alphabet, language_model)
        print('beam_search_lm:' + ' ' * 2 + pred_beam_lm)
        pred_token = ctcTokenPassing(prediction[0], alphabet,
                                     language_model.getWordList())
        print('token_passing:' + ' ' * 3 + pred_token)
def train_model(input_to_softmax, 
                pickle_path,
                save_model_path,
                train_json='data-test-clean.json',
                valid_json='data-train-clean.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.005, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
                #optimizer=Adagrad(lr=0.005),
                epochs=20,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):
    
    print("# create a class instance for obtaining batches of data")
    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size, 
        spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration,
        sort_by_duration=sort_by_duration)
    print("# add the training data to the generator")
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    print("# calculate steps_per_epoch")
    # calculate steps_per_epoch
    num_train_examples=len(audio_gen.train_audio_paths)
    #print("num_train_examples : ", num_train_examples)
    steps_per_epoch = num_train_examples//minibatch_size
    #print("steps_per_epoch : ", steps_per_epoch)
    print("# calculate validation_steps")
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    
    print("# add CTC loss to the NN specified in input_to_softmax")
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)
    
    print("# CTC loss is implemented elsewhere, so use a dummy lambda function for the loss")
    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

    print("# make results/ directory, if necessary")
    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    print("# add checkpointer")    
    # add checkpointer
    checkpointer = ModelCheckpoint(filepath='results/'+save_model_path, verbose=0)
    #print(checkpointer)
    
    
    print("# train the model")
    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,
        epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,
        callbacks=[checkpointer], verbose=verbose)

    print("# save model loss")
    # save model loss
    with open('results/'+pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)