Exemplo n.º 1
0
def get_predictions(index, partition, input_to_softmax, model_path, spectrogram_features=True):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights

    return the predicted probability matrix (in a 2D matrix) and the ground truth
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=spectrogram_features)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    return (prediction[0], transcr, audio_path)
Exemplo n.º 2
0
def get_predictions(index, partition, input_to_softmax, model_path):
    '''
    Get the model's decoded predictions to caculate metrics
    '''
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if partition == 'valid':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    label = transcr
    predicted = ''.join(int_sequence_to_text(pred_ints))

    return label, predicted
Exemplo n.º 3
0
def compare_predictions(index,
                        partition,
                        inputs_to_softmax=[],
                        model_paths=[],
                        phn=False):
    """ Print a model's decoded predictions
	Params:
		index (int): The example you would like to visualize
		partition (str): One of 'train' or 'validation'
		input_to_softmax (Model): The acoustic model
		model_path (str): Path to saved acoustic model's weights
	"""
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    # obtain the true transcription and the audio features
    if partition == 'test':
        if phn:
            transcr = data_gen.test_phn_texts[index]
            audio_path = data_gen.test_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.test_wrd_texts[index]
            audio_path = data_gen.test_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        if phn:
            transcr = data_gen.train_phn_texts[index]
            audio_path = data_gen.train_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.train_wrd_texts[index]
            audio_path = data_gen.train_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!	 Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    pred_ints = []
    for model_path, input_to_softmax in zip(model_paths, inputs_to_softmax):
        input_to_softmax.load_weights(model_path)
        prediction = input_to_softmax.predict(
            np.expand_dims(data_point, axis=0))
        output_length = [input_to_softmax.output_length(data_point.shape[0])]
        pred_int = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                    1).flatten().tolist()
        pred_ints.append(pred_int)

    # play the audio file, and display the true and predicted transcriptions
    print('-' * 80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    i = 0
    for pred_in in pred_ints:
        i = i + 1
        print('Predicted transcription number', i,
              ':\n' + '\n' + ''.join(int_sequence_to_text(pred_in, phn)))
        print('-' * 80)
Exemplo n.º 4
0
def predict_test(input_to_softmax, model_path, audio_range=100000):
    '''
    Method for predicting the testing set. Set default audio_range to be 100000.
    (If it's over the index, it will return an error anyway).
    '''
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    transcr = data_gen.test_texts
    audio_path = data_gen.test_audio_paths
    input_to_softmax.load_weights(model_path)
    predictions = []
    try:
        for i in range(len(audio_path)):  #default len(audio_path)):
            data_point = data_gen.normalize(data_gen.featurize(audio_path[i]))

            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(
                    prediction, output_length)[0][0])+1).flatten().tolist()
            pred = ''.join(int_sequence_to_text(pred_ints))
            predictions.append(pred)
    except:
        predictions = ''.join(predictions)
        transcr = transcr[:10]
        transcr = ''.join(transcr)
        with open("predictions/predictions.txt", "w") as output:
            output.write(str(predictions))
        with open("predictions/truescr.txt", "w") as output:
            output.write(str(transcr))
Exemplo n.º 5
0
def train_model(input_to_softmax, 
                pickle_path,
                save_model_path,
                train_json = 'train_corpus.json',
                valid_json = 'valid_corpus.json',
                minibatch_size = 20,
                spectrogram = True,
                mfcc_dim = 13,
                optimizer = SGD(lr = 0.02, decay = 1e-6, momentum = 0.9, nesterov = True, clipnorm = 5),
                epochs = 20,
                verbose = 1,
                sort_by_duration = False,
                max_duration = 10.0):
    
    audio_gen = AudioGenerator(minibatch_size = minibatch_size, spectrogram = spectrogram, mfcc_dim = mfcc_dim, 
        max_duration = max_duration, sort_by_duration = sort_by_duration)
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    num_train_examples = len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    model = add_ctc_loss(input_to_softmax)
    model.compile(loss = {'ctc': lambda y_true, y_pred: y_pred}, optimizer = optimizer)
    if not os.path.exists('results'):
        os.makedirs('results')

    checkpointer = ModelCheckpoint(filepath = 'results/' + save_model_path, verbose = 0)
    hist = model.fit_generator(generator = audio_gen.next_train(), steps_per_epoch = steps_per_epoch,
        epochs = epochs, validation_data = audio_gen.next_valid(), validation_steps = validation_steps,
        callbacks = [checkpointer], verbose = verbose)

    with open('results/' + pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Exemplo n.º 6
0
def get_predictions(index, partition, trained_model, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        trained_model (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features from Dataset
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    print("Trained model output length:\n" +
          str(trained_model.output_length(data_point.shape[0])))
    # obtain and decode the acoustic model's predictions
    trained_model.load_weights(model_path)
    prediction = trained_model.predict(np.expand_dims(data_point, axis=0))
    output_length = [trained_model.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    transcription = ''.join(int_sequence_to_text(pred_ints))
    # Correction using KenLM language model toolkit
    corrected_transcription = correction(transcription)

    print('-' * 80)
    print(repr(audio_path).replace(r"\\", r"/"))
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    print('Raw prediction:\n' + str(prediction[0]))
    print('CTC Decoded predicted Ints before conversion to text:\n' +
          str(pred_ints))
    print('Predicted transcription:\n' + '\n' + transcription)
    print('Predicted transcription with correction:\n' +
          corrected_transcription)
    print('-' * 80)
Exemplo n.º 7
0
def train_model(input_to_softmax,
                pickle_path,
                save_model_path,
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.02,
                              decay=1e-6,
                              momentum=0.9,
                              nesterov=True,
                              clipnorm=5),
                epochs=20,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):
    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size,
                               spectrogram=spectrogram,
                               mfcc_dim=mfcc_dim,
                               max_duration=max_duration,
                               sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples = len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples // minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths)
    validation_steps = num_valid_samples // minibatch_size
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)
    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={
        'ctc': lambda y_true, y_pred: y_pred
    },
                  optimizer=optimizer)
    # make results directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')
    # add checkpointer
    checkpointer = ModelCheckpoint(filepath=os.path.join(
        'results', save_model_path),
                                   verbose=0)
    # train the model
    callbacks = [TQDMNotebookCallback(), checkpointer
                 ] if verbose < 0 else [checkpointer]
    hist = model.fit_generator(generator=audio_gen.next_train(),
                               steps_per_epoch=steps_per_epoch,
                               epochs=epochs,
                               validation_data=audio_gen.next_valid(),
                               validation_steps=validation_steps,
                               callbacks=callbacks,
                               verbose=verbose)
    # save model loss
    with open(os.path.join('results', pickle_path), 'wb') as f:
        pickle.dump(hist.history, f)
Exemplo n.º 8
0
def train_model(input_to_softmax, 
				phn,
				pickle_path,
				save_model_path,
				train_json='JSON\\train_corpus',
				valid_json='JSON\\test_corpus',
				minibatch_size=10,
				mfcc_dim=13,
				optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
				epochs=20,
				verbose=1,
				sort_by_duration=False,
				max_duration=20.0):
	

	# create a class instance for obtaining batches of data
	audio_gen = AudioGenerator(minibatch_size=minibatch_size,  mfcc_dim=mfcc_dim, max_duration=max_duration,
		sort_by_duration=sort_by_duration)
	# add the training data to the generator
	audio_gen.load_train_data(train_json)
	audio_gen.load_test_data(valid_json)
	# calculate steps_per_epoch
	if phn:
		num_train_examples=len(audio_gen.train_phn_audio_paths)
		steps_per_epoch = num_train_examples//minibatch_size
	elif not phn:
		num_train_examples=len(audio_gen.train_wrd_audio_paths)
		steps_per_epoch = num_train_examples//minibatch_size
	# calculate validation_steps
	if phn:
		num_valid_samples = len(audio_gen.test_phn_audio_paths) 
		validation_steps = num_valid_samples//minibatch_size
	elif not phn:
		num_valid_samples = len(audio_gen.test_wrd_audio_paths) 
		validation_steps = num_valid_samples//minibatch_size
	
	# add CTC loss to the NN specified in input_to_softmax
	model = add_ctc_loss(input_to_softmax)

	# CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
	model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

	# make results/ directory, if necessary
	if not os.path.exists('models'):
		os.makedirs('models')

	# add checkpointer
	checkpointer = ModelCheckpoint(filepath='models/'+save_model_path, verbose=0)

	# train the model
	generator=audio_gen.next_train(phn)
	validation_data=audio_gen.next_test(phn)
	hist = model.fit_generator(generator=generator, steps_per_epoch=steps_per_epoch,
		epochs=epochs, validation_data=validation_data, validation_steps=validation_steps,
		callbacks=[checkpointer], verbose=verbose)

	# save model loss
	with open('models/'+pickle_path, 'wb') as f:
		pickle.dump(hist.history, f)
def train_model(input_to_softmax, 
                pickle_path,
                save_model_path,
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                # another option for the optimizer
                # optimizer=RMSprop(clipvalue=0.5),
                optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=1.0, clipvalue=0.5),
                # clipnorm was originally set to 5
                # there are many exploding gradients and clipnorm/clipvalue can help
                epochs=20,
                verbose=1,
                sort_by_duration=False,
                max_duration=20.0):

    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size,
                               spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration,
                               sort_by_duration=sort_by_duration)

    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)

    # calculate steps_per_epoch
    num_train_examples=len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size

    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    # add checkpointer
    checkpointer = ModelCheckpoint(filepath='results/'+save_model_path, verbose=0)

    keras.backend.get_session().run(tf.global_variables_initializer())

    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,
                               epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,
                               callbacks=[checkpointer], verbose=verbose)

    # save model loss
    with open('results/'+pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Exemplo n.º 10
0
def validation_sentences():
    """ storage the validation sentences
    Params:
        None
    """
    # load the test data
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_validation_data()
    # obtain the true transcription and the audio features
    num = 500
    f = open('C:/Users/mribles/Desktop/corpus.txt', 'a')
    while num > 490:
        transcr = data_gen.valid_texts[num]
        f.write(transcr + '\n')
        num = num -1
    f.close()
def get_predictions(index, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()

    print(len(data_gen.valid_texts))
    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    if not data_gen.valid_texts[index]:
        return

    with open(r'D:\DIPLOMSKA\results\predictions_cnn_rnn_12.txt',
              'a+',
              encoding='utf8') as fp:
        fp.write('True transcription:\n' + '\n' + transcr + '\n')
        #print(transcr)
        fp.write('-' * 30 + '\n')
        fp.write('Predicted transcription:\n' + '\n' +
                 ''.join(int_sequence_to_text(pred_ints)) + '\n')
        #print(int_sequence_to_text(pred_ints))
        fp.write('-' * 30 + '\n')
Exemplo n.º 12
0
def get_predictions(index,
                    partition,
                    input_to_softmax,
                    model_path,
                    spectogram=True,
                    mfcc_dim=13):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=spectogram, mfcc_dim=mfcc_dim)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights('results/' + model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    print('-' * 80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    print('Predicted transcription:\n' + '\n' +
          ''.join(int_sequence_to_text(pred_ints)))
    print('-' * 80)
Exemplo n.º 13
0
def train_my_model(model, pickle_path, save_model_path, train_json='train_corpus.json', valid_json='valid_corpus.json', minibatch_size=20, spectrogram=True,
                epochs=20,
                verbose=1,
                sort_by_duration=False,
                max_duration=40.0):
    
    """
        Gabriel Freire: Train my own model
        sample_models.py > own_model(input_dim=161, output_dim=29)
    """    
    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size, 
        spectrogram=spectrogram, max_duration=max_duration,
        sort_by_duration=sort_by_duration)
    
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    
    # calculate steps_per_epoch
    num_train_examples=len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size
    print("Num of training examples: {}".format(num_train_examples))
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    print("Num of validation examples: {}".format(num_valid_samples))
    
    # add ctc loss
    model = add_ctc_loss(model)
    # Compile
    optimizer = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
    # optimizer = Adam(lr=0.02, clipnorm=5, decay=1e-6)
    model.compile(loss=ctc, optimizer=optimizer)

    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    # make tensorboard/ directory, if necessary
    if not os.path.exists('tensorboard'):
        os.makedirs('tensorboard')
    
    # add checkpointer and tensorboard callbacks
    checkpointer = ModelCheckpoint(filepath='results/' + save_model_path, verbose=0)
    tensorboard = TensorBoard(log_dir='tensorboard/{}/'.format('cnn_rnn_own_model_events'), write_graph=False, write_images=True)

    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,
        epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,
        callbacks=[checkpointer, tensorboard], verbose=verbose)

    # save model loss
    with open('results/'+pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
def get_predictions(data_gen: AudioGenerator,
                    model,
                    partition, index, omit_true=False, print_line=True):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        model (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    # data_gen = AudioGenerator()
    # data_gen.load_train_data()
    # data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if data_gen is None:
        print("Data Generator is None!")
    if partition == 'validation':
        transcription = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcription = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    prediction = model.predict(np.expand_dims(data_point, axis=0))
    output_length = [model.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(
        prediction, output_length)[0][0]) + 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    # Audio(audio_path)
    input_type = "SPEC" if data_gen.spectrogram else "MFCC"
    if not omit_true:
        print('TRUE:      ' + transcription)
    print('PRED ' + input_type + ': ' + ''.join(int_sequence_to_text(pred_ints)))
    if print_line:
        print('-' * 82)
    return audio_path
Exemplo n.º 15
0
def get_predictions(index, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions ·
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    starttime = time.time()
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()
    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    #print(input_to_softmax.summary())
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(
       prediction, output_length, greedy=True)[0][0]) + 1).flatten().tolist()
    Audio(audio_path)
    print('-' * 80)
    b = "".join(int_sequence_to_text(pred_ints))
    a = transcr
    print("Greedy_predictions:\n" + '\n' + ''.join(int_sequence_to_text(pred_ints)))
    print('1. Word Error Rate for ASR ==', wer(a, b)*100, '%')
    endtime = time.time()
    print('2. Real Time Factor for ASR ==',(endtime - starttime)/data_gen.valid_durations[index],'\n')
    print('-' * 80)
Exemplo n.º 16
0
def lexcion_predictions(index, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions ·
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    starttime = time.time()
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()
    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    #print(input_to_softmax.summary())
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    print('-' * 80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    b = testline(prediction[0])
    a = transcr
    print("TokenPassing_predictions:\n")
    print(b + '\n')
    print('1. Word Error Rate for ASR ==', wer(a, b)*100, '%')
    endtime = time.time()
    print('2. Real Time Factor for ASR ==',(endtime - starttime)/data_gen.valid_durations[index])
Exemplo n.º 17
0
def train_model(input_to_softmax, 
                pickle_path,
                save_model_path,
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
                epochs=30,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):
    
    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size, 
        spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration,
        sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples=len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

    
    # make results/ directory, if necessary
    #if not os.path.exists('results'):
    #   os.makedirs('results')
        
    #if os.path.exists('/gdrive/My Drive/results/model_end.h5'):
    #    model.load_weights('/gdrive/My Drive/results/model_end.h5')
    
    resume_weights = '/gdrive/My Drive/results/rnn_model.hdf5'
    
    if os.path.isfile(resume_weights):
        print ("Resumed model's weights from {}".format(resume_weights))
        model.load_weights(resume_weights)
    
    # add checkpointer
    checkpointer = ModelCheckpoint(filepath='/gdrive/My Drive/results/'+save_model_path, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)

    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,callbacks=[checkpointer], verbose=verbose)
Exemplo n.º 18
0
def get_predictions_rec(input_to_softmax, a_path, model_path):
    data_gen = AudioGenerator(spectrogram=False)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    audio_path = a_path
    data_point = data_gen.normalize(data_gen.featurize(audio_path))

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    return 'Predicted transcription:\n' + '\n' + ''.join(
        int_sequence_to_text(pred_ints))
Exemplo n.º 19
0
def train_model(input_to_softmax, 
                pickle_path,
                save_model_path,
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
                epochs=1,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):
    
    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size, 
        spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration,
        sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples=len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    # add checkpointer
    checkpointer = ModelCheckpoint(filepath='results/'+save_model_path, verbose=0)

    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,
        epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,
        callbacks=[checkpointer], verbose=verbose)

    # save model loss
    with open('results/'+pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Exemplo n.º 20
0
def train_model_history(
                input_to_softmax, 
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
                epochs=20,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):
    
    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size, 
        spectrogram=spectrogram, mfcc_dim=mfcc_dim, max_duration=max_duration,
        sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples=len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples//minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths) 
    validation_steps = num_valid_samples//minibatch_size
    
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

    # train the model
    hist = model.fit_generator(generator=audio_gen.next_train(), steps_per_epoch=steps_per_epoch,
        epochs=epochs, validation_data=audio_gen.next_valid(), validation_steps=validation_steps,
        callbacks=[], verbose=verbose)

    return hist
Exemplo n.º 21
0
def _predict(name="200_32_3.wav"):
    '''
    Get the predicted results of a single sample
    :param name:
    '''

    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()

    audio_path_valid = data_gen.valid_audio_paths
    audio_path_train = data_gen.train_audio_paths
    idx = -1
    partition = "valid"
    for i in range(len(audio_path_valid)):
        rets = audio_path_valid[i].split('/')
        if rets[-1] == name:
            idx = i
            break
    if idx == -1:
        for i in range(len(audio_path_train)):
            rets = audio_path_train[i].split('/')
            if rets[-1] == name:
                idx = i
                partition = "train"
                break

    start = datetime.now()
    label, predicted = get_predictions(index=idx,
                                       partition=partition,
                                       input_to_softmax=mmodel1(
                                           input_dim=13,
                                           filters=512,
                                           kernel_size=5,
                                           conv_stride=1,
                                           conv_border_mode='same',
                                           units=1024,
                                           output_dim=95),
                                       model_path='results/mmodel1.h5')
    time = datetime.now() - start
    return label, predicted, str(time)
Exemplo n.º 22
0
def get_predictions(audio_path, input_to_softmax, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """

    # print("OK");
    # return;

    # load the train and test data
    data_gen = AudioGenerator(spectrogram=False, mfcc_dim=13)

    # read and get features
    # audio_path = "./samples/16/19/16-19-0159.wav"

    # print("audio_path:{}".format(audio_path))

    # data not normalized yet
    data_point = data_gen.featurize(audio_path)
    # print("shape:{}".format(data_gen.featurize(audio_path).shape))

    # print("feats_mean: {}".format(data_gen.feats_mean))
    # print("feats_std: {}".format(data_gen.feats_std))
    # print("feats_mean: {}".format(data_gen.feats_mean.shape))
    # print("feats_std: {}".format(data_gen.feats_std.shape))

    feats_mean = np.array([
        14.81652005, -0.1802923, -1.22285122, 0.87062853, -16.05643781,
        -14.03943633, -5.7298706, -15.52425927, -3.39637537, -3.85226744,
        -5.17435844, -2.13766871, -11.39111645
    ])
    feats_std = np.array([
        7.16816358, 14.58747728, 11.99928947, 15.69431836, 14.45918537,
        16.79930368, 13.98395715, 12.60133111, 11.61310503, 11.34526655,
        12.01205471, 13.41467652, 10.89021869
    ])

    # print("feats_mean: {}".format(feats_mean))
    # print("feats_std: {}".format(feats_std))
    # print("feats_mean: {}".format(feats_mean.shape))
    # print("feats_std: {}".format(feats_std.shape))

    # print(data_gen.featurize(audio_path).shape)
    # normalize data
    eps = 1e-14
    data_point = (data_point - feats_mean) / (feats_std + eps)

    # data_point = data_gen.normalize(data_gen.featurize(audio_path))

    # print("data_point,shape:{}".format(data_point.shape))

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    recognized_text = "".join(int_sequence_to_text(pred_ints))
    print(recognized_text)
Exemplo n.º 23
0
def train_model(
        input_to_softmax,
        pickle_path,
        #save_model_path,
        train_json='train_corpus.json',
        valid_json='valid_corpus.json',
        minibatch_size=20,
        spectrogram=True,
        mfcc_dim=13,
        #optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
        optimizer=adam,
        epochs=20,
        verbose=1,
        sort_by_duration=True,
        max_duration=16.7):
    # create a class instance for obtaining batches of data
    audio_gen_train = AudioGenerator(minibatch_size=minibatch_size,
                                     spectrogram=spectrogram,
                                     mfcc_dim=mfcc_dim,
                                     max_duration=max_duration,
                                     sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen_train.load_train_data(train_json)
    audio_gen_test = AudioGenerator(minibatch_size=minibatch_size,
                                    spectrogram=spectrogram,
                                    mfcc_dim=mfcc_dim,
                                    max_duration=30.0,
                                    sort_by_duration=False)
    audio_gen_test.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples = len(audio_gen_train.train_audio_paths)
    print('number of train examples ==', num_train_examples)
    steps_per_epoch = num_train_examples // minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen_test.valid_audio_paths)
    validation_steps = num_valid_samples // minibatch_size
    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    # model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer, metrics=['accuracy'])
    model.compile(loss={
        'ctc': lambda y_true, y_pred: y_pred
    },
                  optimizer='adam')
    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    # add checkpointer
    #checkpointer = ModelCheckpoint(filepath='results/' + save_model_path, verbose=0, period=1)
    filepath = "results/test-{epoch:02d}.hdf5"
    checkpointer = ModelCheckpoint(filepath, verbose=0, period=5)
    # train the model
    hist = model.fit_generator(generator=audio_gen_train.next_train(),
                               steps_per_epoch=steps_per_epoch,
                               epochs=epochs,
                               validation_data=audio_gen_test.next_valid(),
                               validation_steps=validation_steps,
                               callbacks=[checkpointer],
                               verbose=verbose)

    # save model loss
    with open('results/' + pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Exemplo n.º 24
0
def generate_corpus(desc_file):
    data_sentences = AudioGenerator()
    #data_gen.load_train_data(desc_file=desc_file)
    data_sentences.load_train_data(desc_file=desc_file)
    sentences = data_sentences.train_texts
    return sentences
Exemplo n.º 25
0
and predicts character sequence

neural network will process each frame of the spectrogram

length of x != length of y
use CTC = connectionist temporal classification
'''
if __name__ == "__main__":
    # datasets
    partition = {
        'train': 'LibriSpeech/dev-clean/84/121123/84-121123.trans.txt',
        'validation': 'LibriSpeech/dev-clean/84/121550/84-121550.trans.txt'
    }

    # Generators
    training_generator = AudioGenerator(descr_file=partition['train'],
                                        batch_size=20)
    validation_generator = AudioGenerator(descr_file=partition['validation'],
                                          batch_size=20)

    # get this model working first and then use lstm
    model = cnn_rnn_model(input_dim=161,
                          filters=200,
                          kernel_size=11,
                          conv_stride=2,
                          conv_border_mode='valid',
                          units=200)

    train_model(input_to_softmax=model,
                pickle_path='model_0.pickle',
                train_generator=training_generator,
                validation_generator=validation_generator,
Exemplo n.º 26
0
lr_scheduler = LearningRateScheduler(scheduler)
lr_tracker = SGDLearningRateTracker()
lr_plateau = ReduceLROnPlateau(
    monitor='val_acc', mode='max', patience=4, factor=np.sqrt(0.1),
    verbose=1, min_lr=1e-6)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--fold', type=str, default='0', help='which fold')
    FLAGS, _ = parser.parse_known_args()
    print('conduct train and test in fold {0}'.format(FLAGS.fold))
    train_generator = AudioGenerator(
        root_dir= '../data/input/train/audio/',
        k=FLAGS.fold,
        file_temp=TRAIN_SPLIT_FILE_TEMP,
        ori_batch_size=batch_size,
        train_or_valid='train',
        augmentation_prob=30,
    )
    # train_generator.steps_per_epoch = train_generator.steps_per_epoch * 2
    valid_generator = AudioGenerator(
        root_dir= '../data/input/train/audio/',
        k=FLAGS.fold,
        file_temp=VALID_SPLIT_FILE_TEMP,
        ori_batch_size=batch_size,
        train_or_valid='valid',
    )
    preds = np.zeros((len(fname_test), n_classes))
    for run in range(RUNS_IN_FOLD):
        print('fold {0} runs {1}'.format(FLAGS.fold, run))
        # use model check point callbacks
Exemplo n.º 27
0
def get_predictions(indexes, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=False, mfcc_dim=13)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # loading language model
    alphabet = ''.join(index_map.values())
    language_model = LanguageModel('data/word/corpus.txt', alphabet)

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)

    for index in indexes:
        # obtain the true transcription and the audio features
        if partition == 'validation':
            transcr = data_gen.valid_texts[index]
            audio_path = data_gen.valid_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
        elif partition == 'train':
            transcr = data_gen.train_texts[index]
            audio_path = data_gen.train_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
        elif partition == 'test':
            transcr = 'hello how are you'
            audio_path = '../datasets/AgentExpress/hello.wav.wav'
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
            #print(data_point)
        else:
            raise Exception(
                'Invalid partition!  Must be "train" or "validation"')

        prediction = input_to_softmax.predict(
            np.expand_dims(data_point, axis=0))

        output_length = [input_to_softmax.output_length(data_point.shape[0])]

        #print(K.eval(K.ctc_decode(prediction, output_length, False, 30, 1)[0][0]))
        pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                     1).flatten().tolist()

        # play the audio file, and display the true and predicted transcriptions
        print('-' * 80)
        Audio(audio_path)
        print('ground_truth:' + ' ' * 4 + transcr)
        print('best_path:' + ' ' * 7 +
              ''.join(int_sequence_to_text(pred_ints)))
        pred_beam = ctcBeamSearch(prediction[0], alphabet, None)
        print('beam_search:' + ' ' * 5 + pred_beam)
        pred_beam_lm = ctcBeamSearch(prediction[0], alphabet, language_model)
        print('beam_search_lm:' + ' ' * 2 + pred_beam_lm)
        pred_token = ctcTokenPassing(prediction[0], alphabet,
                                     language_model.getWordList())
        print('token_passing:' + ' ' * 3 + pred_token)
Exemplo n.º 28
0
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()
enc = OneHotEncoder(sparse=False)
y_train_int = label_enc.fit_transform(y_train)
y_train_int = y_train_int.reshape(len(y_train_int), 1)
y_train_one_hot = enc.fit_transform(y_train_int)

y_test_int = label_enc.transform(y_test)
y_test_int = y_test_int.reshape(len(y_test_int), 1)
y_test_one_hot = enc.transform(y_test_int)

# Create audio generator
audio_gen = AudioGenerator(batch_size=batch_size,
                           fns=X_train_fn,
                           labels=y_train_one_hot,
                           mode=mode)
valid_gen = AudioGenerator(batch_size=batch_size,
                           fns=X_test_fn,
                           labels=y_test_one_hot,
                           mode=mode)
l, Sxx = audio_gen.rnd_one_sample()

num_train = audio_gen.get_train_test_num()
num_test = valid_gen.get_train_test_num()
print(num_train, num_test)
step_per_epoch = num_train // batch_size
validation_step = num_test // batch_size
image_shape = Sxx.shape
print(image_shape)
Exemplo n.º 29
0
def generate_corpus(desc_file):
    #outputs a list of sentences
    data_sentences = AudioGenerator()
    data_sentences.load_train_data(desc_file=desc_file)
    sentences = data_sentences.train_texts
    return sentences
Exemplo n.º 30
0
def get_group_predictions(input_to_softmax, model_path, partition):
    starttime = time.time()
    wer_sum = 0
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()
    input_to_softmax.load_weights(model_path)
    # obtain the true transcription and the audio features
    if partition == 'validation':
        num = 99
        while num >= 0:
            index = random.randint(1, 2500)
            transcr = data_gen.valid_texts[index]
            audio_path = data_gen.valid_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
            # obtain and decode the acoustic model's predictions
            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][
                                    0]) + 1).flatten().tolist()
            # print('True transcription:\n' + '\n' + transcr)
            b = "".join(int_sequence_to_text(pred_ints))
            a = transcr
            # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))
            # print('-' * 80)
            # print('1.Editable Distance for ASR ==', edit(a, b), '\n')
            if (wer(a, b) <= 1):
                print('index_%d' % index, ':')
                wer_sum = wer_sum + wer(a, b)
                print(wer(a, b))
                print("Transcription: ",a)
                print("Prediction:    ",b)
                print('-' * 80)
            elif ():
                num = num + 1
            num = num - 1
    elif partition == 'train':
        num = 999
        while num >= 0:
            index = random.randint(1, 10000)
            transcr = data_gen.train_texts[index]
            audio_path = data_gen.train_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))

            # obtain and decode the acoustic model's predictions

            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][
                                    0]) + 1).flatten().tolist()
            # print('True transcription:\n' + '\n' + transcr)
            b = "".join(int_sequence_to_text(pred_ints))
            a = transcr
            # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))
            # print('-' * 80)
            # print('1.Editable Distance for ASR ==', edit(a, b), '\n')

            if (wer(a, b) <= 1):
                print('index_%d' % index, ':')
                wer_sum = wer_sum + wer(a, b)
                print(wer(a, b))
                print("Transcription: ",a)
                print("Prediction:    ",b)
                print('-' * 80)
            elif ():
                num = num + 1
            num = num - 1
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    endtime = time.time()
    #print('1. Average Word Error Rate for ASR ==', wer_sum/100 , '%')
    print('1. Average Word Error Rate for ASR ==', wer_sum / 100)
    print('2. Average Real Time Factor for ASR ==', (endtime - starttime) / 100, '\n')
Exemplo n.º 31
0
    ##################################################
    print('prepare test data')
    d_test = pickle.load(open(test_dir + 'test_{0}.pkl'.format(FE_TYPE), 'rb'))
    fname_test, X_test = d_test['fname'], d_test['data']
    X_test = X_test.reshape(tuple(list(X_test.shape) + [1])).astype('float32')
    del d_test
    gc.collect()

    ##################################################
    # make data generator
    ##################################################
    print('prepare train data in fold {0}'.format(FLAGS.fold))
    train_generator = AudioGenerator(
        root_dir='../data/input/train/audio/',
        k=FLAGS.fold,
        file_temp=TRAIN_SPLIT_FILE_TEMP,
        ori_batch_size=batch_size,
        train_or_valid='train',
        augmentation_prob=0,
    )
    # train_generator.steps_per_epoch = train_generator.steps_per_epoch * 2
    print('prepare valid data in fold {0}'.format(FLAGS.fold))
    valid_generator = AudioGenerator(
        root_dir='../data/input/train/audio/',
        k=FLAGS.fold,
        file_temp=VALID_SPLIT_FILE_TEMP,
        ori_batch_size=batch_size,
        train_or_valid='valid',
    )
    # prepare valid data
    fname_valid = valid_generator.in_fold_data['fname']
    truth_valid = valid_generator.in_fold_data['truth']
Exemplo n.º 32
0
def train_model(input_to_softmax,
                pickle_path,
                save_model_path,
                train_json='train_corpus.json',
                valid_json='valid_corpus.json',
                minibatch_size=20,
                spectrogram=True,
                mfcc_dim=13,
                optimizer=SGD(lr=0.02,
                              decay=1e-6,
                              momentum=0.9,
                              nesterov=True,
                              clipnorm=5),
                epochs=20,
                verbose=1,
                sort_by_duration=False,
                max_duration=10.0):

    # create a class instance for obtaining batches of data
    audio_gen = AudioGenerator(minibatch_size=minibatch_size,
                               spectrogram=spectrogram,
                               mfcc_dim=mfcc_dim,
                               max_duration=max_duration,
                               sort_by_duration=sort_by_duration)
    # add the training data to the generator
    audio_gen.load_train_data(train_json)
    audio_gen.load_validation_data(valid_json)
    # calculate steps_per_epoch
    num_train_examples = len(audio_gen.train_audio_paths)
    steps_per_epoch = num_train_examples // minibatch_size
    # calculate validation_steps
    num_valid_samples = len(audio_gen.valid_audio_paths)
    validation_steps = num_valid_samples // minibatch_size

    # add CTC loss to the NN specified in input_to_softmax
    model = add_ctc_loss(input_to_softmax)

    # CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
    model.compile(
        loss={
            'ctc': lambda y_true, y_pred: y_pred
        },
        optimizer=optimizer,
    )

    # make results/ directory, if necessary
    if not os.path.exists('results'):
        os.makedirs('results')

    # add checkpointer
    # checkpointer = ModelCheckpoint(filepath='results/'+save_model_path, verbose=0)

    # callbacks
    # Notes:
    # added ReduceLROnPlateau to decrease the learning rate when the model doesn't improve for 2 epochs
    # added EarlyStopping to stop the training when the model clearly overfits
    callbacks = [
        ModelCheckpoint(filepath='results/' + save_model_path, verbose=0),
        ReduceLROnPlateau(monitor="val_loss",
                          factor=0.9,
                          patience=2,
                          verbose=verbose,
                          min_lr=0.001),
        EarlyStopping(patience=4)
    ]

    # train the model
    hist = model.fit_generator(
        generator=audio_gen.next_train(),
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        validation_data=audio_gen.next_valid(),
        validation_steps=validation_steps,
        callbacks=callbacks,
        verbose=verbose)  # originally `callbacks=[checkpointer]

    # save model loss
    with open('results/' + pickle_path, 'wb') as f:
        pickle.dump(hist.history, f)
Exemplo n.º 33
0
def get_predictions(index, partition, input_to_softmax, model_path, phn=False):
    """ Print a model's decoded predictions
	Params:
		index (int): The example you would like to visualize
		partition (str): One of 'train' or 'validation'
		input_to_softmax (Model): The acoustic model
		model_path (str): Path to saved acoustic model's weights
	"""
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    # obtain the true transcription and the audio features
    if partition == 'test':
        if phn:
            transcr = data_gen.test_phn_texts[index]
            audio_path = data_gen.test_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.test_wrd_texts[index]
            audio_path = data_gen.test_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        if phn:
            transcr = data_gen.train_phn_texts[index]
            audio_path = data_gen.train_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.train_wrd_texts[index]
            audio_path = data_gen.train_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!	 Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    if not phn:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n' +
              ''.join(int_sequence_to_text(pred_ints, phn)))
        print('-' * 80)
    else:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n')
        split_true = transcr.split(" ")
        split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
        print("\033[1;32m" + split_pred[0] + " ", end='')
        for i in range(1, len(split_true) - 1):
            if split_true[i - 1] == split_pred[i] or split_true[
                    i] == split_pred[i] or split_true[i + 1] == split_pred[i]:
                print("\033[1;32m" + split_pred[i] + " ", end='')
            else:
                print("\033[1;31m" + split_pred[i] + " ", end='')
        print(split_pred[len(split_true) - 1] + " ", end='')
    split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
    split_true = transcr.split(" ")
    displayAccuracy(split_true, split_pred, phn)
Exemplo n.º 34
0
        print('\nLR: {:.6f}\n'.format(lr))

lr_scheduler = LearningRateScheduler(scheduler)
lr_tracker = SGDLearningRateTracker()



if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--fold', type=str, default='0', help='which fold')
    FLAGS, _ = parser.parse_known_args()
    print('conduct train and test in fold {0}'.format(FLAGS.fold))
    train_generator = AudioGenerator(
        root_dir= '../data/input/train/audio/',
        k=FLAGS.fold,
        file_temp=TRAIN_SPLIT_FILE_TEMP,
        ori_batch_size=batch_size,
        train_or_valid='train',
        augmentation_prob=50,
    )
    train_generator.steps_per_epoch = train_generator.steps_per_epoch * 2
    valid_generator = AudioGenerator(
        root_dir= '../data/input/train/audio/',
        k=FLAGS.fold,
        file_temp=VALID_SPLIT_FILE_TEMP,
        ori_batch_size=batch_size,
        train_or_valid='valid',
    )
    preds = np.zeros((len(fname_test), n_classes))
    for run in range(RUNS_IN_FOLD):
        print('fold {0} runs {1}'.format(FLAGS.fold, run))
        model = get_model()