예제 #1
0
def predict_test(input_to_softmax, model_path, audio_range=100000):
    '''
    Method for predicting the testing set. Set default audio_range to be 100000.
    (If it's over the index, it will return an error anyway).
    '''
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    transcr = data_gen.test_texts
    audio_path = data_gen.test_audio_paths
    input_to_softmax.load_weights(model_path)
    predictions = []
    try:
        for i in range(len(audio_path)):  #default len(audio_path)):
            data_point = data_gen.normalize(data_gen.featurize(audio_path[i]))

            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(
                    prediction, output_length)[0][0])+1).flatten().tolist()
            pred = ''.join(int_sequence_to_text(pred_ints))
            predictions.append(pred)
    except:
        predictions = ''.join(predictions)
        transcr = transcr[:10]
        transcr = ''.join(transcr)
        with open("predictions/predictions.txt", "w") as output:
            output.write(str(predictions))
        with open("predictions/truescr.txt", "w") as output:
            output.write(str(transcr))
예제 #2
0
def compare_predictions(index,
                        partition,
                        inputs_to_softmax=[],
                        model_paths=[],
                        phn=False):
    """ Print a model's decoded predictions
	Params:
		index (int): The example you would like to visualize
		partition (str): One of 'train' or 'validation'
		input_to_softmax (Model): The acoustic model
		model_path (str): Path to saved acoustic model's weights
	"""
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    # obtain the true transcription and the audio features
    if partition == 'test':
        if phn:
            transcr = data_gen.test_phn_texts[index]
            audio_path = data_gen.test_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.test_wrd_texts[index]
            audio_path = data_gen.test_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        if phn:
            transcr = data_gen.train_phn_texts[index]
            audio_path = data_gen.train_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.train_wrd_texts[index]
            audio_path = data_gen.train_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!	 Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    pred_ints = []
    for model_path, input_to_softmax in zip(model_paths, inputs_to_softmax):
        input_to_softmax.load_weights(model_path)
        prediction = input_to_softmax.predict(
            np.expand_dims(data_point, axis=0))
        output_length = [input_to_softmax.output_length(data_point.shape[0])]
        pred_int = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                    1).flatten().tolist()
        pred_ints.append(pred_int)

    # play the audio file, and display the true and predicted transcriptions
    print('-' * 80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    i = 0
    for pred_in in pred_ints:
        i = i + 1
        print('Predicted transcription number', i,
              ':\n' + '\n' + ''.join(int_sequence_to_text(pred_in, phn)))
        print('-' * 80)
예제 #3
0
def train_model(input_to_softmax, 
				phn,
				pickle_path,
				save_model_path,
				train_json='JSON\\train_corpus',
				valid_json='JSON\\test_corpus',
				minibatch_size=10,
				mfcc_dim=13,
				optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
				epochs=20,
				verbose=1,
				sort_by_duration=False,
				max_duration=20.0):
	

	# create a class instance for obtaining batches of data
	audio_gen = AudioGenerator(minibatch_size=minibatch_size,  mfcc_dim=mfcc_dim, max_duration=max_duration,
		sort_by_duration=sort_by_duration)
	# add the training data to the generator
	audio_gen.load_train_data(train_json)
	audio_gen.load_test_data(valid_json)
	# calculate steps_per_epoch
	if phn:
		num_train_examples=len(audio_gen.train_phn_audio_paths)
		steps_per_epoch = num_train_examples//minibatch_size
	elif not phn:
		num_train_examples=len(audio_gen.train_wrd_audio_paths)
		steps_per_epoch = num_train_examples//minibatch_size
	# calculate validation_steps
	if phn:
		num_valid_samples = len(audio_gen.test_phn_audio_paths) 
		validation_steps = num_valid_samples//minibatch_size
	elif not phn:
		num_valid_samples = len(audio_gen.test_wrd_audio_paths) 
		validation_steps = num_valid_samples//minibatch_size
	
	# add CTC loss to the NN specified in input_to_softmax
	model = add_ctc_loss(input_to_softmax)

	# CTC loss is implemented elsewhere, so use a dummy lambda function for the loss
	model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)

	# make results/ directory, if necessary
	if not os.path.exists('models'):
		os.makedirs('models')

	# add checkpointer
	checkpointer = ModelCheckpoint(filepath='models/'+save_model_path, verbose=0)

	# train the model
	generator=audio_gen.next_train(phn)
	validation_data=audio_gen.next_test(phn)
	hist = model.fit_generator(generator=generator, steps_per_epoch=steps_per_epoch,
		epochs=epochs, validation_data=validation_data, validation_steps=validation_steps,
		callbacks=[checkpointer], verbose=verbose)

	# save model loss
	with open('models/'+pickle_path, 'wb') as f:
		pickle.dump(hist.history, f)
예제 #4
0
def get_predictions(index, partition, input_to_softmax, model_path, phn=False):
    """ Print a model's decoded predictions
	Params:
		index (int): The example you would like to visualize
		partition (str): One of 'train' or 'validation'
		input_to_softmax (Model): The acoustic model
		model_path (str): Path to saved acoustic model's weights
	"""
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    # obtain the true transcription and the audio features
    if partition == 'test':
        if phn:
            transcr = data_gen.test_phn_texts[index]
            audio_path = data_gen.test_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.test_wrd_texts[index]
            audio_path = data_gen.test_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        if phn:
            transcr = data_gen.train_phn_texts[index]
            audio_path = data_gen.train_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.train_wrd_texts[index]
            audio_path = data_gen.train_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!	 Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    if not phn:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n' +
              ''.join(int_sequence_to_text(pred_ints, phn)))
        print('-' * 80)
    else:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n')
        split_true = transcr.split(" ")
        split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
        print("\033[1;32m" + split_pred[0] + " ", end='')
        for i in range(1, len(split_true) - 1):
            if split_true[i - 1] == split_pred[i] or split_true[
                    i] == split_pred[i] or split_true[i + 1] == split_pred[i]:
                print("\033[1;32m" + split_pred[i] + " ", end='')
            else:
                print("\033[1;31m" + split_pred[i] + " ", end='')
        print(split_pred[len(split_true) - 1] + " ", end='')
    split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
    split_true = transcr.split(" ")
    displayAccuracy(split_true, split_pred, phn)