예제 #1
0
def get_predictions(index, partition, input_to_softmax, model_path, spectrogram_features=True):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights

    return the predicted probability matrix (in a 2D matrix) and the ground truth
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=spectrogram_features)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    return (prediction[0], transcr, audio_path)
예제 #2
0
def get_predictions(index, partition, input_to_softmax, model_path):
    '''
    Get the model's decoded predictions to caculate metrics
    '''
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if partition == 'valid':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    label = transcr
    predicted = ''.join(int_sequence_to_text(pred_ints))

    return label, predicted
예제 #3
0
def compare_predictions(index,
                        partition,
                        inputs_to_softmax=[],
                        model_paths=[],
                        phn=False):
    """ Print a model's decoded predictions
	Params:
		index (int): The example you would like to visualize
		partition (str): One of 'train' or 'validation'
		input_to_softmax (Model): The acoustic model
		model_path (str): Path to saved acoustic model's weights
	"""
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    # obtain the true transcription and the audio features
    if partition == 'test':
        if phn:
            transcr = data_gen.test_phn_texts[index]
            audio_path = data_gen.test_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.test_wrd_texts[index]
            audio_path = data_gen.test_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        if phn:
            transcr = data_gen.train_phn_texts[index]
            audio_path = data_gen.train_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.train_wrd_texts[index]
            audio_path = data_gen.train_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!	 Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    pred_ints = []
    for model_path, input_to_softmax in zip(model_paths, inputs_to_softmax):
        input_to_softmax.load_weights(model_path)
        prediction = input_to_softmax.predict(
            np.expand_dims(data_point, axis=0))
        output_length = [input_to_softmax.output_length(data_point.shape[0])]
        pred_int = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                    1).flatten().tolist()
        pred_ints.append(pred_int)

    # play the audio file, and display the true and predicted transcriptions
    print('-' * 80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    i = 0
    for pred_in in pred_ints:
        i = i + 1
        print('Predicted transcription number', i,
              ':\n' + '\n' + ''.join(int_sequence_to_text(pred_in, phn)))
        print('-' * 80)
예제 #4
0
def get_predictions(index, partition, trained_model, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        trained_model (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features from Dataset
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    print("Trained model output length:\n" +
          str(trained_model.output_length(data_point.shape[0])))
    # obtain and decode the acoustic model's predictions
    trained_model.load_weights(model_path)
    prediction = trained_model.predict(np.expand_dims(data_point, axis=0))
    output_length = [trained_model.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    transcription = ''.join(int_sequence_to_text(pred_ints))
    # Correction using KenLM language model toolkit
    corrected_transcription = correction(transcription)

    print('-' * 80)
    print(repr(audio_path).replace(r"\\", r"/"))
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    print('Raw prediction:\n' + str(prediction[0]))
    print('CTC Decoded predicted Ints before conversion to text:\n' +
          str(pred_ints))
    print('Predicted transcription:\n' + '\n' + transcription)
    print('Predicted transcription with correction:\n' +
          corrected_transcription)
    print('-' * 80)
예제 #5
0
def predict_test(input_to_softmax, model_path, audio_range=100000):
    '''
    Method for predicting the testing set. Set default audio_range to be 100000.
    (If it's over the index, it will return an error anyway).
    '''
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    transcr = data_gen.test_texts
    audio_path = data_gen.test_audio_paths
    input_to_softmax.load_weights(model_path)
    predictions = []
    try:
        for i in range(len(audio_path)):  #default len(audio_path)):
            data_point = data_gen.normalize(data_gen.featurize(audio_path[i]))

            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(
                    prediction, output_length)[0][0])+1).flatten().tolist()
            pred = ''.join(int_sequence_to_text(pred_ints))
            predictions.append(pred)
    except:
        predictions = ''.join(predictions)
        transcr = transcr[:10]
        transcr = ''.join(transcr)
        with open("predictions/predictions.txt", "w") as output:
            output.write(str(predictions))
        with open("predictions/truescr.txt", "w") as output:
            output.write(str(transcr))
def get_predictions(index, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()

    print(len(data_gen.valid_texts))
    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    if not data_gen.valid_texts[index]:
        return

    with open(r'D:\DIPLOMSKA\results\predictions_cnn_rnn_12.txt',
              'a+',
              encoding='utf8') as fp:
        fp.write('True transcription:\n' + '\n' + transcr + '\n')
        #print(transcr)
        fp.write('-' * 30 + '\n')
        fp.write('Predicted transcription:\n' + '\n' +
                 ''.join(int_sequence_to_text(pred_ints)) + '\n')
        #print(int_sequence_to_text(pred_ints))
        fp.write('-' * 30 + '\n')
예제 #7
0
def get_predictions(index,
                    partition,
                    input_to_softmax,
                    model_path,
                    spectogram=True,
                    mfcc_dim=13):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=spectogram, mfcc_dim=mfcc_dim)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights('results/' + model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    print('-' * 80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    print('Predicted transcription:\n' + '\n' +
          ''.join(int_sequence_to_text(pred_ints)))
    print('-' * 80)
def get_predictions(data_gen: AudioGenerator,
                    model,
                    partition, index, omit_true=False, print_line=True):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        model (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    # data_gen = AudioGenerator()
    # data_gen.load_train_data()
    # data_gen.load_validation_data()

    # obtain the true transcription and the audio features
    if data_gen is None:
        print("Data Generator is None!")
    if partition == 'validation':
        transcription = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcription = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    prediction = model.predict(np.expand_dims(data_point, axis=0))
    output_length = [model.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(
        prediction, output_length)[0][0]) + 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    # Audio(audio_path)
    input_type = "SPEC" if data_gen.spectrogram else "MFCC"
    if not omit_true:
        print('TRUE:      ' + transcription)
    print('PRED ' + input_type + ': ' + ''.join(int_sequence_to_text(pred_ints)))
    if print_line:
        print('-' * 82)
    return audio_path
예제 #9
0
def get_predictions(index, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions ·
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    starttime = time.time()
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()
    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    #print(input_to_softmax.summary())
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(
       prediction, output_length, greedy=True)[0][0]) + 1).flatten().tolist()
    Audio(audio_path)
    print('-' * 80)
    b = "".join(int_sequence_to_text(pred_ints))
    a = transcr
    print("Greedy_predictions:\n" + '\n' + ''.join(int_sequence_to_text(pred_ints)))
    print('1. Word Error Rate for ASR ==', wer(a, b)*100, '%')
    endtime = time.time()
    print('2. Real Time Factor for ASR ==',(endtime - starttime)/data_gen.valid_durations[index],'\n')
    print('-' * 80)
예제 #10
0
def lexcion_predictions(index, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions ·
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    starttime = time.time()
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()
    # obtain the true transcription and the audio features
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    #print(input_to_softmax.summary())
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    print('-' * 80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-' * 80)
    b = testline(prediction[0])
    a = transcr
    print("TokenPassing_predictions:\n")
    print(b + '\n')
    print('1. Word Error Rate for ASR ==', wer(a, b)*100, '%')
    endtime = time.time()
    print('2. Real Time Factor for ASR ==',(endtime - starttime)/data_gen.valid_durations[index])
예제 #11
0
def get_predictions_rec(input_to_softmax, a_path, model_path):
    data_gen = AudioGenerator(spectrogram=False)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    audio_path = a_path
    data_point = data_gen.normalize(data_gen.featurize(audio_path))

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    return 'Predicted transcription:\n' + '\n' + ''.join(
        int_sequence_to_text(pred_ints))
예제 #12
0
def get_group_predictions(input_to_softmax, model_path, partition):
    starttime = time.time()
    wer_sum = 0
    data_gen = AudioGenerator(spectrogram=True)
    data_gen.load_train_data()
    data_gen.load_validation_data()
    input_to_softmax.load_weights(model_path)
    # obtain the true transcription and the audio features
    if partition == 'validation':
        num = 99
        while num >= 0:
            index = random.randint(1, 2500)
            transcr = data_gen.valid_texts[index]
            audio_path = data_gen.valid_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
            # obtain and decode the acoustic model's predictions
            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][
                                    0]) + 1).flatten().tolist()
            # print('True transcription:\n' + '\n' + transcr)
            b = "".join(int_sequence_to_text(pred_ints))
            a = transcr
            # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))
            # print('-' * 80)
            # print('1.Editable Distance for ASR ==', edit(a, b), '\n')
            if (wer(a, b) <= 1):
                print('index_%d' % index, ':')
                wer_sum = wer_sum + wer(a, b)
                print(wer(a, b))
                print("Transcription: ",a)
                print("Prediction:    ",b)
                print('-' * 80)
            elif ():
                num = num + 1
            num = num - 1
    elif partition == 'train':
        num = 999
        while num >= 0:
            index = random.randint(1, 10000)
            transcr = data_gen.train_texts[index]
            audio_path = data_gen.train_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))

            # obtain and decode the acoustic model's predictions

            prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
            output_length = [input_to_softmax.output_length(data_point.shape[0])]
            pred_ints = (K.eval(K.ctc_decode(prediction, output_length, greedy=False, beam_width=100, top_paths=1)[0][
                                    0]) + 1).flatten().tolist()
            # print('True transcription:\n' + '\n' + transcr)
            b = "".join(int_sequence_to_text(pred_ints))
            a = transcr
            # print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))
            # print('-' * 80)
            # print('1.Editable Distance for ASR ==', edit(a, b), '\n')

            if (wer(a, b) <= 1):
                print('index_%d' % index, ':')
                wer_sum = wer_sum + wer(a, b)
                print(wer(a, b))
                print("Transcription: ",a)
                print("Prediction:    ",b)
                print('-' * 80)
            elif ():
                num = num + 1
            num = num - 1
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')

    endtime = time.time()
    #print('1. Average Word Error Rate for ASR ==', wer_sum/100 , '%')
    print('1. Average Word Error Rate for ASR ==', wer_sum / 100)
    print('2. Average Real Time Factor for ASR ==', (endtime - starttime) / 100, '\n')
예제 #13
0
def get_predictions(indexes, partition, input_to_softmax, model_path):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator(spectrogram=False, mfcc_dim=13)
    data_gen.load_train_data()
    data_gen.load_validation_data()

    # loading language model
    alphabet = ''.join(index_map.values())
    language_model = LanguageModel('data/word/corpus.txt', alphabet)

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)

    for index in indexes:
        # obtain the true transcription and the audio features
        if partition == 'validation':
            transcr = data_gen.valid_texts[index]
            audio_path = data_gen.valid_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
        elif partition == 'train':
            transcr = data_gen.train_texts[index]
            audio_path = data_gen.train_audio_paths[index]
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
        elif partition == 'test':
            transcr = 'hello how are you'
            audio_path = '../datasets/AgentExpress/hello.wav.wav'
            data_point = data_gen.normalize(data_gen.featurize(audio_path))
            #print(data_point)
        else:
            raise Exception(
                'Invalid partition!  Must be "train" or "validation"')

        prediction = input_to_softmax.predict(
            np.expand_dims(data_point, axis=0))

        output_length = [input_to_softmax.output_length(data_point.shape[0])]

        #print(K.eval(K.ctc_decode(prediction, output_length, False, 30, 1)[0][0]))
        pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                     1).flatten().tolist()

        # play the audio file, and display the true and predicted transcriptions
        print('-' * 80)
        Audio(audio_path)
        print('ground_truth:' + ' ' * 4 + transcr)
        print('best_path:' + ' ' * 7 +
              ''.join(int_sequence_to_text(pred_ints)))
        pred_beam = ctcBeamSearch(prediction[0], alphabet, None)
        print('beam_search:' + ' ' * 5 + pred_beam)
        pred_beam_lm = ctcBeamSearch(prediction[0], alphabet, language_model)
        print('beam_search_lm:' + ' ' * 2 + pred_beam_lm)
        pred_token = ctcTokenPassing(prediction[0], alphabet,
                                     language_model.getWordList())
        print('token_passing:' + ' ' * 3 + pred_token)
예제 #14
0
def get_predictions(index, partition, input_to_softmax, model_path, phn=False):
    """ Print a model's decoded predictions
	Params:
		index (int): The example you would like to visualize
		partition (str): One of 'train' or 'validation'
		input_to_softmax (Model): The acoustic model
		model_path (str): Path to saved acoustic model's weights
	"""
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    # obtain the true transcription and the audio features
    if partition == 'test':
        if phn:
            transcr = data_gen.test_phn_texts[index]
            audio_path = data_gen.test_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.test_wrd_texts[index]
            audio_path = data_gen.test_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        if phn:
            transcr = data_gen.train_phn_texts[index]
            audio_path = data_gen.train_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.train_wrd_texts[index]
            audio_path = data_gen.train_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!	 Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    if not phn:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n' +
              ''.join(int_sequence_to_text(pred_ints, phn)))
        print('-' * 80)
    else:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n')
        split_true = transcr.split(" ")
        split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
        print("\033[1;32m" + split_pred[0] + " ", end='')
        for i in range(1, len(split_true) - 1):
            if split_true[i - 1] == split_pred[i] or split_true[
                    i] == split_pred[i] or split_true[i + 1] == split_pred[i]:
                print("\033[1;32m" + split_pred[i] + " ", end='')
            else:
                print("\033[1;31m" + split_pred[i] + " ", end='')
        print(split_pred[len(split_true) - 1] + " ", end='')
    split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
    split_true = transcr.split(" ")
    displayAccuracy(split_true, split_pred, phn)
예제 #15
0
from wer import wer
from data_generator import AudioGenerator
from keras import backend as K
from utils import int_sequence_to_text
import numpy as np

from keras.callbacks import ModelCheckpoint, Callback

valid_cache = []
data_gen = AudioGenerator(spectrogram=True)
data_gen.load_train_data()
data_gen.load_validation_data()
for index in range(len(data_gen.valid_texts)):
    transcr = data_gen.valid_texts[index]
    audio_path = data_gen.valid_audio_paths[index]
    data_point = data_gen.normalize(data_gen.featurize(audio_path))
    valid_cache.append(data_point)


def calculate_wer2(input_to_softmax, model_path, words=False):
    # data_gen = AudioGenerator()
    # data_gen.load_train_data()
    # data_gen.load_validation_data()
    wers = []
    input_to_softmax.load_weights(model_path)

    l = len(data_gen.valid_texts)
    l = 100
    for index in range(l):
        transcr = data_gen.valid_texts[index]
        # audio_path = data_gen.valid_audio_paths[index]