Exemplo n.º 1
0
def index():
    form = InferenceForm()

    truth_transcription = None
    prediction_transcription = None
    raw_plot = None
    raw_shape = None
    spectrogram_plot = None
    spectrogram_shape = None
    error_rate = None
    similarity = None

    def plot_raw_audio(vis_raw_audio):
        # Plot the raw audio signal
        fig = plt.figure(figsize=(7,3))
        ax = fig.add_subplot(111)
        steps = len(vis_raw_audio)
        ax.plot(np.linspace(1, steps, steps), vis_raw_audio)
        plt.title('Raw Audio Signal')
        plt.xlabel('Time')
        plt.ylabel('Amplitude')
        figfile1 = BytesIO()
        plt.savefig(figfile1, format='png')
        figfile1.seek(0)
        raw_plot = base64.b64encode(figfile1.getvalue())
        return raw_plot.decode('utf8')

    def plot_spectrogram_feature(vis_spectrogram_feature):
        # Plot a normalized spectrogram
        fig = plt.figure(figsize=(7,3))
        ax = fig.add_subplot(111)
        im = ax.imshow(vis_spectrogram_feature, cmap=plt.cm.jet, aspect='auto')
        plt.title('Spectrogram')
        plt.ylabel('Time')
        plt.xlabel('Frequency')
        divider = make_axes_locatable(ax)
        cax = divider.append_axes("right", size="5%", pad=0.05)
        plt.colorbar(im, cax=cax)
        figfile2 = BytesIO()
        plt.savefig(figfile2, format='png')
        figfile2.seek(0)
        spectrogram_plot = base64.b64encode(figfile2.getvalue())
        return spectrogram_plot.decode('utf8')

    def wer_calc(ref, pred):
        # Calcualte word error rate
        d = np.zeros((len(ref) + 1) * (len(pred) + 1), dtype=np.uint16)
        d = d.reshape((len(ref) + 1, len(pred) + 1))
        for i in range(len(ref) + 1):
            for j in range(len(pred) + 1):
                if i == 0:
                    d[0][j] = j
                elif j == 0:
                    d[i][0] = i
        for i in range(1, len(ref) + 1):
            for j in range(1, len(pred) + 1):
                if ref[i - 1] == pred[j - 1]:
                    d[i][j] = d[i - 1][j - 1]
                else:
                    substitution = d[i - 1][j - 1] + 1
                    insertion = d[i][j - 1] + 1
                    deletion = d[i - 1][j] + 1
                    d[i][j] = min(substitution, insertion, deletion)
        result = float(d[len(ref)][len(pred)]) / len(ref) * 100
        return result

    if form.validate_on_submit():
        partition = form.partition.data
        instance_number = form.instance_number.data

        truth_transcription = make_predictions.get_ground_truth(index=instance_number, partition=partition, input_to_softmax=make_predictions.final_keras, model_path='./results/final_keras.h5')
        prediction_transcription = make_predictions.get_prediction(index=instance_number, partition=partition, input_to_softmax=make_predictions.final_keras, model_path='./results/final_keras.h5')

        vis_text, vis_raw_audio, vis_spectrogram_feature, vis_audio_path = make_predictions.vis_audio_features(index=instance_number, partition=partition)

        raw_plot = plot_raw_audio(vis_raw_audio)
        raw_shape = 'The shape of the waveform of the chosen audio file: ' + str(vis_raw_audio.shape)

        spectrogram_plot = plot_spectrogram_feature(vis_spectrogram_feature)
        spectrogram_shape = 'The shape of the spectrogram of the chosen audio file: ' + str(vis_spectrogram_feature.shape)

        cv = CountVectorizer()
        ground_truth_vec = cv.fit_transform([truth_transcription])
        pred_transcription_vec = cv.transform([prediction_transcription])
        similarity = cosine_similarity(ground_truth_vec, pred_transcription_vec)

        error_rate = wer_calc(truth_transcription, prediction_transcription)
    
    return render_template('index.html', title='Hey, Jetson!', form=form, truth_transcription=truth_transcription, prediction_transcription=prediction_transcription, raw_plot=raw_plot, raw_shape=raw_shape,
    spectrogram_plot=spectrogram_plot, spectrogram_shape=spectrogram_shape, error_rate=error_rate, similarity=similarity)
Exemplo n.º 2
0
def sentiment():
    # Initializing form for user input
    sentiment_form = SentimentForm()
    # Initializing variables passed to HTML files
    truth_transcription = None
    prediction_transcription = None
    cortana_transcription = None
    recognitionstatus = None
    offset = None
    duration = None
    nbest = None
    confidence = None
    lexical = None
    itn = None
    maskeditn = None
    display = None
    sentiments = None
    documents = None
    errors = None
    truth_score = None
    truth_id = None
    prediction_score = None
    prediction_id = None
    cortana_score = None
    cortana_id = None
    play_audio = None

    # Form for sentiment engine
    if sentiment_form.validate_on_submit():
        s_model_number = sentiment_form.sent_model_number.data
        s_partition = sentiment_form.sent_partition.data
        s_instance_number = sentiment_form.sent_instance_number.data
        # Get ground truth and predicted transcriptions
        if s_model_number == 'model_10':
            truth_transcription = make_predictions.get_ground_truth(
                index=s_instance_number,
                partition=s_partition,
                input_to_softmax=make_predictions.model_10,
                model_path='./results/model_10.h5')
            prediction_transcription = make_predictions.get_prediction(
                index=s_instance_number,
                partition=s_partition,
                input_to_softmax=make_predictions.model_10,
                model_path='./results/model_10.h5')
        else:
            truth_transcription = make_predictions.get_ground_truth(
                index=s_instance_number,
                partition=s_partition,
                input_to_softmax=make_predictions.model_8,
                model_path='./results/model_8.h5')
            prediction_transcription = make_predictions.get_prediction(
                index=s_instance_number,
                partition=s_partition,
                input_to_softmax=make_predictions.model_8,
                model_path='./results/model_8.h5')
        # Connecting to Microsoft Speech API for Cortana's predicted transcription
        filepath = make_predictions.azure_inference(index=s_instance_number,
                                                    partition=s_partition)
        audiofile = open(filepath, 'rb')
        response = requests.post(
            'https://westus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1',
            headers=headers,
            params=params,
            data=make_predictions.read_in_chunks(audiofile))
        cortana_transcription = response.content
        val = json.loads(response.text)
        recognitionstatus = val["RecognitionStatus"]
        offset = val["Offset"]
        duration = val["Duration"]
        nbest = val["NBest"]
        confidence = val["NBest"][0]["Confidence"]
        lexical = val["NBest"][0]["Lexical"]
        itn = val["NBest"][0]["ITN"]
        maskeditn = val["NBest"][0]["MaskedITN"]
        display = val["NBest"][0]["Display"]
        # Connecting to Microsoft Text Analytics API for sentiment analysis
        text_documents = {
            'documents': [{
                'id': 'Ground Truth Transcription',
                'language': 'en',
                'text': truth_transcription
            }, {
                'id': 'Predicted Transcription',
                'language': 'en',
                'text': prediction_transcription
            }, {
                'id': 'Cortana Transcription',
                'language': 'en',
                'text': lexical
            }]
        }
        sentiment_response = requests.post(sentiment_api_url,
                                           headers=text_headers,
                                           json=text_documents)
        sentiments = sentiment_response.json()
        documents = sentiments["documents"]
        errors = sentiments["errors"]
        truth_score = sentiments["documents"][0]["score"]
        truth_id = sentiments["documents"][0]["id"]
        prediction_score = sentiments["documents"][1]["score"]
        prediction_id = sentiments["documents"][1]["id"]
        cortana_score = sentiments["documents"][2]["score"]
        cortana_id = sentiments["documents"][2]["id"]
        # Serve the audio file for the audio player
        play_audio = filepath.replace("/home/brice/Hey-Jetson/app/", "")

    # Render the html page
    return render_template('sentiment.html',
                           sentiment_form=sentiment_form,
                           truth_transcription=truth_transcription,
                           prediction_transcription=prediction_transcription,
                           cortana_transcription=cortana_transcription,
                           confidence=confidence,
                           lexical=lexical,
                           itn=itn,
                           maskeditn=maskeditn,
                           display=display,
                           truth_score=truth_score,
                           prediction_score=prediction_score,
                           cortana_score=cortana_score,
                           play_audio=play_audio)
Exemplo n.º 3
0
def visualization():
    # Initializing form for user input
    visualization_form = VisualizationForm()
    # Initializing variables passed to HTML files
    truth_transcription = None
    prediction_transcription = None
    raw_plot = None
    spectrogram_plot = None
    spectrogram_shape = None
    log_spectrogram_plot = None
    spectrogram_3d = None
    cortana_transcription = None
    recognitionstatus = None
    offset = None
    duration = None
    nbest = None
    confidence = None
    lexical = None
    itn = None
    maskeditn = None
    display = None
    play_audio = None

    # Form for visualization engine
    if visualization_form.validate_on_submit():
        v_model_number = visualization_form.viz_model_number.data
        v_partition = visualization_form.viz_partition.data
        v_instance_number = visualization_form.viz_instance_number.data
        # Get ground truth and predicted transcriptions
        if v_model_number == 'model_10':
            truth_transcription = make_predictions.get_ground_truth(
                index=v_instance_number,
                partition=v_partition,
                input_to_softmax=make_predictions.model_10,
                model_path='./results/model_10.h5')
            prediction_transcription = make_predictions.get_prediction(
                index=v_instance_number,
                partition=v_partition,
                input_to_softmax=make_predictions.model_10,
                model_path='./results/model_10.h5')
        else:
            truth_transcription = make_predictions.get_ground_truth(
                index=v_instance_number,
                partition=v_partition,
                input_to_softmax=make_predictions.model_8,
                model_path='./results/model_8.h5')
            prediction_transcription = make_predictions.get_prediction(
                index=v_instance_number,
                partition=v_partition,
                input_to_softmax=make_predictions.model_8,
                model_path='./results/model_8.h5')
        # Get features for visualizations
        vis_text, vis_spectrogram_feature, vis_audio_path, sample_rate, samples = make_predictions.vis_audio_features(
            index=v_instance_number, partition=v_partition)
        # Plot the audio waveform
        raw_plot = make_predictions.plot_raw_audio(sample_rate, samples)
        # Plot the spectrogram of the audio file
        spectrogram_plot = make_predictions.plot_spectrogram_feature(
            vis_spectrogram_feature)
        spectrogram_shape = 'The shape of the spectrogram of the chosen audio file: ' + str(
            vis_spectrogram_feature.shape)
        # 2nd way to plot the spectrogram of the audio file
        freqs, times, log_spectrogram = make_predictions.log_spectrogram_feature(
            samples, sample_rate)
        mean = np.mean(log_spectrogram, axis=0)
        std = np.std(log_spectrogram, axis=0)
        log_spectrogram = (log_spectrogram - mean) / std
        log_spectrogram_plot = make_predictions.plot_log_spectrogram_feature(
            freqs, times, log_spectrogram)

        # 3d plot of the spectrogram of a random audio file from the test set, plotting amplitude over frequency over time.
        def plot_3d_spectrogram(log_spectrogram):
            data = [go.Surface(z=log_spectrogram.T, colorscale='Viridis')]
            layout = go.Layout(title='3D Spectrogram',
                               autosize=True,
                               width=700,
                               height=700,
                               margin=dict(l=50, r=50, b=50, t=50))
            fig = go.Figure(data=data, layout=layout)
            div_output = plot(fig, output_type='div', include_plotlyjs=False)
            return div_output

        # 3d spectrogram plot
        spectrogram_3d = plot_3d_spectrogram(log_spectrogram)
        spectrogram_3d = Markup(spectrogram_3d)
        # Connecting to Microsoft Speech API for Cortana's predicted transcription
        filepath = make_predictions.azure_inference(index=v_instance_number,
                                                    partition=v_partition)
        audiofile = open(filepath, 'rb')
        response = requests.post(
            'https://westus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1',
            headers=headers,
            params=params,
            data=make_predictions.read_in_chunks(audiofile))
        cortana_transcription = response.content
        val = json.loads(response.text)
        recognitionstatus = val["RecognitionStatus"]
        offset = val["Offset"]
        duration = val["Duration"]
        nbest = val["NBest"]
        confidence = val["NBest"][0]["Confidence"]
        lexical = val["NBest"][0]["Lexical"]
        itn = val["NBest"][0]["ITN"]
        maskeditn = val["NBest"][0]["MaskedITN"]
        display = val["NBest"][0]["Display"]
        # Serve the audio file for the audio player
        play_audio = filepath.replace("/home/brice/Hey-Jetson/app/", "")

    # Render the html page.
    return render_template('visualization.html',
                           visualization_form=visualization_form,
                           truth_transcription=truth_transcription,
                           prediction_transcription=prediction_transcription,
                           raw_plot=raw_plot,
                           spectrogram_plot=spectrogram_plot,
                           log_spectrogram_plot=log_spectrogram_plot,
                           spectrogram_shape=spectrogram_shape,
                           spectrogram_3d=spectrogram_3d,
                           cortana_transcription=cortana_transcription,
                           confidence=confidence,
                           lexical=lexical,
                           itn=itn,
                           maskeditn=maskeditn,
                           display=display,
                           play_audio=play_audio)
Exemplo n.º 4
0
def performance():
    # Initializing form for user input
    performance_form = PerformanceForm()
    # Initializing variables passed to HTML files
    truth_transcription = None
    prediction_transcription = None
    word_error_rate = None
    cv_similarity = None
    tfidf_similarity = None
    jetson_time_to_predict = None
    cortana_time_to_predict = None
    cortana_transcription = None
    recognitionstatus = None
    offset = None
    duration = None
    nbest = None
    confidence = None
    lexical = None
    itn = None
    maskeditn = None
    display = None
    cortana_cv = None
    cortana_tfidf = None
    cortana_wer = None
    play_audio = None

    # Form for performance engine
    if performance_form.validate_on_submit():
        p_model_number = performance_form.perf_model_number.data
        p_partition = performance_form.perf_partition.data
        p_instance_number = performance_form.perf_instance_number.data
        # Get ground truth and predicted transcriptions
        if p_model_number == 'model_10':
            truth_transcription = make_predictions.get_ground_truth(
                index=p_instance_number,
                partition=p_partition,
                input_to_softmax=make_predictions.model_10,
                model_path='./results/model_10.h5')
            start = time.time()
            prediction_transcription = make_predictions.get_prediction(
                index=p_instance_number,
                partition=p_partition,
                input_to_softmax=make_predictions.model_10,
                model_path='./results/model_10.h5')
            end = time.time()
            jetson_time_to_predict = end - start
        else:
            truth_transcription = make_predictions.get_ground_truth(
                index=p_instance_number,
                partition=p_partition,
                input_to_softmax=make_predictions.model_8,
                model_path='./results/model_8.h5')
            start = time.time()
            prediction_transcription = make_predictions.get_prediction(
                index=p_instance_number,
                partition=p_partition,
                input_to_softmax=make_predictions.model_8,
                model_path='./results/model_8.h5')
            end = time.time()
            jetson_time_to_predict = end - start
        # Calculate cosine similarity of individual transcriptions using Count Vectorizer
        cv = CountVectorizer()
        cv_ground_truth_vec = cv.fit_transform([truth_transcription])
        cv_pred_transcription_vec = cv.transform([prediction_transcription])
        cv_similarity = cosine_similarity(cv_ground_truth_vec,
                                          cv_pred_transcription_vec)
        # Calculate cosine similarity of individual transcriptions using Tfidf Vectorizer
        tfidf = TfidfVectorizer()
        tfidf_ground_truth_vec = tfidf.fit_transform([truth_transcription])
        tfidf_pred_transcription_vec = tfidf.transform(
            [prediction_transcription])
        tfidf_similarity = cosine_similarity(tfidf_ground_truth_vec,
                                             tfidf_pred_transcription_vec)
        # Calculate word error rate of individual transcription
        word_error_rate = make_predictions.wer_calc(truth_transcription,
                                                    prediction_transcription)
        # Connecting to Microsoft Speech API for Cortana's predicted transcription
        c_start = time.time()
        filepath = make_predictions.azure_inference(index=p_instance_number,
                                                    partition=p_partition)
        audiofile = open(filepath, 'rb')
        response = requests.post(
            'https://westus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1',
            headers=headers,
            params=params,
            data=make_predictions.read_in_chunks(audiofile))
        cortana_transcription = response.content
        c_end = time.time()
        cortana_time_to_predict = c_end - c_start
        val = json.loads(response.text)
        recognitionstatus = val["RecognitionStatus"]
        offset = val["Offset"]
        duration = val["Duration"]
        nbest = val["NBest"]
        confidence = val["NBest"][0]["Confidence"]
        lexical = val["NBest"][0]["Lexical"]
        itn = val["NBest"][0]["ITN"]
        maskeditn = val["NBest"][0]["MaskedITN"]
        display = val["NBest"][0]["Display"]
        # Calculate performance measures on AZURE transcript
        cv_cortana_vec = cv.transform([lexical])
        cortana_cv = cosine_similarity(cv_ground_truth_vec, cv_cortana_vec)
        tfidf_cortana_vec = tfidf.transform([lexical])
        cortana_tfidf = cosine_similarity(tfidf_ground_truth_vec,
                                          tfidf_cortana_vec)
        cortana_wer = make_predictions.wer_calc(truth_transcription, lexical)
        # Serve the audio file for the audio player
        play_audio = filepath.replace("/home/brice/Hey-Jetson/app/", "")

    # Render the html page
    return render_template('performance.html',
                           performance_form=performance_form,
                           truth_transcription=truth_transcription,
                           prediction_transcription=prediction_transcription,
                           word_error_rate=word_error_rate,
                           cv_similarity=cv_similarity,
                           tfidf_similarity=tfidf_similarity,
                           jetson_time_to_predict=jetson_time_to_predict,
                           cortana_transcription=cortana_transcription,
                           cortana_time_to_predict=cortana_time_to_predict,
                           confidence=confidence,
                           lexical=lexical,
                           itn=itn,
                           maskeditn=maskeditn,
                           display=display,
                           cortana_cv=cortana_cv,
                           cortana_tfidf=cortana_tfidf,
                           cortana_wer=cortana_wer,
                           play_audio=play_audio)
Exemplo n.º 5
0
def index():
    # Initializing form for user input
    form = InferenceForm()
    # Initializing variables passed to HTML files
    truth_transcription = None
    prediction_transcription = None
    raw_plot = None
    spectrogram_plot = None
    spectrogram_shape = None
    log_spectrogram_plot = None
    error_rate = None
    cv_similarity = None
    tfidf_similarity = None

    # Form for inference engine
    if form.validate_on_submit():
        partition = form.partition.data
        instance_number = form.instance_number.data
        # Get ground truth and predicted transcriptions
        truth_transcription = make_predictions.get_ground_truth(
            index=instance_number,
            partition=partition,
            input_to_softmax=make_predictions.model_8,
            model_path='./results/model_8.h5')
        prediction_transcription = make_predictions.get_prediction(
            index=instance_number,
            partition=partition,
            input_to_softmax=make_predictions.model_8,
            model_path='./results/model_8.h5')
        # Get features for visualizations
        vis_text, vis_spectrogram_feature, vis_audio_path, sample_rate, samples = make_predictions.vis_audio_features(
            index=instance_number, partition=partition)
        # Plot the audio waveform
        raw_plot = plot_raw_audio(sample_rate, samples)
        # Plot the spectrogram of the audio file
        spectrogram_plot = plot_spectrogram_feature(vis_spectrogram_feature)
        spectrogram_shape = 'The shape of the spectrogram of the chosen audio file: ' + str(
            vis_spectrogram_feature.shape)
        # 2nd and better plot of the spectrogram of the audio file
        freqs, times, log_spectrogram = log_spectrogram_feature(
            samples, sample_rate)
        mean = np.mean(log_spectrogram, axis=0)
        std = np.std(log_spectrogram, axis=0)
        log_spectrogram = (log_spectrogram - mean) / std
        log_spectrogram_plot = plot_log_spectrogram_feature(
            freqs, times, log_spectrogram)
        # Calculate cosine similarity of individual transcriptions using Count Vectorizer
        cv = CountVectorizer()
        cv_ground_truth_vec = cv.fit_transform([truth_transcription])
        cv_pred_transcription_vec = cv.transform([prediction_transcription])
        cv_similarity = cosine_similarity(cv_ground_truth_vec,
                                          cv_pred_transcription_vec)
        # Calculate cosine similarity of individual transcriptions using Tfidf Vectorizer
        tfidf = TfidfVectorizer()
        tfidf_ground_truth_vec = tfidf.fit_transform([truth_transcription])
        tfidf_pred_transcription_vec = tfidf.transform(
            [prediction_transcription])
        tfidf_similarity = cosine_similarity(tfidf_ground_truth_vec,
                                             tfidf_pred_transcription_vec)
        # calculate word error rate of individual transcription
        error_rate = wer_calc(truth_transcription, prediction_transcription)
    # Render the html page with
    return render_template('index.html',
                           title='Hey, Jetson!',
                           form=form,
                           truth_transcription=truth_transcription,
                           prediction_transcription=prediction_transcription,
                           raw_plot=raw_plot,
                           spectrogram_plot=spectrogram_plot,
                           log_spectrogram_plot=log_spectrogram_plot,
                           spectrogram_shape=spectrogram_shape,
                           error_rate=error_rate,
                           cv_similarity=cv_similarity,
                           tfidf_similarity=tfidf_similarity)