def sentiment(): # Initializing form for user input sentiment_form = SentimentForm() # Initializing variables passed to HTML files truth_transcription = None prediction_transcription = None cortana_transcription = None recognitionstatus = None offset = None duration = None nbest = None confidence = None lexical = None itn = None maskeditn = None display = None sentiments = None documents = None errors = None truth_score = None truth_id = None prediction_score = None prediction_id = None cortana_score = None cortana_id = None play_audio = None # Form for sentiment engine if sentiment_form.validate_on_submit(): s_model_number = sentiment_form.sent_model_number.data s_partition = sentiment_form.sent_partition.data s_instance_number = sentiment_form.sent_instance_number.data # Get ground truth and predicted transcriptions if s_model_number == 'model_10': truth_transcription = make_predictions.get_ground_truth( index=s_instance_number, partition=s_partition, input_to_softmax=make_predictions.model_10, model_path='./results/model_10.h5') prediction_transcription = make_predictions.get_prediction( index=s_instance_number, partition=s_partition, input_to_softmax=make_predictions.model_10, model_path='./results/model_10.h5') else: truth_transcription = make_predictions.get_ground_truth( index=s_instance_number, partition=s_partition, input_to_softmax=make_predictions.model_8, model_path='./results/model_8.h5') prediction_transcription = make_predictions.get_prediction( index=s_instance_number, partition=s_partition, input_to_softmax=make_predictions.model_8, model_path='./results/model_8.h5') # Connecting to Microsoft Speech API for Cortana's predicted transcription filepath = make_predictions.azure_inference(index=s_instance_number, partition=s_partition) audiofile = open(filepath, 'rb') response = requests.post( 'https://westus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1', headers=headers, params=params, data=make_predictions.read_in_chunks(audiofile)) cortana_transcription = response.content val = json.loads(response.text) recognitionstatus = val["RecognitionStatus"] offset = val["Offset"] duration = val["Duration"] nbest = val["NBest"] confidence = val["NBest"][0]["Confidence"] lexical = val["NBest"][0]["Lexical"] itn = val["NBest"][0]["ITN"] maskeditn = val["NBest"][0]["MaskedITN"] display = val["NBest"][0]["Display"] # Connecting to Microsoft Text Analytics API for sentiment analysis text_documents = { 'documents': [{ 'id': 'Ground Truth Transcription', 'language': 'en', 'text': truth_transcription }, { 'id': 'Predicted Transcription', 'language': 'en', 'text': prediction_transcription }, { 'id': 'Cortana Transcription', 'language': 'en', 'text': lexical }] } sentiment_response = requests.post(sentiment_api_url, headers=text_headers, json=text_documents) sentiments = sentiment_response.json() documents = sentiments["documents"] errors = sentiments["errors"] truth_score = sentiments["documents"][0]["score"] truth_id = sentiments["documents"][0]["id"] prediction_score = sentiments["documents"][1]["score"] prediction_id = sentiments["documents"][1]["id"] cortana_score = sentiments["documents"][2]["score"] cortana_id = sentiments["documents"][2]["id"] # Serve the audio file for the audio player play_audio = filepath.replace("/home/brice/Hey-Jetson/app/", "") # Render the html page return render_template('sentiment.html', sentiment_form=sentiment_form, truth_transcription=truth_transcription, prediction_transcription=prediction_transcription, cortana_transcription=cortana_transcription, confidence=confidence, lexical=lexical, itn=itn, maskeditn=maskeditn, display=display, truth_score=truth_score, prediction_score=prediction_score, cortana_score=cortana_score, play_audio=play_audio)
def visualization(): # Initializing form for user input visualization_form = VisualizationForm() # Initializing variables passed to HTML files truth_transcription = None prediction_transcription = None raw_plot = None spectrogram_plot = None spectrogram_shape = None log_spectrogram_plot = None spectrogram_3d = None cortana_transcription = None recognitionstatus = None offset = None duration = None nbest = None confidence = None lexical = None itn = None maskeditn = None display = None play_audio = None # Form for visualization engine if visualization_form.validate_on_submit(): v_model_number = visualization_form.viz_model_number.data v_partition = visualization_form.viz_partition.data v_instance_number = visualization_form.viz_instance_number.data # Get ground truth and predicted transcriptions if v_model_number == 'model_10': truth_transcription = make_predictions.get_ground_truth( index=v_instance_number, partition=v_partition, input_to_softmax=make_predictions.model_10, model_path='./results/model_10.h5') prediction_transcription = make_predictions.get_prediction( index=v_instance_number, partition=v_partition, input_to_softmax=make_predictions.model_10, model_path='./results/model_10.h5') else: truth_transcription = make_predictions.get_ground_truth( index=v_instance_number, partition=v_partition, input_to_softmax=make_predictions.model_8, model_path='./results/model_8.h5') prediction_transcription = make_predictions.get_prediction( index=v_instance_number, partition=v_partition, input_to_softmax=make_predictions.model_8, model_path='./results/model_8.h5') # Get features for visualizations vis_text, vis_spectrogram_feature, vis_audio_path, sample_rate, samples = make_predictions.vis_audio_features( index=v_instance_number, partition=v_partition) # Plot the audio waveform raw_plot = make_predictions.plot_raw_audio(sample_rate, samples) # Plot the spectrogram of the audio file spectrogram_plot = make_predictions.plot_spectrogram_feature( vis_spectrogram_feature) spectrogram_shape = 'The shape of the spectrogram of the chosen audio file: ' + str( vis_spectrogram_feature.shape) # 2nd way to plot the spectrogram of the audio file freqs, times, log_spectrogram = make_predictions.log_spectrogram_feature( samples, sample_rate) mean = np.mean(log_spectrogram, axis=0) std = np.std(log_spectrogram, axis=0) log_spectrogram = (log_spectrogram - mean) / std log_spectrogram_plot = make_predictions.plot_log_spectrogram_feature( freqs, times, log_spectrogram) # 3d plot of the spectrogram of a random audio file from the test set, plotting amplitude over frequency over time. def plot_3d_spectrogram(log_spectrogram): data = [go.Surface(z=log_spectrogram.T, colorscale='Viridis')] layout = go.Layout(title='3D Spectrogram', autosize=True, width=700, height=700, margin=dict(l=50, r=50, b=50, t=50)) fig = go.Figure(data=data, layout=layout) div_output = plot(fig, output_type='div', include_plotlyjs=False) return div_output # 3d spectrogram plot spectrogram_3d = plot_3d_spectrogram(log_spectrogram) spectrogram_3d = Markup(spectrogram_3d) # Connecting to Microsoft Speech API for Cortana's predicted transcription filepath = make_predictions.azure_inference(index=v_instance_number, partition=v_partition) audiofile = open(filepath, 'rb') response = requests.post( 'https://westus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1', headers=headers, params=params, data=make_predictions.read_in_chunks(audiofile)) cortana_transcription = response.content val = json.loads(response.text) recognitionstatus = val["RecognitionStatus"] offset = val["Offset"] duration = val["Duration"] nbest = val["NBest"] confidence = val["NBest"][0]["Confidence"] lexical = val["NBest"][0]["Lexical"] itn = val["NBest"][0]["ITN"] maskeditn = val["NBest"][0]["MaskedITN"] display = val["NBest"][0]["Display"] # Serve the audio file for the audio player play_audio = filepath.replace("/home/brice/Hey-Jetson/app/", "") # Render the html page. return render_template('visualization.html', visualization_form=visualization_form, truth_transcription=truth_transcription, prediction_transcription=prediction_transcription, raw_plot=raw_plot, spectrogram_plot=spectrogram_plot, log_spectrogram_plot=log_spectrogram_plot, spectrogram_shape=spectrogram_shape, spectrogram_3d=spectrogram_3d, cortana_transcription=cortana_transcription, confidence=confidence, lexical=lexical, itn=itn, maskeditn=maskeditn, display=display, play_audio=play_audio)
def performance(): # Initializing form for user input performance_form = PerformanceForm() # Initializing variables passed to HTML files truth_transcription = None prediction_transcription = None word_error_rate = None cv_similarity = None tfidf_similarity = None jetson_time_to_predict = None cortana_time_to_predict = None cortana_transcription = None recognitionstatus = None offset = None duration = None nbest = None confidence = None lexical = None itn = None maskeditn = None display = None cortana_cv = None cortana_tfidf = None cortana_wer = None play_audio = None # Form for performance engine if performance_form.validate_on_submit(): p_model_number = performance_form.perf_model_number.data p_partition = performance_form.perf_partition.data p_instance_number = performance_form.perf_instance_number.data # Get ground truth and predicted transcriptions if p_model_number == 'model_10': truth_transcription = make_predictions.get_ground_truth( index=p_instance_number, partition=p_partition, input_to_softmax=make_predictions.model_10, model_path='./results/model_10.h5') start = time.time() prediction_transcription = make_predictions.get_prediction( index=p_instance_number, partition=p_partition, input_to_softmax=make_predictions.model_10, model_path='./results/model_10.h5') end = time.time() jetson_time_to_predict = end - start else: truth_transcription = make_predictions.get_ground_truth( index=p_instance_number, partition=p_partition, input_to_softmax=make_predictions.model_8, model_path='./results/model_8.h5') start = time.time() prediction_transcription = make_predictions.get_prediction( index=p_instance_number, partition=p_partition, input_to_softmax=make_predictions.model_8, model_path='./results/model_8.h5') end = time.time() jetson_time_to_predict = end - start # Calculate cosine similarity of individual transcriptions using Count Vectorizer cv = CountVectorizer() cv_ground_truth_vec = cv.fit_transform([truth_transcription]) cv_pred_transcription_vec = cv.transform([prediction_transcription]) cv_similarity = cosine_similarity(cv_ground_truth_vec, cv_pred_transcription_vec) # Calculate cosine similarity of individual transcriptions using Tfidf Vectorizer tfidf = TfidfVectorizer() tfidf_ground_truth_vec = tfidf.fit_transform([truth_transcription]) tfidf_pred_transcription_vec = tfidf.transform( [prediction_transcription]) tfidf_similarity = cosine_similarity(tfidf_ground_truth_vec, tfidf_pred_transcription_vec) # Calculate word error rate of individual transcription word_error_rate = make_predictions.wer_calc(truth_transcription, prediction_transcription) # Connecting to Microsoft Speech API for Cortana's predicted transcription c_start = time.time() filepath = make_predictions.azure_inference(index=p_instance_number, partition=p_partition) audiofile = open(filepath, 'rb') response = requests.post( 'https://westus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1', headers=headers, params=params, data=make_predictions.read_in_chunks(audiofile)) cortana_transcription = response.content c_end = time.time() cortana_time_to_predict = c_end - c_start val = json.loads(response.text) recognitionstatus = val["RecognitionStatus"] offset = val["Offset"] duration = val["Duration"] nbest = val["NBest"] confidence = val["NBest"][0]["Confidence"] lexical = val["NBest"][0]["Lexical"] itn = val["NBest"][0]["ITN"] maskeditn = val["NBest"][0]["MaskedITN"] display = val["NBest"][0]["Display"] # Calculate performance measures on AZURE transcript cv_cortana_vec = cv.transform([lexical]) cortana_cv = cosine_similarity(cv_ground_truth_vec, cv_cortana_vec) tfidf_cortana_vec = tfidf.transform([lexical]) cortana_tfidf = cosine_similarity(tfidf_ground_truth_vec, tfidf_cortana_vec) cortana_wer = make_predictions.wer_calc(truth_transcription, lexical) # Serve the audio file for the audio player play_audio = filepath.replace("/home/brice/Hey-Jetson/app/", "") # Render the html page return render_template('performance.html', performance_form=performance_form, truth_transcription=truth_transcription, prediction_transcription=prediction_transcription, word_error_rate=word_error_rate, cv_similarity=cv_similarity, tfidf_similarity=tfidf_similarity, jetson_time_to_predict=jetson_time_to_predict, cortana_transcription=cortana_transcription, cortana_time_to_predict=cortana_time_to_predict, confidence=confidence, lexical=lexical, itn=itn, maskeditn=maskeditn, display=display, cortana_cv=cortana_cv, cortana_tfidf=cortana_tfidf, cortana_wer=cortana_wer, play_audio=play_audio)
def index(): # Initializing form for user input audio_form = AudioForm(CombinedMultiDict((request.files, request.form))) # Initializing variables passed to HTML files filename = None prediction_transcription = None raw_plot = None spectrogram_plot = None spectrogram_shape = None log_spectrogram_plot = None spectrogram_3d = None word_error_rate = None cv_similarity = None jetson_time_to_predict = None cortana_time_to_predict = None cortana_transcription = None recognitionstatus = None offset = None duration = None nbest = None confidence = None lexical = None itn = None maskeditn = None display = None sentiments = None documents = None errors = None prediction_score = None prediction_id = None cortana_score = None cortana_id = None # Form for inference engine if audio_form.validate_on_submit(): f = audio_form.audio_file.data filename = os.path.join('app/static/audio/', 'tmp.wav') f.save(filename) # Connecting to Microsoft Speech API for Cortana's predicted transcription c_start = time.time() audiofile = open(filename, 'rb') response = requests.post( 'https://westus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1', headers=headers, params=params, data=make_predictions.read_in_chunks(audiofile)) cortana_transcription = response.content c_end = time.time() cortana_time_to_predict = c_end - c_start val = json.loads(response.text) recognitionstatus = val["RecognitionStatus"] offset = val["Offset"] duration = val["Duration"] nbest = val["NBest"] confidence = val["NBest"][0]["Confidence"] lexical = val["NBest"][0]["Lexical"] itn = val["NBest"][0]["ITN"] maskeditn = val["NBest"][0]["MaskedITN"] display = val["NBest"][0]["Display"] # Producing Hey, Jetson! predicted transcription s_start = time.time() prediction_transcription = make_predictions.run_inference( audio_path=filename, input_to_softmax=make_predictions.model_10, model_path='./results/model_10.h5') s_end = time.time() jetson_time_to_predict = s_end - s_start vis_spectrogram_feature, sample_rate, samples = make_predictions.inference_vis_audio_features( index=filename) # Plot the audio waveform raw_plot = make_predictions.plot_raw_audio(sample_rate, samples) # Plot the spectrogram of the audio file spectrogram_plot = make_predictions.plot_spectrogram_feature( vis_spectrogram_feature) spectrogram_shape = 'The shape of the spectrogram of the uploaded audio file: ' + str( vis_spectrogram_feature.shape) # 2nd way to plot the spectrogram of the audio file freqs, times, log_spectrogram = make_predictions.log_spectrogram_feature( samples, sample_rate) mean = np.mean(log_spectrogram, axis=0) std = np.std(log_spectrogram, axis=0) log_spectrogram = (log_spectrogram - mean) / std log_spectrogram_plot = make_predictions.plot_log_spectrogram_feature( freqs, times, log_spectrogram) # 3d plot of the spectrogram of a random audio file from the test set, plotting amplitude over frequency over time. def plot_3d_spectrogram(log_spectrogram): data = [go.Surface(z=log_spectrogram.T, colorscale='Viridis')] layout = go.Layout(title='3D Spectrogram', autosize=True, width=700, height=700, margin=dict(l=50, r=50, b=50, t=50)) fig = go.Figure(data=data, layout=layout) div_output = plot(fig, output_type='div', include_plotlyjs=False) return div_output # 3d spectrogram plot spectrogram_3d = plot_3d_spectrogram(log_spectrogram) spectrogram_3d = Markup(spectrogram_3d) # Connecting to Microsoft Text Analytics API for sentiment analysis text_documents = { 'documents': [{ 'id': 'Predicted Transcription', 'language': 'en', 'text': prediction_transcription }, { 'id': 'Cortana Transcription', 'language': 'en', 'text': lexical }] } sentiment_response = requests.post(sentiment_api_url, headers=text_headers, json=text_documents) sentiments = sentiment_response.json() documents = sentiments["documents"] errors = sentiments["errors"] prediction_score = sentiments["documents"][0]["score"] prediction_id = sentiments["documents"][0]["id"] cortana_score = sentiments["documents"][1]["score"] cortana_id = sentiments["documents"][1]["id"] # Render the html page. return render_template('index.html', audio_form=audio_form, filename=filename, prediction_transcription=prediction_transcription, raw_plot=raw_plot, spectrogram_plot=spectrogram_plot, log_spectrogram_plot=log_spectrogram_plot, spectrogram_shape=spectrogram_shape, spectrogram_3d=spectrogram_3d, jetson_time_to_predict=jetson_time_to_predict, cortana_time_to_predict=cortana_time_to_predict, confidence=confidence, lexical=lexical, itn=itn, maskeditn=maskeditn, display=display, prediction_score=prediction_score, cortana_score=cortana_score)