def topword_download(): session_manager.cache_analysis_option() session_manager.cache_top_word_options() file_path = TopwordModel().get_download_path() return send_file(file_path, as_attachment=True, attachment_filename=constants.TOPWORD_CSV_FILE_NAME)
def documents() -> str: """Get the statistics of the individual documents. :return: The statistics of the individual documents. """ session_manager.cache_analysis_option() return jsonify(StatsModel().get_document_statistics())
def box_plot() -> str: """ Get a Plotly box plot of the document sizes. :return: The Plotly box plot of the document sizes. """ session_manager.cache_analysis_option() return StatsModel().get_box_plot()
def topword_html(): # 'POST' request occurs when html form is submitted num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() # get the class division map and number of existing classes class_division_map = FileManagerModel().load_file_manager().\ get_class_division_map() num_class = class_division_map.shape[0] if 'get-topword' in request.form: # download topword path = TopwordModel().get_topword_csv_path( class_division_map=class_division_map) session_manager.cache_analysis_option() session_manager.cache_top_word_options() return send_file(path, attachment_filename=constants.TOPWORD_CSV_FILE_NAME, as_attachment=True) else: session_manager.cache_analysis_option() session_manager.cache_top_word_options() topword_result = TopwordModel().get_readable_result( class_division_map=class_division_map) return render_template('topword.html', result=topword_result.results, labels=labels, header=topword_result.header, numclass=num_class, topwordsgenerated='True', classmap=[], itm='topwords', numActiveDocs=num_active_docs)
def corpus() -> str: """ Gets the corpus statistics. :return: The corpus statistics. """ # Cache the options session_manager.cache_analysis_option() # Return the statistics file_result = StatsModel().get_corpus_stats() return json.dumps({ "unit": file_result.unit, "average": file_result.mean, "standard_deviation": file_result.std_deviation, "interquartile_range": file_result.inter_quartile_range, "standard_error_small": file_result.anomaly_se.small_items, "standard_error_large": file_result.anomaly_se.large_items, "interquartile_range_small": file_result.anomaly_iqr.small_items, "interquartile_range_large": file_result.anomaly_iqr.large_items })
def dendrogram_div(): """Send the dendrogram div that is generated by plotly to frontend :return: an html string of the dendrogram div """ session_manager.cache_analysis_option() session_manager.cache_hierarchy_option() return DendrogramModel().get_dendrogram_div()
def get_table(): """ Gets the requested table data. :return: The requested table data. """ # Cache the options. session_manager.cache_analysis_option() # Return the generated document term matrix. return jsonify(TokenizerModel().get_table())
def results(): """Gets the k-means results. :return: The k-means results. """ # Cache options session_manager.cache_analysis_option() session_manager.cache_k_mean_option() # Get the k-means results return KMeansModel().get_results()
def results() -> str: """ Gets the top words results. :return: The top words results. """ # Cache the options session_manager.cache_analysis_option() session_manager.cache_top_word_options() # Return the top words results return TopwordModel().get_results()
def get_table() -> str: """ Gets the similarity query results. :return: The similarity query results. """ # Cache the options session_manager.cache_analysis_option() session_manager.cache_sim_options() # Return the table data return jsonify(SimilarityModel().get_results())
def graph() -> str: """Gets the consensus tree graph. :return: The consensus tree graph. """ # Cache the options session_manager.cache_bct_option() session_manager.cache_analysis_option() # Return the bootstrap consensus tree return BCTModel().get_bootstrap_consensus_tree_plot_decoded()
def corpus_stats_report(): session_manager.cache_analysis_option() file_result = StatsModel().get_corpus_stats() return jsonify(unit=file_result.unit, mean=file_result.mean, std_deviation=file_result.std_deviation, anomaly_se_small=file_result.anomaly_se.small_items, anomaly_se_large=file_result.anomaly_se.large_items, anomaly_iqr_small=file_result.anomaly_iqr.small_items, anomaly_iqr_large=file_result.anomaly_iqr.large_items, inter_quartile_range=file_result.inter_quartile_range)
def dendrogram_div(): """ Gets the Plotly dendrogram. :return: The Plotly dendrogram. """ # Cache options session_manager.cache_analysis_option() session_manager.cache_hierarchy_option() # Send the dendrogram return DendrogramModel().get_dendrogram_div()
def get_bct_result(): """Send the BCT result to frontend :return: Send file from directory to the ajax call. """ # Import the model module. from lexos.models.bct_model import BCTModel # Cache all the options. session_manager.cache_bct_option() session_manager.cache_analysis_option() # Get the bootstrap consensus tree result. return BCTModel().get_bootstrap_consensus_tree_plot_decoded()
def get_bct_result(): """Send the BCT result to frontend :return: Send file from directory to the ajax call. """ # Import the model module. from lexos.models.bct_model import BCTModel # Cache all the options. session_manager.cache_bct_option() session_manager.cache_analysis_option() # Get the bootstrap consensus tree result. return BCTModel().get_bootstrap_consensus_tree_plot_decoded()
def corpus_stats_report(): session_manager.cache_analysis_option() file_result = StatsModel().get_corpus_stats() return jsonify( unit=file_result.unit, mean=file_result.mean, std_deviation=file_result.std_deviation, anomaly_se_small=file_result.anomaly_se.small_items, anomaly_se_large=file_result.anomaly_se.large_items, anomaly_iqr_small=file_result.anomaly_iqr.small_items, anomaly_iqr_large=file_result.anomaly_iqr.large_items, inter_quartile_range=file_result.inter_quartile_range )
def get_tokenizer_csv(): """Called when the CSV button in Tokenizer is clicked. :return: a response object (often a render_template call) to flask and eventually to the browser. """ file_manager = utility.load_file_manager() session_manager.cache_analysis_option() session_manager.cache_csv_options() save_path, file_extension = utility.generate_csv(file_manager) utility.save_file_manager(file_manager) return send_file(save_path, attachment_filename="frequency_matrix" + file_extension, as_attachment=True)
def sim_html(): session_manager.cache_analysis_option() session_manager.cache_sim_options() return SimilarityModel().generate_sims_html()
def k_means(): """Handles the functionality on the kmeans page. It analyzes the various texts and displays the class label of the files. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() for key in labels: labels[key] = labels[key] default_k = int(len(labels) / 2) if request.method == 'GET': # 'GET' request occurs when the page is first loaded if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS if 'kmeanoption' not in session: session['kmeanoption'] = constants.DEFAULT_KMEAN_OPTIONS return render_template('kmeans.html', labels=labels, silhouettescore='', kmeansIndex=[], fileNameStr='', fileNumber=len(labels), KValue=0, defaultK=default_k, colorChartStr='', kmeansdatagenerated=False, itm="kmeans", numActiveDocs=num_active_docs) if request.method == "POST": # 'POST' request occur when html form is submitted # (i.e. 'Get Graphs', 'Download...') session_manager.cache_analysis_option() session_manager.cache_k_mean_option() utility.save_file_manager(file_manager) if request.form['viz'] == 'PCA': kmeans_index, silhouette_score, file_name_str, k_value, \ color_chart_str = utility.generate_k_means_pca(file_manager) return render_template('kmeans.html', labels=labels, silhouettescore=silhouette_score, kmeansIndex=kmeans_index, fileNameStr=file_name_str, fileNumber=len(labels), KValue=k_value, defaultK=default_k, colorChartStr=color_chart_str, kmeansdatagenerated=True, itm="kmeans", numActiveDocs=num_active_docs) elif request.form['viz'] == 'Voronoi': kmeans_index, silhouette_score, file_name_str, k_value, \ color_chart_str, final_points_list, final_centroids_list, \ text_data, max_x = \ utility.generate_k_means_voronoi(file_manager) return render_template('kmeans.html', labels=labels, silhouettescore=silhouette_score, kmeansIndex=kmeans_index, fileNameStr=file_name_str, fileNumber=len(labels), KValue=k_value, defaultK=default_k, colorChartStr=color_chart_str, finalPointsList=final_points_list, finalCentroidsList=final_centroids_list, textData=text_data, maxX=max_x, kmeansdatagenerated=True, itm="kmeans", numActiveDocs=num_active_docs)
def corpus_box_plot(): session_manager.cache_analysis_option() return StatsModel().get_box_plot()
def file_stats_table(): session_manager.cache_analysis_option() return StatsModel().get_file_stats()
def file_stats_table(): session_manager.cache_analysis_option() return StatsModel().get_file_stats()
def corpus_box_plot(): session_manager.cache_analysis_option() return StatsModel().get_box_plot()
def k_means_result(): session_manager.cache_analysis_option() session_manager.cache_k_mean_option() result = KMeansModel().get_result() return jsonify(table=result.table, plot=result.plot)
def sim_html(): session_manager.cache_analysis_option() session_manager.cache_sim_options() return SimilarityModel().generate_sims_html()
def topword_result(): session_manager.cache_analysis_option() session_manager.cache_top_word_options() return TopwordModel().get_displayable_result()
def tokenizer(): """Handles the functionality on the tokenizer page. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Use timeit to test peformance from timeit import default_timer as timer start_t = timer() print("Initialising GET request.") import pandas as pd from operator import itemgetter # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() if request.method == "GET": # Get the active labels and sort them labels = file_manager.get_active_labels_with_id() header_labels = [] for fileID in labels: header_labels.append(file_manager.files[int(fileID)].label) header_labels = natsorted(header_labels) # Get the starting options from the session if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS if 'csvoptions' not in session: session['csvoptions'] = constants.DEFAULT_CSV_OPTIONS csv_orientation = session['csvoptions']['csvorientation'] csv_delimiter = session['csvoptions']['csvdelimiter'] cull_number = session['analyoption']['cullnumber'] token_type = session['analyoption']['tokenType'] normalize_type = session['analyoption']['normalizeType'] token_size = session['analyoption']['tokenSize'] norm = session['analyoption']['norm'] data = { 'cullnumber': cull_number, 'tokenType': token_type, 'normalizeType': normalize_type, 'csvdelimiter': csv_delimiter, 'mfwnumber': '1', 'csvorientation': csv_orientation, 'tokenSize': token_size, 'norm': norm } # If there are active documents, generate a DTM matrix if num_active_docs > 0: end_t = timer() elapsed = end_t - start_t print("before generateCSVMatrixFromAjax") print(elapsed) # Get the DTM with the session options and convert it to a list of # lists dtm = utility.generate_csv_matrix_from_ajax(data, file_manager, round_decimal=True) end_t = timer() elapsed = end_t - start_t print("after generateCSVMatrixFromAjax") print(elapsed) # Print the first five rows for testing # print dtm[0:5] # #dtm[0] += (0,0,) # for i,row in enumerate(dtm[1:]): # dtm[i+1] += (0,0,) # print dtm[0:5] # Create a pandas dataframe with the correct orientation. # Convert it to a list of lists (matrix) if csv_orientation == "filerow": df = pd.DataFrame(dtm) # Create the matrix matrix = df.values.tolist() else: df = pd.DataFrame(dtm) end_t = timer() elapsed = end_t - start_t print("DataFrame created.") print(elapsed) # Calculate the sums and averages length = len(df.index) sums = [0] * (length - 1) sums.insert(0, "Total") averages = [0] * (length - 1) averages.insert(0, "Average") end_t = timer() elapsed = end_t - start_t print("Sum and averages calculated.") print(elapsed) # Concatenate the total and average columns to the dataframe df = pd.concat([df, pd.DataFrame(sums, columns=['Total'])], axis=1) df = pd.concat( [df, pd.DataFrame(averages, columns=['Average'])], axis=1) end_t = timer() elapsed = end_t - start_t print("DataFrame modified.") print(elapsed) # Create the matrix matrix = df.values.tolist() matrix[0][0] = "Terms" end_t = timer() elapsed = end_t - start_t print("DataFrame converted to matrix.") print(elapsed) # Prevent Unicode errors in column headers for i, v in enumerate(matrix[0]): matrix[0][i] = v # Save the column headers and remove them from the matrix # columns = natsorted(matrix[0]) columns = matrix[0] if csv_orientation == "filecolumn": columns[0] = "Terms" else: columns[0] = "Documents" del matrix[0] # Prevent Unicode errors in the row headers for i, v in enumerate(matrix): matrix[i][0] = v[0] # Calculate the number of rows in the matrix records_total = len(matrix) # Sort the matrix by column 0 matrix = natsorted(matrix, key=itemgetter(0), reverse=False) # Set the table length -- maximum 10 records for initial load if records_total <= 10: end_index = records_total - 1 matrix = matrix[0:end_index] else: matrix = matrix[0:9] # escape all the html character in matrix matrix = [[general_functions.html_escape(row[0])] + row[1:] for row in matrix] # escape all the html character in columns columns = [general_functions.html_escape(item) for item in columns] # The first 10 rows are sent to the template as an HTML string. # After the template renders, an ajax request fetches new data # to re-render the table with the correct number of rows. # Create the columns string cols = "<tr>" for s in columns: cols += "<th>" + str(s) + "</th>" cols += "</tr>" # Create the rows string rows = "" for l in matrix: row = "<tr>" for s in l: row += "<td>" + str(s) + "</td>" row += "</tr>" rows += row # Catch instances where there is no active document (triggers the error # modal) else: cols = "<tr><th>Terms</th></tr>" rows = "<tr><td></td></tr>" records_total = 0 # Render the template end_t = timer() elapsed = end_t - start_t print("Matrix generated. Rendering template.") print(elapsed) return render_template('tokenizer.html', draw=1, itm="tokenize", labels=labels, headers=header_labels, columns=cols, rows=rows, numRows=records_total, orientation=csv_orientation, numActiveDocs=num_active_docs) if request.method == "POST": end_t = timer() elapsed = end_t - start_t print("POST received.") print(elapsed) session_manager.cache_analysis_option() session_manager.cache_csv_options() if 'get-csv' in request.form: # The 'Download Matrix' button is clicked on tokenizer.html. save_path, file_extension = utility.generate_csv(file_manager) utility.save_file_manager(file_manager) return send_file(save_path, attachment_filename="frequency_matrix" + file_extension, as_attachment=True) else: # Get the active labels and sort them labels = file_manager.get_active_labels_with_id() header_labels = [] for fileID in labels: header_labels.append(file_manager.files[int(fileID)].label) # Get the Tokenizer options from the request json object length = int(request.json["length"]) # Increment for the ajax response draw = int(request.json["draw"]) + 1 search = request.json["search"] order = str(request.json["order"][1]) sort_column = int(request.json["order"][0]) csv_orientation = request.json["csvorientation"] # Set the sorting order if order == "desc": reverse = True else: reverse = False # Get the DTM with the requested options and convert it to a list # of lists dtm = utility.generate_csv_matrix_from_ajax(request.json, file_manager, round_decimal=True) end_t = timer() elapsed = end_t - start_t print("DTM received.") print(elapsed) if csv_orientation == "filerow": dtm[0][0] = "Documents" df = pd.DataFrame(dtm) footer_stats = df.drop(df.index[[0]], axis=0) footer_stats = footer_stats.drop(df.index[[0]], axis=1) footer_totals = footer_stats.sum().tolist() footer_totals = [round(total, 4) for total in footer_totals] footer_averages = footer_stats.mean().tolist() footer_averages = [round(ave, 4) for ave in footer_averages] sums = ["Total"] averages = ["Average"] # Discrepancy--this is used for tokenize/POST length = len(df.index) for i in range(0, length): if i > 0: rounded_sum = round(df.iloc[i][1:].sum(), 4) sums.append(rounded_sum) rounded_ave = round(df.iloc[i][1:].mean(), 4) averages.append(rounded_ave) df = pd.concat([df, pd.DataFrame(sums, columns=['Total'])], axis=1) df = pd.concat( [df, pd.DataFrame(averages, columns=['Average'])], axis=1) # Populate the sum of sums and average of averages cells sum_of_sums = df['Total'].tolist() num_rows = len(df['Total'].tolist()) num_rows = num_rows - 1 sum_of_sums = sum(sum_of_sums[1:]) sum_of_ave = df['Average'].tolist() sum_of_ave = sum(sum_of_ave[1:]) footer_totals.append(round(sum_of_sums, 4)) footer_totals.append(round(sum_of_ave, 4)) ave_of_sums = sum_of_sums / num_rows ave_of_aves = ave_of_sums / num_rows footer_averages.append(round(ave_of_sums, 4)) footer_averages.append(round(ave_of_aves, 4)) # Change the DataFrame to a list matrix = df.values.tolist() # Prevent Unicode errors in column headers for i, v in enumerate(matrix[0]): matrix[0][i] = v # Save the column headers and remove them from the matrix columns = natsorted(matrix[0][1:-2]) columns.insert(0, "Documents") columns.append("Total") columns.append("Average") del matrix[0] else: df = pd.DataFrame(dtm) # print(df[0:3]) end_t = timer() elapsed = end_t - start_t print("DTM created. Calculating footer stats") print(elapsed) footer_stats = df.drop(df.index[[0]], axis=0) # print(footer_stats[0:3]) footer_stats = footer_stats.drop(df.index[[0]], axis=1) footer_totals = footer_stats.sum().tolist() footer_totals = [round(total, 4) for total in footer_totals] footer_averages = footer_stats.mean().tolist() footer_averages = [round(ave, 4) for ave in footer_averages] end_t = timer() elapsed = end_t - start_t print("Footer stats calculated. " "Calculating totals and averages...") print(elapsed) # try it with nested for loops sums = [] averages = [] n_rows = len(df.index) # all rows are the same, so picking any row n_cols = len(df.iloc[1]) for i in range(1, n_rows): row_total = 0 for j in range(1, n_cols): row_total += df.iloc[i][j] sums.append(round(row_total, 4)) averages.append(round((row_total / (n_cols - 1)), 4)) sums.insert(0, "Total") averages.insert(0, "Average") end_t = timer() elapsed = end_t - start_t print("Totals and averages calculated. Appending columns...") print(elapsed) # This seems to be the bottleneck df['Total'] = sums df['Average'] = averages end_t = timer() elapsed = end_t - start_t print("Populating columns with rounded values.") print(elapsed) # Populate the sum of sums and average of averages cells sum_of_sums = df['Total'].tolist() num_rows = len(df['Total'].tolist()) num_rows = num_rows - 1 sum_of_sums = sum(sum_of_sums[1:]) sum_of_ave = df['Average'].tolist() sum_of_ave = sum(sum_of_ave[1:]) footer_totals.append(round(sum_of_sums, 4)) footer_totals.append(round(sum_of_ave, 4)) ave_of_sums = sum_of_sums / num_rows ave_of_aves = ave_of_sums / num_rows footer_averages.append(round(ave_of_sums, 4)) footer_averages.append(round(ave_of_aves, 4)) end_t = timer() elapsed = end_t - start_t print("Rounded values added.") print(elapsed) matrix = df.values.tolist() matrix[0][0] = "Terms" # Prevent Unicode errors in column headers for i, v in enumerate(matrix[0]): matrix[0][i] = v # Save the column headers and remove them from the matrix columns = natsorted(matrix[0]) if csv_orientation == "filecolumn": columns[0] = "Terms" else: columns[0] = "Documents" del matrix[0] # Code for both orientations # end_t = timer() elapsed = end_t - start_t print("Starting common code.") print(elapsed) # Prevent Unicode errors in the row headers for i, v in enumerate(matrix): matrix[i][0] = v[0] # Calculate the number of rows in the matrix records_total = len(matrix) # Sort and Filter the cached DTM by column if len(search) != 0: matrix = [x for x in matrix if x[0].startswith(search)] matrix = natsorted(matrix, key=itemgetter(sort_column), reverse=reverse) else: matrix = natsorted(matrix, key=itemgetter(sort_column), reverse=reverse) # Get the number of filtered rows records_filtered = len(matrix) # Set the table length if length == -1: matrix = matrix[0:] else: start_index = int(request.json["start"]) end_index = int(request.json["end"]) matrix = matrix[start_index:end_index] # Correct the footer rows footer_totals = [float(Decimal("%.4f" % e)) for e in footer_totals] footer_averages = [float(Decimal("%.4f" % e)) for e in footer_averages] footer_totals.insert(0, "Total") footer_averages.insert(0, "Average") footer_totals.append("") footer_averages.append("") response = { "draw": draw, "records_total": records_total, "records_filtered": records_filtered, "length": int(length), "columns": columns, "data": matrix, "totals": footer_totals, "averages": footer_averages } end_t = timer() elapsed = end_t - start_t print("Returning table data to the browser.") print(elapsed) return json.dumps(response)