def k_means(): """Handles the functionality on the K Means page. It analyzes the various texts and displays the class label of the files. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() # Set default number of clusters to be half of the number of documents. default_k = int(num_active_docs / 2) # Get file labels. labels = FileManagerModel().load_file_manager().get_active_labels_with_id() # Fill the default options. if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS if 'kmeanoption' not in session: session['kmeanoption'] = constants.DEFAULT_KMEAN_OPTIONS # Always update the nclusters. session['kmeanoption']['nclusters'] = default_k return render_template( 'kmeans.html', itm='kmeans', labels=labels, numActiveDocs=num_active_docs)
def scrub(): # Are you looking for scrubber.py? """Handles the functionality of the scrub page. It scrubs the files depending on the specifications chosen by the user, with an option to download the scrubbed files. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() # "GET" request occurs when the page is first loaded. if 'scrubbingoptions' not in session: session['scrubbingoptions'] = constants.DEFAULT_SCRUB_OPTIONS if 'xmlhandlingoptions' not in session: session['xmlhandlingoptions'] = { "myselect": { "action": '', "attribute": "" } } utility.xml_handling_options() previews = file_manager.get_previews_of_active() tags_present, doe_present, gutenberg_present = \ file_manager.check_actives_tags() return render_template('scrub.html', previews=previews, itm="scrubber", haveTags=tags_present, haveDOE=doe_present, haveGutenberg=gutenberg_present, numActiveDocs=num_active_docs)
def bct_analysis(): """Display the web page when first got to bootstrap consensus analysis. :return: The rendered template. """ # Detect the number of active documents. num_active_docs = detect_active_docs() # Get labels with their ids. id_label_map = \ FileManagerModel().load_file_manager().get_active_labels_with_id() # Fill in default options. if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS if 'bctoption' not in session: session['bctoption'] = constants.DEFAULT_BCT_OPTIONS try: from lexos.models.bct_model import BCTModel # Use a black hole variable to hold the model to get rid of warning. _ = BCTModel() # Render the HTML template. return render_template( 'bct_analysis.html', itm="bct-analysis", labels=id_label_map, numActiveDocs=num_active_docs ) except ImportError: return render_template( 'bct_analysis_import_error.html', itm="bct-analysis" )
def bct_analysis(): """Display the web page when first got to bootstrap consensus analysis. :return: The rendered template. """ # Detect the number of active documents. num_active_docs = detect_active_docs() # Get labels with their ids. id_label_map = \ FileManagerModel().load_file_manager().get_active_labels_with_id() # Fill in default options. if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS if 'bctoption' not in session: session['bctoption'] = constants.DEFAULT_BCT_OPTIONS try: from lexos.models.bct_model import BCTModel # Use a black hole variable to hold the model to get rid of warning. _ = BCTModel() # Render the HTML template. return render_template('bct_analysis.html', itm="bct-analysis", labels=id_label_map, numActiveDocs=num_active_docs) except ImportError: return render_template('bct_analysis_import_error.html', itm="bct-analysis")
def scrub(): # Are you looking for scrubber.py? """Handles the functionality of the scrub page. It scrubs the files depending on the specifications chosen by the user, with an option to download the scrubbed files. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() # "GET" request occurs when the page is first loaded. if 'scrubbingoptions' not in session: session['scrubbingoptions'] = constants.DEFAULT_SCRUB_OPTIONS if 'xmlhandlingoptions' not in session: session['xmlhandlingoptions'] = { "myselect": {"action": '', "attribute": ""}} utility.xml_handling_options() previews = file_manager.get_previews_of_active() tags_present, doe_present, gutenberg_present = \ file_manager.check_actives_tags() return render_template( 'scrub.html', previews=previews, itm="scrubber", haveTags=tags_present, haveDOE=doe_present, haveGutenberg=gutenberg_present, numActiveDocs=num_active_docs)
def similarity(): """Handles the similarity query page functionality. Returns ranked list of files and their cosine similarities to a comparison document. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() id_label_map = \ FileManagerModel().load_file_manager().get_active_labels_with_id() # 'GET' request occurs when the page is first loaded if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS if 'similarities' not in session: session['similarities'] = constants.DEFAULT_SIM_OPTIONS return render_template( 'similarity.html', itm="similarity-query", labels=id_label_map, numActiveDocs=num_active_docs, )
def topword_html(): # 'POST' request occurs when html form is submitted num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() # get the class division map and number of existing classes class_division_map = FileManagerModel().load_file_manager().\ get_class_division_map() num_class = class_division_map.shape[0] if 'get-topword' in request.form: # download topword path = TopwordModel().get_topword_csv_path( class_division_map=class_division_map) session_manager.cache_analysis_option() session_manager.cache_top_word_options() return send_file(path, attachment_filename=constants.TOPWORD_CSV_FILE_NAME, as_attachment=True) else: session_manager.cache_analysis_option() session_manager.cache_top_word_options() topword_result = TopwordModel().get_readable_result( class_division_map=class_division_map) return render_template('topword.html', result=topword_result.results, labels=labels, header=topword_result.header, numclass=num_class, topwordsgenerated='True', classmap=[], itm='topwords', numActiveDocs=num_active_docs)
def top_words(): """Handles the topword page functionality. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() # 'GET' request occurs when the page is first loaded if 'topwordoption' not in session: session['topwordoption'] = constants.DEFAULT_TOPWORD_OPTIONS if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS # get the class division map and number of existing classes class_division_map = FileManagerModel().load_file_manager().\ get_class_division_map() num_class = class_division_map.shape[0] return render_template('topword.html', labels=labels, classmap=class_division_map, numclass=num_class, topwordsgenerated='class_div', itm='topwords', numActiveDocs=num_active_docs)
def upload_dictionaries(): """Uploads dictionaries to the content analysis object. :return: a json object. """ path = os.path.join(constants.TMP_FOLDER, constants.UPLOAD_FOLDER, session['id'], 'content_analysis/') if not os.path.isdir(path): os.makedirs(path) data = {'dictionary_labels': [], 'active_dictionaries': [], 'formula': "", 'toggle_all_value': True, 'error': False} if detect_active_docs() == 0: data['error'] = True for upload_file in request.files.getlist('lemfileselect[]'): file_name = upload_file.filename content = upload_file.read().decode("utf-8").replace('\n', '') file = open(path + file_name, 'w') file.write(content) file.close() dictionary_names = [name for name in os.listdir(path)] data['dictionary_labels'] = [os.path.splitext(dict_name)[0] for dict_name in dictionary_names] data['active_dictionaries'] = [True] * len(dictionary_names) session['dictionary_labels'] = data['dictionary_labels'] session['active_dictionaries'] = data['active_dictionaries'] session['toggle_all_value'] = data['toggle_all_value'] return json.dumps(data)
def multi_cloud(): """Handles the functionality on the multicloud pages. :return: a response object (often a render_template call) to Flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() labels = OrderedDict( natsorted(list(labels.items()), key=lambda x: x[1])) if request.method == 'GET': # 'GET' request occurs when the page is first loaded. if 'cloudoption' not in session: session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS if 'multicloudoptions' not in session: session['multicloudoptions'] = \ constants.DEFAULT_MULTICLOUD_OPTIONS return render_template( 'multicloud.html', itm="multicloud", jsonStr="", labels=labels, numActiveDocs=num_active_docs) if request.method == "POST": # This is legacy code. # The form is now submitted by Ajax do_multicloud() # 'POST' request occur when html form is submitted # (i.e. 'Get Graphs', 'Download...') file_manager = utility.load_file_manager() json_obj = utility.generate_mc_json_obj(file_manager) # Replaces client-side array generator word_counts_array = [] for doc in json_obj: name = doc["name"] children = doc["children"] word_counts = {} for item in children: word_counts[item["text"]] = item["size"] word_counts_array.append( {"name": name, "word_counts": word_counts, "words": children}) # Temporary fix because the front end needs a string json_obj = json.dumps(json_obj) session_manager.cache_cloud_option() session_manager.cache_multi_cloud_options() return render_template( 'multicloud.html', itm="multicloud", JSONObj=json_obj, labels=labels, numActiveDocs=num_active_docs)
def dendrogram(): # Detect the number of active documents. num_active_docs = detect_active_docs() if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS if 'hierarchyoption' not in session: session['hierarchyoption'] = constants.DEFAULT_HIERARCHICAL_OPTIONS labels = FileManagerModel().load_file_manager().get_active_labels_with_id() return render_template('dendrogram.html', labels=labels, numActiveDocs=num_active_docs, itm="hierarchical")
def multi_cloud(): """Handles the functionality on the multicloud pages. :return: a response object (often a render_template call) to Flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1])) if request.method == 'GET': # 'GET' request occurs when the page is first loaded. if 'cloudoption' not in session: session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS if 'multicloudoptions' not in session: session['multicloudoptions'] = \ constants.DEFAULT_MULTICLOUD_OPTIONS return render_template('multicloud.html', itm="multicloud", jsonStr="", labels=labels, numActiveDocs=num_active_docs) if request.method == "POST": # This is legacy code. # The form is now submitted by Ajax do_multicloud() # 'POST' request occur when html form is submitted # (i.e. 'Get Graphs', 'Download...') file_manager = utility.load_file_manager() json_obj = utility.generate_mc_json_obj(file_manager) # Replaces client-side array generator word_counts_array = [] for doc in json_obj: name = doc["name"] children = doc["children"] word_counts = {} for item in children: word_counts[item["text"]] = item["size"] word_counts_array.append({ "name": name, "word_counts": word_counts, "words": children }) # Temporary fix because the front end needs a string json_obj = json.dumps(json_obj) session_manager.cache_cloud_option() session_manager.cache_multi_cloud_options() return render_template('multicloud.html', itm="multicloud", JSONObj=json_obj, labels=labels, numActiveDocs=num_active_docs)
def upload(): """Handles the functionality of the upload page. It uploads files to be used in the current session. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() if request.method == "GET": print("About to fix session in case of browser caching") # fix the session in case the browser is caching the old session session_manager.fix() print("Session fixed. Rendering template.") if 'generalsettings' not in session: session['generalsettings'] = \ constants.DEFAULT_GENERALSETTINGS_OPTIONS return render_template( 'upload.html', MAX_FILE_SIZE=constants.MAX_FILE_SIZE, MAX_FILE_SIZE_INT=constants.MAX_FILE_SIZE_INT, MAX_FILE_SIZE_UNITS=constants.MAX_FILE_SIZE_UNITS, itm="upload-tool", numActiveDocs=num_active_docs) # X-FILENAME is the flag to signify a file upload if 'X-FILENAME' in request.headers: # File upload through javascript file_manager = utility.load_file_manager() # --- check file name --- # Grab the filename, which will be UTF-8 percent-encoded (e.g. '%E7' # instead of python's '\xe7') file_name = request.headers['X-FILENAME'] # Unquote using urllib's percent-encoding decoder (turns '%E7' into # '\xe7') file_name = unquote(file_name) # --- end check file name --- if file_name.endswith('.lexos'): file_manager.handle_upload_workspace() # update filemanager file_manager = utility.load_file_manager() file_manager.update_workspace() else: file_manager.add_upload_file(request.data, file_name) utility.save_file_manager(file_manager) return 'success'
def cut(): """ Handles the functionality of the cut page. It cuts the files into various segments depending on the specifications chosen by the user, and sends the text segments. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() active = file_manager.get_active_files() if len(active) > 0: num_char = [x.num_letters() for x in active] num_word = [x.num_words() for x in active] num_line = [x.num_lines() for x in active] max_char = max(num_char) max_word = max(num_word) max_line = max(num_line) active_file_ids = [lfile.id for lfile in active] else: num_char = [] num_word = [] num_line = [] max_char = 0 max_word = 0 max_line = 0 active_file_ids = [] if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'cuttingoptions' not in session: session['cuttingoptions'] = constants.DEFAULT_CUT_OPTIONS previews = file_manager.get_previews_of_active() return render_template( 'cut.html', previews=previews, num_active_files=len(previews), numChar=num_char, numWord=num_word, numLine=num_line, maxChar=max_char, maxWord=max_word, maxLine=max_line, activeFileIDs=active_file_ids, itm="cut", numActiveDocs=num_active_docs)
def cut(): """ Handles the functionality of the cut page. It cuts the files into various segments depending on the specifications chosen by the user, and sends the text segments. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() active = file_manager.get_active_files() if len(active) > 0: num_char = [x.num_letters() for x in active] num_word = [x.num_words() for x in active] num_line = [x.num_lines() for x in active] max_char = max(num_char) max_word = max(num_word) max_line = max(num_line) active_file_ids = [lfile.id for lfile in active] else: num_char = [] num_word = [] num_line = [] max_char = 0 max_word = 0 max_line = 0 active_file_ids = [] if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'cuttingoptions' not in session: session['cuttingoptions'] = constants.DEFAULT_CUT_OPTIONS previews = file_manager.get_previews_of_active() return render_template('cut.html', previews=previews, num_active_files=len(previews), numChar=num_char, numWord=num_word, numLine=num_line, maxChar=max_char, maxWord=max_word, maxLine=max_line, activeFileIDs=active_file_ids, itm="cut", numActiveDocs=num_active_docs)
def statistics(): """Handles the functionality on the Statistics page. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() # Get labels with their ids. id_label_map = \ FileManagerModel().load_file_manager().get_active_labels_with_id() # "GET" request occurs when the page is first loaded. if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS return render_template('statistics.html', itm="statistics", labels=id_label_map, numActiveDocs=num_active_docs)
def statistics(): """Handles the functionality on the Statistics page. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() # Get labels with their ids. id_label_map = \ FileManagerModel().load_file_manager().get_active_labels_with_id() # "GET" request occurs when the page is first loaded. if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS return render_template( 'statistics.html', itm="statistics", labels=id_label_map, numActiveDocs=num_active_docs)
def rolling_window(): """Handles the functionality on the rollingwindow page. It analyzes the various texts using a rolling window of analysis. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = FileManagerModel().load_file_manager() # Get active labels with id and sort all labels. labels = file_manager.get_active_labels_with_id() labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1])) # Fill in the default options if the option was not already there. if 'rwoption' not in session: session['rwoption'] = constants.DEFAULT_ROLLINGWINDOW_OPTIONS # Return the rendered template. return render_template('rwanalysis.html', itm="rolling-windows", labels=labels, numActiveDocs=num_active_docs)
def scrape(): """scraps the urls an generates text file from each url. :return: json object with a string that indicates that is has succeeded """ # Detect the number of active documents. num_active_docs = detect_active_docs() if request.method == "GET": return render_template('scrape.html', numActiveDocs=num_active_docs) if request.method == "POST": import requests urls = request.json["urls"] urls = urls.strip() urls = urls.replace(",", "\n") # Replace commas with line breaks urls = re.sub(r"\s+", "\n", urls) # Get rid of extra white space urls = urls.split("\n") file_manager = utility.load_file_manager() for i, url in enumerate(urls): r = requests.get(url) file_manager.add_upload_file(r.text, "url" + str(i) + ".txt") utility.save_file_manager(file_manager) response = "success" return json.dumps(response)
def scrape(): """scraps the urls an generates text file from each url. :return: json object with a string that indicates that is has succeeded """ # Detect the number of active documents. num_active_docs = detect_active_docs() if request.method == "GET": return render_template('scrape.html', numActiveDocs=num_active_docs) if request.method == "POST": import requests urls = request.json["urls"] urls = urls.strip() urls = urls.replace(",", "\n") # Replace commas with line breaks urls = re.sub("\s+", "\n", urls) # Get rid of extra white space urls = urls.split("\n") file_manager = utility.load_file_manager() for i, url in enumerate(urls): r = requests.get(url) file_manager.add_upload_file(r.text, "url" + str(i) + ".txt") utility.save_file_manager(file_manager) response = "success" return json.dumps(response)
def viz(): """Handles the functionality on the alternate bubbleViz page. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() from collections import OrderedDict from natsort import natsorted labels = OrderedDict(natsorted(labels.items(), key=lambda x: x[1])) if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'cloudoption' not in session: session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS if 'bubblevisoption' not in session: session['bubblevisoption'] = constants.DEFAULT_BUBBLEVIZ_OPTIONS return render_template( 'viz.html', JSONObj="", labels=labels, itm="bubbleviz", numActiveDocs=num_active_docs) if request.method == "POST": # "POST" request occur when html form is submitted # (i.e. 'Get Dendrogram', 'Download...') # Legacy function # json_obj = utility.generateJSONForD3(file_manager, mergedSet=True) # Get the file manager, sorted labels, and tokenization options file_manager = utility.load_file_manager() if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS token_type = session['analyoption']['tokenType'] token_size = int(session['analyoption']['tokenSize']) # Limit docs to those selected or to active docs chosen_doc_ids = [int(x) for x in request.form.getlist('segmentlist')] active_docs = [] if chosen_doc_ids: for file_id in chosen_doc_ids: active_docs.append(file_id) else: for l_file in file_manager.files.values(): if l_file.active: active_docs.append(l_file.id) # Get the contents of all selected/active docs all_contents = [] for file_id in active_docs: if file_manager.files[file_id].active: content = file_manager.files[file_id].load_contents() all_contents.append(content) # Generate a DTM dtm, vocab = utility.simple_vectorizer( all_contents, token_type, token_size) # Convert the DTM to a pandas dataframe with the terms as column # headers import pandas as pd df = pd.DataFrame(dtm, columns=vocab) # Get the Minimum Token Length and Maximum Term Settings minimum_length = int( request.form['minlength']) if 'minlength' in request.form else 0 if 'maxwords' in request.form: # Make sure there is a number in the input form check_for_value = request.form['maxwords'] if check_for_value == "": max_num_words = 100 else: max_num_words = int(request.form['maxwords']) # Filter words that don't meet the minimum length from the dataframe for term in vocab: if len(term) < minimum_length: del df[term] # Extract a dictionary of term count sums sums_dict = df.sum(axis=0).to_dict() # Create a new dataframe of sums and sort it by counts, then terms # Warning!!! This is not natsort. Multiple terms at the edge of # the maximum number of words limit may be cut off in abitrary # order. We need to implement natsort for dataframes. f = pd.DataFrame(list(sums_dict.items()), columns=['term', 'count']) f.sort_values(by=['count', 'term'], axis=0, ascending=[False, True], inplace=True) # Convert the dataframe head to a dict for use below f = f.head(n=max_num_words).to_dict() # Build the JSON object for d3.js termslist = [] countslist = [] children = [] for item in f['term'].items(): termslist.append(item[1]) for item in f['count'].items(): countslist.append(item[1]) for k, v in enumerate(termslist): children.append({"name": v, "size": str(countslist[k])}) json_obj = {"name": "tokens", "children": children} # Turn the JSON object into a JSON string for the front end json_str = json.dumps(json_obj) session_manager.cache_cloud_option() session_manager.cache_bubble_viz_option() return render_template( 'viz.html', JSONObj=json_str, labels=labels, itm="bubbleviz", numActiveDocs=num_active_docs)
def content_analysis(): """Handles the functionality on the contentanalysis page. :return: a response object (often a render_template call) to flask and eventually to the browser. """ analysis = ContentAnalysisModel() path = os.path.join(constants.TMP_FOLDER, constants.UPLOAD_FOLDER, session['id'], 'content_analysis/') if os.path.isdir(path): dictionary_names = [name for name in os.listdir(path)] else: dictionary_names = [] if request.method == 'GET': if 'dictionary_labels' in session: dict_labels = session['dictionary_labels'] else: dict_labels = [] if 'active_dictionaries' in session: active_dicts = session['active_dictionaries'] else: active_dicts = [True] * len(dict_labels) if 'toggle_all_value' in session: toggle_all_value = session['toggle_all_value'] else: toggle_all_value = True if 'formula' in session: formula = session['formula'] else: formula = "" return render_template('contentanalysis.html', dictionary_labels=dict_labels, active_dictionaries=active_dicts, toggle_all_value=toggle_all_value, itm="content-analysis", formula=formula) else: num_active_docs = detect_active_docs() active_dicts = ContentAnalysisReceiver().options_from_front_end( ).active_dicts dict_labels = ContentAnalysisReceiver().options_from_front_end( ).dict_labels session['formula'] = ContentAnalysisReceiver().options_from_front_end( ).formula if len(dict_labels) == 0: dict_labels = [os.path.splitext(dict_name)[0] for dict_name in dictionary_names] active_dicts = [True] * len(dict_labels) num_active_dicts = active_dicts.count(True) if num_active_docs == 0 and num_active_dicts == 0: return error("At least 1 active document and 1 active " "dictionary are required to perform a " "content analysis.") elif num_active_docs == 0: return error("At least 1 active document is required to perform " "a content analysis.") elif num_active_dicts == 0: return error("At least 1 active dictionary is required to perform" " a content analysis.") file_manager = load_file_manager() active_files = file_manager.get_active_files() for file in active_files: analysis.add_file(file_name=file.name, label=file.label, content=file.load_contents()) for dict_name, dict_label, active in zip(dictionary_names, dict_labels, active_dicts): if active: f = open(os.path.join(path, dict_name), "r") content = f.read() analysis.add_dictionary(file_name=dict_name, label=dict_label, content=content) result_table, corpus_raw_counts_table, files_raw_counts_tables,\ formula_errors = analysis.analyze() if len(formula_errors) != 0 or result_table is None: return error(formula_errors) data = {"result_table": result_table, "dictionary_labels": dict_labels, "active_dictionaries": active_dicts, "corpus_raw_counts_table": corpus_raw_counts_table, "files_raw_counts_tables": files_raw_counts_tables, "error": False} return json.dumps(data)
def manage(): """Handles the functionality of the select page. Its primary role is to activate/deactivate specific files depending on the user's input. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() # Usual loading of the FileManager file_manager = utility.load_file_manager() if request.method == "GET": rows = file_manager.get_previews_of_all() for row in rows: if row["state"]: row["state"] = "selected" else: row["state"] = "" return render_template( 'manage.html', rows=rows, itm="manage", numActiveDocs=num_active_docs) if 'previewTest' in request.headers: file_id = int(request.data) file_label = file_manager.files[file_id].label file_preview = file_manager.files[file_id].get_preview() preview_vals = { "id": file_id, "label": file_label, "previewText": file_preview} return json.dumps(preview_vals) if 'toggleFile' in request.headers: # Catch-all for any POST request. # On the select page, POSTs come from JavaScript AJAX XHRequests. file_id = int(request.data) # Toggle the file from active to inactive or vice versa file_manager.toggle_file(file_id) elif 'toggliFy' in request.headers: file_ids = request.data file_ids = file_ids.split(",") file_manager.disable_all() # Toggle the file from active to inactive or vice versa file_manager.enable_files(file_ids) elif 'setLabel' in request.headers: new_name = (request.headers['setLabel']) file_id = int(request.data) file_manager.files[file_id].set_name(new_name) file_manager.files[file_id].label = new_name elif 'setClass' in request.headers: new_class_label = (request.headers['setClass']) file_id = int(request.data) file_manager.files[file_id].set_class_label(new_class_label) elif 'disableAll' in request.headers: file_manager.disable_all() elif 'selectAll' in request.headers: file_manager.enable_all() elif 'applyClassLabel' in request.headers: file_manager.classify_active_files() elif 'deleteActive' in request.headers: file_manager.delete_active_files() elif 'deleteRow' in request.headers: # delete the file in request.form file_manager.delete_files(list(request.form.keys())) utility.save_file_manager(file_manager) return '' # Return an empty string because you have to return something
def word_cloud(): """Handles the functionality on the visualisation page. a prototype for displaying single word cloud graphs. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() from collections import OrderedDict labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1])) if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'cloudoption' not in session: session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS # there is no wordcloud option so we don't initialize that return render_template( 'wordcloud.html', itm="word-cloud", labels=labels, numActiveDocs=num_active_docs) if request.method == "POST": # "POST" request occur when html form is submitted # (i.e. 'Get Dendrogram', 'Download...') # Get the file manager, sorted labels, and tokenization options file_manager = utility.load_file_manager() if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS token_type = session['analyoption']['tokenType'] token_size = int(session['analyoption']['tokenSize']) # Limit docs to those selected or to active docs chosen_doc_ids = [int(x) for x in request.form.getlist('segmentlist')] active_docs = [] if chosen_doc_ids: for file_id in chosen_doc_ids: active_docs.append(file_id) else: for l_file in file_manager.files.values(): if l_file.active: active_docs.append(l_file.id) # Get the contents of all selected/active docs all_contents = [] for file_id in active_docs: if file_manager.files[file_id].active: content = file_manager.files[file_id].load_contents() all_contents.append(content) # Generate a DTM dtm, vocab = utility.simple_vectorizer( all_contents, token_type, token_size) # Convert the DTM to a pandas dataframe and save the sums import pandas as pd df = pd.DataFrame(dtm) df = df.sum(axis=0) # Build the JSON object for d3.js json_obj = {"name": "tokens", "children": []} for k, v in enumerate(vocab): json_obj["children"].append({"name": v, "size": str(df[k])}) # Create a list of column values for the word count table from operator import itemgetter terms = natsorted( json_obj["children"], key=itemgetter('size'), reverse=True) column_values = [] for term in terms: # rows = [term["name"].encode('utf-8'), term["size"]] rows = [term["name"], term["size"]] column_values.append(rows) # Turn the JSON object into a JSON string for the front end json_obj = json.dumps(json_obj) session_manager.cache_cloud_option() return render_template( 'wordcloud.html', labels=labels, JSONObj=json_obj, columnValues=column_values, itm="word-cloud", numActiveDocs=num_active_docs)
def k_means(): """Handles the functionality on the kmeans page. It analyzes the various texts and displays the class label of the files. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() for key in labels: labels[key] = labels[key] default_k = int(len(labels) / 2) if request.method == 'GET': # 'GET' request occurs when the page is first loaded if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS if 'kmeanoption' not in session: session['kmeanoption'] = constants.DEFAULT_KMEAN_OPTIONS return render_template('kmeans.html', labels=labels, silhouettescore='', kmeansIndex=[], fileNameStr='', fileNumber=len(labels), KValue=0, defaultK=default_k, colorChartStr='', kmeansdatagenerated=False, itm="kmeans", numActiveDocs=num_active_docs) if request.method == "POST": # 'POST' request occur when html form is submitted # (i.e. 'Get Graphs', 'Download...') session_manager.cache_analysis_option() session_manager.cache_k_mean_option() utility.save_file_manager(file_manager) if request.form['viz'] == 'PCA': kmeans_index, silhouette_score, file_name_str, k_value, \ color_chart_str = utility.generate_k_means_pca(file_manager) return render_template('kmeans.html', labels=labels, silhouettescore=silhouette_score, kmeansIndex=kmeans_index, fileNameStr=file_name_str, fileNumber=len(labels), KValue=k_value, defaultK=default_k, colorChartStr=color_chart_str, kmeansdatagenerated=True, itm="kmeans", numActiveDocs=num_active_docs) elif request.form['viz'] == 'Voronoi': kmeans_index, silhouette_score, file_name_str, k_value, \ color_chart_str, final_points_list, final_centroids_list, \ text_data, max_x = \ utility.generate_k_means_voronoi(file_manager) return render_template('kmeans.html', labels=labels, silhouettescore=silhouette_score, kmeansIndex=kmeans_index, fileNameStr=file_name_str, fileNumber=len(labels), KValue=k_value, defaultK=default_k, colorChartStr=color_chart_str, finalPointsList=final_points_list, finalCentroidsList=final_centroids_list, textData=text_data, maxX=max_x, kmeansdatagenerated=True, itm="kmeans", numActiveDocs=num_active_docs)
def word_cloud(): """Handles the functionality on the visualisation page. a prototype for displaying single word cloud graphs. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() from collections import OrderedDict labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1])) if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'cloudoption' not in session: session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS # there is no wordcloud option so we don't initialize that return render_template('wordcloud.html', labels=labels, itm="word-cloud", numActiveDocs=num_active_docs) if request.method == "POST": # "POST" request occur when html form is submitted # (i.e. 'Get Dendrogram', 'Download...') # Get the file manager, sorted labels, and tokenization options file_manager = utility.load_file_manager() if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS token_type = session['analyoption']['tokenType'] token_size = int(session['analyoption']['tokenSize']) # Limit docs to those selected or to active docs chosen_doc_ids = [int(x) for x in request.form.getlist('segmentlist')] active_docs = [] if chosen_doc_ids: for ID in chosen_doc_ids: active_docs.append(ID) else: for lFile in file_manager.files.values(): if lFile.active: active_docs.append(lFile.id) # Get the contents of all selected/active docs all_contents = [] for ID in active_docs: if file_manager.files[ID].active: content = file_manager.files[ID].load_contents() all_contents.append(content) # Generate a DTM dtm, vocab = utility.simple_vectorizer(all_contents, token_type, token_size) # Convert the DTM to a pandas dataframe and save the sums import pandas as pd df = pd.DataFrame(dtm) df = df.sum(axis=0) # Build the JSON object for d3.js json_obj = {"name": "tokens", "children": []} for k, v in enumerate(vocab): json_obj["children"].append({"name": v, "size": str(df[k])}) # Create a list of column values for the word count table from operator import itemgetter terms = natsorted(json_obj["children"], key=itemgetter('size'), reverse=True) column_values = [] for term in terms: rows = [term["name"].encode('utf-8'), term["size"]] column_values.append(rows) # Turn the JSON object into a JSON string for the front end json_obj = json.dumps(json_obj) session_manager.cache_cloud_option() return render_template('wordcloud.html', labels=labels, JSONObj=json_obj, columnValues=column_values, itm="word-cloud", numActiveDocs=num_active_docs)
def rolling_window(): """Handles the functionality on the rollingwindow page. It analyzes the various texts using a rolling window of analysis. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() labels = OrderedDict(natsorted(list(labels.items()), key=lambda x: x[1])) if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'rwoption' not in session: session['rwoption'] = constants.DEFAULT_ROLLINGWINDOW_OPTIONS # default legendlabels legend_labels = [""] return render_template( 'rwanalysis.html', labels=labels, legendLabels=legend_labels, rwadatagenerated=False, itm="rolling-windows", numActiveDocs=num_active_docs) if request.method == "POST": # "POST" request occurs when user hits submit (Get Graph) button data_points, data_list, graph_title, x_axis_label, y_axis_label, \ legend_labels = utility.generate_rwa(file_manager) if 'get-RW-plot' in request.form: # The 'Graph Data' button is clicked on rollingwindow.html. save_path, file_extension = utility.generate_rw_matrix_plot( data_points, legend_labels) return send_file( save_path, attachment_filename="rollingwindow_matrix" + file_extension, as_attachment=True) if 'get-RW-data' in request.form: # The 'CSV Matrix' button is clicked on rollingwindow.html. save_path, file_extension = utility.generate_rw_matrix(data_list) return send_file( save_path, attachment_filename="rollingwindow_matrix" + file_extension, as_attachment=True) session_manager.cache_rw_analysis_option() if session['rwoption']['rollingwindowsize'] != '': return render_template( 'rwanalysis.html', labels=labels, data=data_points, graphTitle=graph_title, xAxisLabel=x_axis_label, yAxisLabel=y_axis_label, legendLabels=legend_labels, rwadatagenerated=True, itm="rolling-windows", numActiveDocs=num_active_docs) else: return render_template( 'rwanalysis.html', labels=labels, data=data_points, graphTitle=graph_title, xAxisLabel=x_axis_label, yAxisLabel=y_axis_label, legendLabels=legend_labels, rwadatagenerated=False, itm="rolling-windows", numActiveDocs=num_active_docs)
def manage(): """Handles the functionality of the select page. Its primary role is to activate/deactivate specific files depending on the user's input. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() # Usual loading of the FileManager file_manager = utility.load_file_manager() if request.method == "GET": rows = file_manager.get_previews_of_all() for row in rows: if row["state"]: row["state"] = "selected" else: row["state"] = "" return render_template('manage.html', rows=rows, itm="manage", numActiveDocs=num_active_docs) if 'previewTest' in request.headers: file_id = int(request.data) file_label = file_manager.files[file_id].label file_preview = file_manager.files[file_id].get_preview() preview_vals = { "id": file_id, "label": file_label, "previewText": file_preview } return json.dumps(preview_vals) if 'toggleFile' in request.headers: # Catch-all for any POST request. # On the select page, POSTs come from JavaScript AJAX XHRequests. file_id = int(request.data) # Toggle the file from active to inactive or vice versa file_manager.toggle_file(file_id) elif 'toggliFy' in request.headers: file_ids = request.data file_ids = file_ids.split(",") file_manager.disable_all() # Toggle the file from active to inactive or vice versa file_manager.enable_files(file_ids) elif 'setLabel' in request.headers: new_name = (request.headers['setLabel']) file_id = int(request.data) file_manager.files[file_id].set_name(new_name) file_manager.files[file_id].label = new_name elif 'setClass' in request.headers: new_class_label = (request.headers['setClass']) file_id = int(request.data) file_manager.files[file_id].set_class_label(new_class_label) elif 'disableAll' in request.headers: file_manager.disable_all() elif 'selectAll' in request.headers: file_manager.enable_all() elif 'applyClassLabel' in request.headers: file_manager.classify_active_files() elif 'deleteActive' in request.headers: file_manager.delete_active_files() elif 'deleteRow' in request.headers: # delete the file in request.form file_manager.delete_files(list(request.form.keys())) utility.save_file_manager(file_manager) return '' # Return an empty string because you have to return something
def viz(): """Handles the functionality on the alternate bubbleViz page. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() labels = file_manager.get_active_labels_with_id() from collections import OrderedDict from natsort import natsorted labels = OrderedDict(natsorted(labels.items(), key=lambda x: x[1])) if request.method == "GET": # "GET" request occurs when the page is first loaded. if 'cloudoption' not in session: session['cloudoption'] = constants.DEFAULT_CLOUD_OPTIONS if 'bubblevisoption' not in session: session['bubblevisoption'] = constants.DEFAULT_BUBBLEVIZ_OPTIONS return render_template('viz.html', JSONObj="", labels=labels, itm="bubbleviz", numActiveDocs=num_active_docs) if request.method == "POST": # "POST" request occur when html form is submitted # (i.e. 'Get Dendrogram', 'Download...') # Legacy function # json_obj = utility.generateJSONForD3(file_manager, mergedSet=True) # Get the file manager, sorted labels, and tokenization options file_manager = utility.load_file_manager() if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS token_type = session['analyoption']['tokenType'] token_size = int(session['analyoption']['tokenSize']) # Limit docs to those selected or to active docs chosen_doc_ids = [int(x) for x in request.form.getlist('segmentlist')] active_docs = [] if chosen_doc_ids: for ID in chosen_doc_ids: active_docs.append(ID) else: for lFile in file_manager.files.values(): if lFile.active: active_docs.append(lFile.id) # Get the contents of all selected/active docs all_contents = [] for ID in active_docs: if file_manager.files[ID].active: content = file_manager.files[ID].load_contents() all_contents.append(content) # Generate a DTM dtm, vocab = utility.simple_vectorizer(all_contents, token_type, token_size) # Convert the DTM to a pandas dataframe with the terms as column # headers import pandas as pd df = pd.DataFrame(dtm, columns=vocab) # Get the Minimum Token Length and Maximum Term Settings minimum_length = int( request.form['minlength']) if 'minlength' in request.form else 0 if 'maxwords' in request.form: # Make sure there is a number in the input form check_for_value = request.form['maxwords'] if check_for_value == "": max_num_words = 100 else: max_num_words = int(request.form['maxwords']) # Filter words that don't meet the minimum length from the dataframe for term in vocab: if len(term) < minimum_length: del df[term] # Extract a dictionary of term count sums sums_dict = df.sum(axis=0).to_dict() # Create a new dataframe of sums and sort it by counts, then terms # Warning!!! This is not natsort. Multiple terms at the edge of # the maximum number of words limit may be cut off in abitrary # order. We need to implement natsort for dataframes. f = pd.DataFrame(list(sums_dict.items()), columns=['term', 'count']) f.sort_values(by=['count', 'term'], axis=0, ascending=[False, True], inplace=True) # Convert the dataframe head to a dict for use below f = f.head(n=max_num_words).to_dict() # Build the JSON object for d3.js termslist = [] countslist = [] children = [] for item in f['term'].items(): termslist.append(item[1]) for item in f['count'].items(): countslist.append(item[1]) for k, v in enumerate(termslist): children.append({"name": v, "size": str(countslist[k])}) json_obj = {"name": "tokens", "children": children} # Turn the JSON object into a JSON string for the front end json_str = json.dumps(json_obj) session_manager.cache_cloud_option() session_manager.cache_bubble_viz_option() return render_template('viz.html', JSONObj=json_str, labels=labels, itm="bubbleviz", numActiveDocs=num_active_docs)
def tokenizer(): """Handles the functionality on the tokenizer page. :return: a response object (often a render_template call) to flask and eventually to the browser. """ # Use timeit to test peformance from timeit import default_timer as timer start_t = timer() print("Initialising GET request.") import pandas as pd from operator import itemgetter # Detect the number of active documents. num_active_docs = detect_active_docs() file_manager = utility.load_file_manager() if request.method == "GET": # Get the active labels and sort them labels = file_manager.get_active_labels_with_id() header_labels = [] for fileID in labels: header_labels.append(file_manager.files[int(fileID)].label) header_labels = natsorted(header_labels) # Get the starting options from the session if 'analyoption' not in session: session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS if 'csvoptions' not in session: session['csvoptions'] = constants.DEFAULT_CSV_OPTIONS csv_orientation = session['csvoptions']['csvorientation'] csv_delimiter = session['csvoptions']['csvdelimiter'] cull_number = session['analyoption']['cullnumber'] token_type = session['analyoption']['tokenType'] normalize_type = session['analyoption']['normalizeType'] token_size = session['analyoption']['tokenSize'] norm = session['analyoption']['norm'] data = { 'cullnumber': cull_number, 'tokenType': token_type, 'normalizeType': normalize_type, 'csvdelimiter': csv_delimiter, 'mfwnumber': '1', 'csvorientation': csv_orientation, 'tokenSize': token_size, 'norm': norm } # If there are active documents, generate a DTM matrix if num_active_docs > 0: end_t = timer() elapsed = end_t - start_t print("before generateCSVMatrixFromAjax") print(elapsed) # Get the DTM with the session options and convert it to a list of # lists dtm = utility.generate_csv_matrix_from_ajax(data, file_manager, round_decimal=True) end_t = timer() elapsed = end_t - start_t print("after generateCSVMatrixFromAjax") print(elapsed) # Print the first five rows for testing # print dtm[0:5] # #dtm[0] += (0,0,) # for i,row in enumerate(dtm[1:]): # dtm[i+1] += (0,0,) # print dtm[0:5] # Create a pandas dataframe with the correct orientation. # Convert it to a list of lists (matrix) if csv_orientation == "filerow": df = pd.DataFrame(dtm) # Create the matrix matrix = df.values.tolist() else: df = pd.DataFrame(dtm) end_t = timer() elapsed = end_t - start_t print("DataFrame created.") print(elapsed) # Calculate the sums and averages length = len(df.index) sums = [0] * (length - 1) sums.insert(0, "Total") averages = [0] * (length - 1) averages.insert(0, "Average") end_t = timer() elapsed = end_t - start_t print("Sum and averages calculated.") print(elapsed) # Concatenate the total and average columns to the dataframe df = pd.concat([df, pd.DataFrame(sums, columns=['Total'])], axis=1) df = pd.concat( [df, pd.DataFrame(averages, columns=['Average'])], axis=1) end_t = timer() elapsed = end_t - start_t print("DataFrame modified.") print(elapsed) # Create the matrix matrix = df.values.tolist() matrix[0][0] = "Terms" end_t = timer() elapsed = end_t - start_t print("DataFrame converted to matrix.") print(elapsed) # Prevent Unicode errors in column headers for i, v in enumerate(matrix[0]): matrix[0][i] = v # Save the column headers and remove them from the matrix # columns = natsorted(matrix[0]) columns = matrix[0] if csv_orientation == "filecolumn": columns[0] = "Terms" else: columns[0] = "Documents" del matrix[0] # Prevent Unicode errors in the row headers for i, v in enumerate(matrix): matrix[i][0] = v[0] # Calculate the number of rows in the matrix records_total = len(matrix) # Sort the matrix by column 0 matrix = natsorted(matrix, key=itemgetter(0), reverse=False) # Set the table length -- maximum 10 records for initial load if records_total <= 10: end_index = records_total - 1 matrix = matrix[0:end_index] else: matrix = matrix[0:9] # escape all the html character in matrix matrix = [[general_functions.html_escape(row[0])] + row[1:] for row in matrix] # escape all the html character in columns columns = [general_functions.html_escape(item) for item in columns] # The first 10 rows are sent to the template as an HTML string. # After the template renders, an ajax request fetches new data # to re-render the table with the correct number of rows. # Create the columns string cols = "<tr>" for s in columns: cols += "<th>" + str(s) + "</th>" cols += "</tr>" # Create the rows string rows = "" for l in matrix: row = "<tr>" for s in l: row += "<td>" + str(s) + "</td>" row += "</tr>" rows += row # Catch instances where there is no active document (triggers the error # modal) else: cols = "<tr><th>Terms</th></tr>" rows = "<tr><td></td></tr>" records_total = 0 # Render the template end_t = timer() elapsed = end_t - start_t print("Matrix generated. Rendering template.") print(elapsed) return render_template('tokenizer.html', draw=1, itm="tokenize", labels=labels, headers=header_labels, columns=cols, rows=rows, numRows=records_total, orientation=csv_orientation, numActiveDocs=num_active_docs) if request.method == "POST": end_t = timer() elapsed = end_t - start_t print("POST received.") print(elapsed) session_manager.cache_analysis_option() session_manager.cache_csv_options() if 'get-csv' in request.form: # The 'Download Matrix' button is clicked on tokenizer.html. save_path, file_extension = utility.generate_csv(file_manager) utility.save_file_manager(file_manager) return send_file(save_path, attachment_filename="frequency_matrix" + file_extension, as_attachment=True) else: # Get the active labels and sort them labels = file_manager.get_active_labels_with_id() header_labels = [] for fileID in labels: header_labels.append(file_manager.files[int(fileID)].label) # Get the Tokenizer options from the request json object length = int(request.json["length"]) # Increment for the ajax response draw = int(request.json["draw"]) + 1 search = request.json["search"] order = str(request.json["order"][1]) sort_column = int(request.json["order"][0]) csv_orientation = request.json["csvorientation"] # Set the sorting order if order == "desc": reverse = True else: reverse = False # Get the DTM with the requested options and convert it to a list # of lists dtm = utility.generate_csv_matrix_from_ajax(request.json, file_manager, round_decimal=True) end_t = timer() elapsed = end_t - start_t print("DTM received.") print(elapsed) if csv_orientation == "filerow": dtm[0][0] = "Documents" df = pd.DataFrame(dtm) footer_stats = df.drop(df.index[[0]], axis=0) footer_stats = footer_stats.drop(df.index[[0]], axis=1) footer_totals = footer_stats.sum().tolist() footer_totals = [round(total, 4) for total in footer_totals] footer_averages = footer_stats.mean().tolist() footer_averages = [round(ave, 4) for ave in footer_averages] sums = ["Total"] averages = ["Average"] # Discrepancy--this is used for tokenize/POST length = len(df.index) for i in range(0, length): if i > 0: rounded_sum = round(df.iloc[i][1:].sum(), 4) sums.append(rounded_sum) rounded_ave = round(df.iloc[i][1:].mean(), 4) averages.append(rounded_ave) df = pd.concat([df, pd.DataFrame(sums, columns=['Total'])], axis=1) df = pd.concat( [df, pd.DataFrame(averages, columns=['Average'])], axis=1) # Populate the sum of sums and average of averages cells sum_of_sums = df['Total'].tolist() num_rows = len(df['Total'].tolist()) num_rows = num_rows - 1 sum_of_sums = sum(sum_of_sums[1:]) sum_of_ave = df['Average'].tolist() sum_of_ave = sum(sum_of_ave[1:]) footer_totals.append(round(sum_of_sums, 4)) footer_totals.append(round(sum_of_ave, 4)) ave_of_sums = sum_of_sums / num_rows ave_of_aves = ave_of_sums / num_rows footer_averages.append(round(ave_of_sums, 4)) footer_averages.append(round(ave_of_aves, 4)) # Change the DataFrame to a list matrix = df.values.tolist() # Prevent Unicode errors in column headers for i, v in enumerate(matrix[0]): matrix[0][i] = v # Save the column headers and remove them from the matrix columns = natsorted(matrix[0][1:-2]) columns.insert(0, "Documents") columns.append("Total") columns.append("Average") del matrix[0] else: df = pd.DataFrame(dtm) # print(df[0:3]) end_t = timer() elapsed = end_t - start_t print("DTM created. Calculating footer stats") print(elapsed) footer_stats = df.drop(df.index[[0]], axis=0) # print(footer_stats[0:3]) footer_stats = footer_stats.drop(df.index[[0]], axis=1) footer_totals = footer_stats.sum().tolist() footer_totals = [round(total, 4) for total in footer_totals] footer_averages = footer_stats.mean().tolist() footer_averages = [round(ave, 4) for ave in footer_averages] end_t = timer() elapsed = end_t - start_t print("Footer stats calculated. " "Calculating totals and averages...") print(elapsed) # try it with nested for loops sums = [] averages = [] n_rows = len(df.index) # all rows are the same, so picking any row n_cols = len(df.iloc[1]) for i in range(1, n_rows): row_total = 0 for j in range(1, n_cols): row_total += df.iloc[i][j] sums.append(round(row_total, 4)) averages.append(round((row_total / (n_cols - 1)), 4)) sums.insert(0, "Total") averages.insert(0, "Average") end_t = timer() elapsed = end_t - start_t print("Totals and averages calculated. Appending columns...") print(elapsed) # This seems to be the bottleneck df['Total'] = sums df['Average'] = averages end_t = timer() elapsed = end_t - start_t print("Populating columns with rounded values.") print(elapsed) # Populate the sum of sums and average of averages cells sum_of_sums = df['Total'].tolist() num_rows = len(df['Total'].tolist()) num_rows = num_rows - 1 sum_of_sums = sum(sum_of_sums[1:]) sum_of_ave = df['Average'].tolist() sum_of_ave = sum(sum_of_ave[1:]) footer_totals.append(round(sum_of_sums, 4)) footer_totals.append(round(sum_of_ave, 4)) ave_of_sums = sum_of_sums / num_rows ave_of_aves = ave_of_sums / num_rows footer_averages.append(round(ave_of_sums, 4)) footer_averages.append(round(ave_of_aves, 4)) end_t = timer() elapsed = end_t - start_t print("Rounded values added.") print(elapsed) matrix = df.values.tolist() matrix[0][0] = "Terms" # Prevent Unicode errors in column headers for i, v in enumerate(matrix[0]): matrix[0][i] = v # Save the column headers and remove them from the matrix columns = natsorted(matrix[0]) if csv_orientation == "filecolumn": columns[0] = "Terms" else: columns[0] = "Documents" del matrix[0] # Code for both orientations # end_t = timer() elapsed = end_t - start_t print("Starting common code.") print(elapsed) # Prevent Unicode errors in the row headers for i, v in enumerate(matrix): matrix[i][0] = v[0] # Calculate the number of rows in the matrix records_total = len(matrix) # Sort and Filter the cached DTM by column if len(search) != 0: matrix = [x for x in matrix if x[0].startswith(search)] matrix = natsorted(matrix, key=itemgetter(sort_column), reverse=reverse) else: matrix = natsorted(matrix, key=itemgetter(sort_column), reverse=reverse) # Get the number of filtered rows records_filtered = len(matrix) # Set the table length if length == -1: matrix = matrix[0:] else: start_index = int(request.json["start"]) end_index = int(request.json["end"]) matrix = matrix[start_index:end_index] # Correct the footer rows footer_totals = [float(Decimal("%.4f" % e)) for e in footer_totals] footer_averages = [float(Decimal("%.4f" % e)) for e in footer_averages] footer_totals.insert(0, "Total") footer_averages.insert(0, "Average") footer_totals.append("") footer_averages.append("") response = { "draw": draw, "records_total": records_total, "records_filtered": records_filtered, "length": int(length), "columns": columns, "data": matrix, "totals": footer_totals, "averages": footer_averages } end_t = timer() elapsed = end_t - start_t print("Returning table data to the browser.") print(elapsed) return json.dumps(response)