示例#1
0
 def test_all(self):
     assert html_escape('&"\'><') == '&amp;&quot;&apos;&gt;&lt;'
     assert html_escape("<html lang='en'></html>") == '&lt;html lang=&apo' \
                                                      's;en&apos;&gt;&lt;' \
                                                      '/html&gt;'
     assert html_escape('<html lang="en"></html>') == '&lt;html lang=&quo' \
                                                      't;en&quot;&gt;&lt;' \
                                                      '/html&gt;'
示例#2
0
 def test_all(self):
     assert html_escape('&"\'><') == '&amp;&quot;&apos;&gt;&lt;'
     assert html_escape("<html lang='en'></html>") == '&lt;html lang=&apo' \
                                                      's;en&apos;&gt;&lt;' \
                                                      '/html&gt;'
     assert html_escape('<html lang="en"></html>') == '&lt;html lang=&quo' \
                                                      't;en&quot;&gt;&lt;' \
                                                      '/html&gt;'
示例#3
0
def do_scrubbing():
    """:return: a json object with a scrubbed preview
    """
    file_manager = utility.load_file_manager()
    # The 'Preview Scrubbing' or 'Apply Scrubbing' button is clicked on
    # scrub.html.
    session_manager.cache_alteration_files()
    session_manager.cache_scrub_options()
    # saves changes only if 'Apply Scrubbing' button is clicked
    saving_changes = True if request.form["formAction"] == "apply" else False
    # preview_info is a tuple of (id, file_name(label), class_label, preview)
    previews = file_manager.scrub_files(saving_changes=saving_changes)
    # escape the html elements, only transforms preview[3], because that is
    # the text:
    previews = [
        [preview[0], preview[1], preview[2],
         general_functions.html_escape(preview[3])] for preview in previews]
    if saving_changes:
        utility.save_file_manager(file_manager)
    data = {"data": previews}
    data = json.dumps(data)
    return data
def do_scrubbing():
    """:return: a json object with a scrubbed preview
    """
    file_manager = utility.load_file_manager()
    # The 'Preview Scrubbing' or 'Apply Scrubbing' button is clicked on
    # scrub.html.
    session_manager.cache_alteration_files()
    session_manager.cache_scrub_options()
    # saves changes only if 'Apply Scrubbing' button is clicked
    saving_changes = True if request.form["formAction"] == "apply" else False
    # preview_info is a tuple of (id, file_name(label), class_label, preview)
    previews = file_manager.scrub_files(saving_changes=saving_changes)
    # escape the html elements, only transforms preview[3], because that is
    # the text:
    previews = [[
        preview[0], preview[1], preview[2],
        general_functions.html_escape(preview[3])
    ] for preview in previews]
    if saving_changes:
        utility.save_file_manager(file_manager)
    data = {"data": previews}
    data = json.dumps(data)
    return data
示例#5
0
 def test_lt(self):
     assert html_escape('<') == "&lt;"
示例#6
0
 def test_gt(self):
     assert html_escape('>') == "&gt;"
示例#7
0
 def test_apos(self):
     assert html_escape("'") == "&apos;"
示例#8
0
 def test_quot(self):
     assert html_escape('"') == "&quot;"
示例#9
0
 def test_amp(self):
     assert html_escape('&') == "&amp;"
示例#10
0
def tokenizer():
    """Handles the functionality on the tokenizer page.

    :return: a response object (often a render_template call) to flask and
    eventually to the browser.
    """
    # Use timeit to test peformance
    from timeit import default_timer as timer
    start_t = timer()
    print("Initialising GET request.")
    import pandas as pd
    from operator import itemgetter
    # Detect the number of active documents.
    num_active_docs = detect_active_docs()
    file_manager = utility.load_file_manager()
    if request.method == "GET":
        # Get the active labels and sort them
        labels = file_manager.get_active_labels_with_id()
        header_labels = []
        for fileID in labels:
            header_labels.append(file_manager.files[int(fileID)].label)
        header_labels = natsorted(header_labels)
        # Get the starting options from the session
        if 'analyoption' not in session:
            session['analyoption'] = constants.DEFAULT_ANALYZE_OPTIONS
        if 'csvoptions' not in session:
            session['csvoptions'] = constants.DEFAULT_CSV_OPTIONS
        csv_orientation = session['csvoptions']['csvorientation']
        csv_delimiter = session['csvoptions']['csvdelimiter']
        cull_number = session['analyoption']['cullnumber']
        token_type = session['analyoption']['tokenType']
        normalize_type = session['analyoption']['normalizeType']
        token_size = session['analyoption']['tokenSize']
        norm = session['analyoption']['norm']
        data = {
            'cullnumber': cull_number,
            'tokenType': token_type,
            'normalizeType': normalize_type,
            'csvdelimiter': csv_delimiter,
            'mfwnumber': '1',
            'csvorientation': csv_orientation,
            'tokenSize': token_size,
            'norm': norm
        }
        # If there are active documents, generate a DTM matrix
        if num_active_docs > 0:
            end_t = timer()
            elapsed = end_t - start_t
            print("before generateCSVMatrixFromAjax")
            print(elapsed)
            # Get the DTM with the session options and convert it to a list of
            # lists
            dtm = utility.generate_csv_matrix_from_ajax(data,
                                                        file_manager,
                                                        round_decimal=True)
            end_t = timer()
            elapsed = end_t - start_t
            print("after generateCSVMatrixFromAjax")
            print(elapsed)
            # Print the first five rows for testing
            # print dtm[0:5]
            # #dtm[0] += (0,0,)
            # for i,row in enumerate(dtm[1:]):
            #     dtm[i+1] += (0,0,)
            # print dtm[0:5]
            # Create a pandas dataframe with the correct orientation.
            # Convert it to a list of lists (matrix)
            if csv_orientation == "filerow":
                df = pd.DataFrame(dtm)
                # Create the matrix
                matrix = df.values.tolist()
            else:
                df = pd.DataFrame(dtm)
                end_t = timer()
                elapsed = end_t - start_t
                print("DataFrame created.")
                print(elapsed)
                # Calculate the sums and averages
                length = len(df.index)
                sums = [0] * (length - 1)
                sums.insert(0, "Total")
                averages = [0] * (length - 1)
                averages.insert(0, "Average")
                end_t = timer()
                elapsed = end_t - start_t
                print("Sum and averages calculated.")
                print(elapsed)
                # Concatenate the total and average columns to the dataframe
                df = pd.concat([df, pd.DataFrame(sums, columns=['Total'])],
                               axis=1)
                df = pd.concat(
                    [df, pd.DataFrame(averages, columns=['Average'])], axis=1)
                end_t = timer()
                elapsed = end_t - start_t
                print("DataFrame modified.")
                print(elapsed)
                # Create the matrix
                matrix = df.values.tolist()
                matrix[0][0] = "Terms"
                end_t = timer()
                elapsed = end_t - start_t
                print("DataFrame converted to matrix.")
                print(elapsed)
            # Prevent Unicode errors in column headers
            for i, v in enumerate(matrix[0]):
                matrix[0][i] = v
            # Save the column headers and remove them from the matrix
            # columns = natsorted(matrix[0])
            columns = matrix[0]
            if csv_orientation == "filecolumn":
                columns[0] = "Terms"
            else:
                columns[0] = "Documents"
            del matrix[0]
            # Prevent Unicode errors in the row headers
            for i, v in enumerate(matrix):
                matrix[i][0] = v[0]
            # Calculate the number of rows in the matrix
            records_total = len(matrix)
            # Sort the matrix by column 0
            matrix = natsorted(matrix, key=itemgetter(0), reverse=False)
            # Set the table length -- maximum 10 records for initial load
            if records_total <= 10:
                end_index = records_total - 1
                matrix = matrix[0:end_index]
            else:
                matrix = matrix[0:9]
            # escape all the html character in matrix
            matrix = [[general_functions.html_escape(row[0])] + row[1:]
                      for row in matrix]
            # escape all the html character in columns
            columns = [general_functions.html_escape(item) for item in columns]
            # The first 10 rows are sent to the template as an HTML string.
            # After the template renders, an ajax request fetches new data
            # to re-render the table with the correct number of rows.
            # Create the columns string
            cols = "<tr>"
            for s in columns:
                cols += "<th>" + str(s) + "</th>"
            cols += "</tr>"
            # Create the rows string
            rows = ""
            for l in matrix:
                row = "<tr>"
                for s in l:
                    row += "<td>" + str(s) + "</td>"
                row += "</tr>"
                rows += row
        # Catch instances where there is no active document (triggers the error
        # modal)
        else:
            cols = "<tr><th>Terms</th></tr>"
            rows = "<tr><td></td></tr>"
            records_total = 0
        # Render the template
        end_t = timer()
        elapsed = end_t - start_t
        print("Matrix generated. Rendering template.")
        print(elapsed)
        return render_template('tokenizer.html',
                               draw=1,
                               itm="tokenize",
                               labels=labels,
                               headers=header_labels,
                               columns=cols,
                               rows=rows,
                               numRows=records_total,
                               orientation=csv_orientation,
                               numActiveDocs=num_active_docs)
    if request.method == "POST":
        end_t = timer()
        elapsed = end_t - start_t
        print("POST received.")
        print(elapsed)
        session_manager.cache_analysis_option()
        session_manager.cache_csv_options()
        if 'get-csv' in request.form:
            # The 'Download Matrix' button is clicked on tokenizer.html.
            save_path, file_extension = utility.generate_csv(file_manager)
            utility.save_file_manager(file_manager)
            return send_file(save_path,
                             attachment_filename="frequency_matrix" +
                             file_extension,
                             as_attachment=True)
        else:
            # Get the active labels and sort them
            labels = file_manager.get_active_labels_with_id()
            header_labels = []
            for fileID in labels:
                header_labels.append(file_manager.files[int(fileID)].label)
            # Get the Tokenizer options from the request json object
            length = int(request.json["length"])
            # Increment for the ajax response
            draw = int(request.json["draw"]) + 1
            search = request.json["search"]
            order = str(request.json["order"][1])
            sort_column = int(request.json["order"][0])
            csv_orientation = request.json["csvorientation"]
            # Set the sorting order
            if order == "desc":
                reverse = True
            else:
                reverse = False
            # Get the DTM with the requested options and convert it to a list
            # of lists
            dtm = utility.generate_csv_matrix_from_ajax(request.json,
                                                        file_manager,
                                                        round_decimal=True)
            end_t = timer()
            elapsed = end_t - start_t
            print("DTM received.")
            print(elapsed)
            if csv_orientation == "filerow":
                dtm[0][0] = "Documents"
                df = pd.DataFrame(dtm)
                footer_stats = df.drop(df.index[[0]], axis=0)
                footer_stats = footer_stats.drop(df.index[[0]], axis=1)
                footer_totals = footer_stats.sum().tolist()
                footer_totals = [round(total, 4) for total in footer_totals]
                footer_averages = footer_stats.mean().tolist()
                footer_averages = [round(ave, 4) for ave in footer_averages]
                sums = ["Total"]
                averages = ["Average"]
                # Discrepancy--this is used for tokenize/POST
                length = len(df.index)
                for i in range(0, length):
                    if i > 0:
                        rounded_sum = round(df.iloc[i][1:].sum(), 4)
                        sums.append(rounded_sum)
                        rounded_ave = round(df.iloc[i][1:].mean(), 4)
                        averages.append(rounded_ave)
                df = pd.concat([df, pd.DataFrame(sums, columns=['Total'])],
                               axis=1)
                df = pd.concat(
                    [df, pd.DataFrame(averages, columns=['Average'])], axis=1)
                # Populate the sum of sums and average of averages cells
                sum_of_sums = df['Total'].tolist()
                num_rows = len(df['Total'].tolist())
                num_rows = num_rows - 1
                sum_of_sums = sum(sum_of_sums[1:])
                sum_of_ave = df['Average'].tolist()
                sum_of_ave = sum(sum_of_ave[1:])
                footer_totals.append(round(sum_of_sums, 4))
                footer_totals.append(round(sum_of_ave, 4))
                ave_of_sums = sum_of_sums / num_rows
                ave_of_aves = ave_of_sums / num_rows
                footer_averages.append(round(ave_of_sums, 4))
                footer_averages.append(round(ave_of_aves, 4))
                # Change the DataFrame to a list
                matrix = df.values.tolist()
                # Prevent Unicode errors in column headers
                for i, v in enumerate(matrix[0]):
                    matrix[0][i] = v
                # Save the column headers and remove them from the matrix
                columns = natsorted(matrix[0][1:-2])
                columns.insert(0, "Documents")
                columns.append("Total")
                columns.append("Average")
                del matrix[0]
            else:
                df = pd.DataFrame(dtm)
                # print(df[0:3])
                end_t = timer()
                elapsed = end_t - start_t
                print("DTM created. Calculating footer stats")
                print(elapsed)
                footer_stats = df.drop(df.index[[0]], axis=0)
                # print(footer_stats[0:3])
                footer_stats = footer_stats.drop(df.index[[0]], axis=1)
                footer_totals = footer_stats.sum().tolist()
                footer_totals = [round(total, 4) for total in footer_totals]
                footer_averages = footer_stats.mean().tolist()
                footer_averages = [round(ave, 4) for ave in footer_averages]
                end_t = timer()
                elapsed = end_t - start_t
                print("Footer stats calculated. "
                      "Calculating totals and averages...")
                print(elapsed)
                # try it with nested for loops
                sums = []
                averages = []
                n_rows = len(df.index)
                # all rows are the same, so picking any row
                n_cols = len(df.iloc[1])
                for i in range(1, n_rows):
                    row_total = 0
                    for j in range(1, n_cols):
                        row_total += df.iloc[i][j]
                    sums.append(round(row_total, 4))
                    averages.append(round((row_total / (n_cols - 1)), 4))
                sums.insert(0, "Total")
                averages.insert(0, "Average")
                end_t = timer()
                elapsed = end_t - start_t
                print("Totals and averages calculated. Appending columns...")
                print(elapsed)
                # This seems to be the bottleneck
                df['Total'] = sums
                df['Average'] = averages
                end_t = timer()
                elapsed = end_t - start_t
                print("Populating columns with rounded values.")
                print(elapsed)
                # Populate the sum of sums and average of averages cells
                sum_of_sums = df['Total'].tolist()
                num_rows = len(df['Total'].tolist())
                num_rows = num_rows - 1
                sum_of_sums = sum(sum_of_sums[1:])
                sum_of_ave = df['Average'].tolist()
                sum_of_ave = sum(sum_of_ave[1:])
                footer_totals.append(round(sum_of_sums, 4))
                footer_totals.append(round(sum_of_ave, 4))
                ave_of_sums = sum_of_sums / num_rows
                ave_of_aves = ave_of_sums / num_rows
                footer_averages.append(round(ave_of_sums, 4))
                footer_averages.append(round(ave_of_aves, 4))
                end_t = timer()
                elapsed = end_t - start_t
                print("Rounded values added.")
                print(elapsed)
                matrix = df.values.tolist()
                matrix[0][0] = "Terms"
                # Prevent Unicode errors in column headers
                for i, v in enumerate(matrix[0]):
                    matrix[0][i] = v
                # Save the column headers and remove them from the matrix
                columns = natsorted(matrix[0])
                if csv_orientation == "filecolumn":
                    columns[0] = "Terms"
                else:
                    columns[0] = "Documents"
                del matrix[0]
        # Code for both orientations #
        end_t = timer()
        elapsed = end_t - start_t
        print("Starting common code.")
        print(elapsed)
        # Prevent Unicode errors in the row headers
        for i, v in enumerate(matrix):
            matrix[i][0] = v[0]
        # Calculate the number of rows in the matrix
        records_total = len(matrix)
        # Sort and Filter the cached DTM by column
        if len(search) != 0:
            matrix = [x for x in matrix if x[0].startswith(search)]
            matrix = natsorted(matrix,
                               key=itemgetter(sort_column),
                               reverse=reverse)
        else:
            matrix = natsorted(matrix,
                               key=itemgetter(sort_column),
                               reverse=reverse)
        # Get the number of filtered rows
        records_filtered = len(matrix)
        # Set the table length
        if length == -1:
            matrix = matrix[0:]
        else:
            start_index = int(request.json["start"])
            end_index = int(request.json["end"])
            matrix = matrix[start_index:end_index]
        # Correct the footer rows
        footer_totals = [float(Decimal("%.4f" % e)) for e in footer_totals]
        footer_averages = [float(Decimal("%.4f" % e)) for e in footer_averages]
        footer_totals.insert(0, "Total")
        footer_averages.insert(0, "Average")
        footer_totals.append("")
        footer_averages.append("")
        response = {
            "draw": draw,
            "records_total": records_total,
            "records_filtered": records_filtered,
            "length": int(length),
            "columns": columns,
            "data": matrix,
            "totals": footer_totals,
            "averages": footer_averages
        }
        end_t = timer()
        elapsed = end_t - start_t
        print("Returning table data to the browser.")
        print(elapsed)
        return json.dumps(response)
示例#11
0
 def test_lt(self):
     assert html_escape('<') == "&lt;"
示例#12
0
 def test_gt(self):
     assert html_escape('>') == "&gt;"
示例#13
0
 def test_apos(self):
     assert html_escape("'") == "&apos;"
示例#14
0
 def test_quot(self):
     assert html_escape('"') == "&quot;"
示例#15
0
 def test_amp(self):
     assert html_escape('&') == "&amp;"