def write_basic_spreadsheet(df: pd.DataFrame, fpath: Path, column_widths: dict, columns_to_center: List[str]): if df.empty: return None df = df.fillna('') default_column_width = 14 # if not specified in column_widths xsheet_name = 'Basic' with pd.ExcelWriter(fpath.as_posix(), engine='xlsxwriter') as writer: df.to_excel(writer, index=False, sheet_name=xsheet_name) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book worksheet = writer.sheets[xsheet_name] xlfmts = add_workbook_formats(workbook) # https://stackoverflow.com/questions/43991505/xlsxwriter-set-global-font-size workbook.formats[0].set_font_size( 14) # to make it readable when printed # ---------------------------------------------- # Populate col_infos with formatting information col_infos = {} excel_letters = excel_columns() for ix, col in enumerate(df.columns): col_letter = excel_letters[ix] col_info = {} fmt = xlfmts[Xlformat.CENTER] if ( columns_to_center is not None and col in columns_to_center) else None col_info['format'] = fmt col_info['width'] = column_widths.get(col, default_column_width) col_info['xl_col_letter'] = f'{col_letter}' col_infos[col] = col_info colspec = pd.DataFrame(col_infos).T # Set the column widths and format. # Set formats with e.g. 'C:C' for col_num, col_info in colspec.iterrows(): xl_col_letter = col_info['xl_col_letter'] wid = col_info['width'] fmt = col_info['format'] worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt) # Make the sheet banded make_sheet_banded(worksheet, df) worksheet.freeze_panes(1, 0) # Freeze the first row. # Write the column headers with the defined format. for col, col_info in colspec.iterrows(): col_num = list(colspec.index).index(col) worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER])
def write_possible_translations_spreadsheet(translations_df, translations_xl_path): if translations_df.empty: return None # LocalSpeciesName eBirdSpeciesName levd match_whole_line regex circle AltName1 Lev2 AltName2 col_widths = [50, 50, 12, 16, 12, 12, 30] for ix in range(MAX_SPECIES_ALTERNATIVES - 1): col_widths.append(30) col_widths.append(10) xsheet_name = 'Possible Translations' sheet_names = banded_sheets = [xsheet_name] center_cols = [ col for col in translations_df.columns if col.startswith('lev') ] for col in ['match_whole_line', 'regex', 'circle']: center_cols.append(col) with pd.ExcelWriter(translations_xl_path, engine='xlsxwriter') as writer: translations_df.to_excel(writer, index=False, sheet_name=xsheet_name) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book xlfmts = add_workbook_formats(workbook) excel_letters = excel_columns() for sheet_num, sheet_name in enumerate(sheet_names): worksheet = writer.sheets[xsheet_name] # Set the column width and format. widths = col_widths col_vals = translations_df.columns.values # df_columns[sheet_num].values for ix, wid in enumerate(widths): col_letter = excel_letters[ix] fmt = xlfmts[ Xlformat.CENTER] if col_vals[ix] in center_cols else None worksheet.set_column(f'{col_letter}:{col_letter}', wid, fmt) if sheet_name in banded_sheets: make_sheet_banded(worksheet, translations_df) # Write the column headers with the defined format. for col_num, value in enumerate(col_vals): worksheet.write(0, col_num, value, xlfmts[Xlformat.HEADER])
def write_categorized_lines_spreadsheet(df: pd.DataFrame, output_path: Path, col_widths: List[int], col_align: List[str], sheet_name: str): if df.empty: return None sheet_names = banded_sheets = [sheet_name] with pd.ExcelWriter(output_path.as_posix(), engine='xlsxwriter') as writer: df.to_excel(writer, index=False, sheet_name=sheet_name) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book xlfmts = add_workbook_formats(workbook) for sheet_num, sheet_name in enumerate(sheet_names): worksheet = writer.sheets[sheet_name] # Set the column width and format. col_vals = df.columns.values row_count = df.shape[0] for ix, wid in enumerate(col_widths): col_letter = string.ascii_uppercase[ix] fmt = xlfmts[ Xlformat.CENTER] if col_align[ix] == 'center' else None worksheet.set_column(f'{col_letter}:{col_letter}', wid, fmt) if sheet_name in banded_sheets: make_sheet_banded(worksheet, df) # ---------- Specific to categorized_lines ---------- colorize_category_column(df, workbook, worksheet) # Write the column headers with the defined format. for col_num, value in enumerate(col_vals): worksheet.write(0, col_num, value, xlfmts[Xlformat.HEADER])
def write_ground_truths(truths, out_path: Path): if truths.empty: return None truths = truths.copy() xsheet_name = 'Ground Truths' # Columns # name Category ABED-1 ABED-1v ABED-2 ABED-2v ABED-3 ABED-3v ABED-4 ABED-4v... cols_to_center = truths.columns.drop('name').drop( 'Category') # everything else is centered column_widths = {'name': 40, 'Category': 10} for col in cols_to_center: column_widths[col] = 5 if col.endswith('v') else 11 numeric_cols = cols_to_center text_cols = ['name', 'Category' ] # force otherwise may interpret original_line as a formula xl_last_data_row = truths.shape[ 0] + 1 # plus 1 is because data starts at row 2 with pd.ExcelWriter(out_path.as_posix(), engine='xlsxwriter') as writer: truths.to_excel(writer, index=False, sheet_name=xsheet_name) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book xlfmts = add_workbook_formats(workbook) # ---------------------------------------------- # Populate col_infos with formatting information col_infos = {} col_letters = excel_columns() for ix, col in enumerate(truths.columns): col_letter = col_letters[ix] col_info = {} # col_info['hide'] = col in real_cols_to_hide # col_info['highlight'] = col in real_cols_to_highlight if col in numeric_cols: fmt = xlfmts[Xlformat.NUMERIC_CENTERED] elif col in text_cols: fmt = xlfmts[Xlformat.TEXT] else: fmt = xlfmts[ Xlformat.CENTER] if col in cols_to_center else None col_info['format'] = fmt col_info['width'] = column_widths.get(col, 10) col_info['xl_col_letter'] = f'{col_letter}' col_infos[col] = col_info colspec = pd.DataFrame(col_infos).T # ---------------------------------------------- worksheet = writer.sheets[xsheet_name] title = 'Ground Truths' worksheet.set_header(f'&C&16&"Times New Roman,Regular"{title}') # footer_fmt = f'&C&12&"Times New Roman,Regular"' # worksheet.set_footer(f'{footer_fmt}Region: {region} Party: {party} Date: {dtcstr}') # https://xlsxwriter.readthedocs.io/page_setup.html # A common requirement is to fit the printed output to n pages wide but # have the height be as long as necessary # worksheet.fit_to_pages(1, 0) # Highlight numbers > 0 for species count last_column_letter = col_letters[truths.shape[1] - 1] v_total_cell_range = f'C2:{last_column_letter}{xl_last_data_row}' worksheet.conditional_format( v_total_cell_range, { 'type': 'cell', 'criteria': 'equal to', 'value': True, 'format': xlfmts[Xlformat.GREEN] }) # Set the column width and format. # Set formats with e.g. 'C:C' for col_num, col_info in colspec.iterrows(): xl_col_letter = col_info['xl_col_letter'] wid = col_info['width'] fmt = col_info['format'] worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt) # https://xlsxwriter.readthedocs.io/worksheet.html#set_column # for ix, col_info in colspec[colspec.hide].iterrows(): # xl_col_letter = col_info['xl_col_letter'] # worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', None, None, {'hidden': 1}) # Make the sheet banded make_sheet_banded(worksheet, truths) # Write the column headers with the defined format. for col, col_info in colspec.iterrows(): # fmt = col_info['format'] col_num = list(colspec.index).index(col) worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER])
def write_nlp_statistics(nlp_statistics, stats_path: Path): if nlp_statistics.empty: return None nlp_statistics = nlp_statistics.copy() xsheet_name = 'ParsePDF Statistics' # Columns # ['family', 'unknown', 'intersections', 'line_token_count', 'line', 'original_line', 'guess', # 'levd', 'line_len', 'lev_len_pct', 'species_inferred', 'is_species_line', # 'guess_correct', 'source'] cols_to_center = [ 'is_group', 'non_avian', 'intersections', 'line_token_count', 'levd', 'tx_line_len', 'species_inferred', 'is_species_line', 'guess_correct', 'source' ] column_widths = { 'classification': 20, 'original_line': 45, 'transformed_line': 45, 'species': 45, 'closest_match': 45, 'is_group': 12, 'non_avian': 12, 'intersections': 12, 'line_token_count': 12, 'levd': 11, 'tx_line_len': 11, 'lev_len_pct': 11, 'species_inferred': 14, 'exact_match': 14, 'verified': 14, 'source': 14 } numeric_cols = ['intersections', 'line_token_count', 'levd', 'line_len'] text_cols = [ 'classification', 'transformed_line', 'original_line', 'species', 'closest_match' ] # force otherwise may interpret original_line as a formula with pd.ExcelWriter(stats_path.as_posix(), engine='xlsxwriter') as writer: nlp_statistics.to_excel(writer, index=False, sheet_name=xsheet_name) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book xlfmts = add_workbook_formats(workbook) # ---------------------------------------------- # Populate col_infos with formatting information col_infos = {} excel_letters = excel_columns() for ix, col in enumerate(nlp_statistics.columns): col_letter = excel_letters[ix] col_info = {} # col_info['hide'] = col in real_cols_to_hide # col_info['highlight'] = col in real_cols_to_highlight if col in numeric_cols: fmt = xlfmts[Xlformat.NUMERIC_CENTERED] elif col == 'lev_len_pct': fmt = xlfmts[Xlformat.PERCENTAGE] elif col in text_cols: fmt = xlfmts[Xlformat.TEXT] else: fmt = xlfmts[ Xlformat.CENTER] if col in cols_to_center else None col_info['format'] = fmt col_info['width'] = column_widths.get(col, 10) col_info['xl_col_letter'] = f'{col_letter}' col_infos[col] = col_info colspec = pd.DataFrame(col_infos).T # ---------------------------------------------- worksheet = writer.sheets[xsheet_name] title = 'NLP Statistics' worksheet.set_header(f'&C&16&"Times New Roman,Regular"{title}') # footer_fmt = f'&C&12&"Times New Roman,Regular"' # worksheet.set_footer(f'{footer_fmt}Region: {region} Party: {party} Date: {dtcstr}') # https://xlsxwriter.readthedocs.io/page_setup.html # A common requirement is to fit the printed output to n pages wide but have the # height be as long as necessary # worksheet.fit_to_pages(1, 0) # rare_name_cells = f'G2:G{xl_last_data_row}' # rarity_criteria = '=EXACT(H2,"X")' # worksheet.conditional_format(rare_name_cells, # {'type': 'formula', 'criteria': rarity_criteria, 'format': format_rare}) # Set the column width and format. # Set formats with e.g. 'C:C' for col_num, col_info in colspec.iterrows(): xl_col_letter = col_info['xl_col_letter'] wid = col_info['width'] fmt = col_info['format'] worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt) # https://xlsxwriter.readthedocs.io/worksheet.html#set_column # for ix, col_info in colspec[colspec.hide].iterrows(): # xl_col_letter = col_info['xl_col_letter'] # worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', None, None, {'hidden': 1}) # Make the sheet banded make_sheet_banded(worksheet, nlp_statistics) # Write the column headers with the defined format. for col, col_info in colspec.iterrows(): # fmt = col_info['format'] col_num = list(colspec.index).index(col) worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER])
def write_final_checklist_spreadsheet(checklist, checklist_path: Path, parameters: dict, additional_sheets: Optional[List[dict]], cols_to_hide: list = None, cols_to_highlight: list = None, header_cell_groups: List[str] = None): # updated_checklist is the filled-in local_checklist # It may be wrapped to a two column (printing) format if cols_to_highlight is None: cols_to_highlight = ['Total'] if cols_to_hide is None: cols_to_hide = ['Group', 'R', 'TaxonOrder'] if checklist.empty: return None checklist = checklist.copy() xsheet_name = 'Final Checklist' # Columns # Group, CommonName, R, Total, TaxonOrder, Group_, CommonName_, R_, Total_, TaxonOrder_ # A B C D E F G H I J real_cols_to_hide = [ x for x in checklist.columns if x.rstrip() in cols_to_hide ] if cols_to_hide else [] real_cols_to_highlight = [x for x in checklist.columns if x.rstrip() in cols_to_highlight] \ if cols_to_highlight else [] cols_to_center = [ 'R', 'Total', 'TaxonOrder', 'Rare', 'Category', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER' ] stripped_widths = { 'Group': 20, 'CommonName': 40, 'R': 5, 'Total': 7, 'TaxonOrder': 8, 'LocalName': 35, 'may_need_writeup': 35, 'Rare': 10, 'D': 3, 'Adult': 6, 'Immature': 6, 'W-morph': 6, 'B-Morph': 6, 'Difficulty': 6, 'Adult/White': 11, 'Immature/Blue': 11, 'Ad': 3, 'Im': 3, 'CountSpecial': 3, 'Category': 10, 'NACC_SORT_ORDER': 8, 'ABA_SORT_ORDER': 8 } xl_last_data_row = checklist.shape[ 0] + 1 # plus 1 is because data starts at row 2 fill_values = { 'Group': '', 'CommonName': '', 'Rare': '', 'TaxonOrder': 99999, 'Group ': '', 'CommonName ': '', 'Rare ': '', 'TaxonOrder ': 99999 } checklist = checklist.fillna(value=fill_values) # Probably sector names standard_cols_base = [ 'CommonName', 'LocalName', 'Group', 'Category', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER', 'Total' ] standard_cols = standard_cols_base.copy() for col in standard_cols_base: standard_cols.append(col + ' ') non_standard_cols = [ col for col in checklist.columns if col not in standard_cols ] for col in non_standard_cols: cols_to_center.append(col) if not stripped_widths.get(col, None): stripped_widths[col] = 14 try: intcols = [col for col in checklist.columns if col.startswith('Taxon')] for col in intcols: checklist[col] = pd.to_numeric(checklist[col], errors='coerce') # checklist = checklist.astype({col: 'int32'}, errors='ignore') except Exception as ee: print(f'Failed to set type of column "{col}" to numeric', ee) checklist.to_csv(checklist_path.parent / f'failure-checklist.csv', index=False) unknown_idxs = checklist.index[checklist[col] == 'UNKNOWN'] display(checklist.loc[unknown_idxs]) traceback.print_exc(file=sys.stdout) # pass checklist.astype({'Total': str}) with pd.ExcelWriter(checklist_path.as_posix(), engine='xlsxwriter') as writer: checklist.to_excel(writer, index=False, sheet_name=xsheet_name) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book xlfmts = add_workbook_formats(workbook) # https://stackoverflow.com/questions/43991505/xlsxwriter-set-global-font-size workbook.formats[0].set_font_size( 14) # to make it readable when printed # ---------------------------------------------- # Populate col_infos with formatting information col_infos = {} col_letters = excel_columns() assert (len(checklist.columns) <= len(col_letters)) # 702 for ix, col in enumerate(checklist.columns): stripped_col = col.rstrip() col_letter = col_letters[ix] col_info = { 'hide': col in real_cols_to_hide, 'highlight': col in real_cols_to_highlight, 'format': xlfmts[Xlformat.CENTER] if col in cols_to_center else None, 'width': stripped_widths.get(stripped_col, 10), 'xl_col_letter': f'{col_letter}' } col_infos[col] = col_info colspec = pd.DataFrame(col_infos).T # ---------------------------------------------- worksheet = writer.sheets[xsheet_name] date_of_count = parameters['CountDate'] dtcount = datetime.strptime(date_of_count, '%Y-%m-%d') dtcstr = dtcount.strftime("%d %b %Y") yr = dtcount.strftime("%Y") title = parameters.get('FinalChecklistTitle', '') worksheet.set_header(f'&C&16&"Times New Roman,Regular"{title} {yr}') region = parameters['CircleAbbrev'] party = parameters['CircleID'] footer_fmt = f'&C&12&"Times New Roman,Regular"' worksheet.set_footer( f'{footer_fmt}Region: {region} Party: {party} Date: {dtcstr}' ) # print(f'parameters: {parameters}') page_breaks = parameters.get('page_breaks', None) if page_breaks: # When splitting into 2 columns, the page breaks are known print(f'page_breaks: {page_breaks}') worksheet.set_h_pagebreaks(page_breaks) else: # https://xlsxwriter.readthedocs.io/page_setup.html # A common requirement is to fit the printed output to n pages wide but # have the height be as long as necessary # print('fitting to 1 page wide') worksheet.fit_to_pages(1, 0) # Highlight numbers > 0 for species count for ix, col_info in colspec[colspec.highlight].iterrows(): xl_col_letter = col_info['xl_col_letter'] v_total_cell_range = f'{xl_col_letter}2:{xl_col_letter}{xl_last_data_row}' worksheet.conditional_format( v_total_cell_range, { 'type': 'cell', 'criteria': '>', 'value': 0, 'format': xlfmts[Xlformat.GREEN] }) excel_letters = excel_columns() # Make CommonName bold if Rare column is set cols_to_bold_idxs = [ idx for idx, xs in enumerate(checklist.columns) if xs.startswith('CommonName') ] rare_cols_idxs = [ idx for idx, xs in enumerate(checklist.columns) if xs.startswith('Rare') ] for cn_idx, ra_idx in zip(cols_to_bold_idxs, rare_cols_idxs): letter = excel_letters[cn_idx] letter_rare = excel_letters[ra_idx] format_rare = workbook.add_format({'bold': True}) # 'bg_color': '#FFC7CE', rare_name_cells = f'{letter}2:{letter}{xl_last_data_row}' rarity_criteria_cells = f'{letter_rare}2' rarity_criteria = f'=EXACT({rarity_criteria_cells},"X")' # print(f'rarity_criteria: {rarity_criteria}') worksheet.conditional_format( rare_name_cells, { 'type': 'formula', 'criteria': rarity_criteria, 'format': format_rare }) # Make CommonName background yellow if CountSpecial column is set format_col_if_other_col(checklist, worksheet, ['CommonName'], ['CountSpecial'], xlfmts[Xlformat.COUNTSPECIAL]) # format_col_if_other_col(checklist, worksheet, col_to_format, condition_cols, xformat) # Color the 'D' (Difficulty) column based on value in 'Difficulty' column xformats = [ xlfmts[idx] for idx in [Xlformat.EASY, Xlformat.MARGINAL, Xlformat.DIFFICULT] ] for to_match, xformat in zip(['E', 'M', 'D'], xformats): format_col_if_other_col(checklist, worksheet, ['D'], ['Difficulty'], xformat, to_match) # Color the 'Ad' (Adult) column based on value in 'Adult' column format_col_if_other_col(checklist, worksheet, ['Ad', 'Im'], ['Adult', 'Immature'], xlfmts[Xlformat.AGE], 'X') # Highlight the 'Ad', 'Im' if non-zero values in 'W-morph', 'B-Morph' # The 'Ad', 'Im' columns are overloaded here since there is no species overlap format_col_if_other_col(checklist, worksheet, ['Ad', 'Im'], ['W-morph', 'B-Morph'], xlfmts[Xlformat.MORPH], 'X') # Italicize non-species try: if 'Category' in checklist.columns: cols_to_italicize_idxs = [ idx for idx, xs in enumerate(checklist.columns) if xs.startswith('CommonName') ] category_cols_idxs = [ idx for idx, xs in enumerate(checklist.columns) if xs.startswith('Category') ] for cn_idx, ca_idx in zip(cols_to_italicize_idxs, category_cols_idxs): letter = excel_letters[cn_idx] letter_category = excel_letters[ca_idx] common_name_cells = f'{letter}2:{letter}{xl_last_data_row}' category_criteria_cells = f'{letter_category}2' # category_criteria = f'=EXACT({category_criteria_cells},"slash")' category_criteria = f'={category_criteria_cells}<>"species"' # print(f'category_criteria: {category_criteria}') worksheet.conditional_format( common_name_cells, { 'type': 'formula', 'criteria': category_criteria, 'format': xlfmts[Xlformat.ITALIC] }) except Exception as ee: print(ee) print(checklist.columns) print(category_cols_idxs) traceback.print_exc(file=sys.stdout) raise # rare_name_cells = f'G2:G{xl_last_data_row}' # rarity_criteria = '=EXACT(H2,"X")' # worksheet.conditional_format(rare_name_cells, # {'type': 'formula', 'criteria': rarity_criteria, 'format': format_rare}) # Set the column width and format. # Set formats with e.g. 'C:C' for col_num, col_info in colspec.iterrows(): xl_col_letter = col_info['xl_col_letter'] wid = col_info['width'] fmt = col_info['format'] worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt) # https://xlsxwriter.readthedocs.io/worksheet.html#set_column for ix, col_info in colspec[colspec.hide].iterrows(): xl_col_letter = col_info['xl_col_letter'] worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', None, None, {'hidden': 1}) # Make the sheet banded make_sheet_banded(worksheet, checklist) # Set the width, and other properties of a row # row (int) – The worksheet row (zero indexed). # height (float) – The row height. worksheet.set_row(0, 70, None, None) worksheet.freeze_panes(1, 0) # Freeze the first row. # header_cell_groups if header_cell_groups is not None: for ix, header_cell_group in enumerate(header_cell_groups): category_criteria = f'=True' # print(header_cell_group, ix, fmt) worksheet.conditional_format( header_cell_group, { 'type': 'formula', 'criteria': category_criteria, 'format': choose_format_accent(xlfmts, ix) }) # Write the column headers with the defined format. for col, col_info in colspec.iterrows(): # fmt = col_info['format'] col_num = list(colspec.index).index(col) worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER]) # *** if additional_sheets is not None: for sheet_info in additional_sheets: # print(sheet_info['sheet_name']) df = sheet_info['data'] df.to_excel(writer, index=False, sheet_name=sheet_info['sheet_name']) worksheet = writer.sheets[sheet_info['sheet_name']] make_sheet_banded(worksheet, df) center_cols = sheet_info['to_center'] for col, wid in sheet_info['widths'].items(): col_index = list(df.columns).index(col) col_letter = excel_letters[col_index] fmt = xlfmts[ Xlformat.CENTER] if col in center_cols else None worksheet.set_column(f'{col_letter}:{col_letter}', wid, fmt) # Set the width, and other properties of a row # row (int) – The worksheet row (zero indexed). # height (float) – The row height. # worksheet.set_row(0, 70, None, None) worksheet.freeze_panes(1, 0) # Freeze the first row. # Set the header format worksheet.write_row(0, 0, list(df.columns), xlfmts[Xlformat.HEADER])