예제 #1
0
def write_basic_spreadsheet(df: pd.DataFrame, fpath: Path, column_widths: dict,
                            columns_to_center: List[str]):
    if df.empty:
        return None
    df = df.fillna('')

    default_column_width = 14  # if not specified in column_widths
    xsheet_name = 'Basic'

    with pd.ExcelWriter(fpath.as_posix(), engine='xlsxwriter') as writer:
        df.to_excel(writer, index=False, sheet_name=xsheet_name)

        # Get the xlsxwriter workbook and worksheet objects.
        workbook = writer.book
        worksheet = writer.sheets[xsheet_name]
        xlfmts = add_workbook_formats(workbook)

        # https://stackoverflow.com/questions/43991505/xlsxwriter-set-global-font-size
        workbook.formats[0].set_font_size(
            14)  # to make it readable when printed

        # ----------------------------------------------
        # Populate col_infos with formatting information
        col_infos = {}

        excel_letters = excel_columns()
        for ix, col in enumerate(df.columns):
            col_letter = excel_letters[ix]
            col_info = {}
            fmt = xlfmts[Xlformat.CENTER] if (
                columns_to_center is not None
                and col in columns_to_center) else None
            col_info['format'] = fmt
            col_info['width'] = column_widths.get(col, default_column_width)
            col_info['xl_col_letter'] = f'{col_letter}'
            col_infos[col] = col_info

        colspec = pd.DataFrame(col_infos).T

        # Set the column widths and format.
        # Set formats with e.g. 'C:C'
        for col_num, col_info in colspec.iterrows():
            xl_col_letter = col_info['xl_col_letter']
            wid = col_info['width']
            fmt = col_info['format']
            worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt)

        # Make the sheet banded
        make_sheet_banded(worksheet, df)

        worksheet.freeze_panes(1, 0)  # Freeze the first row.

        # Write the column headers with the defined format.
        for col, col_info in colspec.iterrows():
            col_num = list(colspec.index).index(col)
            worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER])
예제 #2
0
def extract_from_audubon_results(fpath: Path) -> Tuple[pd.DataFrame, dict]:
    # Kinda hacky, works for 121st count results (2020 season)
    # Use local version of open_workbook to fix:
    #   Suppress annoying xlrd warning "WARNING *** file size ..." #37
    encoding_override = None
    with open(fpath, 'rb') as fp:
        book = open_workbook(file_contents=fp.read(), encoding_override=encoding_override)
    sheet = book.sheet_by_index(0)

    # Count code is in cell $A6$AC, name in $A6$L, date in $A6$AO
    count_info_row = 5
    count_code_col = excel_columns().index('AC')
    count_name_col = excel_columns().index('L')
    count_date_col = excel_columns().index('AO')

    circle_code = sheet.cell_value(rowx=count_info_row, colx=count_code_col)
    circle_name = sheet.cell_value(rowx=count_info_row, colx=count_name_col)
    count_date = sheet.cell_value(rowx=count_info_row, colx=count_date_col)

    col_g = sheet.col_values(excel_columns().index('G'))
    col_s = sheet.col_values(excel_columns().index('S'))

    # Scan down column G to find boundaries for species list
    species_index = col_g.index('Species')
    total_individuals_index = col_g.index('Total Individuals')

    species = col_g[species_index + 1: total_individuals_index]
    totals = pd.Series(col_s[species_index + 1: total_individuals_index]).apply(
        str_with_comma_to_num).fillna(0).astype(int)

    # print(circle_name, circle_code, count_date)
    # print(f'Species count: {len(species)}')

    circle_info = {'Name': circle_name, 'Code': circle_code, 'Date': count_date}

    df = pd.DataFrame(list(zip(species, totals)), columns=['CommonName', 'Total'])

    return df, circle_info
def write_possible_translations_spreadsheet(translations_df,
                                            translations_xl_path):
    if translations_df.empty:
        return None

    # LocalSpeciesName	eBirdSpeciesName	levd	match_whole_line	regex	circle	AltName1	Lev2	AltName2

    col_widths = [50, 50, 12, 16, 12, 12, 30]
    for ix in range(MAX_SPECIES_ALTERNATIVES - 1):
        col_widths.append(30)
        col_widths.append(10)

    xsheet_name = 'Possible Translations'

    sheet_names = banded_sheets = [xsheet_name]
    center_cols = [
        col for col in translations_df.columns if col.startswith('lev')
    ]
    for col in ['match_whole_line', 'regex', 'circle']:
        center_cols.append(col)

    with pd.ExcelWriter(translations_xl_path, engine='xlsxwriter') as writer:
        translations_df.to_excel(writer, index=False, sheet_name=xsheet_name)

        # Get the xlsxwriter workbook and worksheet objects.
        workbook = writer.book
        xlfmts = add_workbook_formats(workbook)

        excel_letters = excel_columns()
        for sheet_num, sheet_name in enumerate(sheet_names):
            worksheet = writer.sheets[xsheet_name]

            # Set the column width and format.
            widths = col_widths
            col_vals = translations_df.columns.values  # df_columns[sheet_num].values

            for ix, wid in enumerate(widths):
                col_letter = excel_letters[ix]
                fmt = xlfmts[
                    Xlformat.CENTER] if col_vals[ix] in center_cols else None
                worksheet.set_column(f'{col_letter}:{col_letter}', wid, fmt)

            if sheet_name in banded_sheets:
                make_sheet_banded(worksheet, translations_df)

            # Write the column headers with the defined format.
            for col_num, value in enumerate(col_vals):
                worksheet.write(0, col_num, value, xlfmts[Xlformat.HEADER])
def format_col_if_other_col(checklist,
                            worksheet,
                            cols_to_format: List[str],
                            condition_cols: List[str],
                            xformat,
                            to_match: str = 'X'):
    # Make CommonName background yellow if CountSpecial column is set
    # Assumes format has been added to workbook already
    # condition_col is the "other col"

    xl_last_data_row = checklist.shape[
        0] + 1  # plus 1 is because data starts at row 2

    # Add variants with blanks to handle two column layouts (Double)
    # For example, both 'Adult' and 'Adult ' could be in columns
    cols_to_format.extend([xs + ' ' for xs in cols_to_format])
    condition_cols.extend([xs + ' ' for xs in condition_cols])

    cols_to_format_idxs = [
        idx for idx, xs in enumerate(checklist.columns) if xs in cols_to_format
    ]
    # What column do we look for an 'X' in? These are the condition columns
    x_cols_idxs = [
        idx for idx, xs in enumerate(checklist.columns) if xs in condition_cols
    ]
    col_letters = excel_columns()
    for cn_idx, cond_idx in zip(cols_to_format_idxs, x_cols_idxs):
        col2fmt_letter = col_letters[cn_idx]
        cond_col_letter = col_letters[cond_idx]
        to_format_cells = f'{col2fmt_letter}2:{col2fmt_letter}{xl_last_data_row}'
        criteria_cells = f'{cond_col_letter}2'
        criteria = f'=EXACT({criteria_cells}, {to_match})'
        # print(f'rarity_criteria: {rarity_criteria}')
        worksheet.conditional_format(to_format_cells, {
            'type': 'formula',
            'criteria': criteria,
            'format': xformat
        })
예제 #5
0
def create_ebird_summary(summary_base: pd.DataFrame,
                         personal_checklists: pd.DataFrame,
                         checklist_meta: pd.DataFrame, circle_abbrev,
                         parameters: Parameters, sector_name: str,
                         taxonomy: Taxonomy,
                         output_path) -> Tuple[Any, List[str]]:
    # Each checklist becomes a column in the summary sheet
    # Start of big processing loop
    summary = summary_base.copy()
    # team_cols = set()
    summary_common_names = list(summary.CommonName.values)

    checklist_meta = checklist_meta.copy()[
        checklist_meta.sharing != 'secondary']
    checklist_meta.sort_values(
        by=['location_group', 'locId', 'obsDt', 'groupId', 'Name'],
        na_position='first',
        inplace=True)

    sector_subids = list(personal_checklists.subId.values)
    sector_checklist_meta = checklist_meta[checklist_meta.subId.isin(
        sector_subids)]

    # Group	CommonName	Rare	Total	TaxonOrder	NACC_SORT_ORDER
    summary['FrozenTotal'] = 0  # placeholder

    if 'Category' not in summary.columns:
        summary['Category'] = create_category_column(summary, taxonomy)

    std_columns = [
        'Group', 'CommonName', 'Rare', 'Total', 'FrozenTotal', 'Category',
        'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER'
    ]
    summary = summary[std_columns]

    # Sector checklists may have added species not on the template
    # Add on rows for these new species
    additional_rare = []
    names_to_add = set(
        personal_checklists.CommonName.values) - set(summary_common_names)
    if not names_to_add == set():
        # print(f'Need to add: {names_to_add}')
        # blank_row = pd.Series([''] * len(summary.columns), index=summary.columns)
        rows_to_add = []
        for common_name in names_to_add:
            row_rarity = create_row_for_missing_species(
                common_name, summary, taxonomy)
            if row_rarity is None:
                continue
            row, rarity = row_rarity
            rows_to_add.append(row)
            if rarity:
                additional_rare.append(common_name)

        summary = summary.append(rows_to_add, ignore_index=True)
        summary_common_names.extend(list(names_to_add))

    # Re-sort by TaxonOrder
    # Sorting has to be done before we create any formulae for Totals
    summary = summary.sort_values(by=['NACC_SORT_ORDER']).reset_index(
        drop=True)

    # Use the order from checklist_meta and add a column to summary for each checklist
    # personal_columns = []
    for subid in sector_checklist_meta.subId.values:
        pcsub = personal_checklists[personal_checklists.subId == subid]
        species_totals = []
        for common_name in summary.CommonName.values:
            species_row = pcsub[pcsub.CommonName == common_name]
            species_total = 0 if species_row.empty else species_row.Total.values[
                0]
            species_totals.append(species_total)
        # Add the column to summary
        summary[subid] = species_totals

    # Don't think we need the filter any more, since that was done above
    rare_species = filter_additional_rare(taxonomy, additional_rare)
    if len(rare_species) > 0:
        print(
            f'   Requires rare bird form: {", ".join(rare_species)} [not on master list]'
        )

    # Re-sort by TaxonOrder
    # Sorting has to be done before we create any formulae for Totals
    summary = summary.sort_values(by=['NACC_SORT_ORDER']).reset_index(
        drop=True)

    # We don't rename columns until right before we create Excel file
    team_cols = sector_checklist_meta.subId.values

    # The complexity here is because we can have cases where a single birder birded
    # near-duplicate locations. This means location_group is e.g. L13065376+L13065792
    # but each of these checklist should be considered separate (use SUM not MAX)
    # Example in CAMP 2020/Rancho San Carlos:
    # L13065376-S78154180-09:24-Jeff Manker | L13065792-S78156572-10:10-Jeff Manker |
    # L13065792-S78184574-10:44-Jeff Manker
    mask = sector_checklist_meta.location_group.isnull()
    usemaxtmp = sector_checklist_meta[~mask]
    single_birder_locids = set()
    for locgrp, grp in usemaxtmp.groupby(['location_group']):
        # print(locgrp)
        if len(set(grp.Name)) == 1:  # Same birder but possible location dups
            single_birder_locids |= set(grp.locId.values)
    mask_single = checklist_meta.locId.isin(single_birder_locids)

    mask |= mask_single
    use_sum_locids = sector_checklist_meta[mask].locId.values
    # Remove duplicates but keep in order
    use_max_locids = list(
        dict.fromkeys(sector_checklist_meta[~mask].locId.values))

    # These are the columns we can just total up
    use_sum_subids = sector_checklist_meta[sector_checklist_meta.locId.isin(
        use_sum_locids)].subId.values
    use_sum_total = summary[use_sum_subids].apply(
        pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    use_max_total = 0
    # ToDo: logic is duplicated below
    for locid in use_max_locids:
        # subIds are the column names right now
        # subids = sector_checklist_meta[sector_checklist_meta.locId == locid].subId.values
        mask = [(lg.startswith(locid) if lg is not None else False)
                for lg in checklist_meta.location_group.values]
        subids = checklist_meta[mask].subId.values
        # This can be empty if it is not the first in a set of duplicate locations
        if len(subids) == 0:
            continue
        # print(locid, subids)
        max_vals = summary[subids].apply(
            pd.to_numeric).fillna(0).max(axis=1).astype(int)
        use_max_total += max_vals

    summary_total = use_sum_total + use_max_total
    # print(sum(summary_total))

    # Values computed by formulae are only evaluated after a workbook has been opened and
    # saved by Excel. This means if we create these files but never open them, the Total
    # field will show up as 0 (a string formula converted to numeric)
    # Add this so that service_merge/merge_checklists has an actual value to use
    # ToDo: fix summary_total to use SUM/MAX
    summary['FrozenTotal'] = summary_total

    # Actually, make it a formula
    # Has to be after sorting
    #     base_columns = ['Group', 'CommonName', 'Rare', 'Total', 'TaxonOrder']
    #     Group	CommonName	Rare	Total	Ad	Im	CountSpecial
    # =SUM($F5:$Q5)
    col_letters = excel_columns()
    # std_columns = ['Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder',
    #                'NACC_SORT_ORDER']
    sum_start_index = len(std_columns)
    sum_end_index = len(std_columns) + len(use_sum_locids) - 1
    sum_start_col = col_letters[sum_start_index]
    sum_end_col = col_letters[sum_end_index]
    # Start template for total with non-duplicate columns
    sum_formula_template = f'=SUM(${sum_start_col}INDEX:${sum_end_col}INDEX)'

    header_cell_groups = []
    max_formula_totals = []
    max_formula = None
    for locid in use_max_locids:
        # subIds are the column names right now
        # subids = sector_checklist_meta[sector_checklist_meta.locId == locid].subId.values
        mask = [(lg.startswith(locid) if lg is not None else False)
                for lg in checklist_meta.location_group.values]
        subids = checklist_meta[mask].subId.values
        # This can be empty if it is not the first in a set of duplicate locations
        if len(subids) == 0:
            continue
        max_start_index = list(summary.columns).index(subids[0])
        max_end_index = list(summary.columns).index(subids[-1])
        max_start_col = col_letters[max_start_index]
        max_end_col = col_letters[max_end_index]
        max_formula_template = f'MAX(${max_start_col}INDEX:${max_end_col}INDEX)'
        max_formula_totals.append(max_formula_template)
        # Collect up the header cells so we can color different groups
        header_cell_group = f'${max_start_col}1:${max_end_col}1'
        header_cell_groups.append(header_cell_group)

    if len(max_formula_totals):
        max_formula = '+'.join(max_formula_totals)

    total_formula = []
    for ix in range(2, summary.shape[0] + 2):
        sft = sum_formula_template.replace('INDEX', str(ix))
        tf_sum = f'{sft}'
        if max_formula is None:
            total_formula.append(tf_sum)
        else:
            mft = max_formula.replace('INDEX', str(ix))
            tf_max = f'{mft}'
            total_formula.append(tf_sum + '+' + tf_max)

    # print(f'    {total_formula[0]}')

    summary['Total'] = total_formula

    # Add last row for Total and each Sector total
    totals_row = pd.Series([''] * len(summary.columns), index=summary.columns)
    totals_row['Group'] = 'Totals'
    totals_row['TaxonOrder'] = 99999
    totals_row['NACC_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER
    totals_row['ABA_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER

    # Formula for Grand Total, e.g. =SUM($D$2:$D$245)
    total_col_letter = col_letters[std_columns.index('Total')]
    total_formula = f'=SUM(${total_col_letter}2:${total_col_letter}{summary.shape[0] + 1})'
    totals_row.Total = total_formula

    # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')]
    sector_totals = summary[team_cols].apply(
        pd.to_numeric).fillna(0).sum(axis=0).astype(int)
    for col, st in sector_totals.items():
        totals_row[col] = st

    summary = summary.append(totals_row, ignore_index=True)

    # Rename columns to more human readable form
    newcols = create_personal_checklist_columns(sector_checklist_meta)
    summary.rename(columns=newcols, inplace=True)

    # Don't hide 'Rare' since this will be frequently used in a filter
    cols_to_hide = [
        'D', 'Difficulty', 'Adult', 'Immature', 'W-morph', 'B-Morph', 'Ad',
        'Im', 'CountSpecial', 'FrozenTotal'
    ]
    cols_to_highlight = list(
        set(summary.columns) & {'Total', 'Adult/White', 'Immature/Blue'})

    outname = output_path / f'{circle_abbrev}-EBird-Summary-{sector_name}.xlsx'
    write_final_checklist_spreadsheet(summary,
                                      outname,
                                      parameters.parameters,
                                      additional_sheets=None,
                                      cols_to_hide=cols_to_hide,
                                      cols_to_highlight=cols_to_highlight,
                                      header_cell_groups=header_cell_groups)

    return summary, rare_species
def write_ground_truths(truths, out_path: Path):
    if truths.empty:
        return None

    truths = truths.copy()
    xsheet_name = 'Ground Truths'

    # Columns
    # name	Category	ABED-1	ABED-1v	ABED-2	ABED-2v	ABED-3	ABED-3v	ABED-4	ABED-4v...

    cols_to_center = truths.columns.drop('name').drop(
        'Category')  # everything else is centered
    column_widths = {'name': 40, 'Category': 10}
    for col in cols_to_center:
        column_widths[col] = 5 if col.endswith('v') else 11

    numeric_cols = cols_to_center
    text_cols = ['name', 'Category'
                 ]  # force otherwise may interpret original_line as a formula

    xl_last_data_row = truths.shape[
        0] + 1  # plus 1 is because data starts at row 2

    with pd.ExcelWriter(out_path.as_posix(), engine='xlsxwriter') as writer:
        truths.to_excel(writer, index=False, sheet_name=xsheet_name)

        # Get the xlsxwriter workbook and worksheet objects.
        workbook = writer.book
        xlfmts = add_workbook_formats(workbook)

        # ----------------------------------------------
        # Populate col_infos with formatting information
        col_infos = {}

        col_letters = excel_columns()
        for ix, col in enumerate(truths.columns):
            col_letter = col_letters[ix]
            col_info = {}
            # col_info['hide'] = col in real_cols_to_hide
            # col_info['highlight'] = col in real_cols_to_highlight
            if col in numeric_cols:
                fmt = xlfmts[Xlformat.NUMERIC_CENTERED]
            elif col in text_cols:
                fmt = xlfmts[Xlformat.TEXT]
            else:
                fmt = xlfmts[
                    Xlformat.CENTER] if col in cols_to_center else None
            col_info['format'] = fmt
            col_info['width'] = column_widths.get(col, 10)
            col_info['xl_col_letter'] = f'{col_letter}'
            col_infos[col] = col_info

        colspec = pd.DataFrame(col_infos).T
        # ----------------------------------------------

        worksheet = writer.sheets[xsheet_name]

        title = 'Ground Truths'
        worksheet.set_header(f'&C&16&"Times New Roman,Regular"{title}')

        # footer_fmt = f'&C&12&"Times New Roman,Regular"'
        # worksheet.set_footer(f'{footer_fmt}Region: {region} Party: {party}  Date: {dtcstr}')

        # https://xlsxwriter.readthedocs.io/page_setup.html
        # A common requirement is to fit the printed output to n pages wide but
        # have the height be as long as necessary
        # worksheet.fit_to_pages(1, 0)

        # Highlight numbers > 0 for species count
        last_column_letter = col_letters[truths.shape[1] - 1]
        v_total_cell_range = f'C2:{last_column_letter}{xl_last_data_row}'
        worksheet.conditional_format(
            v_total_cell_range, {
                'type': 'cell',
                'criteria': 'equal to',
                'value': True,
                'format': xlfmts[Xlformat.GREEN]
            })

        # Set the column width and format.
        # Set formats with e.g. 'C:C'
        for col_num, col_info in colspec.iterrows():
            xl_col_letter = col_info['xl_col_letter']
            wid = col_info['width']
            fmt = col_info['format']
            worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt)

        # https://xlsxwriter.readthedocs.io/worksheet.html#set_column
        # for ix, col_info in colspec[colspec.hide].iterrows():
        #     xl_col_letter = col_info['xl_col_letter']
        #     worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', None, None, {'hidden': 1})

        # Make the sheet banded
        make_sheet_banded(worksheet, truths)

        # Write the column headers with the defined format.
        for col, col_info in colspec.iterrows():
            # fmt = col_info['format']
            col_num = list(colspec.index).index(col)
            worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER])
def write_nlp_statistics(nlp_statistics, stats_path: Path):
    if nlp_statistics.empty:
        return None

    nlp_statistics = nlp_statistics.copy()
    xsheet_name = 'ParsePDF Statistics'

    # Columns
    # ['family', 'unknown', 'intersections', 'line_token_count', 'line', 'original_line', 'guess',
    #   'levd',  'line_len', 'lev_len_pct', 'species_inferred', 'is_species_line',
    #   'guess_correct', 'source']

    cols_to_center = [
        'is_group', 'non_avian', 'intersections', 'line_token_count', 'levd',
        'tx_line_len', 'species_inferred', 'is_species_line', 'guess_correct',
        'source'
    ]
    column_widths = {
        'classification': 20,
        'original_line': 45,
        'transformed_line': 45,
        'species': 45,
        'closest_match': 45,
        'is_group': 12,
        'non_avian': 12,
        'intersections': 12,
        'line_token_count': 12,
        'levd': 11,
        'tx_line_len': 11,
        'lev_len_pct': 11,
        'species_inferred': 14,
        'exact_match': 14,
        'verified': 14,
        'source': 14
    }

    numeric_cols = ['intersections', 'line_token_count', 'levd', 'line_len']
    text_cols = [
        'classification', 'transformed_line', 'original_line', 'species',
        'closest_match'
    ]  # force otherwise may interpret original_line as a formula

    with pd.ExcelWriter(stats_path.as_posix(), engine='xlsxwriter') as writer:
        nlp_statistics.to_excel(writer, index=False, sheet_name=xsheet_name)

        # Get the xlsxwriter workbook and worksheet objects.
        workbook = writer.book
        xlfmts = add_workbook_formats(workbook)

        # ----------------------------------------------
        # Populate col_infos with formatting information
        col_infos = {}

        excel_letters = excel_columns()
        for ix, col in enumerate(nlp_statistics.columns):
            col_letter = excel_letters[ix]
            col_info = {}
            # col_info['hide'] = col in real_cols_to_hide
            # col_info['highlight'] = col in real_cols_to_highlight
            if col in numeric_cols:
                fmt = xlfmts[Xlformat.NUMERIC_CENTERED]
            elif col == 'lev_len_pct':
                fmt = xlfmts[Xlformat.PERCENTAGE]
            elif col in text_cols:
                fmt = xlfmts[Xlformat.TEXT]
            else:
                fmt = xlfmts[
                    Xlformat.CENTER] if col in cols_to_center else None
            col_info['format'] = fmt
            col_info['width'] = column_widths.get(col, 10)
            col_info['xl_col_letter'] = f'{col_letter}'
            col_infos[col] = col_info

        colspec = pd.DataFrame(col_infos).T
        # ----------------------------------------------

        worksheet = writer.sheets[xsheet_name]

        title = 'NLP Statistics'
        worksheet.set_header(f'&C&16&"Times New Roman,Regular"{title}')

        # footer_fmt = f'&C&12&"Times New Roman,Regular"'
        # worksheet.set_footer(f'{footer_fmt}Region: {region} Party: {party}       Date: {dtcstr}')

        # https://xlsxwriter.readthedocs.io/page_setup.html
        # A common requirement is to fit the printed output to n pages wide but have the
        # height be as long as necessary
        # worksheet.fit_to_pages(1, 0)

        # rare_name_cells = f'G2:G{xl_last_data_row}'
        # rarity_criteria = '=EXACT(H2,"X")'
        # worksheet.conditional_format(rare_name_cells,
        #       {'type': 'formula', 'criteria': rarity_criteria, 'format': format_rare})

        # Set the column width and format.
        # Set formats with e.g. 'C:C'
        for col_num, col_info in colspec.iterrows():
            xl_col_letter = col_info['xl_col_letter']
            wid = col_info['width']
            fmt = col_info['format']
            worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt)

        # https://xlsxwriter.readthedocs.io/worksheet.html#set_column
        # for ix, col_info in colspec[colspec.hide].iterrows():
        #     xl_col_letter = col_info['xl_col_letter']
        #     worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', None, None, {'hidden': 1})

        # Make the sheet banded
        make_sheet_banded(worksheet, nlp_statistics)

        # Write the column headers with the defined format.
        for col, col_info in colspec.iterrows():
            # fmt = col_info['format']
            col_num = list(colspec.index).index(col)
            worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER])
def write_final_checklist_spreadsheet(checklist,
                                      checklist_path: Path,
                                      parameters: dict,
                                      additional_sheets: Optional[List[dict]],
                                      cols_to_hide: list = None,
                                      cols_to_highlight: list = None,
                                      header_cell_groups: List[str] = None):
    # updated_checklist is the filled-in local_checklist
    # It may be wrapped to a two column (printing) format
    if cols_to_highlight is None:
        cols_to_highlight = ['Total']
    if cols_to_hide is None:
        cols_to_hide = ['Group', 'R', 'TaxonOrder']
    if checklist.empty:
        return None

    checklist = checklist.copy()
    xsheet_name = 'Final Checklist'

    # Columns
    # Group, CommonName, R, Total, TaxonOrder, Group_, CommonName_, R_, Total_, TaxonOrder_
    #   A        B       C    D         E        F         G        H     I         J

    real_cols_to_hide = [
        x for x in checklist.columns if x.rstrip() in cols_to_hide
    ] if cols_to_hide else []
    real_cols_to_highlight = [x for x in checklist.columns if x.rstrip() in cols_to_highlight] \
        if cols_to_highlight else []
    cols_to_center = [
        'R', 'Total', 'TaxonOrder', 'Rare', 'Category', 'NACC_SORT_ORDER',
        'ABA_SORT_ORDER'
    ]
    stripped_widths = {
        'Group': 20,
        'CommonName': 40,
        'R': 5,
        'Total': 7,
        'TaxonOrder': 8,
        'LocalName': 35,
        'may_need_writeup': 35,
        'Rare': 10,
        'D': 3,
        'Adult': 6,
        'Immature': 6,
        'W-morph': 6,
        'B-Morph': 6,
        'Difficulty': 6,
        'Adult/White': 11,
        'Immature/Blue': 11,
        'Ad': 3,
        'Im': 3,
        'CountSpecial': 3,
        'Category': 10,
        'NACC_SORT_ORDER': 8,
        'ABA_SORT_ORDER': 8
    }
    xl_last_data_row = checklist.shape[
        0] + 1  # plus 1 is because data starts at row 2

    fill_values = {
        'Group': '',
        'CommonName': '',
        'Rare': '',
        'TaxonOrder': 99999,
        'Group ': '',
        'CommonName ': '',
        'Rare ': '',
        'TaxonOrder ': 99999
    }
    checklist = checklist.fillna(value=fill_values)

    # Probably sector names
    standard_cols_base = [
        'CommonName', 'LocalName', 'Group', 'Category', 'TaxonOrder',
        'NACC_SORT_ORDER', 'ABA_SORT_ORDER', 'Total'
    ]
    standard_cols = standard_cols_base.copy()
    for col in standard_cols_base:
        standard_cols.append(col + ' ')
    non_standard_cols = [
        col for col in checklist.columns if col not in standard_cols
    ]
    for col in non_standard_cols:
        cols_to_center.append(col)
        if not stripped_widths.get(col, None):
            stripped_widths[col] = 14

    try:
        intcols = [col for col in checklist.columns if col.startswith('Taxon')]
        for col in intcols:
            checklist[col] = pd.to_numeric(checklist[col], errors='coerce')
            # checklist = checklist.astype({col: 'int32'}, errors='ignore')
    except Exception as ee:
        print(f'Failed to set type of column "{col}" to numeric', ee)
        checklist.to_csv(checklist_path.parent / f'failure-checklist.csv',
                         index=False)
        unknown_idxs = checklist.index[checklist[col] == 'UNKNOWN']
        display(checklist.loc[unknown_idxs])
        traceback.print_exc(file=sys.stdout)
        # pass

    checklist.astype({'Total': str})

    with pd.ExcelWriter(checklist_path.as_posix(),
                        engine='xlsxwriter') as writer:
        checklist.to_excel(writer, index=False, sheet_name=xsheet_name)

        # Get the xlsxwriter workbook and worksheet objects.
        workbook = writer.book
        xlfmts = add_workbook_formats(workbook)

        # https://stackoverflow.com/questions/43991505/xlsxwriter-set-global-font-size
        workbook.formats[0].set_font_size(
            14)  # to make it readable when printed

        # ----------------------------------------------
        # Populate col_infos with formatting information
        col_infos = {}

        col_letters = excel_columns()
        assert (len(checklist.columns) <= len(col_letters))  # 702
        for ix, col in enumerate(checklist.columns):
            stripped_col = col.rstrip()
            col_letter = col_letters[ix]
            col_info = {
                'hide': col in real_cols_to_hide,
                'highlight': col in real_cols_to_highlight,
                'format':
                xlfmts[Xlformat.CENTER] if col in cols_to_center else None,
                'width': stripped_widths.get(stripped_col, 10),
                'xl_col_letter': f'{col_letter}'
            }
            col_infos[col] = col_info

        colspec = pd.DataFrame(col_infos).T
        # ----------------------------------------------

        worksheet = writer.sheets[xsheet_name]

        date_of_count = parameters['CountDate']
        dtcount = datetime.strptime(date_of_count, '%Y-%m-%d')
        dtcstr = dtcount.strftime("%d %b %Y")
        yr = dtcount.strftime("%Y")
        title = parameters.get('FinalChecklistTitle', '')

        worksheet.set_header(f'&C&16&"Times New Roman,Regular"{title} {yr}')

        region = parameters['CircleAbbrev']
        party = parameters['CircleID']
        footer_fmt = f'&C&12&"Times New Roman,Regular"'
        worksheet.set_footer(
            f'{footer_fmt}Region: {region}       Party: {party}       Date: {dtcstr}'
        )

        # print(f'parameters: {parameters}')
        page_breaks = parameters.get('page_breaks', None)
        if page_breaks:
            # When splitting into 2 columns, the page breaks are known
            print(f'page_breaks: {page_breaks}')
            worksheet.set_h_pagebreaks(page_breaks)
        else:
            # https://xlsxwriter.readthedocs.io/page_setup.html
            # A common requirement is to fit the printed output to n pages wide but
            # have the height be as long as necessary
            # print('fitting to 1 page wide')
            worksheet.fit_to_pages(1, 0)

        # Highlight numbers > 0 for species count
        for ix, col_info in colspec[colspec.highlight].iterrows():
            xl_col_letter = col_info['xl_col_letter']
            v_total_cell_range = f'{xl_col_letter}2:{xl_col_letter}{xl_last_data_row}'
            worksheet.conditional_format(
                v_total_cell_range, {
                    'type': 'cell',
                    'criteria': '>',
                    'value': 0,
                    'format': xlfmts[Xlformat.GREEN]
                })

        excel_letters = excel_columns()
        # Make CommonName bold if Rare column is set
        cols_to_bold_idxs = [
            idx for idx, xs in enumerate(checklist.columns)
            if xs.startswith('CommonName')
        ]
        rare_cols_idxs = [
            idx for idx, xs in enumerate(checklist.columns)
            if xs.startswith('Rare')
        ]
        for cn_idx, ra_idx in zip(cols_to_bold_idxs, rare_cols_idxs):
            letter = excel_letters[cn_idx]
            letter_rare = excel_letters[ra_idx]
            format_rare = workbook.add_format({'bold':
                                               True})  # 'bg_color': '#FFC7CE',
            rare_name_cells = f'{letter}2:{letter}{xl_last_data_row}'
            rarity_criteria_cells = f'{letter_rare}2'
            rarity_criteria = f'=EXACT({rarity_criteria_cells},"X")'
            # print(f'rarity_criteria: {rarity_criteria}')
            worksheet.conditional_format(
                rare_name_cells, {
                    'type': 'formula',
                    'criteria': rarity_criteria,
                    'format': format_rare
                })

        # Make CommonName background yellow if CountSpecial column is set
        format_col_if_other_col(checklist, worksheet, ['CommonName'],
                                ['CountSpecial'],
                                xlfmts[Xlformat.COUNTSPECIAL])
        # format_col_if_other_col(checklist, worksheet, col_to_format, condition_cols, xformat)

        # Color the 'D' (Difficulty) column based on value in 'Difficulty' column
        xformats = [
            xlfmts[idx]
            for idx in [Xlformat.EASY, Xlformat.MARGINAL, Xlformat.DIFFICULT]
        ]
        for to_match, xformat in zip(['E', 'M', 'D'], xformats):
            format_col_if_other_col(checklist, worksheet, ['D'],
                                    ['Difficulty'], xformat, to_match)

        # Color the 'Ad' (Adult) column based on value in 'Adult' column
        format_col_if_other_col(checklist, worksheet, ['Ad', 'Im'],
                                ['Adult', 'Immature'], xlfmts[Xlformat.AGE],
                                'X')

        # Highlight the 'Ad', 'Im' if non-zero values in 'W-morph', 'B-Morph'
        # The 'Ad', 'Im' columns are overloaded here since there is no species overlap
        format_col_if_other_col(checklist, worksheet, ['Ad', 'Im'],
                                ['W-morph', 'B-Morph'], xlfmts[Xlformat.MORPH],
                                'X')

        # Italicize non-species
        try:
            if 'Category' in checklist.columns:
                cols_to_italicize_idxs = [
                    idx for idx, xs in enumerate(checklist.columns)
                    if xs.startswith('CommonName')
                ]
                category_cols_idxs = [
                    idx for idx, xs in enumerate(checklist.columns)
                    if xs.startswith('Category')
                ]
                for cn_idx, ca_idx in zip(cols_to_italicize_idxs,
                                          category_cols_idxs):
                    letter = excel_letters[cn_idx]
                    letter_category = excel_letters[ca_idx]
                    common_name_cells = f'{letter}2:{letter}{xl_last_data_row}'
                    category_criteria_cells = f'{letter_category}2'
                    # category_criteria = f'=EXACT({category_criteria_cells},"slash")'
                    category_criteria = f'={category_criteria_cells}<>"species"'
                    # print(f'category_criteria: {category_criteria}')
                    worksheet.conditional_format(
                        common_name_cells, {
                            'type': 'formula',
                            'criteria': category_criteria,
                            'format': xlfmts[Xlformat.ITALIC]
                        })
        except Exception as ee:
            print(ee)
            print(checklist.columns)
            print(category_cols_idxs)
            traceback.print_exc(file=sys.stdout)
            raise

        # rare_name_cells = f'G2:G{xl_last_data_row}'
        # rarity_criteria = '=EXACT(H2,"X")'
        # worksheet.conditional_format(rare_name_cells,
        #         {'type': 'formula', 'criteria': rarity_criteria, 'format': format_rare})

        # Set the column width and format.
        # Set formats with e.g. 'C:C'
        for col_num, col_info in colspec.iterrows():
            xl_col_letter = col_info['xl_col_letter']
            wid = col_info['width']
            fmt = col_info['format']
            worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt)

        # https://xlsxwriter.readthedocs.io/worksheet.html#set_column
        for ix, col_info in colspec[colspec.hide].iterrows():
            xl_col_letter = col_info['xl_col_letter']
            worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', None,
                                 None, {'hidden': 1})

        # Make the sheet banded
        make_sheet_banded(worksheet, checklist)

        # Set the width, and other properties of a row
        # row (int) – The worksheet row (zero indexed).
        # height (float) – The row height.
        worksheet.set_row(0, 70, None, None)
        worksheet.freeze_panes(1, 0)  # Freeze the first row.

        # header_cell_groups
        if header_cell_groups is not None:
            for ix, header_cell_group in enumerate(header_cell_groups):
                category_criteria = f'=True'
                # print(header_cell_group, ix, fmt)
                worksheet.conditional_format(
                    header_cell_group, {
                        'type': 'formula',
                        'criteria': category_criteria,
                        'format': choose_format_accent(xlfmts, ix)
                    })

        # Write the column headers with the defined format.
        for col, col_info in colspec.iterrows():
            # fmt = col_info['format']
            col_num = list(colspec.index).index(col)
            worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER])

        # ***
        if additional_sheets is not None:
            for sheet_info in additional_sheets:
                # print(sheet_info['sheet_name'])
                df = sheet_info['data']

                df.to_excel(writer,
                            index=False,
                            sheet_name=sheet_info['sheet_name'])
                worksheet = writer.sheets[sheet_info['sheet_name']]
                make_sheet_banded(worksheet, df)

                center_cols = sheet_info['to_center']
                for col, wid in sheet_info['widths'].items():
                    col_index = list(df.columns).index(col)
                    col_letter = excel_letters[col_index]
                    fmt = xlfmts[
                        Xlformat.CENTER] if col in center_cols else None
                    worksheet.set_column(f'{col_letter}:{col_letter}', wid,
                                         fmt)

                # Set the width, and other properties of a row
                # row (int) – The worksheet row (zero indexed).
                # height (float) – The row height.
                # worksheet.set_row(0, 70, None, None)
                worksheet.freeze_panes(1, 0)  # Freeze the first row.
                # Set the header format
                worksheet.write_row(0, 0, list(df.columns),
                                    xlfmts[Xlformat.HEADER])