def show_lines_not_found_in_taxonomy(double_translated, taxonomy: Taxonomy): # ['Group', 'CommonName', 'Rare', 'Total', 'TaxonOrder'] for local_name, _ in double_translated: row = taxonomy.find_local_name_row(local_name) if row is None: print(f'Not found in taxonomy: {local_name}')
def create_row_for_missing_species( common_name: str, summary: pd.DataFrame, taxonomy: Taxonomy) -> Optional[Tuple[pd.Series, bool]]: # can also be SPUH, ISSF etc., just something that wasn't on official list # The number of columns may vary based on the checklist, but we fill # in the ones that we know must be there taxonomy_row = taxonomy.find_local_name_row(common_name) if taxonomy_row is None: # i.e. not found, drop it return None new_row = pd.Series([''] * len(summary.columns), index=summary.columns) new_row['Group'] = taxonomy_row.SPECIES_GROUP new_row['CommonName'] = common_name new_row['TaxonOrder'] = taxonomy_row.TAXON_ORDER new_row['NACC_SORT_ORDER'] = taxonomy_row.NACC_SORT_ORDER new_row['ABA_SORT_ORDER'] = taxonomy_row.ABA_SORT_ORDER new_row['Category'] = taxonomy_row.Category # Filled in later. This is the "Grand Total", not the total from an individual checklist new_row['Total'] = 0 # Not on official list, so mark it Rare if it's a species (not SPUH etc.) rarity = taxonomy_row.Category == 'species' if rarity: new_row['Rare'] = 'X' return new_row, rarity
def create_category_column(summary: pd.DataFrame, taxonomy: Taxonomy) -> list: categories = [] for common_name in summary.CommonName.values: taxonomy_row = taxonomy.find_local_name_row(common_name) category = '' if taxonomy_row is None else taxonomy_row.Category categories.append(category) return categories
def filter_additional_rare(taxonomy: Taxonomy, additional_rare: List[str]) -> List[str]: rare_species = [] for cn in additional_rare: row = taxonomy.find_local_name_row(cn) if row is not None and row.Category == 'species': rare_species.append(cn) return rare_species
def clean_common_names( common_names: List[str], taxonomy: Taxonomy, local_translation_context: LocalTranslationContext) -> List[str]: # skip tertiary_transformation() for now common_names = [ secondary_species_processing(pre_process_line(line)) for line in common_names ] # text_list = [tertiary_transformation(secondary_species_processing(pre_process_line(line))) \ # for line in text_list] # # Processing 1 checklist here # tti = TaxonomyTokenIdentify(taxonomy, interim_data_path) # # # use text_list from above # text_list_lower = [x.lower() for x in text_list] # possibles = filter_to_possibles(tti, text_list_lower) # print(f'Possible species lines: {len(possibles)} (based on word intersections)') # Double translate # print('Doing double translation') # Can take a while translated = [] for line in common_names: # was: possibles txline = local_translation_context.apply_translations( line.lower(), True) translated.append(txline) double_translated = [] for line, _ in translated: txline2 = local_translation_context.apply_translations( line.lower(), True) double_translated.append(txline2) double_translated = [x for (x, y) in double_translated] # print(double_translated) # they may be all lower case, return proper capitalization result = [] for common_name in double_translated: xcn = '' if common_name != '': # avoid most common exception try: row = taxonomy.find_local_name_row(common_name) xcn = row.comName except AttributeError as ae: print(ae) print(f'no taxonomy entry for "{common_name}"') result.append(xcn) return result
def build_full_tally_sheet(double_translated, fpath: Path, taxonomy: Taxonomy, parameters: Parameters, circle_prefix: str): candidate_names = [x for x, y in double_translated] local_names = process_exceptions(candidate_names, fpath, circle_prefix) # if issf etc in list, then base species must be also issfs = taxonomy.filter_issf(local_names) for cn in issfs: base_species = taxonomy.report_as(cn) if base_species: local_names.append(base_species) entries = [] for local_name in local_names: # common_name, taxon_order, species_group, NACC_SORT_ORDER record = taxonomy.find_local_name_row(local_name) if record is not None: # e.g. ('White-throated Sparrow', 31943, 'New World Sparrows', 1848.0) entry = (record.comName, record.TAXON_ORDER, record.SPECIES_GROUP, record.NACC_SORT_ORDER, record.ABA_SORT_ORDER, '', 0 ) # append 'Rare', 'Total' entries.append(entry) df = pd.DataFrame(entries, columns=[ 'CommonName', 'TaxonOrder', 'Group', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER', 'Rare', 'Total' ]) # Re-order cols = [ 'Group', 'CommonName', 'Rare', 'Total', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER' ] local_checklist = df[cols] local_checklist.sort_values(by='TaxonOrder', inplace=True) # local_checklist.shape # double_translated may have duplicates local_checklist = local_checklist[ ~local_checklist.duplicated(subset=['CommonName'], keep='first')] local_checklist = process_annotations_or_rarities(local_checklist, fpath, circle_prefix) # Re-order columns preferred_order = [ 'Group', 'CommonName', 'Rare', 'D', 'Total', 'Ad', 'Im', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER', 'Difficulty', 'Adult', 'Immature', 'W-morph', 'B-Morph', 'CountSpecial' ] newcols = [ col for col in preferred_order if col in local_checklist.columns ] local_checklist = local_checklist[newcols] # Write out full tally sheet # circle_code = circle_prefix[0:4] # double_path = outputs_path / f'{circle_code}-DoubleX.xlsx' # write_local_checklist_with_group(local_checklist, double_path, parameters.parameters) return local_checklist
def merge_checklists( summary_base: Any, sector_files: List[Any], stem_to_colname: Union[dict, List[str]], taxonomy: Taxonomy, local_translation_context: LocalTranslationContext ) -> Tuple[pd.DataFrame, List[str], List[str]]: # Easier to use single column summary_base, but this will transform it if needed if isinstance(summary_base, Path): template = read_excel_or_csv_path(summary_base) # Create a single column master for summary summary_base = recombine_transformed_checklist(template, taxonomy) elif isinstance(summary_base, pd.DataFrame): summary_base = summary_base base_has_adult_col = 'Ad' in summary_base.columns base_has_immature_col = 'Im' in summary_base.columns has_adult_col = False has_immature_col = False # Start of big processing loop summary = summary_base.copy() sector_unique = 1 sector_cols = [] for idx, fpath in enumerate(sector_files): try: if isinstance(fpath, Path): sector_col = stem_to_colname.get(fpath.stem, None) else: sector_col = stem_to_colname[idx] except Exception as ee: print(ee, idx, fpath) sector_col = None if not sector_col: sector_col = f'X{sector_unique}' sector_unique += 1 sector_cols.append(sector_col) print(f'Processing {sector_col}') summary_common_names = summary.CommonName.values summary_common_names_lower = [ xs.lower() for xs in summary_common_names ] if isinstance(fpath, Path): checklist = read_excel_or_csv_path(fpath) # Only Excel files would be double column. CSV files could be hand made, # so clean them up. Double translation takes a long time, so avoid when # possible if fpath.suffix == '.xlsx': checklist = recombine_transformed_checklist( checklist, taxonomy) else: cleaned_common_names = clean_common_names( checklist.CommonName, taxonomy, local_translation_context) checklist.CommonName = cleaned_common_names # print(checklist.Total) xdtypes = {'CommonName': str, 'Total': int} checklist = checklist.astype(dtype=xdtypes) # so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce') # summary.NACC_SORT_ORDER = pd.Series(so).fillna(taxonomy.INVALID_NACC_SORT_ORDER) else: # isinstance(summary_base, pd.DataFrame): checklist = fpath # Drop any rows with a blank CommonName. This can occur if the checklist is a summary # report with a 'Total' row at the bottom, and 'Total' is not a valid species checklist = checklist[checklist.CommonName != ''] # Sector checklists may have added species not on the template checklist['cnlower'] = [xs.lower() for xs in checklist.CommonName] checklist_common_names_lower = set( [xs.lower() for xs in checklist.CommonName]) names_to_add = checklist_common_names_lower - set( summary_common_names_lower) if not names_to_add == set(): species_to_add = taxonomy.filter_species(list(names_to_add)) if len(species_to_add) > 0: print(f'Added species: {species_to_add}') # Fix capitalization names_to_add = clean_common_names(list(names_to_add), taxonomy, local_translation_context) blank_row = pd.Series([''] * len(summary.columns), index=summary.columns) rows_to_add = [] for cn in names_to_add: row = blank_row.copy() row['CommonName'] = cn if cn.lower() in species_to_add: row['Rare'] = 'X' total = checklist[checklist.cnlower == cn.lower()]['Total'].values[0] row[sector_col] = total rows_to_add.append(row) summary = summary.append(rows_to_add, ignore_index=True) # has_adult_col = 'Ad' in checklist.columns has_immature_col = 'Im' in checklist.columns summary[sector_col] = 0 # 'Total' field for this sector if has_adult_col: ad_col = f'Ad-{sector_col}' summary[ad_col] = 0 if has_immature_col: im_col = f'Im-{sector_col}' summary[im_col] = 0 # # S # # Fill in total for existing names # already_present_names = set(summary_common_names) & set(checklist.CommonName) # for cn in set(checklist.CommonName): # total = checklist[checklist.CommonName == cn]['Total'].values[0] # summary.loc[summary.CommonName == cn, sector_col] = total # print(summary.shape, len(summary_common_names_lower)) summary_common_names_lower = [xs.lower() for xs in summary.CommonName] summary['cnlower'] = summary_common_names_lower for ix, row in checklist.iterrows(): # if row.Total: # print(row) total = row.FrozenTotal if 'FrozenTotal' in checklist.columns else row.Total mask = summary.cnlower == row.cnlower summary.loc[mask, sector_col] = total summary.drop(['cnlower'], axis=1, inplace=True) # if has_adult_col: # adult_total = checklist[checklist.CommonName == cn]['Ad'].values[0] # summary.loc[summary.CommonName == cn, ad_col] = adult_total # # if has_immature_col: # immature_total = checklist[checklist.CommonName == cn]['Im'].values[0] # summary.loc[summary.CommonName == cn, im_col] = immature_total # Fill in zeros for missing sector_col values; may have blanks if species added # for col in sector_cols: # summary[col] = summary[col].apply(pd.to_numeric).fillna(0) # Do sums for Ad/Im columns. Ad == 'Adult/White' if base_has_adult_col: ad_cols = [xs for xs in summary.columns if xs.startswith('Ad-')] summary['Ad'] = summary[ad_cols].apply( pd.to_numeric).fillna(0).sum(axis=1).astype(int) if base_has_immature_col: im_cols = [xs for xs in summary.columns if xs.startswith('Im-')] summary['Im'] = summary[im_cols].apply( pd.to_numeric).fillna(0).sum(axis=1).astype(int) # Look up Group and TaxonOrder for anything missing these (may have been added species) for idx, row in summary.iterrows(): record = taxonomy.find_local_name_row(row['CommonName']) if record is not None: summary.at[idx, 'TaxonOrder'] = record.TAXON_ORDER summary.at[idx, 'Group'] = record.SPECIES_GROUP so = record.NACC_SORT_ORDER if record.NACC_SORT_ORDER != 0 else \ taxonomy.INVALID_NACC_SORT_ORDER summary.at[idx, 'NACC_SORT_ORDER'] = so so = record.ABA_SORT_ORDER if record.ABA_SORT_ORDER != 0 else \ taxonomy.INVALID_NACC_SORT_ORDER summary.at[idx, 'ABA_SORT_ORDER'] = so summary.at[idx, 'Category'] = record.Category # Re-sort by TaxonOrder # Must sort before creating formulae for Total so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce') summary.NACC_SORT_ORDER = pd.Series(so).fillna( taxonomy.INVALID_NACC_SORT_ORDER) so = pd.to_numeric(summary.ABA_SORT_ORDER, errors='coerce') summary.ABA_SORT_ORDER = pd.Series(so).fillna( taxonomy.INVALID_NACC_SORT_ORDER) try: summary = summary.sort_values(by=['NACC_SORT_ORDER']).reset_index( drop=True) except TypeError as te: print(te) traceback.print_exc(file=sys.stdout) return summary # Now set the overall total field: # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')] # summary['Total'] = summary[sector_cols].apply(pd.to_numeric).fillna(0).sum(axis=1).astype(int) col_letters = excel_columns() # team_start_col = col_letters[len(base_columns)] std_columns = [ 'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER' ] # Filter out any missing columns std_columns = [col for col in std_columns if col in summary.columns] # team_start_col = col_letters[index_of_first_subtotal_column(summary)] sector_start_col = col_letters[len(std_columns)] sector_end_col = col_letters[len(summary.columns) - 1] total_formula = [ f'=SUM(${sector_start_col}{ix}:${sector_end_col}{ix})' for ix in range(2, summary.shape[0] + 2) ] summary['Total'] = total_formula # Add last row for Total and each Sector total totals_row = pd.Series([''] * len(summary.columns), index=summary.columns) totals_row['Group'] = 'Totals' totals_row['TaxonOrder'] = 99999 totals_row['NACC_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER totals_row['ABA_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER # Formula for Grand Total, e.g. =SUM($D$2:$D$245) total_col_letter = col_letters[std_columns.index('Total')] total_formula = f'=SUM(${total_col_letter}2:${total_col_letter}{summary.shape[0] + 1})' totals_row.Total = total_formula # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')] sector_totals = summary[sector_cols].apply( pd.to_numeric).fillna(0).sum(axis=0).astype(int) for col, st in sector_totals.items(): totals_row[col] = st summary = summary.append(totals_row, ignore_index=True) cols_to_drop = [ col for col in summary.columns if (col.startswith('Ad-') or col.startswith('Im-')) ] summary.drop(labels=cols_to_drop, axis=1, inplace=True) summary.rename(columns={ 'Ad': 'Adult/White', 'Im': 'Immature/Blue' }, inplace=True) # Re-order columns # print(sector_cols) # print(summary.columns) new_col_order = [ col for col in [ 'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER' ] if col in summary.columns ] new_col_order.extend(sector_cols) summary = summary[new_col_order] # Don't hide 'Rare' since this will be frequently used in a filter cols_to_hide = [ 'D', 'Difficulty', 'Adult', 'Immature', 'W-morph', 'B-Morph' ] if 'Adult/White' in summary.columns: if summary['Adult/White'].apply(pd.to_numeric).fillna(0).sum() == 0: cols_to_hide.append('Adult/White') if 'Immature/Blue' in summary.columns: if summary['Immature/Blue'].apply(pd.to_numeric).fillna(0).sum() == 0: cols_to_hide.append('Immature/Blue') cols_to_highlight = list( set(summary.columns) & {'Total', 'Adult/White', 'Immature/Blue'}) return summary, cols_to_hide, cols_to_highlight