def process_exceptions(candidate_names: List[str], checklist_path: Path, circle_prefix: str) -> List[str]: # checklist_path = inputs_parse_path / 'CAPA-checklist.xlsx' # only care about path and prefix exceptions_dir = checklist_path.parent exceptions_path = exceptions_dir / f'{circle_prefix}Exceptions.xlsx' print(f'Exceptions path: {exceptions_path}') if not exceptions_path.exists(): return candidate_names print(f'Exceptions: {exceptions_path}') exceptions_df = read_excel_or_csv_path(exceptions_path) if exceptions_df.empty: return candidate_names mask_add = exceptions_df.Add == 'X' mask_sub = exceptions_df.Subtract == 'X' additions = set(exceptions_df[mask_add].CommonName.values) subtractions = set(exceptions_df[mask_sub].CommonName.values) addstr = ', '.join(additions) subst = ', '.join(subtractions) print(f'Additions: {addstr}\nSubtractions: {subst}') local_names = list((set(candidate_names) | additions) - subtractions) return local_names
def load_annotations(annotations_path: Path) -> pd.DataFrame: # Check for an annotations file annotations = pd.DataFrame() if annotations_path.exists(): print(f'Annotations: {annotations_path}') annotations = read_excel_or_csv_path(annotations_path) # Don't output these columns unless present in Annotations file if {'Easy', 'Marginal', 'Difficult'} & set(annotations.columns): difficulty = pd.Series([''] * annotations.shape[0]) for col in ['Easy', 'Marginal', 'Difficult']: mask = [xs == 'X' for xs in annotations[col].values] difficulty[mask] = str(col)[0:1] annotations['Difficulty'] = difficulty cols_to_keep = [ col for col in [ 'CommonName', 'Rare', 'Easy', 'Marginal', 'Difficult', 'Ranging', 'Adult', 'Immature', 'W-morph', 'B-Morph', 'Difficulty', 'CountSpecial' ] if col in annotations.columns ] annotations = annotations[cols_to_keep] return annotations
def _extract(self) -> str: alltext = '' df = read_excel_or_csv_path(self.fpath, None) for ix, item in enumerate(df.iteritems()): text = '\n'.join(map(str, list(item[1].values))) alltext += text + '\n' return alltext
def raw_csv_to_checklist(fpath: Path, taxonomy: Taxonomy, local_translation_context: LocalTranslationContext, observer_name: str, xdates: List[str]) -> pd.DataFrame: csvdf = read_excel_or_csv_path(fpath) df = csv_dataframe_to_checklist(csvdf, taxonomy, local_translation_context, observer_name, xdates) if df is None: print(f'File {fpath} is not a valid species data file') return df
def load_parameters(self, quiet: bool = True) -> dict: # Load and normalize parameters parameters_df = read_excel_or_csv_path(self.parameters_path, xheader=None) if parameters_df.empty: err_msg = f'Parameters file is required: {self.parameters_path.as_posix()}\n' if CRASH_ON_ERROR: sys.exit(err_msg) else: print(err_msg) # Do some manipulation on the data read in # The file is transposed on disk for user convenience parameters_df = parameters_df.T cols = list(parameters_df.iloc[0].values) parameters_df = parameters_df.drop([parameters_df.index[0] ]).reset_index(drop=True) parameters_df.columns = cols parameters = parameters_df.iloc[0].to_dict() # Look for "NationalCode" first, so we can generate the region files # before possible exit below country = parameters.get('NationalCode', None) if not country: print( 'Warning: no "NationalCode" field found in Parameters, assuming "US"' ) country = 'US' parameters['NationalCode'] = country # May exit here region_code = parameters.get('eBirdRegion', None) if not region_code: line1 = f'Region code is required in eBirdRegion field of parameters file\n' line2 = f'Parameters file: {self.parameters_path.as_posix()}\n' line3 = f'Region files path: {interim_data_path}' err_msg = line1 + line2 + line3 if CRASH_ON_ERROR: sys.exit(err_msg) else: print(err_msg) date_of_count = parameters.get('CountDate', datetime.now()) parameters['CountDate'] = date_of_count.strftime("%Y-%m-%d") if not quiet: print('Using these parameters:\n') for key, val in parameters.items(): print(f'{key:<25s}{str(val):<80s}') print() return parameters
def merge_audubon_results(taxonomy: Taxonomy, local_translation_context: LocalTranslationContext): results_path = raw_data_path / 'AudubonResults' stem_to_colnames = {} parameters = {} sector_files = [] for fpath in results_path.glob('*'): if fpath.stem.startswith('~$') or fpath.stem.startswith('.') or fpath.is_dir(): continue print(fpath) df = read_excel_or_csv_path(fpath) ci = extract_circle_info(fpath, df) print(f'Name: {ci["Name"]}, Code: {ci["Code"]}, Date: {ci["Date"]}, Count: {df.shape[0]}') xparameters = { 'CountDate': normalize_date_for_details(ci['Date']), 'FinalChecklistTitle': ci['Name'], 'CircleAbbrev': ci["Code"], 'CircleID': ci["Code"], } # Just need something to pass to write_final_checklist_spreadsheet # ToDo: Service-Merge should not depend on loading parameters #36 parameters = xparameters # cleaned_common_names = clean_common_names(df.CommonName, taxonomy, # local_translation_context) # df.CommonName = cleaned_common_names dfcl = dataframe_to_checklist(df, taxonomy, local_translation_context) print(f'dfcl: {dfcl.shape}') year = ci["Date"][0:4] fname = f'{ci["Code"]}-{year}-AudubonResults.xlsx' outpath = inputs_merge_path / fname col_name = f'{ci["Name"]} ({ci["Code"]}) {xparameters["CountDate"]}' stem_to_colnames[outpath.stem] = col_name sector_files.append(outpath) write_final_checklist_spreadsheet(dfcl, outpath, xparameters, None) # Pick something local as base from common_paths import outputs_path summary_base = outputs_path / 'CASJ-2020-Single.xlsx' summary, cols_to_hide, cols_to_highlight = merge_checklists(summary_base, sector_files, stem_to_colnames, taxonomy, local_translation_context) output_path = outputs_path / 'Merged-Audubon-Results.xlsx' write_final_checklist_spreadsheet(summary, output_path, parameters=parameters, additional_sheets=None, cols_to_hide=cols_to_hide, cols_to_highlight=cols_to_highlight )
def extract_circle_info(fpath: Path, df: pd.DataFrame) -> Optional[dict]: if 'CurrentYearResultsByCount' in fpath.stem: return extract_circle_info_from_audubon_dataframe(df) # This assumes file has circle code as first 4 characters, e.g. # CAPA-pacbc_totals_2020.xlsx for 'CAPA' name_prefix = fpath.stem[0:9] ppath = local_parameters_path / f'{name_prefix}-Parameters.xlsx' # print(ppath) if not ppath.exists(): return {'Name': fpath.stem, 'Code': 'XXXX', 'Date': '2020-12-XX'} parameters_df = read_excel_or_csv_path(ppath, xheader=None) parameters_df = Parameters().prepare_parameters_from_file(parameters_df) # print(parameters_df) return { 'Name': parameters_df['CircleName'], 'Code': parameters_df['CircleAbbrev'], 'Date': parameters_df['CountDate'].strftime("%Y-%m-%d") }
def process_annotations_or_rarities(checklist: pd.DataFrame, checklist_path: Path, circle_prefix: str) -> pd.DataFrame: """ Look for Annotations or Rarities files and mark the 'Rare' column in checklist with an 'X' Annotations.xlsx must have these columns: Rarities.xlsx (or CSV) requires 'CommonName' and 'Rare' columns Rarities.txt is just a text list of rare species :param circle_prefix: :param checklist: :param checklist_path: full path for checklist. Used to construct names for inputs :return: checklist with 'Rare' column set to 'X' if species is rare """ # Process annotations. The XXXX-LocalAnnotations.xlsx file will be preferred over # the rarities list if it exists annotations_dir = checklist_path.parent annotations_path = annotations_dir / f'{circle_prefix}Annotations.xlsx' print(f'Annotations path: {annotations_path}') if annotations_path.exists(): return process_annotations(checklist, annotations_path) for ext in ['xlsx', 'csv', 'txt']: rarities_path = annotations_dir / f'{circle_prefix}Rarities.{ext}' if not rarities_path.exists(): continue if ext == 'txt': rare_species = load_rarities_text(rarities_path) else: rarities_df = read_excel_or_csv_path(rarities_path) rare_species = list( rarities_df[rarities_df.Rare == 'X'].CommonName.values) checklist = process_rarities(checklist, rare_species) break return checklist
def ground_truths(): ground_truths_in_path = base_path / 'ground_truths.xlsx' ground_truths_in = read_excel_or_csv_path(ground_truths_in_path) return ground_truths_in
def summarize_checklists(personal_checklists: pd.DataFrame, taxonomy: Taxonomy, template_path: Path, parameters: Parameters, checklist_meta: pd.DataFrame, geo_data, location_data, location_meta): # Try with up to date 2020 checklist # template_path = inputs_path / 'Merge' / 'CASJ-2-SingleChecklist-CASJ-2-checklist2020.xlsx' circle_code = parameters.parameters.get('CircleAbbrev', 'XXXX') # Load Summary template template = read_excel_or_csv_path(template_path) template_2col = template.copy() # Create a single column master for summary summary_base = recombine_transformed_checklist(template_2col, taxonomy) # Create EBird Summaries unlisted_rare_species = set() sectors = sorted( list(set(geo_data[geo_data['type'] == 'sector'].GeoName.values))) sectors.append('Unspecified') if len(sectors) == 0: sector = geo_data[geo_data['type'] == 'circle'].GeoName.values[0] summary, rare_species = create_ebird_summary( summary_base, personal_checklists, checklist_meta, circle_code, parameters, sector, taxonomy, reports_path) for species in rare_species: unlisted_rare_species.add(species) else: for sector in sectors: sector_subids = location_meta[location_meta.GeoName == sector].locId.values sector_checklists = personal_checklists[ personal_checklists.locId.isin(sector_subids)] print( f'Sector: {sector:30} [{sector_checklists.shape[0]} observations]' ) if sector_checklists.shape[0] == 0: continue summary, rare_species = create_ebird_summary( summary_base, sector_checklists, checklist_meta, circle_code, parameters, sector, taxonomy, reports_path) for species in rare_species: unlisted_rare_species.add(species) # Print out rarities (eventually move to somewhere useful) rare_base = summary_base[summary_base.Rare != ''].CommonName.values all_rarities = list(unlisted_rare_species | set(rare_base)) mask = [cn in all_rarities for cn in personal_checklists.CommonName.values] rarities_df = personal_checklists[mask].copy().reset_index(drop=True) rarities_df.drop(columns=['groupId', 'speciesCode'], inplace=True) # , add name from locId]) rarities_df.sort_values(by=['Name'], inplace=True) rarities_df['Reason'] = rarities_df.CommonName.apply( lambda cn: 'Missing' if cn in unlisted_rare_species else 'Explicit') rarities_df['Where'] = [ find_location_name_with_locid(location_data, locid) for locid in rarities_df.locId.values ] # display(rarities_df) return rarities_df
def merge_checklists( summary_base: Any, sector_files: List[Any], stem_to_colname: Union[dict, List[str]], taxonomy: Taxonomy, local_translation_context: LocalTranslationContext ) -> Tuple[pd.DataFrame, List[str], List[str]]: # Easier to use single column summary_base, but this will transform it if needed if isinstance(summary_base, Path): template = read_excel_or_csv_path(summary_base) # Create a single column master for summary summary_base = recombine_transformed_checklist(template, taxonomy) elif isinstance(summary_base, pd.DataFrame): summary_base = summary_base base_has_adult_col = 'Ad' in summary_base.columns base_has_immature_col = 'Im' in summary_base.columns has_adult_col = False has_immature_col = False # Start of big processing loop summary = summary_base.copy() sector_unique = 1 sector_cols = [] for idx, fpath in enumerate(sector_files): try: if isinstance(fpath, Path): sector_col = stem_to_colname.get(fpath.stem, None) else: sector_col = stem_to_colname[idx] except Exception as ee: print(ee, idx, fpath) sector_col = None if not sector_col: sector_col = f'X{sector_unique}' sector_unique += 1 sector_cols.append(sector_col) print(f'Processing {sector_col}') summary_common_names = summary.CommonName.values summary_common_names_lower = [ xs.lower() for xs in summary_common_names ] if isinstance(fpath, Path): checklist = read_excel_or_csv_path(fpath) # Only Excel files would be double column. CSV files could be hand made, # so clean them up. Double translation takes a long time, so avoid when # possible if fpath.suffix == '.xlsx': checklist = recombine_transformed_checklist( checklist, taxonomy) else: cleaned_common_names = clean_common_names( checklist.CommonName, taxonomy, local_translation_context) checklist.CommonName = cleaned_common_names # print(checklist.Total) xdtypes = {'CommonName': str, 'Total': int} checklist = checklist.astype(dtype=xdtypes) # so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce') # summary.NACC_SORT_ORDER = pd.Series(so).fillna(taxonomy.INVALID_NACC_SORT_ORDER) else: # isinstance(summary_base, pd.DataFrame): checklist = fpath # Drop any rows with a blank CommonName. This can occur if the checklist is a summary # report with a 'Total' row at the bottom, and 'Total' is not a valid species checklist = checklist[checklist.CommonName != ''] # Sector checklists may have added species not on the template checklist['cnlower'] = [xs.lower() for xs in checklist.CommonName] checklist_common_names_lower = set( [xs.lower() for xs in checklist.CommonName]) names_to_add = checklist_common_names_lower - set( summary_common_names_lower) if not names_to_add == set(): species_to_add = taxonomy.filter_species(list(names_to_add)) if len(species_to_add) > 0: print(f'Added species: {species_to_add}') # Fix capitalization names_to_add = clean_common_names(list(names_to_add), taxonomy, local_translation_context) blank_row = pd.Series([''] * len(summary.columns), index=summary.columns) rows_to_add = [] for cn in names_to_add: row = blank_row.copy() row['CommonName'] = cn if cn.lower() in species_to_add: row['Rare'] = 'X' total = checklist[checklist.cnlower == cn.lower()]['Total'].values[0] row[sector_col] = total rows_to_add.append(row) summary = summary.append(rows_to_add, ignore_index=True) # has_adult_col = 'Ad' in checklist.columns has_immature_col = 'Im' in checklist.columns summary[sector_col] = 0 # 'Total' field for this sector if has_adult_col: ad_col = f'Ad-{sector_col}' summary[ad_col] = 0 if has_immature_col: im_col = f'Im-{sector_col}' summary[im_col] = 0 # # S # # Fill in total for existing names # already_present_names = set(summary_common_names) & set(checklist.CommonName) # for cn in set(checklist.CommonName): # total = checklist[checklist.CommonName == cn]['Total'].values[0] # summary.loc[summary.CommonName == cn, sector_col] = total # print(summary.shape, len(summary_common_names_lower)) summary_common_names_lower = [xs.lower() for xs in summary.CommonName] summary['cnlower'] = summary_common_names_lower for ix, row in checklist.iterrows(): # if row.Total: # print(row) total = row.FrozenTotal if 'FrozenTotal' in checklist.columns else row.Total mask = summary.cnlower == row.cnlower summary.loc[mask, sector_col] = total summary.drop(['cnlower'], axis=1, inplace=True) # if has_adult_col: # adult_total = checklist[checklist.CommonName == cn]['Ad'].values[0] # summary.loc[summary.CommonName == cn, ad_col] = adult_total # # if has_immature_col: # immature_total = checklist[checklist.CommonName == cn]['Im'].values[0] # summary.loc[summary.CommonName == cn, im_col] = immature_total # Fill in zeros for missing sector_col values; may have blanks if species added # for col in sector_cols: # summary[col] = summary[col].apply(pd.to_numeric).fillna(0) # Do sums for Ad/Im columns. Ad == 'Adult/White' if base_has_adult_col: ad_cols = [xs for xs in summary.columns if xs.startswith('Ad-')] summary['Ad'] = summary[ad_cols].apply( pd.to_numeric).fillna(0).sum(axis=1).astype(int) if base_has_immature_col: im_cols = [xs for xs in summary.columns if xs.startswith('Im-')] summary['Im'] = summary[im_cols].apply( pd.to_numeric).fillna(0).sum(axis=1).astype(int) # Look up Group and TaxonOrder for anything missing these (may have been added species) for idx, row in summary.iterrows(): record = taxonomy.find_local_name_row(row['CommonName']) if record is not None: summary.at[idx, 'TaxonOrder'] = record.TAXON_ORDER summary.at[idx, 'Group'] = record.SPECIES_GROUP so = record.NACC_SORT_ORDER if record.NACC_SORT_ORDER != 0 else \ taxonomy.INVALID_NACC_SORT_ORDER summary.at[idx, 'NACC_SORT_ORDER'] = so so = record.ABA_SORT_ORDER if record.ABA_SORT_ORDER != 0 else \ taxonomy.INVALID_NACC_SORT_ORDER summary.at[idx, 'ABA_SORT_ORDER'] = so summary.at[idx, 'Category'] = record.Category # Re-sort by TaxonOrder # Must sort before creating formulae for Total so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce') summary.NACC_SORT_ORDER = pd.Series(so).fillna( taxonomy.INVALID_NACC_SORT_ORDER) so = pd.to_numeric(summary.ABA_SORT_ORDER, errors='coerce') summary.ABA_SORT_ORDER = pd.Series(so).fillna( taxonomy.INVALID_NACC_SORT_ORDER) try: summary = summary.sort_values(by=['NACC_SORT_ORDER']).reset_index( drop=True) except TypeError as te: print(te) traceback.print_exc(file=sys.stdout) return summary # Now set the overall total field: # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')] # summary['Total'] = summary[sector_cols].apply(pd.to_numeric).fillna(0).sum(axis=1).astype(int) col_letters = excel_columns() # team_start_col = col_letters[len(base_columns)] std_columns = [ 'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER' ] # Filter out any missing columns std_columns = [col for col in std_columns if col in summary.columns] # team_start_col = col_letters[index_of_first_subtotal_column(summary)] sector_start_col = col_letters[len(std_columns)] sector_end_col = col_letters[len(summary.columns) - 1] total_formula = [ f'=SUM(${sector_start_col}{ix}:${sector_end_col}{ix})' for ix in range(2, summary.shape[0] + 2) ] summary['Total'] = total_formula # Add last row for Total and each Sector total totals_row = pd.Series([''] * len(summary.columns), index=summary.columns) totals_row['Group'] = 'Totals' totals_row['TaxonOrder'] = 99999 totals_row['NACC_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER totals_row['ABA_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER # Formula for Grand Total, e.g. =SUM($D$2:$D$245) total_col_letter = col_letters[std_columns.index('Total')] total_formula = f'=SUM(${total_col_letter}2:${total_col_letter}{summary.shape[0] + 1})' totals_row.Total = total_formula # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')] sector_totals = summary[sector_cols].apply( pd.to_numeric).fillna(0).sum(axis=0).astype(int) for col, st in sector_totals.items(): totals_row[col] = st summary = summary.append(totals_row, ignore_index=True) cols_to_drop = [ col for col in summary.columns if (col.startswith('Ad-') or col.startswith('Im-')) ] summary.drop(labels=cols_to_drop, axis=1, inplace=True) summary.rename(columns={ 'Ad': 'Adult/White', 'Im': 'Immature/Blue' }, inplace=True) # Re-order columns # print(sector_cols) # print(summary.columns) new_col_order = [ col for col in [ 'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER' ] if col in summary.columns ] new_col_order.extend(sector_cols) summary = summary[new_col_order] # Don't hide 'Rare' since this will be frequently used in a filter cols_to_hide = [ 'D', 'Difficulty', 'Adult', 'Immature', 'W-morph', 'B-Morph' ] if 'Adult/White' in summary.columns: if summary['Adult/White'].apply(pd.to_numeric).fillna(0).sum() == 0: cols_to_hide.append('Adult/White') if 'Immature/Blue' in summary.columns: if summary['Immature/Blue'].apply(pd.to_numeric).fillna(0).sum() == 0: cols_to_hide.append('Immature/Blue') cols_to_highlight = list( set(summary.columns) & {'Total', 'Adult/White', 'Immature/Blue'}) return summary, cols_to_hide, cols_to_highlight