示例#1
0
def process_exceptions(candidate_names: List[str], checklist_path: Path,
                       circle_prefix: str) -> List[str]:
    # checklist_path = inputs_parse_path / 'CAPA-checklist.xlsx'  # only care about path and prefix
    exceptions_dir = checklist_path.parent

    exceptions_path = exceptions_dir / f'{circle_prefix}Exceptions.xlsx'
    print(f'Exceptions path: {exceptions_path}')

    if not exceptions_path.exists():
        return candidate_names

    print(f'Exceptions: {exceptions_path}')
    exceptions_df = read_excel_or_csv_path(exceptions_path)
    if exceptions_df.empty:
        return candidate_names

    mask_add = exceptions_df.Add == 'X'
    mask_sub = exceptions_df.Subtract == 'X'
    additions = set(exceptions_df[mask_add].CommonName.values)
    subtractions = set(exceptions_df[mask_sub].CommonName.values)
    addstr = ', '.join(additions)
    subst = ', '.join(subtractions)
    print(f'Additions: {addstr}\nSubtractions: {subst}')
    local_names = list((set(candidate_names) | additions) - subtractions)

    return local_names
示例#2
0
def load_annotations(annotations_path: Path) -> pd.DataFrame:
    # Check for an annotations file
    annotations = pd.DataFrame()
    if annotations_path.exists():
        print(f'Annotations: {annotations_path}')

        annotations = read_excel_or_csv_path(annotations_path)

        # Don't output these columns unless present in Annotations file
        if {'Easy', 'Marginal', 'Difficult'} & set(annotations.columns):
            difficulty = pd.Series([''] * annotations.shape[0])
            for col in ['Easy', 'Marginal', 'Difficult']:
                mask = [xs == 'X' for xs in annotations[col].values]
                difficulty[mask] = str(col)[0:1]
            annotations['Difficulty'] = difficulty

        cols_to_keep = [
            col for col in [
                'CommonName', 'Rare', 'Easy', 'Marginal', 'Difficult',
                'Ranging', 'Adult', 'Immature', 'W-morph', 'B-Morph',
                'Difficulty', 'CountSpecial'
            ] if col in annotations.columns
        ]
        annotations = annotations[cols_to_keep]

    return annotations
示例#3
0
    def _extract(self) -> str:
        alltext = ''
        df = read_excel_or_csv_path(self.fpath, None)
        for ix, item in enumerate(df.iteritems()):
            text = '\n'.join(map(str, list(item[1].values)))
            alltext += text + '\n'

        return alltext
示例#4
0
def raw_csv_to_checklist(fpath: Path, taxonomy: Taxonomy,
                         local_translation_context: LocalTranslationContext,
                         observer_name: str,
                         xdates: List[str]) -> pd.DataFrame:
    csvdf = read_excel_or_csv_path(fpath)
    df = csv_dataframe_to_checklist(csvdf, taxonomy, local_translation_context,
                                    observer_name, xdates)
    if df is None:
        print(f'File {fpath} is not a valid species data file')
    return df
示例#5
0
    def load_parameters(self, quiet: bool = True) -> dict:
        # Load and normalize parameters
        parameters_df = read_excel_or_csv_path(self.parameters_path,
                                               xheader=None)

        if parameters_df.empty:
            err_msg = f'Parameters file is required: {self.parameters_path.as_posix()}\n'
            if CRASH_ON_ERROR:
                sys.exit(err_msg)
            else:
                print(err_msg)

        # Do some manipulation on the data read in
        # The file is transposed on disk for user convenience
        parameters_df = parameters_df.T
        cols = list(parameters_df.iloc[0].values)
        parameters_df = parameters_df.drop([parameters_df.index[0]
                                            ]).reset_index(drop=True)
        parameters_df.columns = cols

        parameters = parameters_df.iloc[0].to_dict()

        # Look for "NationalCode" first, so we can generate the region files
        # before possible exit below
        country = parameters.get('NationalCode', None)
        if not country:
            print(
                'Warning: no "NationalCode" field found in Parameters, assuming "US"'
            )
            country = 'US'
        parameters['NationalCode'] = country

        # May exit here
        region_code = parameters.get('eBirdRegion', None)
        if not region_code:

            line1 = f'Region code is required in eBirdRegion field of parameters file\n'
            line2 = f'Parameters file: {self.parameters_path.as_posix()}\n'
            line3 = f'Region files path: {interim_data_path}'
            err_msg = line1 + line2 + line3
            if CRASH_ON_ERROR:
                sys.exit(err_msg)
            else:
                print(err_msg)

        date_of_count = parameters.get('CountDate', datetime.now())
        parameters['CountDate'] = date_of_count.strftime("%Y-%m-%d")

        if not quiet:
            print('Using these parameters:\n')
            for key, val in parameters.items():
                print(f'{key:<25s}{str(val):<80s}')
            print()

        return parameters
示例#6
0
def merge_audubon_results(taxonomy: Taxonomy,
                          local_translation_context: LocalTranslationContext):
    results_path = raw_data_path / 'AudubonResults'
    stem_to_colnames = {}
    parameters = {}

    sector_files = []
    for fpath in results_path.glob('*'):
        if fpath.stem.startswith('~$') or fpath.stem.startswith('.') or fpath.is_dir():
            continue
        print(fpath)
        df = read_excel_or_csv_path(fpath)
        ci = extract_circle_info(fpath, df)

        print(f'Name: {ci["Name"]}, Code: {ci["Code"]}, Date: {ci["Date"]}, Count: {df.shape[0]}')

        xparameters = {
            'CountDate': normalize_date_for_details(ci['Date']),
            'FinalChecklistTitle': ci['Name'],
            'CircleAbbrev': ci["Code"],
            'CircleID': ci["Code"],
        }
        # Just need something to pass to write_final_checklist_spreadsheet
        # ToDo: Service-Merge should not depend on loading parameters #36
        parameters = xparameters

        # cleaned_common_names = clean_common_names(df.CommonName, taxonomy,
        #                                           local_translation_context)
        # df.CommonName = cleaned_common_names
        dfcl = dataframe_to_checklist(df, taxonomy, local_translation_context)
        print(f'dfcl: {dfcl.shape}')
        year = ci["Date"][0:4]
        fname = f'{ci["Code"]}-{year}-AudubonResults.xlsx'
        outpath = inputs_merge_path / fname
        col_name = f'{ci["Name"]} ({ci["Code"]}) {xparameters["CountDate"]}'
        stem_to_colnames[outpath.stem] = col_name
        sector_files.append(outpath)

        write_final_checklist_spreadsheet(dfcl, outpath, xparameters, None)

    # Pick something local as base
    from common_paths import outputs_path
    summary_base = outputs_path / 'CASJ-2020-Single.xlsx'
    summary, cols_to_hide, cols_to_highlight = merge_checklists(summary_base, sector_files,
                                                                stem_to_colnames, taxonomy,
                                                                local_translation_context)

    output_path = outputs_path / 'Merged-Audubon-Results.xlsx'
    write_final_checklist_spreadsheet(summary, output_path,
                                      parameters=parameters,
                                      additional_sheets=None,
                                      cols_to_hide=cols_to_hide,
                                      cols_to_highlight=cols_to_highlight
                                      )
示例#7
0
def extract_circle_info(fpath: Path, df: pd.DataFrame) -> Optional[dict]:
    if 'CurrentYearResultsByCount' in fpath.stem:
        return extract_circle_info_from_audubon_dataframe(df)

    # This assumes file has circle code as first 4 characters, e.g.
    #     CAPA-pacbc_totals_2020.xlsx for 'CAPA'
    name_prefix = fpath.stem[0:9]
    ppath = local_parameters_path / f'{name_prefix}-Parameters.xlsx'
    #     print(ppath)
    if not ppath.exists():
        return {'Name': fpath.stem, 'Code': 'XXXX', 'Date': '2020-12-XX'}

    parameters_df = read_excel_or_csv_path(ppath, xheader=None)
    parameters_df = Parameters().prepare_parameters_from_file(parameters_df)

    #     print(parameters_df)
    return {
        'Name': parameters_df['CircleName'],
        'Code': parameters_df['CircleAbbrev'],
        'Date': parameters_df['CountDate'].strftime("%Y-%m-%d")
    }
示例#8
0
def process_annotations_or_rarities(checklist: pd.DataFrame,
                                    checklist_path: Path,
                                    circle_prefix: str) -> pd.DataFrame:
    """
    Look for Annotations or Rarities files and mark the 'Rare' column in checklist
    with an 'X'
    Annotations.xlsx must have these columns:
    Rarities.xlsx (or CSV) requires 'CommonName' and 'Rare' columns
    Rarities.txt is just a text list of rare species
    :param circle_prefix:
    :param checklist:
    :param checklist_path: full path for checklist. Used to construct names for inputs
    :return: checklist with 'Rare' column set to 'X' if species is rare
    """
    # Process annotations. The XXXX-LocalAnnotations.xlsx file will be preferred over
    # the rarities list if it exists

    annotations_dir = checklist_path.parent
    annotations_path = annotations_dir / f'{circle_prefix}Annotations.xlsx'
    print(f'Annotations path: {annotations_path}')

    if annotations_path.exists():
        return process_annotations(checklist, annotations_path)

    for ext in ['xlsx', 'csv', 'txt']:
        rarities_path = annotations_dir / f'{circle_prefix}Rarities.{ext}'
        if not rarities_path.exists():
            continue
        if ext == 'txt':
            rare_species = load_rarities_text(rarities_path)
        else:
            rarities_df = read_excel_or_csv_path(rarities_path)
            rare_species = list(
                rarities_df[rarities_df.Rare == 'X'].CommonName.values)

        checklist = process_rarities(checklist, rare_species)
        break

    return checklist
示例#9
0
def ground_truths():
    ground_truths_in_path = base_path / 'ground_truths.xlsx'
    ground_truths_in = read_excel_or_csv_path(ground_truths_in_path)

    return ground_truths_in
示例#10
0
def summarize_checklists(personal_checklists: pd.DataFrame, taxonomy: Taxonomy,
                         template_path: Path, parameters: Parameters,
                         checklist_meta: pd.DataFrame, geo_data, location_data,
                         location_meta):
    # Try with up to date 2020 checklist
    # template_path = inputs_path / 'Merge' / 'CASJ-2-SingleChecklist-CASJ-2-checklist2020.xlsx'

    circle_code = parameters.parameters.get('CircleAbbrev', 'XXXX')

    # Load Summary template
    template = read_excel_or_csv_path(template_path)
    template_2col = template.copy()
    # Create a single column master for summary
    summary_base = recombine_transformed_checklist(template_2col, taxonomy)

    # Create EBird Summaries
    unlisted_rare_species = set()
    sectors = sorted(
        list(set(geo_data[geo_data['type'] == 'sector'].GeoName.values)))
    sectors.append('Unspecified')
    if len(sectors) == 0:
        sector = geo_data[geo_data['type'] == 'circle'].GeoName.values[0]
        summary, rare_species = create_ebird_summary(
            summary_base, personal_checklists, checklist_meta, circle_code,
            parameters, sector, taxonomy, reports_path)
        for species in rare_species:
            unlisted_rare_species.add(species)
    else:
        for sector in sectors:
            sector_subids = location_meta[location_meta.GeoName ==
                                          sector].locId.values
            sector_checklists = personal_checklists[
                personal_checklists.locId.isin(sector_subids)]
            print(
                f'Sector: {sector:30} [{sector_checklists.shape[0]} observations]'
            )
            if sector_checklists.shape[0] == 0:
                continue

            summary, rare_species = create_ebird_summary(
                summary_base, sector_checklists, checklist_meta, circle_code,
                parameters, sector, taxonomy, reports_path)
            for species in rare_species:
                unlisted_rare_species.add(species)

    # Print out rarities (eventually move to somewhere useful)
    rare_base = summary_base[summary_base.Rare != ''].CommonName.values
    all_rarities = list(unlisted_rare_species | set(rare_base))
    mask = [cn in all_rarities for cn in personal_checklists.CommonName.values]
    rarities_df = personal_checklists[mask].copy().reset_index(drop=True)
    rarities_df.drop(columns=['groupId', 'speciesCode'], inplace=True)
    # , add name from locId])
    rarities_df.sort_values(by=['Name'], inplace=True)
    rarities_df['Reason'] = rarities_df.CommonName.apply(
        lambda cn: 'Missing' if cn in unlisted_rare_species else 'Explicit')
    rarities_df['Where'] = [
        find_location_name_with_locid(location_data, locid)
        for locid in rarities_df.locId.values
    ]
    # display(rarities_df)

    return rarities_df
示例#11
0
def merge_checklists(
    summary_base: Any, sector_files: List[Any],
    stem_to_colname: Union[dict, List[str]], taxonomy: Taxonomy,
    local_translation_context: LocalTranslationContext
) -> Tuple[pd.DataFrame, List[str], List[str]]:
    # Easier to use single column summary_base, but this will transform it if needed
    if isinstance(summary_base, Path):
        template = read_excel_or_csv_path(summary_base)
        # Create a single column master for summary
        summary_base = recombine_transformed_checklist(template, taxonomy)
    elif isinstance(summary_base, pd.DataFrame):
        summary_base = summary_base

    base_has_adult_col = 'Ad' in summary_base.columns
    base_has_immature_col = 'Im' in summary_base.columns
    has_adult_col = False
    has_immature_col = False

    # Start of big processing loop
    summary = summary_base.copy()
    sector_unique = 1
    sector_cols = []
    for idx, fpath in enumerate(sector_files):
        try:
            if isinstance(fpath, Path):
                sector_col = stem_to_colname.get(fpath.stem, None)
            else:
                sector_col = stem_to_colname[idx]
        except Exception as ee:
            print(ee, idx, fpath)
            sector_col = None

        if not sector_col:
            sector_col = f'X{sector_unique}'
            sector_unique += 1

        sector_cols.append(sector_col)
        print(f'Processing {sector_col}')

        summary_common_names = summary.CommonName.values
        summary_common_names_lower = [
            xs.lower() for xs in summary_common_names
        ]
        if isinstance(fpath, Path):
            checklist = read_excel_or_csv_path(fpath)
            # Only Excel files would be double column. CSV files could be hand made,
            # so clean them up. Double translation takes a long time, so avoid when
            # possible
            if fpath.suffix == '.xlsx':
                checklist = recombine_transformed_checklist(
                    checklist, taxonomy)
            else:
                cleaned_common_names = clean_common_names(
                    checklist.CommonName, taxonomy, local_translation_context)
                checklist.CommonName = cleaned_common_names
            # print(checklist.Total)
            xdtypes = {'CommonName': str, 'Total': int}
            checklist = checklist.astype(dtype=xdtypes)

            # so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce')
            # summary.NACC_SORT_ORDER = pd.Series(so).fillna(taxonomy.INVALID_NACC_SORT_ORDER)

        else:  # isinstance(summary_base, pd.DataFrame):
            checklist = fpath

        # Drop any rows with a blank CommonName. This can occur if the checklist is a summary
        # report with a 'Total' row at the bottom, and 'Total' is not a valid species
        checklist = checklist[checklist.CommonName != '']

        # Sector checklists may have added species not on the template
        checklist['cnlower'] = [xs.lower() for xs in checklist.CommonName]
        checklist_common_names_lower = set(
            [xs.lower() for xs in checklist.CommonName])
        names_to_add = checklist_common_names_lower - set(
            summary_common_names_lower)
        if not names_to_add == set():
            species_to_add = taxonomy.filter_species(list(names_to_add))
            if len(species_to_add) > 0:
                print(f'Added species: {species_to_add}')
            # Fix capitalization
            names_to_add = clean_common_names(list(names_to_add), taxonomy,
                                              local_translation_context)
            blank_row = pd.Series([''] * len(summary.columns),
                                  index=summary.columns)
            rows_to_add = []
            for cn in names_to_add:
                row = blank_row.copy()
                row['CommonName'] = cn
                if cn.lower() in species_to_add:
                    row['Rare'] = 'X'
                total = checklist[checklist.cnlower ==
                                  cn.lower()]['Total'].values[0]
                row[sector_col] = total
                rows_to_add.append(row)

            summary = summary.append(rows_to_add, ignore_index=True)

        #
        has_adult_col = 'Ad' in checklist.columns
        has_immature_col = 'Im' in checklist.columns

        summary[sector_col] = 0  # 'Total' field for this sector

        if has_adult_col:
            ad_col = f'Ad-{sector_col}'
            summary[ad_col] = 0

        if has_immature_col:
            im_col = f'Im-{sector_col}'
            summary[im_col] = 0

        # # S
        # # Fill in total for existing names
        # already_present_names = set(summary_common_names) & set(checklist.CommonName)
        # for cn in set(checklist.CommonName):
        #     total = checklist[checklist.CommonName == cn]['Total'].values[0]
        #     summary.loc[summary.CommonName == cn, sector_col] = total

        # print(summary.shape, len(summary_common_names_lower))
        summary_common_names_lower = [xs.lower() for xs in summary.CommonName]

        summary['cnlower'] = summary_common_names_lower
        for ix, row in checklist.iterrows():
            # if row.Total:
            #     print(row)
            total = row.FrozenTotal if 'FrozenTotal' in checklist.columns else row.Total
            mask = summary.cnlower == row.cnlower
            summary.loc[mask, sector_col] = total

        summary.drop(['cnlower'], axis=1, inplace=True)
        #     if has_adult_col:
        #         adult_total = checklist[checklist.CommonName == cn]['Ad'].values[0]
        #         summary.loc[summary.CommonName == cn, ad_col] = adult_total
        #
        #     if has_immature_col:
        #         immature_total = checklist[checklist.CommonName == cn]['Im'].values[0]
        #         summary.loc[summary.CommonName == cn, im_col] = immature_total

    # Fill in zeros for missing sector_col values; may have blanks if species added
    # for col in sector_cols:
    #     summary[col] = summary[col].apply(pd.to_numeric).fillna(0)

    # Do sums for Ad/Im columns. Ad == 'Adult/White'
    if base_has_adult_col:
        ad_cols = [xs for xs in summary.columns if xs.startswith('Ad-')]
        summary['Ad'] = summary[ad_cols].apply(
            pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    if base_has_immature_col:
        im_cols = [xs for xs in summary.columns if xs.startswith('Im-')]
        summary['Im'] = summary[im_cols].apply(
            pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    # Look up Group and TaxonOrder for anything missing these (may have been added species)

    for idx, row in summary.iterrows():
        record = taxonomy.find_local_name_row(row['CommonName'])
        if record is not None:
            summary.at[idx, 'TaxonOrder'] = record.TAXON_ORDER
            summary.at[idx, 'Group'] = record.SPECIES_GROUP
            so = record.NACC_SORT_ORDER if record.NACC_SORT_ORDER != 0 else \
                taxonomy.INVALID_NACC_SORT_ORDER
            summary.at[idx, 'NACC_SORT_ORDER'] = so
            so = record.ABA_SORT_ORDER if record.ABA_SORT_ORDER != 0 else \
                taxonomy.INVALID_NACC_SORT_ORDER
            summary.at[idx, 'ABA_SORT_ORDER'] = so
            summary.at[idx, 'Category'] = record.Category

    # Re-sort by TaxonOrder
    # Must sort before creating formulae for Total
    so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce')
    summary.NACC_SORT_ORDER = pd.Series(so).fillna(
        taxonomy.INVALID_NACC_SORT_ORDER)
    so = pd.to_numeric(summary.ABA_SORT_ORDER, errors='coerce')
    summary.ABA_SORT_ORDER = pd.Series(so).fillna(
        taxonomy.INVALID_NACC_SORT_ORDER)

    try:
        summary = summary.sort_values(by=['NACC_SORT_ORDER']).reset_index(
            drop=True)
    except TypeError as te:
        print(te)
        traceback.print_exc(file=sys.stdout)
        return summary

    # Now set the overall total field:
    #     sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')]
    # summary['Total'] = summary[sector_cols].apply(pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    col_letters = excel_columns()
    #     team_start_col = col_letters[len(base_columns)]
    std_columns = [
        'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder',
        'NACC_SORT_ORDER', 'ABA_SORT_ORDER'
    ]
    # Filter out any missing columns
    std_columns = [col for col in std_columns if col in summary.columns]
    # team_start_col = col_letters[index_of_first_subtotal_column(summary)]
    sector_start_col = col_letters[len(std_columns)]
    sector_end_col = col_letters[len(summary.columns) - 1]
    total_formula = [
        f'=SUM(${sector_start_col}{ix}:${sector_end_col}{ix})'
        for ix in range(2, summary.shape[0] + 2)
    ]
    summary['Total'] = total_formula

    # Add last row for Total and each Sector total
    totals_row = pd.Series([''] * len(summary.columns), index=summary.columns)
    totals_row['Group'] = 'Totals'
    totals_row['TaxonOrder'] = 99999
    totals_row['NACC_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER
    totals_row['ABA_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER

    # Formula for Grand Total, e.g. =SUM($D$2:$D$245)
    total_col_letter = col_letters[std_columns.index('Total')]
    total_formula = f'=SUM(${total_col_letter}2:${total_col_letter}{summary.shape[0] + 1})'
    totals_row.Total = total_formula

    # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')]
    sector_totals = summary[sector_cols].apply(
        pd.to_numeric).fillna(0).sum(axis=0).astype(int)
    for col, st in sector_totals.items():
        totals_row[col] = st

    summary = summary.append(totals_row, ignore_index=True)

    cols_to_drop = [
        col for col in summary.columns
        if (col.startswith('Ad-') or col.startswith('Im-'))
    ]
    summary.drop(labels=cols_to_drop, axis=1, inplace=True)

    summary.rename(columns={
        'Ad': 'Adult/White',
        'Im': 'Immature/Blue'
    },
                   inplace=True)

    # Re-order columns
    # print(sector_cols)
    # print(summary.columns)

    new_col_order = [
        col for col in [
            'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder',
            'NACC_SORT_ORDER', 'ABA_SORT_ORDER'
        ] if col in summary.columns
    ]
    new_col_order.extend(sector_cols)
    summary = summary[new_col_order]

    # Don't hide 'Rare' since this will be frequently used in a filter
    cols_to_hide = [
        'D', 'Difficulty', 'Adult', 'Immature', 'W-morph', 'B-Morph'
    ]

    if 'Adult/White' in summary.columns:
        if summary['Adult/White'].apply(pd.to_numeric).fillna(0).sum() == 0:
            cols_to_hide.append('Adult/White')
    if 'Immature/Blue' in summary.columns:
        if summary['Immature/Blue'].apply(pd.to_numeric).fillna(0).sum() == 0:
            cols_to_hide.append('Immature/Blue')

    cols_to_highlight = list(
        set(summary.columns) & {'Total', 'Adult/White', 'Immature/Blue'})

    return summary, cols_to_hide, cols_to_highlight