예제 #1
0
def show_lines_not_found_in_taxonomy(double_translated, taxonomy: Taxonomy):
    # ['Group', 'CommonName', 'Rare', 'Total', 'TaxonOrder']

    for local_name, _ in double_translated:
        row = taxonomy.find_local_name_row(local_name)
        if row is None:
            print(f'Not found in taxonomy: {local_name}')
예제 #2
0
def create_row_for_missing_species(
        common_name: str, summary: pd.DataFrame,
        taxonomy: Taxonomy) -> Optional[Tuple[pd.Series, bool]]:
    # can also be SPUH, ISSF etc., just something that wasn't on official list
    # The number of columns may vary based on the checklist, but we fill
    # in the ones that we know must be there
    taxonomy_row = taxonomy.find_local_name_row(common_name)
    if taxonomy_row is None:  # i.e. not found, drop it
        return None

    new_row = pd.Series([''] * len(summary.columns), index=summary.columns)
    new_row['Group'] = taxonomy_row.SPECIES_GROUP
    new_row['CommonName'] = common_name
    new_row['TaxonOrder'] = taxonomy_row.TAXON_ORDER
    new_row['NACC_SORT_ORDER'] = taxonomy_row.NACC_SORT_ORDER
    new_row['ABA_SORT_ORDER'] = taxonomy_row.ABA_SORT_ORDER
    new_row['Category'] = taxonomy_row.Category
    # Filled in later. This is the "Grand Total", not the total from an individual checklist
    new_row['Total'] = 0

    # Not on official list, so mark it Rare if it's a species (not SPUH etc.)
    rarity = taxonomy_row.Category == 'species'
    if rarity:
        new_row['Rare'] = 'X'

    return new_row, rarity
예제 #3
0
def create_category_column(summary: pd.DataFrame, taxonomy: Taxonomy) -> list:
    categories = []
    for common_name in summary.CommonName.values:
        taxonomy_row = taxonomy.find_local_name_row(common_name)
        category = '' if taxonomy_row is None else taxonomy_row.Category
        categories.append(category)

    return categories
예제 #4
0
def filter_additional_rare(taxonomy: Taxonomy,
                           additional_rare: List[str]) -> List[str]:
    rare_species = []
    for cn in additional_rare:
        row = taxonomy.find_local_name_row(cn)
        if row is not None and row.Category == 'species':
            rare_species.append(cn)

    return rare_species
예제 #5
0
def clean_common_names(
        common_names: List[str], taxonomy: Taxonomy,
        local_translation_context: LocalTranslationContext) -> List[str]:
    # skip tertiary_transformation() for now
    common_names = [
        secondary_species_processing(pre_process_line(line))
        for line in common_names
    ]

    #  text_list = [tertiary_transformation(secondary_species_processing(pre_process_line(line))) \
    #                for line in text_list]

    # # Processing 1 checklist here
    # tti = TaxonomyTokenIdentify(taxonomy, interim_data_path)
    #
    # # use text_list from above
    # text_list_lower = [x.lower() for x in text_list]
    # possibles = filter_to_possibles(tti, text_list_lower)
    # print(f'Possible species lines: {len(possibles)} (based on word intersections)')

    # Double translate
    # print('Doing double translation')  # Can take a while
    translated = []
    for line in common_names:  # was: possibles
        txline = local_translation_context.apply_translations(
            line.lower(), True)
        translated.append(txline)

    double_translated = []
    for line, _ in translated:
        txline2 = local_translation_context.apply_translations(
            line.lower(), True)
        double_translated.append(txline2)

    double_translated = [x for (x, y) in double_translated]
    # print(double_translated)

    # they may be all lower case, return proper capitalization
    result = []
    for common_name in double_translated:
        xcn = ''
        if common_name != '':  # avoid most common exception
            try:
                row = taxonomy.find_local_name_row(common_name)
                xcn = row.comName
            except AttributeError as ae:
                print(ae)
                print(f'no taxonomy entry for "{common_name}"')
        result.append(xcn)

    return result
예제 #6
0
def build_full_tally_sheet(double_translated, fpath: Path, taxonomy: Taxonomy,
                           parameters: Parameters, circle_prefix: str):
    candidate_names = [x for x, y in double_translated]
    local_names = process_exceptions(candidate_names, fpath, circle_prefix)

    # if issf etc in list, then base species must be also
    issfs = taxonomy.filter_issf(local_names)
    for cn in issfs:
        base_species = taxonomy.report_as(cn)
        if base_species:
            local_names.append(base_species)

    entries = []
    for local_name in local_names:
        # common_name, taxon_order, species_group, NACC_SORT_ORDER
        record = taxonomy.find_local_name_row(local_name)
        if record is not None:
            # e.g. ('White-throated Sparrow', 31943, 'New World Sparrows', 1848.0)
            entry = (record.comName, record.TAXON_ORDER, record.SPECIES_GROUP,
                     record.NACC_SORT_ORDER, record.ABA_SORT_ORDER, '', 0
                     )  # append 'Rare', 'Total'
            entries.append(entry)

    df = pd.DataFrame(entries,
                      columns=[
                          'CommonName', 'TaxonOrder', 'Group',
                          'NACC_SORT_ORDER', 'ABA_SORT_ORDER', 'Rare', 'Total'
                      ])

    # Re-order
    cols = [
        'Group', 'CommonName', 'Rare', 'Total', 'TaxonOrder',
        'NACC_SORT_ORDER', 'ABA_SORT_ORDER'
    ]
    local_checklist = df[cols]
    local_checklist.sort_values(by='TaxonOrder', inplace=True)
    #     local_checklist.shape

    # double_translated may have duplicates
    local_checklist = local_checklist[
        ~local_checklist.duplicated(subset=['CommonName'], keep='first')]

    local_checklist = process_annotations_or_rarities(local_checklist, fpath,
                                                      circle_prefix)

    # Re-order columns
    preferred_order = [
        'Group', 'CommonName', 'Rare', 'D', 'Total', 'Ad', 'Im', 'TaxonOrder',
        'NACC_SORT_ORDER', 'ABA_SORT_ORDER', 'Difficulty', 'Adult', 'Immature',
        'W-morph', 'B-Morph', 'CountSpecial'
    ]
    newcols = [
        col for col in preferred_order if col in local_checklist.columns
    ]
    local_checklist = local_checklist[newcols]

    # Write out full tally sheet
    # circle_code = circle_prefix[0:4]
    # double_path = outputs_path / f'{circle_code}-DoubleX.xlsx'
    # write_local_checklist_with_group(local_checklist, double_path, parameters.parameters)

    return local_checklist
예제 #7
0
def merge_checklists(
    summary_base: Any, sector_files: List[Any],
    stem_to_colname: Union[dict, List[str]], taxonomy: Taxonomy,
    local_translation_context: LocalTranslationContext
) -> Tuple[pd.DataFrame, List[str], List[str]]:
    # Easier to use single column summary_base, but this will transform it if needed
    if isinstance(summary_base, Path):
        template = read_excel_or_csv_path(summary_base)
        # Create a single column master for summary
        summary_base = recombine_transformed_checklist(template, taxonomy)
    elif isinstance(summary_base, pd.DataFrame):
        summary_base = summary_base

    base_has_adult_col = 'Ad' in summary_base.columns
    base_has_immature_col = 'Im' in summary_base.columns
    has_adult_col = False
    has_immature_col = False

    # Start of big processing loop
    summary = summary_base.copy()
    sector_unique = 1
    sector_cols = []
    for idx, fpath in enumerate(sector_files):
        try:
            if isinstance(fpath, Path):
                sector_col = stem_to_colname.get(fpath.stem, None)
            else:
                sector_col = stem_to_colname[idx]
        except Exception as ee:
            print(ee, idx, fpath)
            sector_col = None

        if not sector_col:
            sector_col = f'X{sector_unique}'
            sector_unique += 1

        sector_cols.append(sector_col)
        print(f'Processing {sector_col}')

        summary_common_names = summary.CommonName.values
        summary_common_names_lower = [
            xs.lower() for xs in summary_common_names
        ]
        if isinstance(fpath, Path):
            checklist = read_excel_or_csv_path(fpath)
            # Only Excel files would be double column. CSV files could be hand made,
            # so clean them up. Double translation takes a long time, so avoid when
            # possible
            if fpath.suffix == '.xlsx':
                checklist = recombine_transformed_checklist(
                    checklist, taxonomy)
            else:
                cleaned_common_names = clean_common_names(
                    checklist.CommonName, taxonomy, local_translation_context)
                checklist.CommonName = cleaned_common_names
            # print(checklist.Total)
            xdtypes = {'CommonName': str, 'Total': int}
            checklist = checklist.astype(dtype=xdtypes)

            # so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce')
            # summary.NACC_SORT_ORDER = pd.Series(so).fillna(taxonomy.INVALID_NACC_SORT_ORDER)

        else:  # isinstance(summary_base, pd.DataFrame):
            checklist = fpath

        # Drop any rows with a blank CommonName. This can occur if the checklist is a summary
        # report with a 'Total' row at the bottom, and 'Total' is not a valid species
        checklist = checklist[checklist.CommonName != '']

        # Sector checklists may have added species not on the template
        checklist['cnlower'] = [xs.lower() for xs in checklist.CommonName]
        checklist_common_names_lower = set(
            [xs.lower() for xs in checklist.CommonName])
        names_to_add = checklist_common_names_lower - set(
            summary_common_names_lower)
        if not names_to_add == set():
            species_to_add = taxonomy.filter_species(list(names_to_add))
            if len(species_to_add) > 0:
                print(f'Added species: {species_to_add}')
            # Fix capitalization
            names_to_add = clean_common_names(list(names_to_add), taxonomy,
                                              local_translation_context)
            blank_row = pd.Series([''] * len(summary.columns),
                                  index=summary.columns)
            rows_to_add = []
            for cn in names_to_add:
                row = blank_row.copy()
                row['CommonName'] = cn
                if cn.lower() in species_to_add:
                    row['Rare'] = 'X'
                total = checklist[checklist.cnlower ==
                                  cn.lower()]['Total'].values[0]
                row[sector_col] = total
                rows_to_add.append(row)

            summary = summary.append(rows_to_add, ignore_index=True)

        #
        has_adult_col = 'Ad' in checklist.columns
        has_immature_col = 'Im' in checklist.columns

        summary[sector_col] = 0  # 'Total' field for this sector

        if has_adult_col:
            ad_col = f'Ad-{sector_col}'
            summary[ad_col] = 0

        if has_immature_col:
            im_col = f'Im-{sector_col}'
            summary[im_col] = 0

        # # S
        # # Fill in total for existing names
        # already_present_names = set(summary_common_names) & set(checklist.CommonName)
        # for cn in set(checklist.CommonName):
        #     total = checklist[checklist.CommonName == cn]['Total'].values[0]
        #     summary.loc[summary.CommonName == cn, sector_col] = total

        # print(summary.shape, len(summary_common_names_lower))
        summary_common_names_lower = [xs.lower() for xs in summary.CommonName]

        summary['cnlower'] = summary_common_names_lower
        for ix, row in checklist.iterrows():
            # if row.Total:
            #     print(row)
            total = row.FrozenTotal if 'FrozenTotal' in checklist.columns else row.Total
            mask = summary.cnlower == row.cnlower
            summary.loc[mask, sector_col] = total

        summary.drop(['cnlower'], axis=1, inplace=True)
        #     if has_adult_col:
        #         adult_total = checklist[checklist.CommonName == cn]['Ad'].values[0]
        #         summary.loc[summary.CommonName == cn, ad_col] = adult_total
        #
        #     if has_immature_col:
        #         immature_total = checklist[checklist.CommonName == cn]['Im'].values[0]
        #         summary.loc[summary.CommonName == cn, im_col] = immature_total

    # Fill in zeros for missing sector_col values; may have blanks if species added
    # for col in sector_cols:
    #     summary[col] = summary[col].apply(pd.to_numeric).fillna(0)

    # Do sums for Ad/Im columns. Ad == 'Adult/White'
    if base_has_adult_col:
        ad_cols = [xs for xs in summary.columns if xs.startswith('Ad-')]
        summary['Ad'] = summary[ad_cols].apply(
            pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    if base_has_immature_col:
        im_cols = [xs for xs in summary.columns if xs.startswith('Im-')]
        summary['Im'] = summary[im_cols].apply(
            pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    # Look up Group and TaxonOrder for anything missing these (may have been added species)

    for idx, row in summary.iterrows():
        record = taxonomy.find_local_name_row(row['CommonName'])
        if record is not None:
            summary.at[idx, 'TaxonOrder'] = record.TAXON_ORDER
            summary.at[idx, 'Group'] = record.SPECIES_GROUP
            so = record.NACC_SORT_ORDER if record.NACC_SORT_ORDER != 0 else \
                taxonomy.INVALID_NACC_SORT_ORDER
            summary.at[idx, 'NACC_SORT_ORDER'] = so
            so = record.ABA_SORT_ORDER if record.ABA_SORT_ORDER != 0 else \
                taxonomy.INVALID_NACC_SORT_ORDER
            summary.at[idx, 'ABA_SORT_ORDER'] = so
            summary.at[idx, 'Category'] = record.Category

    # Re-sort by TaxonOrder
    # Must sort before creating formulae for Total
    so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce')
    summary.NACC_SORT_ORDER = pd.Series(so).fillna(
        taxonomy.INVALID_NACC_SORT_ORDER)
    so = pd.to_numeric(summary.ABA_SORT_ORDER, errors='coerce')
    summary.ABA_SORT_ORDER = pd.Series(so).fillna(
        taxonomy.INVALID_NACC_SORT_ORDER)

    try:
        summary = summary.sort_values(by=['NACC_SORT_ORDER']).reset_index(
            drop=True)
    except TypeError as te:
        print(te)
        traceback.print_exc(file=sys.stdout)
        return summary

    # Now set the overall total field:
    #     sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')]
    # summary['Total'] = summary[sector_cols].apply(pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    col_letters = excel_columns()
    #     team_start_col = col_letters[len(base_columns)]
    std_columns = [
        'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder',
        'NACC_SORT_ORDER', 'ABA_SORT_ORDER'
    ]
    # Filter out any missing columns
    std_columns = [col for col in std_columns if col in summary.columns]
    # team_start_col = col_letters[index_of_first_subtotal_column(summary)]
    sector_start_col = col_letters[len(std_columns)]
    sector_end_col = col_letters[len(summary.columns) - 1]
    total_formula = [
        f'=SUM(${sector_start_col}{ix}:${sector_end_col}{ix})'
        for ix in range(2, summary.shape[0] + 2)
    ]
    summary['Total'] = total_formula

    # Add last row for Total and each Sector total
    totals_row = pd.Series([''] * len(summary.columns), index=summary.columns)
    totals_row['Group'] = 'Totals'
    totals_row['TaxonOrder'] = 99999
    totals_row['NACC_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER
    totals_row['ABA_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER

    # Formula for Grand Total, e.g. =SUM($D$2:$D$245)
    total_col_letter = col_letters[std_columns.index('Total')]
    total_formula = f'=SUM(${total_col_letter}2:${total_col_letter}{summary.shape[0] + 1})'
    totals_row.Total = total_formula

    # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')]
    sector_totals = summary[sector_cols].apply(
        pd.to_numeric).fillna(0).sum(axis=0).astype(int)
    for col, st in sector_totals.items():
        totals_row[col] = st

    summary = summary.append(totals_row, ignore_index=True)

    cols_to_drop = [
        col for col in summary.columns
        if (col.startswith('Ad-') or col.startswith('Im-'))
    ]
    summary.drop(labels=cols_to_drop, axis=1, inplace=True)

    summary.rename(columns={
        'Ad': 'Adult/White',
        'Im': 'Immature/Blue'
    },
                   inplace=True)

    # Re-order columns
    # print(sector_cols)
    # print(summary.columns)

    new_col_order = [
        col for col in [
            'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder',
            'NACC_SORT_ORDER', 'ABA_SORT_ORDER'
        ] if col in summary.columns
    ]
    new_col_order.extend(sector_cols)
    summary = summary[new_col_order]

    # Don't hide 'Rare' since this will be frequently used in a filter
    cols_to_hide = [
        'D', 'Difficulty', 'Adult', 'Immature', 'W-morph', 'B-Morph'
    ]

    if 'Adult/White' in summary.columns:
        if summary['Adult/White'].apply(pd.to_numeric).fillna(0).sum() == 0:
            cols_to_hide.append('Adult/White')
    if 'Immature/Blue' in summary.columns:
        if summary['Immature/Blue'].apply(pd.to_numeric).fillna(0).sum() == 0:
            cols_to_hide.append('Immature/Blue')

    cols_to_highlight = list(
        set(summary.columns) & {'Total', 'Adult/White', 'Immature/Blue'})

    return summary, cols_to_hide, cols_to_highlight