Exemplo n.º 1
0
    def __init__(self, annotated_filename, logger=None):
        """Constructor.

        Args:
            annotated_filename (str): full filename for annotated occurrence CSV file
            logger (object): logger for saving relevant processing messages

        Raises:
            FileNotFoundError: on any missing file in occ_filename_list
        """
        if not os.path.exists(annotated_filename):
            raise FileNotFoundError(
                f"File {annotated_filename} does not exist")

        datapath, _ = os.path.split(annotated_filename)
        self._datapath = datapath
        self._csvfile = annotated_filename
        if logger is None:
            logger = get_logger(datapath)
        # Hold all counties found in each state
        self.states = {}
        self._init_states()

        self._log = logger
        # {county_or_state: {species: count, ... }
        #  ...  }
        self.locations = {}
Exemplo n.º 2
0
    def __init__(self, riis_filename, logger=None):
        """Constructor sets the authority and species files and headers expected for BISON-RIIS processing.

        Args:
            riis_filename (str): Path to the base of the input data, used to construct full
                filenames from basepath and relative path constants.
            logger (object): logger for writing messages to file and console

        Raises:
            Exception: on unexpected file header
        """
        datapath, fname_w_ext = os.path.split(riis_filename)
        basename, ext = os.path.splitext(fname_w_ext)

        self._datapath = datapath
        self._csvfile = riis_filename

        if logger is None:
            logger = get_logger(datapath)
        self._log = logger

        self.auth_fname = f"{os.path.join(self._datapath, RIIS_AUTHORITY.FNAME)}.{RIIS.DATA_EXT}"

        # Test and clean headers of non-ascii characters
        self._test_header(self.auth_fname, RIIS_AUTHORITY.HEADER)
        good_header = self._test_header(self._csvfile, RIIS_SPECIES.HEADER)
        if good_header is False:
            raise Exception(f"Unexpected file header found in {self._csvfile}")

        # Trimmed and updated Non-native Species List, built from RIIS
        self.by_gbif_taxkey = None
        self.by_riis_id = None
        self.nnsl_header = None
Exemplo n.º 3
0
    def __init__(self, gbif_occ_filename, nnsl=None, logger=None):
        """Constructor.

        Args:
            gbif_occ_filename (str): full path of CSV occurrence file to annotate
            nnsl (bison.common.riis.NNSL): object containing USGS RIIS data for annotating records
            logger (object): logger for saving relevant processing messages
        """
        datapath, _ = os.path.split(gbif_occ_filename)
        self._datapath = datapath
        self._csvfile = gbif_occ_filename

        if logger is None:
            logger = get_logger(datapath)
        self._log = logger

        if nnsl is not None:
            self.nnsl = nnsl
        else:
            riis_filename = os.path.join(datapath, RIIS_SPECIES.FNAME)
            self.nnsl = NNSL(riis_filename, logger=logger)
            self.nnsl.read_riis(read_resolved=True)

        # Must georeference points to add new, consistent state and county fields
        self._geo_county = GeoResolver(US_CENSUS_COUNTY.FILE,
                                       US_CENSUS_COUNTY.CENSUS_BISON_MAP,
                                       self._log)

        # Input reader
        self._dwcdata = DwcData(self._csvfile, logger=logger)
        # Output writer
        self._csv_writer = None

        self._conus_states = []
        for k, v in US_STATES.items():
            if k not in ("Alaska", "Hawaii"):
                self._conus_states.extend([k, v])
        self._all_states = self._conus_states.copy()
        self._all_states.extend(["Alaska", "Hawaii", "AK", "HI"])

        # Test DwC record contents
        self.good_locations = {}
        self.bad_locations = {}
        self.missing_states = 0
        self.matched_states = 0
        self.mismatched_states = 0
Exemplo n.º 4
0
    def __init__(self, occ_filename, logger=None):
        """Construct an object to read a GBIF datafile.

        Args:
            occ_filename(str): full path of CSV occurrence file to read
            logger (object): logger for saving relevant processing messages
        """
        datapath, _ = os.path.split(occ_filename)
        self._datapath = datapath
        self._csvfile = occ_filename
        if logger is None:
            logger = get_logger(datapath)
        self._log = logger

        # Open file
        self._inf = None

        # CVS DictReader and current record
        self._csv_reader = None
        self.dwcrec = None
Exemplo n.º 5
0
                    "Record {} taxon key {} is not an accepted name {}".format(
                        rec[GBIF.ID_FLD], taxkey, taxstatus))
            # Make sure simple CSV data sciname matches name for taxonkey
            if rec[GBIF.NAME_FLD] != taxname:
                self.logit(
                    "Record {} name {} does not match taxon key {} name {}".
                    format(rec[GBIF.ID_FLD], rec[GBIF.NAME_FLD], taxkey,
                           taxname))


# .............................................................................
if __name__ == "__main__":
    logname = "test_gbif"
    csvfile = GBIF.TEST_DATA

    # Test the taxonkey contents in GBIF simple CSV download file
    logger = get_logger(DATA_PATH, logname=logname)

    Tst = TestGBIFData(DATA_PATH, csvfile, logger)
    Tst.test_gbif_name_accepted()
"""
from test.test_gbif import *

logname = "test_gbif"
csvfile = GBIF.TEST_DATA
logger = get_logger(DATA_PATH, logname=logname)

Tst = TestGBIFData(DATA_PATH, csvfile, logger)
Tst.test_gbif_name_accepted()
"""
Exemplo n.º 6
0
        self.append_dwca_records()
        print(f"   Matched states: {self.matched_states}")
        print(f"   Mis-matched states: {self.mismatched_states}")
        print(f"   Missing states: {self.missing_states}")
        print("   Good states: ")
        for st, counties in self.good_locations.items():
            print(f"  {st}: {counties}")
        print("   Bad states: ")
        for st, counties in self.bad_locations.items():
            print(f"  {st}: {counties}")


# .............................................................................
if __name__ == "__main__":
    # Test the taxonkey contents in GBIF simple CSV download file
    logger = get_logger(DATA_PATH, logname="test_annotate")
    nnsl_data = NNSL(DATA_PATH, logger=logger)
    big_gbif_fname = os.path.join(DATA_PATH, GBIF.TEST_DATA)

    chunk_fnames = chunk_files(big_gbif_fname)
    for fname in chunk_fnames:
        tst = TestAnnotator(fname, do_resolve=False, logger=logger)
        tst.test_annotate_records()
"""
from test.test_annotate import *

outpath = "/tmp"
logname = "test_annotate"
csvfile = GBIF.TEST_DATA
logger = get_logger(DATA_PATH, logname=logname)
Exemplo n.º 7
0
        if is_test:
            test_fname = RIIS_SPECIES.TEST_FNAME
        resolved_nnsl = NNSL(DATA_PATH, test_fname=test_fname)
        resolved_nnsl.read_riis(read_resolved=True)

        # Count originals
        for occid in self.nnsl_by_id.keys():
            try:
                resolved_nnsl.nnsl_by_id[occid]
            except KeyError:
                logit(self._log, "Missing record {}".format(occid))


# .............................................................................
if __name__ == "__main__":
    logger = get_logger(DATA_PATH, "test_riis_resolve")

    tt = TestRIISTaxonomy(DATA_PATH, logger=logger)
    tt.test_missing_taxon_authority_resolution()
    tt.test_taxonomy_keys()
    tt.test_duplicate_name_localities()
    tt.test_gbif_resolution_inconsistency()
    tt.test_itis_resolution_inconsistency()
    tt = None

    # These overwrite resolved test data RIIS species w/ 100 records
    tt = TestRIISTaxonomy(DATA_PATH,
                          test_fname=RIIS_SPECIES.TEST_FNAME,
                          logger=logger)
    tt.test_resolve_gbif()
    tt.test_resolution_output(is_test=True)
Exemplo n.º 8
0
    args = parser.parse_args()
    cmd = args.cmd
    big_csv_filename = os.path.join(DATA_PATH, args.big_csv_filename)
    do_split = True if args.do_split.lower() in ("yes", "y", "true",
                                                 "1") else False

    # ...............................................
    # Test data
    # ...............................................
    cmd = "summarize_assessments"
    big_csv_filename = os.path.join(DATA_PATH, "gbif_2022-03-16_100k.csv")
    # ...............................................
    # ...............................................

    logger = get_logger(DATA_PATH, logname=f"main_{cmd}")
    logger.info(f"Command: {cmd}")
    if cmd == "resolve":
        resolved_riis_filename = resolve_riis_taxa(riis_filename, logger)
        print(resolved_riis_filename)
        log_output(logger, f"Resolved RIIS filename: {resolved_riis_filename}")
    elif cmd == "split":
        input_filenames = find_or_create_subset_files(big_csv_filename, logger)
        log_output(logger, "Input filenames:", outlist=input_filenames)
    else:
        if do_split is True:
            input_filenames = find_or_create_subset_files(
                big_csv_filename, logger)
        else:
            input_filenames = [big_csv_filename]
        # Make sure files to be processed exist