def __init__(self, annotated_filename, logger=None): """Constructor. Args: annotated_filename (str): full filename for annotated occurrence CSV file logger (object): logger for saving relevant processing messages Raises: FileNotFoundError: on any missing file in occ_filename_list """ if not os.path.exists(annotated_filename): raise FileNotFoundError( f"File {annotated_filename} does not exist") datapath, _ = os.path.split(annotated_filename) self._datapath = datapath self._csvfile = annotated_filename if logger is None: logger = get_logger(datapath) # Hold all counties found in each state self.states = {} self._init_states() self._log = logger # {county_or_state: {species: count, ... } # ... } self.locations = {}
def __init__(self, riis_filename, logger=None): """Constructor sets the authority and species files and headers expected for BISON-RIIS processing. Args: riis_filename (str): Path to the base of the input data, used to construct full filenames from basepath and relative path constants. logger (object): logger for writing messages to file and console Raises: Exception: on unexpected file header """ datapath, fname_w_ext = os.path.split(riis_filename) basename, ext = os.path.splitext(fname_w_ext) self._datapath = datapath self._csvfile = riis_filename if logger is None: logger = get_logger(datapath) self._log = logger self.auth_fname = f"{os.path.join(self._datapath, RIIS_AUTHORITY.FNAME)}.{RIIS.DATA_EXT}" # Test and clean headers of non-ascii characters self._test_header(self.auth_fname, RIIS_AUTHORITY.HEADER) good_header = self._test_header(self._csvfile, RIIS_SPECIES.HEADER) if good_header is False: raise Exception(f"Unexpected file header found in {self._csvfile}") # Trimmed and updated Non-native Species List, built from RIIS self.by_gbif_taxkey = None self.by_riis_id = None self.nnsl_header = None
def __init__(self, gbif_occ_filename, nnsl=None, logger=None): """Constructor. Args: gbif_occ_filename (str): full path of CSV occurrence file to annotate nnsl (bison.common.riis.NNSL): object containing USGS RIIS data for annotating records logger (object): logger for saving relevant processing messages """ datapath, _ = os.path.split(gbif_occ_filename) self._datapath = datapath self._csvfile = gbif_occ_filename if logger is None: logger = get_logger(datapath) self._log = logger if nnsl is not None: self.nnsl = nnsl else: riis_filename = os.path.join(datapath, RIIS_SPECIES.FNAME) self.nnsl = NNSL(riis_filename, logger=logger) self.nnsl.read_riis(read_resolved=True) # Must georeference points to add new, consistent state and county fields self._geo_county = GeoResolver(US_CENSUS_COUNTY.FILE, US_CENSUS_COUNTY.CENSUS_BISON_MAP, self._log) # Input reader self._dwcdata = DwcData(self._csvfile, logger=logger) # Output writer self._csv_writer = None self._conus_states = [] for k, v in US_STATES.items(): if k not in ("Alaska", "Hawaii"): self._conus_states.extend([k, v]) self._all_states = self._conus_states.copy() self._all_states.extend(["Alaska", "Hawaii", "AK", "HI"]) # Test DwC record contents self.good_locations = {} self.bad_locations = {} self.missing_states = 0 self.matched_states = 0 self.mismatched_states = 0
def __init__(self, occ_filename, logger=None): """Construct an object to read a GBIF datafile. Args: occ_filename(str): full path of CSV occurrence file to read logger (object): logger for saving relevant processing messages """ datapath, _ = os.path.split(occ_filename) self._datapath = datapath self._csvfile = occ_filename if logger is None: logger = get_logger(datapath) self._log = logger # Open file self._inf = None # CVS DictReader and current record self._csv_reader = None self.dwcrec = None
"Record {} taxon key {} is not an accepted name {}".format( rec[GBIF.ID_FLD], taxkey, taxstatus)) # Make sure simple CSV data sciname matches name for taxonkey if rec[GBIF.NAME_FLD] != taxname: self.logit( "Record {} name {} does not match taxon key {} name {}". format(rec[GBIF.ID_FLD], rec[GBIF.NAME_FLD], taxkey, taxname)) # ............................................................................. if __name__ == "__main__": logname = "test_gbif" csvfile = GBIF.TEST_DATA # Test the taxonkey contents in GBIF simple CSV download file logger = get_logger(DATA_PATH, logname=logname) Tst = TestGBIFData(DATA_PATH, csvfile, logger) Tst.test_gbif_name_accepted() """ from test.test_gbif import * logname = "test_gbif" csvfile = GBIF.TEST_DATA logger = get_logger(DATA_PATH, logname=logname) Tst = TestGBIFData(DATA_PATH, csvfile, logger) Tst.test_gbif_name_accepted() """
self.append_dwca_records() print(f" Matched states: {self.matched_states}") print(f" Mis-matched states: {self.mismatched_states}") print(f" Missing states: {self.missing_states}") print(" Good states: ") for st, counties in self.good_locations.items(): print(f" {st}: {counties}") print(" Bad states: ") for st, counties in self.bad_locations.items(): print(f" {st}: {counties}") # ............................................................................. if __name__ == "__main__": # Test the taxonkey contents in GBIF simple CSV download file logger = get_logger(DATA_PATH, logname="test_annotate") nnsl_data = NNSL(DATA_PATH, logger=logger) big_gbif_fname = os.path.join(DATA_PATH, GBIF.TEST_DATA) chunk_fnames = chunk_files(big_gbif_fname) for fname in chunk_fnames: tst = TestAnnotator(fname, do_resolve=False, logger=logger) tst.test_annotate_records() """ from test.test_annotate import * outpath = "/tmp" logname = "test_annotate" csvfile = GBIF.TEST_DATA logger = get_logger(DATA_PATH, logname=logname)
if is_test: test_fname = RIIS_SPECIES.TEST_FNAME resolved_nnsl = NNSL(DATA_PATH, test_fname=test_fname) resolved_nnsl.read_riis(read_resolved=True) # Count originals for occid in self.nnsl_by_id.keys(): try: resolved_nnsl.nnsl_by_id[occid] except KeyError: logit(self._log, "Missing record {}".format(occid)) # ............................................................................. if __name__ == "__main__": logger = get_logger(DATA_PATH, "test_riis_resolve") tt = TestRIISTaxonomy(DATA_PATH, logger=logger) tt.test_missing_taxon_authority_resolution() tt.test_taxonomy_keys() tt.test_duplicate_name_localities() tt.test_gbif_resolution_inconsistency() tt.test_itis_resolution_inconsistency() tt = None # These overwrite resolved test data RIIS species w/ 100 records tt = TestRIISTaxonomy(DATA_PATH, test_fname=RIIS_SPECIES.TEST_FNAME, logger=logger) tt.test_resolve_gbif() tt.test_resolution_output(is_test=True)
args = parser.parse_args() cmd = args.cmd big_csv_filename = os.path.join(DATA_PATH, args.big_csv_filename) do_split = True if args.do_split.lower() in ("yes", "y", "true", "1") else False # ............................................... # Test data # ............................................... cmd = "summarize_assessments" big_csv_filename = os.path.join(DATA_PATH, "gbif_2022-03-16_100k.csv") # ............................................... # ............................................... logger = get_logger(DATA_PATH, logname=f"main_{cmd}") logger.info(f"Command: {cmd}") if cmd == "resolve": resolved_riis_filename = resolve_riis_taxa(riis_filename, logger) print(resolved_riis_filename) log_output(logger, f"Resolved RIIS filename: {resolved_riis_filename}") elif cmd == "split": input_filenames = find_or_create_subset_files(big_csv_filename, logger) log_output(logger, "Input filenames:", outlist=input_filenames) else: if do_split is True: input_filenames = find_or_create_subset_files( big_csv_filename, logger) else: input_filenames = [big_csv_filename] # Make sure files to be processed exist