def find_geoid(self, geoid: str, place: Loc) -> None: """ Lookup by geoid #Args: geoid: Geonames.org geoid place: Location fields in place are updated #Returns: None. Location fields in place are updated """ flags = ResultFlags(limited=False, filtered=False) place.geoid = geoid place.georow_list.clear() self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list, geoid=place.geoid, place=place) if len(place.georow_list) == 0: self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list, geoid=place.geoid, place=place, admin=True) if len(place.georow_list) > 0: place.result_type = GeoUtil.Result.STRONG_MATCH self.process_results(place=place, flags=flags) # self.logger.debug(f'found geoid {place.georow_list[0]}') else: place.result_type = GeoUtil.Result.NO_MATCH
def process_results(self, place: Loc, flags) -> None: """ Update fields in place record using first entry in place.georow_list Updates fields with available data: city, admin1, admin2, country, lat/long, feature, etc. #Args: place: Loc instance flags: Flags tuple as returned by sort_results #Returns: None. place instance fields are updated """ # self.logger.debug(f'**PROCESS RESULT: Res={place.result_type} Georow_list={place.georow_list}') if place.result_type == GeoUtil.Result.NOT_SUPPORTED: place.place_type = Loc.PlaceType.COUNTRY if place.result_type in GeoUtil.successful_match and len( place.georow_list) > 0: self.geo_build.geodb.copy_georow_to_place(row=place.georow_list[0], place=place, fast=False) elif len(place.georow_list ) > 0 and place.result_type != GeoUtil.Result.NOT_SUPPORTED: # self.logger.debug(f'***RESULT={place.result_type} Setting to Partial') place.result_type = GeoUtil.Result.PARTIAL_MATCH place.set_place_type_text()
def _find_type_as_city(self, place: Loc, typ) -> int: """ Do a lookup using the field specifed by typ as a city name. E.g. if typ is PlaceType.ADMIN1 then use the place.admin1_name field to do the city lookup #Args: place: Loc instance typ: Loc.PlaceType - Specifies which field to use as target for lookup #Returns: None place.georow_list is updated with matches """ # place.standard_parse = False typ_name = '' best = 999 if typ == Loc.PlaceType.CITY: # Try City as city (do as-is) typ_name = 'City' pass elif typ == Loc.PlaceType.ADMIN2: # Try ADMIN2 as city if place.admin2_name != '': # if '*' not in place.city: # place.prefix += ' ' + place.city place.city = place.admin2_name place.admin2_name = '' typ_name = 'Admin2' elif typ == Loc.PlaceType.PREFIX: # Try Prefix as City if place.prefix != '': place.city = place.prefix # if '*' not in tmp: # place.prefix = tmp typ_name = 'Prefix' elif typ == Loc.PlaceType.ADVANCED_SEARCH: # Advanced Search best = self.geo_build.geodb.lookup_place(place=place) return best else: self.logger.warning(f'Unknown TYPE {typ}') if typ_name != '': result_list = [] self.logger.debug( f'2) Try {typ_name} as City. Target={place.city} pref [{place.prefix}] ' ) place.place_type = Loc.PlaceType.CITY best = self.geo_build.geodb.s.lookup_place(place=place) #best_score = self.geo_build.geodb.assign_scores(result_list, place, '', fast=True, quiet=False) self.logger.debug(f'best={best}') if best >= MatchScore.Score.POOR_CUTOFF: self.logger.debug('--- DEEP SEARCH ADM2 ---') best = self.geo_build.geodb.s.deep_lookup(place=place) return best
def run_test_score(idx) -> int: in_place = Loc.Loc() res_place = Loc.Loc() TestScoring.prepare_test(idx, in_place, res_place) score = TestScoring.scoring.match_score(in_place, res_place) in_title = MatchScore.full_normalized_title(in_place) res_title = MatchScore.full_normalized_title(res_place) TestScoring.logger.debug(f' {idx}) {score:.1f} In=[{in_title}] Out=[{res_title}]') return score
def __init__(self, directory_name: str, display_progress, show_message: bool, exit_on_error: bool, languages_list_dct, feature_code_list_dct, supported_countries_dct, volume=''): """ Init #Args: directory_name: directory where geoname.org files are. DB will be in 'cache' folder under this display_progress: None or function to display progress(percent_done:int, msg:str) show_message: If True - show TKInter message dialog on error exit_on_error: If True - exit on significant error languages_list_dct: Dictionary of ISO-2 languages to import from AlternateNamesV2.txt feature_code_list_dct: Dictionary of Geoname Feature codes to import into DB supported_countries_dct: Dictionary of ISO-2 Country codes to import into DB """ self.logger = logging.getLogger(__name__) self.display_progress = display_progress self.save_place: Loc = Loc.Loc() self.miss_diag_file = None self.distance_cutoff = 0.6 # Value to determine if two lat/longs are similar based on Rectilinear Distance self.geo_build = SpellBuild.GeodataBuild( str(directory_name), display_progress=self.display_progress, show_message=show_message, exit_on_error=exit_on_error, languages_list_dct=languages_list_dct, feature_code_list_dct=feature_code_list_dct, supported_countries_dct=supported_countries_dct, volume=volume)
def find_best_match(self, location: str, place: Loc) -> bool: """ Find the best scoring match for this location in the geoname dictionary. #Args: location: location name, e.g. Los Angeles, California, USA place: Loc instance #Returns: True if a match was found place is updated with -- lat, lon, district, city, country_iso, result code """ # First parse the location into <prefix>, city, <district2>, district1, country. # Then look it up in the place db res = self.find_matches(location, place) # Clear to just best entry flags = self.filter_results(place) # If multiple matches, truncate to first match if len(place.georow_list) > 0: place.georow_list = place.georow_list[:1] self.process_results(place=place, flags=flags) place.set_place_type() nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}' place.prefix = place.prefix_cleanup(place.prefix, nm) return True else: if res in GeoUtil.successful_match: nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}' place.prefix = place.prefix_cleanup(place.prefix, nm) #print(f'Found pre=[{place.prefix}{place.prefix_commas}] Nam=[{nm}]') return True return False
def run_test_inscore(idx) -> int: target_place = Loc.Loc() result_place = Loc.Loc() TestScoring.logger.debug(f'TEST INPUT SCORE:') TestScoring.prepare_test(idx, target_place, result_place) TestScoring.logger.debug(f'prepare_test: INP={target_place.city},{target_place.admin2_name},' f'{target_place.admin1_name},{target_place.country_name}' f' RES={result_place.city},{result_place.admin2_name},' f'{result_place.admin1_name},{result_place.country_name}') # Create full, normalized titles (prefix,city,county,state,country) result_title, result_tokens, target_title, target_tokens = MatchScore._prepare_input(target_place, result_place) # Calculate score for percent of input target text that matched result sc = TestScoring.scoring._calculate_weighted_score(target_tokens, result_tokens) print(f'#{idx} SCORE={sc:.1f} In={sc:.1f}[{target_place.original_entry.title().lower()}] [{result_place.get_five_part_title()}]') return sc
def _lookup_city_as_admin2(self, place: Loc, result_list) -> int: """ Lookup place.city as admin2 name #Args: place: result_list: #Returns: """ # Try City as ADMIN2 # place.standard_parse = False place.admin2_name = place.city place.city = '' place.place_type = Loc.PlaceType.ADMIN2 self.logger.debug( f' Try admin2 [{place.admin2_name}] as city [{place.get_five_part_title()}]' ) best = self.geo_build.geodb.lookup_place(place=place) result_list.extend(place.georow_list) return best
def lookup_place(self, place: Loc) -> []: """ **Lookup a place in geoname.org db** Lookup is based on place.place_type as follows: Loc.PlaceType.COUNTRY: does self.search_country(place) Loc.PlaceType.ADVANCED_SEARCH: does self.feature_search(place) Otherwise: do self.search_city(place) # Args: place: Loc instance. Call Loc.parse_place() before calling lookup_place() # Returns: Best score found place.georow_list contains a list of matching entries. Each entry has: Lat, Long, districtID (County or State or Province ID), and a match quality score """ place.result_type = Result.STRONG_MATCH best_score = MatchScore.Score.VERY_POOR if place.place_type == Loc.PlaceType.COUNTRY: # Country if place.georow_list: place.country_name = self.get_country_name(place.country_name) best_score = MatchScore.Score.VERY_GOOD else: # General search if place.place_type == Loc.PlaceType.ADMIN1: place.feature = "ADM1" place.georow_list.clear() best_score = self._search(place=place, georow_list=place.georow_list, name=place.city, admin1_id=place.admin1_id, admin2_id=place.admin2_id, iso=place.country_iso, feature=place.feature, sdx=get_soundex(place.city)) self.logger.debug(f'**LOOKUP PLACE score={best_score}') return best_score
def is_country_valid(self, place: Loc) -> bool: """ See if COUNTRY is present and is in the supported country list #Args: place: #Returns: True if country is valid """ if place.country_iso == '': place.result_type = GeoUtil.Result.NO_COUNTRY is_valid = False elif not self._verify_iso(place.country_iso): self.logger.debug(f'Country [{place.country_iso}] not supported') place.result_type = GeoUtil.Result.NOT_SUPPORTED place.place_type = Loc.PlaceType.COUNTRY is_valid = False else: is_valid = True return is_valid
def add_alias_to_db(self, ky: str, geo_build: SpellBuild): alias_row = alias_list.get(ky) place = Loc.Loc() place.country_iso = alias_row[ALIAS_ISO].lower() place.city = alias_row[ALIAS_NAME] place.feature = alias_row[ALIAS_FEAT] place.place_type = Loc.PlaceType.CITY # Lookup main entry and get GEOID geo_build.geodb.s.lookup_place(place) if len(place.georow_list) > 0: if len(place.georow_list[0]) > 0: geo_row = list(place.georow_list[0][0:GeoUtil.Entry.SDX + 1]) geo_build.update_geo_row_name(geo_row=geo_row, name=ky) geo_tuple = tuple(geo_row) geo_build.insert(geo_tuple=geo_tuple, feat_code=alias_row[ALIAS_FEAT])
def __init__(self, geodb, lang='ut8'): """ Args: geodb: GeoDB instance lang: preferred ISO language code """ self.logger = logging.getLogger(__name__) self.detailed_debug = True self.start = 0 self.use_wildcards = True self.total_lookups = 0 self.cache = {} self.place_type = '' self.select_str = 'name, country, admin1_id, admin2_id, lat, lon, feature, geoid, sdx' self.geodb = geodb self.match = MatchScore.MatchScore() self.norm = Normalize.Normalize() self.place = Loc.Loc() self.lang = lang
def find_matches(self, location: str, place: Loc): """ Find a location in the geoname database. On successful match, place.georow_list will contain a list of georows that matched the name. Each georow can be copied to a Loc structure by calling process_result #Args: location: comma separated name of location to find, e.g. 'Los Angeles, California, USA' place: Loc structure plain_search: If True then don't do wildcard searches #Returns: GeoUtil.Result code """ place.parse_place(place_name=location, geobuild=self.geo_build) best_score = 9999 self.is_country_valid(place) if place.result_type == GeoUtil.Result.NOT_SUPPORTED: place.georow_list.clear() return best_score # Create full entry text place.update_names(self.geo_build.output_replace_dct) flags = ResultFlags(limited=False, filtered=False) result_list = [ ] # We will do different search types and append all results into result_list # self.logger.debug(f'== FIND LOCATION City=[{place.city}] Adm2=[{place.admin2_name}]\ # Adm1=[{place.admin1_name}] Pref=[{place.prefix}] Cntry=[{place.country_name}] iso=[{place.country_iso}] Type={place.place_type} ') # Save a shallow copy of place so we can restore fields self.save_place = copy.copy(place) # After parsing, last token is either country or underscore. # Second to last is either Admin1 or underscore # If >2 tokens: token[0] is placed in City and in Prefix # If >3 tokens: token[1] is placed in Admin2 and appended to Prefix # 1) Try lookup based on standard parsing: lookup city, county, state/province, or country as parsed self.logger.debug( f' 1) Standard, based on parsing. pref [{place.prefix}] city [{place.city}]' f' sdx={GeoSearch.get_soundex(place.city)} ' f'feat={place.feature} typ=[{place.place_type}]') if place.place_type != Loc.PlaceType.COUNTRY and place.place_type != Loc.PlaceType.ADMIN1 \ and place.place_type != Loc.PlaceType.ADMIN2: self.logger.debug('find std place - not ADM* ') best_score = self.geo_build.geodb.s.lookup_place(place=place) self.logger.debug(f'std: best={best_score}') if place.georow_list: result_list.extend(place.georow_list) # self.logger.debug(result_list) if best_score >= MatchScore.Score.POOR_CUTOFF: # No good matches found. Try a deep search on soundex of combinations of terms self.logger.debug('--- DEEP SEARCH city ---') best_score = self.geo_build.geodb.s.deep_lookup(place=place) # self.logger.debug(place.georow_list) if place.georow_list: result_list.extend(place.georow_list) # Restore fields self._restore_fields(place, self.save_place) # 2) Try second token (Admin2) as a city if place.admin2_name != '': self.logger.debug(f'try 2nd token as city') place.georow_list.clear() best_score = self._find_type_as_city(place, Loc.PlaceType.ADMIN2) self.logger.debug(f'2nd token best={best_score}') if place.georow_list: result_list.extend(place.georow_list) # self.logger.debug(result_list) # See if we found any good scoring matches if best_score >= MatchScore.Score.POOR_CUTOFF: # No good matches found. Try a deep search on soundex of combinations of terms self.logger.debug('--- DEEP SEARCH city ---') best_score = self.geo_build.geodb.s.deep_lookup( place=place) # self.logger.debug(place.georow_list) if place.georow_list: result_list.extend(place.georow_list) self._restore_fields(place, self.save_place) # Move result_list into place georow list place.georow_list.clear() place.georow_list.extend(result_list) # self.logger.debug(place.georow_list) else: self.logger.debug('DONE. type is country, adm1, or adm2') return place.result_type if len(place.georow_list) > 0: best_score = self.geo_build.geodb._assign_scores(place.georow_list, place, '', fast=False, quiet=True) # self.logger.debug('process results') self.process_results(place=place, flags=flags) flags = self.filter_results(place) # self.logger.debug(place.georow_list) if len(place.georow_list) == 0: # NO MATCH if place.result_type != GeoUtil.Result.NO_COUNTRY and place.result_type != GeoUtil.Result.NOT_SUPPORTED: place.result_type = GeoUtil.Result.NO_MATCH self.logger.debug(f'Not found.') #place.result_type = GeoUtil.Result.STRONG_MATCH else: self.logger.debug('Found country') elif len(place.georow_list) > 1: self.logger.debug(f'Success! {len(place.georow_list)} matches') place.result_type = GeoUtil.Result.MULTIPLE_MATCHES # Process the results self.process_results(place=place, flags=flags) # self.logger.debug(f'Status={place.status}') return place.result_type
def copy_georow_to_place(self, row, place: Loc, fast: bool): """ Copy data from DB row into place instance Country, admin1_id, admin2_id, city, lat/lon, feature, geoid are updated if available #Args: row: georow from geoname database place: Loc instance fast: Currently ignored #Returns: None. Place instance is updated with data from georow """ place.admin1_id = '' place.admin2_id = '' place.admin1_name = '' place.admin2_name = '' place.city = '' place.country_iso = str(row[Entry.ISO]) place.lat = row[Entry.LAT] place.lon = row[Entry.LON] place.feature = str(row[Entry.FEAT]) # self.logger.debug(f'feat={place.feature}') place.geoid = str(row[Entry.ID]) place.prefix = row[Entry.PREFIX] place.place_type = Loc.PlaceType.CITY if place.feature == 'ADM0': place.place_type = Loc.PlaceType.COUNTRY pass elif place.feature == 'ADM1': place.admin1_id = row[Entry.ADM1] place.place_type = Loc.PlaceType.ADMIN1 elif place.feature == 'ADM2': place.admin1_id = row[Entry.ADM1] place.admin2_id = row[Entry.ADM2] place.place_type = Loc.PlaceType.ADMIN2 else: place.admin1_id = row[Entry.ADM1] place.admin2_id = row[Entry.ADM2] place.city = row[Entry.NAME] if place.admin1_id != '': if place.admin1_name == '': place.admin1_name = self.get_admin1_name( place.admin1_id, place.country_iso) if place.admin2_name == '': place.admin2_name = self.get_admin2_name( place.admin1_id, place.admin2_id, place.country_iso) place.country_name = str(self.get_country_name(row[Entry.ISO])) if place.admin2_name is None: place.admin2_name = '' if place.admin1_name is None: place.admin1_name = '' place.city = str(place.city) if place.city is None: place.city = '' try: place.score = row[Entry.SCORE] except IndexError: pass
def setUp(self) -> None: TestScoring.in_place: Loc.Loc = Loc.Loc() TestScoring.out_place: Loc.Loc = Loc.Loc()
def filter_results(self, place: Loc): """ Sort place.georow_list by match score and eliminate duplicates In case of duplicate, keep the one with best match score. See MatchScore.match_score() for details on score calculation Discard names that didnt exist at time of event (update result flag if this occurs) Duplicates are defined as two items with: 1) same GEOID or 2) same name and similar lat/lon (within Rectilinear Distance of distance_cutoff degrees) Add flag if we hit the lookup limit #Args: place: #Returns: ResultFlags(limited=limited_flag, filtered=date_filtered) """ date_filtered = False # Flag to indicate whether we dropped locations due to event date # event_year = place.event_year if len(place.georow_list) > 100: limited_flag = True else: limited_flag = False if len(place.georow_list) == 0: self.logger.debug('EMPTY') return ResultFlags(limited=limited_flag, filtered=date_filtered) # Remove duplicate locations in list (have same name and lat/lon) self.remove_duplicates(place) if len(place.georow_list) == 0: self.logger.error(f'georow_list = 0') return ResultFlags(limited=limited_flag, filtered=date_filtered) gap_threshold = 0 score = 0 # Sort places in match_score order new_list = sorted(place.georow_list, key=itemgetter(GeoUtil.Entry.SCORE, GeoUtil.Entry.ADM1)) if len(new_list) == 0: self.logger.error(f'new_list = 0') return ResultFlags(limited=limited_flag, filtered=date_filtered) if len(new_list[0]) < GeoUtil.Entry.SCORE + 1: self.logger.debug(f'len = {len(new_list[0])}') self.logger.debug(f'[{new_list[0]}]') return ResultFlags(limited=limited_flag, filtered=date_filtered) min_score = new_list[0][GeoUtil.Entry.SCORE] place.georow_list.clear() # Go through sorted list and only add items to georow_list that are close to the best score for rw, geo_row in enumerate(new_list): score = geo_row[GeoUtil.Entry.SCORE] # admin1_name = self.geo_build.geodb.get_admin1_name_direct(geo_row[GeoUtil.Entry.ADM1], geo_row[GeoUtil.Entry.ISO]) # admin2_name = self.geo_build.geodb.get_admin2_name_direct(geo_row[GeoUtil.Entry.ADM1], # geo_row[GeoUtil.Entry.ADM2], geo_row[GeoUtil.Entry.ISO]) base = MatchScore.Score.VERY_GOOD + (MatchScore.Score.GOOD / 3) gap_threshold = base + abs(min_score) * .6 # Range to display when there is a strong match # if (min_score <= base and score > min_score + gap_threshold) or score > MatchScore.Score.VERY_POOR * 1.5: if score > min_score + gap_threshold: self.logger.debug( f'SKIP Score={score:.1f} Min={min_score:.1f} Gap={gap_threshold:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}]' f' {geo_row[GeoUtil.Entry.NAME]},' f' {geo_row[GeoUtil.Entry.ADM2]},' f' {geo_row[GeoUtil.Entry.ADM1]} ') else: place.georow_list.append(geo_row) self.logger.debug( f'Score {score:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}] {geo_row[GeoUtil.Entry.NAME]}, ' f'AD2={geo_row[GeoUtil.Entry.ADM2]},' f' AD1={geo_row[GeoUtil.Entry.ADM1]} {geo_row[GeoUtil.Entry.ISO]}' ) # self.logger.debug(f'min={min_score:.1f}, gap2={gap_threshold:.1f} strong cutoff={min_score + gap_threshold:.1f}') if min_score <= MatchScore.Score.VERY_GOOD and len( place.georow_list ) == 1 and place.result_type != GeoUtil.Result.NOT_SUPPORTED: place.result_type = GeoUtil.Result.STRONG_MATCH else: # Log item that we couldnt match if self.miss_diag_file: self.miss_diag_file.write( f'Lookup {place.original_entry} thresh={gap_threshold} gap={score - min_score}\n\n' ) return ResultFlags(limited=limited_flag, filtered=date_filtered)
def setUp(self) -> None: self.place: Loc.Loc = Loc.Loc()