def find_best_match(self, location: str, place: Loc) -> bool: """ Find the best scoring match for this location in the geoname dictionary. #Args: location: location name, e.g. Los Angeles, California, USA place: Loc instance #Returns: True if a match was found place is updated with -- lat, lon, district, city, country_iso, result code """ # First parse the location into <prefix>, city, <district2>, district1, country. # Then look it up in the place db res = self.find_matches(location, place) # Clear to just best entry flags = self.filter_results(place) # If multiple matches, truncate to first match if len(place.georow_list) > 0: place.georow_list = place.georow_list[:1] self.process_results(place=place, flags=flags) place.set_place_type() nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}' place.prefix = place.prefix_cleanup(place.prefix, nm) return True else: if res in GeoUtil.successful_match: nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}' place.prefix = place.prefix_cleanup(place.prefix, nm) #print(f'Found pre=[{place.prefix}{place.prefix_commas}] Nam=[{nm}]') return True return False
def copy_georow_to_place(self, row, place: Loc, fast: bool): """ Copy data from DB row into place instance Country, admin1_id, admin2_id, city, lat/lon, feature, geoid are updated if available #Args: row: georow from geoname database place: Loc instance fast: Currently ignored #Returns: None. Place instance is updated with data from georow """ place.admin1_id = '' place.admin2_id = '' place.admin1_name = '' place.admin2_name = '' place.city = '' place.country_iso = str(row[Entry.ISO]) place.lat = row[Entry.LAT] place.lon = row[Entry.LON] place.feature = str(row[Entry.FEAT]) place.geoid = str(row[Entry.ID]) place.prefix = row[Entry.PREFIX] place.place_type = Loc.PlaceType.CITY if place.feature == 'ADM0': place.place_type = Loc.PlaceType.COUNTRY pass elif place.feature == 'ADM1': place.admin1_id = row[Entry.ADM1] place.place_type = Loc.PlaceType.ADMIN1 elif place.feature == 'ADM2': place.admin1_id = row[Entry.ADM1] place.admin2_id = row[Entry.ADM2] place.place_type = Loc.PlaceType.ADMIN2 else: place.admin1_id = row[Entry.ADM1] place.admin2_id = row[Entry.ADM2] place.city = row[Entry.NAME] self.s.update_names(place) if place.admin2_name is None: place.admin2_name = '' if place.admin1_name is None: place.admin1_name = '' place.city = str(place.city) if place.city is None: place.city = '' try: place.score = row[Entry.SCORE] except IndexError: pass
def match_score(self, target_place: Loc, result_place: Loc, fast=False) -> float: """ Calculate a heuristic score for how well a result place name matches a target place name. The score is based on percent of characters that didnt match in input and output (plus other items described below). Mismatch score is 0-100% reflecting the percent mismatch between the user input and the result. This is then adjusted by Feature type (large city gives best score) plus other items to give a final heuristic where -10 is perfect match of a large city and 100 is no match. A) Heuristic: 1) Create 5 part title (prefix, city, county, state/province, country) 2) Normalize text - self.norm.normalize_for_scoring() 3) Remove sequences of 2 chars or more that match in target and result 4) Calculate inscore - percent of characters in input that didn't match result. Weight by term (city,,county,state,ctry) Exact match of city term gets a bonus 5) Calculate result score - percent of characters in db result that didn't match input B) Score components (All are weighted in final score): in_score - (0-100) - score for input that didnt match output feature_score - (0-100) More important features get lower score. City with 1M population is zero. Valley is 100. Geodata.feature_priority(). wildcard_penalty - score is raised by X if it includes a wildcard prefix_penalty - score is raised by length of Prefix C) A standard text difference, such as Levenstein, was not used because those treat both strings as equal, whereas this treats the User text as more important than DB result text and also weights each token. A user's text might commonly be something like: Paris, France and a DB result of Paris, Paris, Ile De France, France. The Levenstein distance would be large, but with this heuristic, the middle terms can have lower weights, and having all the input matched can be weighted higher than mismatches on the county and province. This heuristic gives a score of -9 for Paris, France. # Args: target_place: Loc with users entry. result_place: Loc with DB result. # Returns: score """ if fast: return self.fast_score(target_place, result_place) self.score_diags = '' # Diagnostic text for scoring self.timing = 0 save_prefix = target_place.prefix #self.logger.debug(f'pref={target_place.prefix}') # Remove items in prefix that are in result if target_place.place_type != Loc.PlaceType.ADVANCED_SEARCH: target_place.prefix = self.norm.normalize_for_scoring( target_place.prefix) result_name = result_place.get_long_name(None) target_place.prefix = Loc.Loc.fast_prefix(target_place.prefix, result_name) else: target_place.updated_entry = target_place.get_long_name(None) # Create full, normalized titles (prefix,city,county,state,country) result_title, result_tokens, target_title, target_tokens = self._prepare_input( target_place, result_place) #self.logger.debug(f'Res [{result_tokens}] Targ [{target_tokens}] ') # Calculate Prefix score. Prefix is not used in search and longer is generally worse prefix_score = _calculate_prefix_penalty(target_place.prefix) # Calculate score for percent of input target text that matched result in_score = self._calculate_weighted_score(target_tokens, result_tokens) # Calculate score for wildcard search - wildcard searches are missing letters and need special handling wildcard_score = self._calculate_wildcard_score( target_place.original_entry) # Calculate Feature score - this ensures "important" places get higher rank (large city, etc) feature_score = Geodata.Geodata._feature_priority(result_place.feature) # Weight and add up scores - Each item is 0-100 and then weighted, except wildcard penalty score: float = in_score * self.input_weight + feature_score * self.feature_weight + \ prefix_score * self.prefix_weight + wildcard_score self.logger.debug( f'SCORE {score:.1f} res=[{result_title}] pref=[{target_place.prefix}]' f'inSc={in_score * self.input_weight:.1f}% feat={feature_score * self.feature_weight:.1f} {result_place.feature} ' f'wild={wildcard_score} pref={prefix_score * self.prefix_weight:.1f}' ) self.logger.debug(self.score_diags) target_place.prefix = save_prefix return score + 8