def _calculate_prefix_penalty(prefix): # If the location has a prefix, it is not as good a match prefix_len = len(prefix) if prefix_len > 0: # reduce penalty if prefix is a street (contains digits or 'street' or 'road') penalty = 5 + prefix_len if GeoUtil.is_street(prefix): penalty *= 0.1 else: penalty = 0 return penalty
def parse_place(self, place_name: str, geo_db: GeoDB.GeoDB): """ Given a comma separated place name, parse into its city, admin1, country and type of entity (city, country etc) #Args: place_name: The place name to parse geo_files: GeodataBuild instance #Returns: Fields in Loc (city, adm1, adm2, iso) are updated based on parsing. self.status has Result status code """ self.geo_db = geo_db self.logger.debug(f'PARSE {place_name}\n') self.clear() self.original_entry = place_name # Convert open-brace and open-paren to comma. close brace/paren will be stripped by normalize() name = re.sub(r'\[', ',', place_name) name = re.sub(r'\(', ',', name) tokens = name.split(",") if len(tokens[-1]) == 0: # Last item is blank, so remove it tokens = tokens[:-1] token_count = len(tokens) self.place_type = PlaceType.CITY # First, try to parse and validate State/Province, and Country from last two tokens # If one other token, parse as city # If two other tokens, parse as city, admin2 # First two tokens are also copied to prefix. # Place type is the leftmost item we found - either City, Admin2, Admin2, or Country # If '--' in name, then extract advanced search options if '--' in place_name: # Advanced Search - Pull out filter flags if present self.logger.debug('filter') self.get_filter_parameters(place_name) return if token_count > 0: # COUNTRY - right-most token should be country self.country_name = self.norm.normalize(tokens[-1], False) # Validate country self.country_iso = geo_db.s.get_country_iso( self.country_name) # Get Country country_iso self.logger.debug( f'1) Lookup COUNTRY [{self.country_name}] Found ISO [{self.country_iso}] *******' ) if self.country_iso != '': self.place_type = PlaceType.COUNTRY self.result_type = GeoUtil.Result.PARTIAL_MATCH else: # Last token is not COUNTRY. # Append dummy token so we now have <tokens>, x tokens.append('_') token_count = len(tokens) self.result_type = GeoUtil.Result.NO_COUNTRY self.country_name = '' # self.logger.debug(f'ISO =[{self.country_iso}]') if token_count > 1: # See if 2nd to last token is Admin1 val = tokens[-2] self.logger.debug(f'Get ADM1 from tkn-2 [{val}]') self.admin1_name = self.norm.admin1_normalize( val, self.country_iso) if len(self.admin1_name) > 0: # Lookup Admin1 self.logger.debug( f'2) Find ADMIN1 [{self.admin1_name}] *******') row_list = [] self.admin1_id = geo_db.s.get_admin1_id( self.admin1_name, self.country_iso) if self.admin1_id != '': # Found Admin1 self.place_type = PlaceType.ADMIN1 self.georow_list = row_list self.admin1_name = geo_db.s.get_admin1_name( self.admin1_id, self.country_iso) # self.logger.debug(f'adm1 nm=[{self.admin1_name}]\nGet ISO') self.logger.debug( f'2) Find iso for admin1 id [{self.admin1_id}] *******' ) self.country_iso = geo_db.s.get_iso_from_admin1_id( self.admin1_id, self.country_iso) self.result_type = GeoUtil.Result.PARTIAL_MATCH # Get country if blank row_list = [] if self.country_name == '': self.country_name = geo_db.s.get_country_name( self.country_iso) else: # Last token is not Admin1 - append dummy token so we have <tokens>, admin1, country self.admin1_name = '' # Add dummy token for admin1 position tokens.insert(-1, '_') # token_count = len(tokens) else: tokens[-2] = '_' # Last two tokens are now Admin1, Country (although they may have dummy value '_') # If >2 tokens: Put first non-blank token in City and in Prefix # If >3 tokens: Put second non-blank token in Admin2 and also append to Prefix # Remove all blank tokens tokens = [x for x in tokens if x] token_count = len(tokens) if token_count >= 3: # Possible Formats: City, Admin1, Country or Admin2, Admin1, Country # Take first tkn as city self.city = self.norm.normalize(tokens[0], False) self.place_type = PlaceType.CITY # Also place token[0] into Prefix if '*' not in tokens[0]: self.prefix = str(tokens[0].strip(' ')) if token_count >= 4: # Admin2 is 2nd. Note - if Admin2 isnt found, it will look it up as city if GeoUtil.is_street(tokens[-4].lower()): # Format: Prefix, City, Admin1, Country self.city = self.norm.normalize(tokens[-3], False) else: # Format: City, Admin2, Admin1, Country self.admin2_name = self.norm.normalize(tokens[-3], False) self.city = self.norm.normalize(tokens[-4], False) self.place_type = PlaceType.CITY # put token[0] and token[1] into Prefix if '*' not in tokens[1]: self.prefix = str(tokens[0].strip(' ')) + ' ' + str( tokens[1].strip(' ')) self.prefix = self.norm.normalize(self.prefix, False) row_list = [] # fill in country name if still missing - finding Admin1 will find country ISO if self.country_name == '' and self.country_iso != '': self.country_name = geo_db.s.get_country_name(self.country_iso) self.logger.debug( f" ======= PARSED: {place_name} \nCity [{self.city}] Adm2 [{self.admin2_name}]" f" Adm1 [{self.admin1_name}] adm1_id [{self.admin1_id}] Cntry [{self.country_name}] Pref=[{self.prefix}]" f" type_id={self.place_type}\n") return