示例#1
0
    def find_geoid(self, geoid: str, place: Loc) -> None:
        """
        Lookup by geoid   
        #Args:   
            geoid:  Geonames.org geoid
            place:  Location fields in place are updated

        #Returns: None. Location fields in place are updated

        """
        flags = ResultFlags(limited=False, filtered=False)
        place.geoid = geoid
        place.georow_list.clear()
        self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list,
                                            geoid=place.geoid,
                                            place=place)
        if len(place.georow_list) == 0:
            self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list,
                                                geoid=place.geoid,
                                                place=place,
                                                admin=True)

        if len(place.georow_list) > 0:
            place.result_type = GeoUtil.Result.STRONG_MATCH
            self.process_results(place=place, flags=flags)
            # self.logger.debug(f'found geoid {place.georow_list[0]}')
        else:
            place.result_type = GeoUtil.Result.NO_MATCH
示例#2
0
    def process_results(self, place: Loc, flags) -> None:
        """
            Update fields in place record using first entry in place.georow_list   
            Updates fields with available data: city, admin1, admin2, country, lat/long, feature, etc.   
        #Args:    
            place: Loc instance   
            flags: Flags tuple as returned by sort_results   

        #Returns:    
            None.  place instance fields are updated   
        """
        # self.logger.debug(f'**PROCESS RESULT:  Res={place.result_type}   Georow_list={place.georow_list}')
        if place.result_type == GeoUtil.Result.NOT_SUPPORTED:
            place.place_type = Loc.PlaceType.COUNTRY

        if place.result_type in GeoUtil.successful_match and len(
                place.georow_list) > 0:
            self.geo_build.geodb.copy_georow_to_place(row=place.georow_list[0],
                                                      place=place,
                                                      fast=False)
        elif len(place.georow_list
                 ) > 0 and place.result_type != GeoUtil.Result.NOT_SUPPORTED:
            # self.logger.debug(f'***RESULT={place.result_type} Setting to Partial')
            place.result_type = GeoUtil.Result.PARTIAL_MATCH

        place.set_place_type_text()
示例#3
0
    def _find_type_as_city(self, place: Loc, typ) -> int:
        """
            Do a lookup using the field specifed by typ as a city name.  E.g. if typ is PlaceType.ADMIN1 then   
            use the place.admin1_name field to do the city lookup   
        #Args:   
            place: Loc instance   
            typ: Loc.PlaceType - Specifies which field to use as target for lookup   

        #Returns:  None   
            place.georow_list is updated with matches   
        """
        # place.standard_parse = False
        typ_name = ''
        best = 999
        if typ == Loc.PlaceType.CITY:
            # Try City as city (do as-is)
            typ_name = 'City'
            pass
        elif typ == Loc.PlaceType.ADMIN2:
            # Try ADMIN2 as city
            if place.admin2_name != '':
                # if '*' not in place.city:
                #    place.prefix += ' ' + place.city
                place.city = place.admin2_name
                place.admin2_name = ''
                typ_name = 'Admin2'
        elif typ == Loc.PlaceType.PREFIX:
            # Try Prefix as City
            if place.prefix != '':
                place.city = place.prefix
                # if '*' not in tmp:
                #    place.prefix = tmp
                typ_name = 'Prefix'
        elif typ == Loc.PlaceType.ADVANCED_SEARCH:
            # Advanced Search
            best = self.geo_build.geodb.lookup_place(place=place)
            return best
        else:
            self.logger.warning(f'Unknown TYPE {typ}')

        if typ_name != '':
            result_list = []
            self.logger.debug(
                f'2) Try {typ_name} as City.  Target={place.city}  pref [{place.prefix}] '
            )

            place.place_type = Loc.PlaceType.CITY
            best = self.geo_build.geodb.s.lookup_place(place=place)

            #best_score = self.geo_build.geodb.assign_scores(result_list, place, '', fast=True, quiet=False)
            self.logger.debug(f'best={best}')
            if best >= MatchScore.Score.POOR_CUTOFF:
                self.logger.debug('--- DEEP SEARCH ADM2 ---')
                best = self.geo_build.geodb.s.deep_lookup(place=place)
        return best
示例#4
0
    def run_test_score(idx) -> int:
        in_place = Loc.Loc()
        res_place = Loc.Loc()

        TestScoring.prepare_test(idx, in_place, res_place)
        score = TestScoring.scoring.match_score(in_place, res_place)
        in_title = MatchScore.full_normalized_title(in_place)
        res_title = MatchScore.full_normalized_title(res_place)

        TestScoring.logger.debug(f'     {idx}) {score:.1f} In=[{in_title}] Out=[{res_title}]')
        return score
示例#5
0
    def __init__(self,
                 directory_name: str,
                 display_progress,
                 show_message: bool,
                 exit_on_error: bool,
                 languages_list_dct,
                 feature_code_list_dct,
                 supported_countries_dct,
                 volume=''):
        """
            Init

        #Args:
            directory_name: directory where geoname.org files are.  DB will be in 'cache' folder under this   
            display_progress: None or function to display progress(percent_done:int, msg:str)  
            show_message: If True - show TKInter message dialog on error   
            exit_on_error: If True - exit on significant error   
            languages_list_dct: Dictionary of ISO-2 languages to import from AlternateNamesV2.txt   
            feature_code_list_dct: Dictionary of Geoname Feature codes to import into DB   
            supported_countries_dct: Dictionary of ISO-2 Country codes to import into DB   
        """
        self.logger = logging.getLogger(__name__)
        self.display_progress = display_progress
        self.save_place: Loc = Loc.Loc()
        self.miss_diag_file = None
        self.distance_cutoff = 0.6  # Value to determine if two lat/longs are similar based on Rectilinear Distance
        self.geo_build = SpellBuild.GeodataBuild(
            str(directory_name),
            display_progress=self.display_progress,
            show_message=show_message,
            exit_on_error=exit_on_error,
            languages_list_dct=languages_list_dct,
            feature_code_list_dct=feature_code_list_dct,
            supported_countries_dct=supported_countries_dct,
            volume=volume)
示例#6
0
    def find_best_match(self, location: str, place: Loc) -> bool:
        """
            Find the best scoring match for this location in the geoname dictionary.  
        #Args:  
            location:  location name, e.g. Los Angeles, California, USA   
            place:  Loc instance   
        #Returns: True if a match was found     
            place is updated with -- lat, lon, district, city, country_iso, result code  
        """
        #  First parse the location into <prefix>, city, <district2>, district1, country.
        #  Then look it up in the place db

        res = self.find_matches(location, place)

        # Clear to just best entry
        flags = self.filter_results(place)
        # If multiple matches, truncate to first match
        if len(place.georow_list) > 0:
            place.georow_list = place.georow_list[:1]
            self.process_results(place=place, flags=flags)
            place.set_place_type()

            nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}'
            place.prefix = place.prefix_cleanup(place.prefix, nm)
            return True
        else:
            if res in GeoUtil.successful_match:
                nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}'
                place.prefix = place.prefix_cleanup(place.prefix, nm)
                #print(f'Found pre=[{place.prefix}{place.prefix_commas}] Nam=[{nm}]')
                return True

            return False
示例#7
0
    def run_test_inscore(idx) -> int:
        target_place = Loc.Loc()
        result_place = Loc.Loc()

        TestScoring.logger.debug(f'TEST INPUT SCORE:')

        TestScoring.prepare_test(idx, target_place, result_place)
        TestScoring.logger.debug(f'prepare_test: INP={target_place.city},{target_place.admin2_name},'
                                 f'{target_place.admin1_name},{target_place.country_name}'
                                 f' RES={result_place.city},{result_place.admin2_name},'
                                 f'{result_place.admin1_name},{result_place.country_name}')

        # Create full, normalized titles (prefix,city,county,state,country)
        result_title, result_tokens, target_title, target_tokens = MatchScore._prepare_input(target_place, result_place)

        # Calculate score for  percent of input target text that matched result
        sc = TestScoring.scoring._calculate_weighted_score(target_tokens, result_tokens)

        print(f'#{idx} SCORE={sc:.1f} In={sc:.1f}[{target_place.original_entry.title().lower()}] [{result_place.get_five_part_title()}]')
        return sc
示例#8
0
    def _lookup_city_as_admin2(self, place: Loc, result_list) -> int:
        """
        Lookup place.city as admin2 name   
        #Args:   
            place:     
            result_list:   

        #Returns:   

        """
        # Try City as ADMIN2
        # place.standard_parse = False
        place.admin2_name = place.city
        place.city = ''
        place.place_type = Loc.PlaceType.ADMIN2
        self.logger.debug(
            f'  Try admin2  [{place.admin2_name}] as city [{place.get_five_part_title()}]'
        )
        best = self.geo_build.geodb.lookup_place(place=place)
        result_list.extend(place.georow_list)
        return best
示例#9
0
    def lookup_place(self, place: Loc) -> []:
        """
            **Lookup a place in geoname.org db**     
            Lookup is based on place.place_type as follows:  
                Loc.PlaceType.COUNTRY: does self.search_country(place)  
                Loc.PlaceType.ADVANCED_SEARCH: does self.feature_search(place)  
                Otherwise: do self.search_city(place)  
        # Args:   
            place: Loc instance.  Call Loc.parse_place() before calling lookup_place()   

        # Returns:   
            Best score found  
            place.georow_list contains a list of matching entries.  
            Each entry has: Lat, Long, districtID (County or State or Province ID), and a match quality score  

        """
        place.result_type = Result.STRONG_MATCH
        best_score = MatchScore.Score.VERY_POOR

        if place.place_type == Loc.PlaceType.COUNTRY:
            # Country
            if place.georow_list:
                place.country_name = self.get_country_name(place.country_name)
                best_score = MatchScore.Score.VERY_GOOD
        else:
            # General search
            if place.place_type == Loc.PlaceType.ADMIN1:
                place.feature = "ADM1"
            place.georow_list.clear()
            best_score = self._search(place=place,
                                      georow_list=place.georow_list,
                                      name=place.city,
                                      admin1_id=place.admin1_id,
                                      admin2_id=place.admin2_id,
                                      iso=place.country_iso,
                                      feature=place.feature,
                                      sdx=get_soundex(place.city))
        self.logger.debug(f'**LOOKUP PLACE  score={best_score}')
        return best_score
示例#10
0
    def is_country_valid(self, place: Loc) -> bool:
        """
        See if COUNTRY is present and is in the supported country list   

        #Args:   
            place:  

        #Returns:   
            True if country is valid   
        """
        if place.country_iso == '':
            place.result_type = GeoUtil.Result.NO_COUNTRY
            is_valid = False
        elif not self._verify_iso(place.country_iso):
            self.logger.debug(f'Country [{place.country_iso}] not supported')
            place.result_type = GeoUtil.Result.NOT_SUPPORTED
            place.place_type = Loc.PlaceType.COUNTRY
            is_valid = False
        else:
            is_valid = True

        return is_valid
示例#11
0
    def add_alias_to_db(self, ky: str, geo_build: SpellBuild):
        alias_row = alias_list.get(ky)
        place = Loc.Loc()
        place.country_iso = alias_row[ALIAS_ISO].lower()
        place.city = alias_row[ALIAS_NAME]
        place.feature = alias_row[ALIAS_FEAT]
        place.place_type = Loc.PlaceType.CITY

        # Lookup main entry and get GEOID
        geo_build.geodb.s.lookup_place(place)
        if len(place.georow_list) > 0:
            if len(place.georow_list[0]) > 0:
                geo_row = list(place.georow_list[0][0:GeoUtil.Entry.SDX + 1])
                geo_build.update_geo_row_name(geo_row=geo_row, name=ky)
                geo_tuple = tuple(geo_row)
                geo_build.insert(geo_tuple=geo_tuple,
                                 feat_code=alias_row[ALIAS_FEAT])
示例#12
0
 def __init__(self, geodb, lang='ut8'):
     """
     
     Args:
         geodb: GeoDB instance
         lang: preferred ISO language code
     """
     self.logger = logging.getLogger(__name__)
     self.detailed_debug = True
     self.start = 0
     self.use_wildcards = True
     self.total_lookups = 0
     self.cache = {}
     self.place_type = ''
     self.select_str = 'name, country, admin1_id, admin2_id, lat, lon, feature, geoid, sdx'
     self.geodb = geodb
     self.match = MatchScore.MatchScore()
     self.norm = Normalize.Normalize()
     self.place = Loc.Loc()
     self.lang = lang
示例#13
0
    def find_matches(self, location: str, place: Loc):
        """
            Find a location in the geoname database.  On successful match, place.georow_list will contain   
            a list of georows that matched the name.  Each georow can be copied to a Loc structure by   
            calling process_result   

        #Args:   
            location: comma separated name of location to find, e.g. 'Los Angeles, California, USA'   
            place: Loc structure   
            plain_search: If True then don't do wildcard searches   
        #Returns:   
            GeoUtil.Result code   
        """
        place.parse_place(place_name=location, geobuild=self.geo_build)
        best_score = 9999

        self.is_country_valid(place)
        if place.result_type == GeoUtil.Result.NOT_SUPPORTED:
            place.georow_list.clear()
            return best_score

        # Create full entry text
        place.update_names(self.geo_build.output_replace_dct)

        flags = ResultFlags(limited=False, filtered=False)
        result_list = [
        ]  # We will do different search types and append all results into result_list

        # self.logger.debug(f'== FIND LOCATION City=[{place.city}] Adm2=[{place.admin2_name}]\
        # Adm1=[{place.admin1_name}] Pref=[{place.prefix}] Cntry=[{place.country_name}] iso=[{place.country_iso}]  Type={place.place_type} ')

        # Save a shallow copy of place so we can restore fields
        self.save_place = copy.copy(place)

        # After parsing, last token is either country or underscore.
        # Second to last is either Admin1 or underscore
        # If >2 tokens:  token[0] is placed in City and in Prefix
        # If >3 tokens:  token[1] is placed in Admin2 and appended to Prefix

        # 1) Try lookup based on standard parsing: lookup city, county, state/province, or country as parsed
        self.logger.debug(
            f'  1) Standard, based on parsing.  pref [{place.prefix}] city [{place.city}]'
            f' sdx={GeoSearch.get_soundex(place.city)} '
            f'feat={place.feature} typ=[{place.place_type}]')

        if place.place_type != Loc.PlaceType.COUNTRY and place.place_type != Loc.PlaceType.ADMIN1 \
                and place.place_type != Loc.PlaceType.ADMIN2:
            self.logger.debug('find std place  - not ADM*  ')
            best_score = self.geo_build.geodb.s.lookup_place(place=place)
            self.logger.debug(f'std: best={best_score}')

            if place.georow_list:
                result_list.extend(place.georow_list)
            # self.logger.debug(result_list)

            if best_score >= MatchScore.Score.POOR_CUTOFF:
                # No good matches found.  Try a deep search on soundex of combinations of terms
                self.logger.debug('--- DEEP SEARCH city ---')
                best_score = self.geo_build.geodb.s.deep_lookup(place=place)
                # self.logger.debug(place.georow_list)
                if place.georow_list:
                    result_list.extend(place.georow_list)

            # Restore fields
            self._restore_fields(place, self.save_place)

            # 2) Try second token (Admin2) as a city
            if place.admin2_name != '':
                self.logger.debug(f'try 2nd token as city')
                place.georow_list.clear()
                best_score = self._find_type_as_city(place,
                                                     Loc.PlaceType.ADMIN2)
                self.logger.debug(f'2nd token best={best_score}')

                if place.georow_list:
                    result_list.extend(place.georow_list)
                    # self.logger.debug(result_list)

                    # See if we found any good scoring matches
                    if best_score >= MatchScore.Score.POOR_CUTOFF:
                        # No good matches found.  Try a deep search on soundex of combinations of terms
                        self.logger.debug('--- DEEP SEARCH city ---')
                        best_score = self.geo_build.geodb.s.deep_lookup(
                            place=place)
                        # self.logger.debug(place.georow_list)
                        if place.georow_list:
                            result_list.extend(place.georow_list)

                self._restore_fields(place, self.save_place)

            #  Move result_list into place georow list
            place.georow_list.clear()
            place.georow_list.extend(result_list)
            # self.logger.debug(place.georow_list)
        else:
            self.logger.debug('DONE.  type is country, adm1, or adm2')
            return place.result_type

        if len(place.georow_list) > 0:
            best_score = self.geo_build.geodb._assign_scores(place.georow_list,
                                                             place,
                                                             '',
                                                             fast=False,
                                                             quiet=True)

            # self.logger.debug('process results')
            self.process_results(place=place, flags=flags)
            flags = self.filter_results(place)
        # self.logger.debug(place.georow_list)

        if len(place.georow_list) == 0:
            # NO MATCH
            if place.result_type != GeoUtil.Result.NO_COUNTRY and place.result_type != GeoUtil.Result.NOT_SUPPORTED:
                place.result_type = GeoUtil.Result.NO_MATCH
                self.logger.debug(f'Not found.')
                #place.result_type = GeoUtil.Result.STRONG_MATCH
            else:
                self.logger.debug('Found country')
        elif len(place.georow_list) > 1:
            self.logger.debug(f'Success!  {len(place.georow_list)} matches')
            place.result_type = GeoUtil.Result.MULTIPLE_MATCHES

        # Process the results
        self.process_results(place=place, flags=flags)
        # self.logger.debug(f'Status={place.status}')
        return place.result_type
示例#14
0
    def copy_georow_to_place(self, row, place: Loc, fast: bool):
        """
        Copy data from DB row into place instance   
        Country, admin1_id, admin2_id, city, lat/lon, feature, geoid are updated if available   
        #Args:   
            row: georow from geoname database   
            place: Loc instance   
            fast: Currently ignored
        #Returns:   
            None.  Place instance is updated with data from georow   
        """
        place.admin1_id = ''
        place.admin2_id = ''
        place.admin1_name = ''
        place.admin2_name = ''
        place.city = ''

        place.country_iso = str(row[Entry.ISO])
        place.lat = row[Entry.LAT]
        place.lon = row[Entry.LON]
        place.feature = str(row[Entry.FEAT])
        # self.logger.debug(f'feat={place.feature}')
        place.geoid = str(row[Entry.ID])
        place.prefix = row[Entry.PREFIX]
        place.place_type = Loc.PlaceType.CITY

        if place.feature == 'ADM0':
            place.place_type = Loc.PlaceType.COUNTRY
            pass
        elif place.feature == 'ADM1':
            place.admin1_id = row[Entry.ADM1]
            place.place_type = Loc.PlaceType.ADMIN1
        elif place.feature == 'ADM2':
            place.admin1_id = row[Entry.ADM1]
            place.admin2_id = row[Entry.ADM2]
            place.place_type = Loc.PlaceType.ADMIN2
        else:
            place.admin1_id = row[Entry.ADM1]
            place.admin2_id = row[Entry.ADM2]
            place.city = row[Entry.NAME]

        if place.admin1_id != '':
            if place.admin1_name == '':
                place.admin1_name = self.get_admin1_name(
                    place.admin1_id, place.country_iso)
            if place.admin2_name == '':
                place.admin2_name = self.get_admin2_name(
                    place.admin1_id, place.admin2_id, place.country_iso)
        place.country_name = str(self.get_country_name(row[Entry.ISO]))

        if place.admin2_name is None:
            place.admin2_name = ''
        if place.admin1_name is None:
            place.admin1_name = ''

        place.city = str(place.city)
        if place.city is None:
            place.city = ''

        try:
            place.score = row[Entry.SCORE]
        except IndexError:
            pass
示例#15
0
 def setUp(self) -> None:
     TestScoring.in_place: Loc.Loc = Loc.Loc()
     TestScoring.out_place: Loc.Loc = Loc.Loc()
示例#16
0
    def filter_results(self, place: Loc):
        """
            Sort place.georow_list by match score and eliminate duplicates   
        
        In case of duplicate, keep the one with best match score.   
        See MatchScore.match_score() for details on score calculation    
        Discard names that didnt exist at time of event (update result flag if this occurs)  
        Duplicates are defined as two items with:  
        1) same GEOID or 2) same name and similar lat/lon (within Rectilinear Distance of distance_cutoff degrees)  
        
        Add flag if we hit the lookup limit  
        #Args:   
            place:   
        
        #Returns:   
            ResultFlags(limited=limited_flag, filtered=date_filtered)   
        """

        date_filtered = False  # Flag to indicate whether we dropped locations due to event date
        # event_year = place.event_year

        if len(place.georow_list) > 100:
            limited_flag = True
        else:
            limited_flag = False

        if len(place.georow_list) == 0:
            self.logger.debug('EMPTY')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        # Remove duplicate locations in list (have same name and lat/lon)
        self.remove_duplicates(place)
        if len(place.georow_list) == 0:
            self.logger.error(f'georow_list = 0')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        gap_threshold = 0
        score = 0

        # Sort places in match_score order
        new_list = sorted(place.georow_list,
                          key=itemgetter(GeoUtil.Entry.SCORE,
                                         GeoUtil.Entry.ADM1))
        if len(new_list) == 0:
            self.logger.error(f'new_list = 0')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        if len(new_list[0]) < GeoUtil.Entry.SCORE + 1:
            self.logger.debug(f'len = {len(new_list[0])}')
            self.logger.debug(f'[{new_list[0]}]')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        min_score = new_list[0][GeoUtil.Entry.SCORE]
        place.georow_list.clear()

        # Go through sorted list and only add items to georow_list that are close to the best score
        for rw, geo_row in enumerate(new_list):
            score = geo_row[GeoUtil.Entry.SCORE]
            # admin1_name = self.geo_build.geodb.get_admin1_name_direct(geo_row[GeoUtil.Entry.ADM1], geo_row[GeoUtil.Entry.ISO])
            # admin2_name = self.geo_build.geodb.get_admin2_name_direct(geo_row[GeoUtil.Entry.ADM1],
            #                                                          geo_row[GeoUtil.Entry.ADM2], geo_row[GeoUtil.Entry.ISO])

            base = MatchScore.Score.VERY_GOOD + (MatchScore.Score.GOOD / 3)
            gap_threshold = base + abs(min_score) * .6

            # Range to display when there is a strong match
            # if (min_score <= base and score > min_score + gap_threshold) or score > MatchScore.Score.VERY_POOR * 1.5:
            if score > min_score + gap_threshold:

                self.logger.debug(
                    f'SKIP Score={score:.1f} Min={min_score:.1f} Gap={gap_threshold:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}]'
                    f' {geo_row[GeoUtil.Entry.NAME]},'
                    f' {geo_row[GeoUtil.Entry.ADM2]},'
                    f' {geo_row[GeoUtil.Entry.ADM1]} ')
            else:
                place.georow_list.append(geo_row)
                self.logger.debug(
                    f'Score {score:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}] {geo_row[GeoUtil.Entry.NAME]}, '
                    f'AD2={geo_row[GeoUtil.Entry.ADM2]},'
                    f' AD1={geo_row[GeoUtil.Entry.ADM1]} {geo_row[GeoUtil.Entry.ISO]}'
                )

        # self.logger.debug(f'min={min_score:.1f}, gap2={gap_threshold:.1f} strong cutoff={min_score + gap_threshold:.1f}')

        if min_score <= MatchScore.Score.VERY_GOOD and len(
                place.georow_list
        ) == 1 and place.result_type != GeoUtil.Result.NOT_SUPPORTED:
            place.result_type = GeoUtil.Result.STRONG_MATCH
        else:
            # Log item that we couldnt match
            if self.miss_diag_file:
                self.miss_diag_file.write(
                    f'Lookup {place.original_entry} thresh={gap_threshold} gap={score - min_score}\n\n'
                )

        return ResultFlags(limited=limited_flag, filtered=date_filtered)
示例#17
0
 def setUp(self) -> None:
     self.place: Loc.Loc = Loc.Loc()