def find_first_match(self, location: str, place: Loc.Loc): """ Find the first match for this location in the geoname dictionary. First parse the location into <prefix>, city, <district2>, district1, country. Then look it up in the place db Update place with -- lat, lon, district, city, country_iso, result code """ place.parse_place(place_name=location, geo_files=self.geo_files) place.country_name = self.geo_files.geodb.get_country_name( place.country_iso) place.country_iso = place.country_iso # Lookup location self.geo_files.geodb.lookup_place(place=place) self.update_rowlist_prefix(place=place) # Clear to a single entry if len(place.georow_list) > 1: row = copy.copy(place.georow_list[0]) place.georow_list.clear() place.georow_list.append(row) place.result_type = GeoKeys.Result.STRONG_MATCH self.process_result(place=place, flags=ResultFlags(limited=False, filtered=False))
def process_result(self, place: Loc.Loc, flags) -> None: # Copy geodata to place record and Put together status text # self.logger.debug(f'**PROCESS RESULT: Res={place.result_type} Targ={place.target} Georow_list={place.georow_list}') if place.result_type == GeoKeys.Result.NOT_SUPPORTED: place.place_type = Loc.PlaceType.COUNTRY if place.result_type in GeoKeys.successful_match and len( place.georow_list) > 0: self.geo_files.geodb.copy_georow_to_place(row=place.georow_list[0], place=place) place.format_full_nm(self.geo_files.output_replace_dct) elif len(place.georow_list) > 0: self.logger.debug( f'***RESULT={place.result_type} Setting to Partial') place.result_type = GeoKeys.Result.PARTIAL_MATCH place.prefix = place.prefix.strip(' ') place.set_place_type_text() place.status = f'{place.result_type_text} {result_text_list.get(place.result_type)} ' if flags.limited: place.status = ' First 100 matches shown...' if flags.filtered: place.status = f'{place.result_type_text} {result_text_list.get(place.result_type)} ' # place.status += ' ***VERIFY EVENT DATE***' # place.result_type = GeoKeys.Result.PARTIAL_MATCH self.update_rowlist_prefix(place=place)
def find_geoid(self, geoid: str, place: Loc.Loc): place.target = geoid place.georow_list.clear() self.geo_files.geodb.lookup_geoid(place=place) if len(place.georow_list) > 0: # Copy geo row to Place self.geo_files.geodb.copy_georow_to_place(row=place.georow_list[0], place=place) place.original_entry = place.format_full_nm(None) place.result_type = GeoKeys.Result.STRONG_MATCH else: place.result_type = GeoKeys.Result.NO_MATCH
def __init__(self, directory_name: str, filename: str, progress_bar, geo_files: GeodataFiles, lang_list): super().__init__(directory_name, filename, progress_bar) self.sub_dir = GeoKeys.get_cache_directory(directory_name) self.geo_files: GeodataFiles.GeodataFiles = geo_files self.lang_list = lang_list self.loc = Loc()
def update_rowlist_prefix(self, place: Loc.Loc): """ Set all the prefix values in the georow_list :param place: """ temp_place = Loc.Loc() tokens = place.original_entry.split(',') for idx, rw in enumerate(place.georow_list): update = list(rw) # Put unused fields into prefix self.geo_files.geodb.copy_georow_to_place(rw, temp_place) temp_place.prefix = '' nm = GeoKeys.search_normalize( temp_place.format_full_nm(self.geo_files.output_replace_dct), place.country_iso) # self.logger.debug(f'NAME ={nm}') place.prefix = '' for num, fld in enumerate(tokens[:2]): item = GeoKeys.search_normalize(fld, place.country_iso) add_item = False # self.logger.debug(f'item={item} ') if num == 0 and item not in nm: add_item = True if num == 1 and item not in nm and len(tokens) == 2: # We only add the second token if there are only 2 tokens add_item = True if '*' in item: # Don't add as prefix if item is a wildcard search add_item = False if add_item: if len(place.prefix) > 0: place.prefix += ' ' place.prefix += item.title() if len(place.prefix) > 0: place.prefix_commas = ', ' update[GeoKeys.Entry.PREFIX] = place.prefix # self.logger.debug(f'PREFIX={place.prefix} ') place.georow_list[idx] = tuple(update)
def lookup_as_admin2(self, place: Loc.Loc, result_list, save_place: Loc.Loc): # Try City as ADMIN2 place.extra = place.admin2_name place.target = place.city1 place.admin2_name = place.city1 place.city1 = '' place.place_type = Loc.PlaceType.ADMIN2 self.geo_files.geodb.lookup_place(place=place) result_list.extend(place.georow_list) self.update_rowlist_prefix(place=place) # Restore items place.city1 = save_place.city1 place.admin2_name = save_place.admin2_name place.admin1_name = save_place.admin1_name place.prefix = save_place.prefix
def write_updated_place(self, place: Loc.Loc, entry): # Write out updated location and lat/lon to file self.geodata.geo_files.geodb.set_display_names(place) place.original_entry = place.format_full_nm( self.geodata.geo_files.output_replace_dct) prefix = GeoKeys.capwords(self.place.prefix) if self.diagnostics: self.in_diag_file.write(f'{entry}\n') if place.result_type != GeoKeys.Result.DELETE: # self.logger.debug(f'Write Updated - name={place.name} pref=[{place.prefix}]') self.ancestry_file_handler.write_updated( prefix + place.prefix_commas + place.original_entry, place) self.ancestry_file_handler.write_lat_lon(lat=place.lat, lon=place.lon) text = prefix + place.prefix_commas + place.original_entry + '\n' text = str(text.encode('utf-8', errors='replace')) self.out_diag_file.write(text) else: # self.logger.debug('zero len, no output') if self.diagnostics: self.out_diag_file.write('DELETE\n') pass
def create_enclosed_by(self, place: Loc.Loc): """ Create EnclosedBy elements in Dictionary for CSV file :return: None """ self.logger.debug(f'\nCREATE ENCLOSURE FOR {place.original_entry}') enclosure_place = copy.copy(place) enclosure_place.id = '' # Move up to enclosure level success = self.move_up_level(enclosure_place, idx=self.get_dict_id(enclosure_place)) if success: place.enclosed_by = enclosure_place.id self.update_enclosure_id(place) return
def set_CSV_place_type(place: Loc.Loc): place.set_place_type() if len(place.prefix) > 0: place.place_type = Loc.PlaceType.PREFIX
def retrieve_csv_place(admin_table, geodata, place: Loc.Loc, key, idx): # 0Place (ID), 1Title, 2Name, 3Type, 4latitude, 5longitude, 6enclosed_by row = admin_table[idx].get(key) key_tokens = key.split("_") place.place_type = len(key_tokens) - 1 # self.logger.debug(f'{row}') place.feature = row[CSVEntry.FEAT] place.original_entry = row[CSVEntry.TITLE] place.country_iso = row[CSVEntry.ISO] place.country_name = geodata.geo_files.geodb.get_country_name( place.country_iso) place.enclosed_by = row[CSVEntry.ENCLOSED_BY] place.lat: float = float(row[CSVEntry.LAT]) place.lon: float = float(row[CSVEntry.LON]) place.admin2_id = row[CSVEntry.ADMIN2_ID] place.admin1_id = row[CSVEntry.ADMIN1_ID] place.admin1_name = str(geodata.geo_files.geodb.get_admin1_name(place)) place.admin2_name = str(geodata.geo_files.geodb.get_admin2_name(place)) if place.admin2_name is None: place.admin2_name = '' if place.admin1_name is None: place.admin1_name = '' tokens = place.original_entry.split(',') if len(tokens) > 3: place.city1 = tokens[-4] if len(tokens) > 4: place.prefix = tokens[-5] place.id = row[CSVEntry.PLACE_ID]
def create_csv_node(self, place: Loc.Loc): """ Create CSV row in Dictionary: Place (ID), Title, Name, Type, latitude, longitude,enclosed_by :param place: :return: None """ if place.original_entry == '': return row = [''] * 11 self.set_CSV_place_type(place) if place.id == '': self.set_CSV_place_type(place) place.id = self.get_csv_key(place) row[CSVEntry.PLACE_ID] = place.id row[CSVEntry.ENCLOSED_BY] = place.enclosed_by place.id = row[CSVEntry.PLACE_ID] row[CSVEntry. TITLE] = place.prefix + place.prefix_commas + place.original_entry row[CSVEntry.FEAT] = place.feature row[CSVEntry.LAT] = f'{float(place.lat):.4f}' row[CSVEntry.LAT] = f'{float(place.lat):.4f}' row[CSVEntry.LON] = f'{float(place.lon):.4f}' row[CSVEntry.ADMIN2_ID] = place.admin2_id row[CSVEntry.ADMIN1_ID] = place.admin1_id row[CSVEntry.ISO] = place.country_iso place.set_place_type_text() row[CSVEntry.NAME] = self.get_csv_name(place) row[CSVEntry.TYPE] = place.result_type_text key = self.get_csv_key(place) dict_idx = self.get_dict_id(place) if dict_idx == 0: place.enclosed_by = '' row[CSVEntry.ENCLOSED_BY] = '' if place.enclosed_by != '': if key.count('_') <= row[CSVEntry.ENCLOSED_BY].count( '_') and key.count('_') > 0: msg = f'Incorrect Enclosure for [{place.original_entry}]. Key= [{key}] Enclosure= [{row[CSVEntry.ENCLOSED_BY]}]' self.logger.warning(msg) elif key.count('_') < row[CSVEntry.ENCLOSED_BY].count( '_') and key.count('_') == 0: msg = f'Incorrect Enclosure for [{place.original_entry}]. Key= [{key}] Enclosure= [{row[CSVEntry.ENCLOSED_BY]}]' self.logger.warning(msg) if re.match(r'P\d\d\d\d', place.id): # our item has an ID with P9999, add this row self.admin_table[dict_idx][key.upper()] = row else: res = self.admin_table[dict_idx].get(key.upper()) if res is None: # Nothing there, add this row self.admin_table[dict_idx][key.upper()] = row else: # A node is already there and we don't have a P, so do nothing place.id = res[CSVEntry.PLACE_ID]
def lookup_place(self, place: Loc.Loc) -> []: """ Lookup a place in our geoname.org dictionary and update place with Geo_result with lat, long, District, etc The dictionary geo_result entry contains: Lat, Long, districtID (County or State or Province ID) There can be multiple entries if a city name isnt unique in a country """ result_place: Loc = Loc.Loc() self.start = time.time() place.result_type = Result.STRONG_MATCH #place.admin2_name, modified = GeoKeys.admin2_normalize(place.admin2_name, place.country_iso) if place.country_iso != '' and place.country_name == '': place.country_name = self.get_country_name(place.country_iso) # Lookup Place based on Place Type if place.place_type == Loc.PlaceType.ADMIN1: self.select_admin1(place) elif place.place_type == Loc.PlaceType.ADMIN2: if place.admin1_id == '': self.get_admin1_id(place=place) self.select_admin2(place) #if len(place.georow_list) == 0: # Try search with some text replacements #place.admin2_name, modified = GeoKeys.admin2_normalize(place.admin2_name, place.country_iso) #if modified: # self.select_admin2(place) elif place.place_type == Loc.PlaceType.COUNTRY: self.select_country(place) elif place.place_type == Loc.PlaceType.ADVANCED_SEARCH: self.advanced_search(place) else: # Lookup as City if place.admin2_id == '': self.get_admin2_id(place=place) self.select_city(place) # nm = place.original_entry # self.logger.debug(f'Search results for {place.target} pref[{place.prefix}]') min_score = 9999 # Add search quality score to each entry for idx, rw in enumerate(place.georow_list): self.copy_georow_to_place(row=rw, place=result_place) if len(place.prefix) > 0 and result_place.prefix == '': result_place.prefix = ' ' result_place.prefix_commas = ',' else: result_place.prefix = '' score = self.match.match_score(inp_place=place, res_place=result_place) if score < min_score: min_score = score # Convert row tuple to list and extend so we can assign score update = list(rw) update.append(1) update[GeoKeys.Entry.SCORE] = score place.georow_list[idx] = tuple( update) # Convert back from list to tuple # Remove items in prefix that are in result tk_list = result_place.original_entry.split(",") if place.place_type != Loc.PlaceType.ADVANCED_SEARCH: for item in tk_list: place.prefix = re.sub( item.strip(' ').lower(), '', place.prefix) if place.result_type == Result.STRONG_MATCH and len(place.prefix) > 0: place.result_type = Result.PARTIAL_MATCH if place.result_type == Result.STRONG_MATCH and min_score > 10: place.result_type = Result.PARTIAL_MATCH
def find_location(self, location: str, place: Loc.Loc, shutdown): """ Find a location in the geoname dictionary. First parse the location into <prefix>, city, <district2>, district1, country. Then look it up in the place dictionary Update place with -- lat, lon, district, city, country_iso, result code """ place.parse_place(place_name=location, geo_files=self.geo_files) # Successful Admin1 will also fill in country_iso # Use this to fill in country name if missing if place.country_name == '' and place.country_iso != '': place.country_name = self.geo_files.geodb.get_country_name( place.country_iso) if shutdown: # During shutdown there is no user verification and no reason to try wildcard searches self.geo_files.geodb.db.use_wildcards = False flags = ResultFlags(limited=False, filtered=False) result_list = [] #self.logger.debug(f'== FIND LOCATION City=[{place.city1}] Adm2=[{place.admin2_name}]\ #Adm1=[{place.admin1_name}] Pref=[{place.prefix}] Cntry=[{place.country_name}] iso=[{place.country_iso}] Type={place.place_type} ') # Save a shallow copy so we can restore fields self.save_place = copy.copy(place) if place.place_type == Loc.PlaceType.ADVANCED_SEARCH: # Lookup location with advanced search params self.logger.debug('Advanced Search') self.lookup_by_type(place, result_list, place.place_type, self.save_place) place.georow_list.clear() place.georow_list.extend(result_list) if len(place.georow_list) > 0: # Build list - sort and remove duplicates # self.logger.debug(f'Match {place.georow_list}') self.process_result(place=place, flags=flags) flags = self.build_result_list(place) return self.country_is_valid(place) # The country in this entry is not supported (not loaded into DB) if place.result_type == GeoKeys.Result.NOT_SUPPORTED: self.process_result(place=place, flags=flags) return # 1) Try standard lookup: city, county, state/province, country place.standard_parse = True #self.logger.debug(f' 1) Standard based on parsing. pref [{place.prefix}] type={place.place_type}') self.geo_files.geodb.lookup_place(place=place) result_list.extend(place.georow_list) self.update_rowlist_prefix(place=place) # Restore items place.city1 = self.save_place.city1 place.admin2_name = self.save_place.admin2_name place.prefix = self.save_place.prefix place.extra = self.save_place.extra # try alternatives since parsing can be wrong # 2) Try a) Prefix as city, b) Admin2 as city place.standard_parse = False for ty in [Loc.PlaceType.PREFIX, Loc.PlaceType.ADMIN2]: self.lookup_by_type(place, result_list, ty, self.save_place) # 3) Try city as Admin2 #self.logger.debug(f' 3) Lkp w Cit as Adm2. Target={place.city1} pref [{place.prefix}] ') self.lookup_as_admin2(place=place, result_list=result_list, save_place=self.save_place) # Move result list into place georow list place.georow_list.clear() place.georow_list.extend(result_list) if len(place.georow_list) > 0: # Sort and remove duplicates self.process_result(place=place, flags=flags) flags = self.build_result_list(place) if len(place.georow_list) == 0: # NO MATCH self.logger.debug(f'Not found.') # place = self.save_place if place.result_type != GeoKeys.Result.NO_COUNTRY and place.result_type != GeoKeys.Result.NOT_SUPPORTED: place.result_type = GeoKeys.Result.NO_MATCH elif len(place.georow_list) > 1: self.logger.debug(f'Success! {len(place.georow_list)} matches') place.result_type = GeoKeys.Result.MULTIPLE_MATCHES # Process the results self.process_result(place=place, flags=flags)
def display_result(self, place: Loc.Loc): """ Display result details for an item """ # Enable buttons so user can either click Skip, or edit the item and Click Verify. place.safe_strings() TKHelper.enable_buttons(self.w.review_buttons) # Enable action buttons based on type of result if place.result_type == GeoKeys.Result.MULTIPLE_MATCHES or \ place.result_type == GeoKeys.Result.NO_MATCH or \ place.result_type == GeoKeys.Result.NO_COUNTRY: # Disable the Save & Map button until user clicks Verify and item is found self.set_save_allowed(False) TKHelper.set_preferred_button(self.w.verify_button, self.w.review_buttons, "Preferred.TButton") elif place.result_type == GeoKeys.Result.NOT_SUPPORTED: # Found a match or Not supported - enable save and verify # self.set_save_allowed(True) # Enable save button TKHelper.set_preferred_button(self.w.skip_button, self.w.review_buttons, "Preferred.TButton") else: # Found a match or Not supported - enable save and verify self.set_save_allowed(True) # Enable save button TKHelper.set_preferred_button(self.w.save_button, self.w.review_buttons, "Preferred.TButton") # Display status and color based on success self.set_status_text(place.get_status()) if place.result_type in GeoKeys.successful_match: if place.place_type == Loc.PlaceType.CITY: self.w.status.configure(style="Good.TLabel") else: self.w.status.configure(style="GoodCounty.TLabel") else: self.w.status.configure(style="Error.TLabel") # set Verify as preferred button if len(place.georow_list) > 1: TKHelper.set_preferred_button(self.w.verify_button, self.w.review_buttons, "Preferred.TButton") self.set_save_allowed(False) if len(place.georow_list) > 0: # Display matches in listbox self.w.tree.focus() # Set focus to listbox self.display_georow_list(place) else: # No matches self.w.user_entry.focus() # Set focus to text edit widget self.display_one_georow(place.status_detail, place.geoid, score=9999, feat='') # Display GEDCOM person and event that this location refers to self.w.ged_event_info.set_text( f'{self.ancestry_file_handler.get_name(self.ancestry_file_handler.id)}: ' f'{self.ancestry_file_handler.event_name} {self.ancestry_file_handler.date}' ) self.w.root.update_idletasks()
def match_score(self, inp_place: Loc.Loc, res_place: Loc.Loc) -> int: """ :param inp_place: Input place structure with users text :param res_place: Result place structure with DB result :return: score 0-100 reflecting the difference between the user input and the result. 0 is perfect match, 100 is no match Score is also adjusted based on Feature type. More important features (large city) get lower result """ inp_len = [0] * 5 num_inp_tokens = 0.0 in_score = 0 # Create full place title (prefix,city,county,state,country) from input place. inp_title = inp_place.get_five_part_title() inp_title = GeoKeys.normalize_match_title(inp_title, inp_place.country_iso) inp_tokens = inp_title.split(',') # Create full place title (prefix,city,county,state,country) from result place res_place.prefix = ' ' res_title = res_place.get_five_part_title() res_title = GeoKeys.normalize_match_title(res_title, res_place.country_iso) res_tokens = res_title.split(',') # Store length of original input tokens. This is used for percent unmatched calculation for it, tk in enumerate(inp_tokens): inp_tokens[it] = inp_tokens[it].strip(' ') inp_len[it] = len(inp_tokens[it]) # Create a list of all the words in result and save result len for percent calc res_word_list = ', '.join(map(str, res_tokens)) orig_res_len = len(res_word_list) # Create a list of all the words in input input_words = ', '.join(map(str, inp_tokens)) # Remove any matching sequences in input list and result res_word_list, input_words = self.remove_matching_sequences( res_word_list, input_words) # For each input token calculate percent of new (unmatched) size vs original size unmatched_input_tokens = input_words.split(',') # Each token in place hierarchy gets a different weighting # Prefix, city,county, state, country score_diags = '' # Calculate percent of USER INPUT text that was unmatched, then apply weighting for idx, tk in enumerate(inp_tokens): if inp_len[idx] > 0: unmatched_percent = int( 100.0 * len(unmatched_input_tokens[idx].strip(' ')) / inp_len[idx]) in_score += unmatched_percent * self.weight[idx] score_diags += f' {idx}) [{tk}]{inp_len[idx]} {unmatched_percent}% * {self.weight[idx]} ' # self.logger.debug(f'{idx}) Rem=[{unmatched_input_tokens[idx].strip(" " )}] wgtd={unmatched_percent * self.weight[idx]}') num_inp_tokens += 1.0 * self.weight[idx] # self.logger.debug(f'{idx} [{inp_tokens2[idx]}:{inp_tokens[idx]}] rawscr={sc}% orig_len={inp_len[idx]} wgt={self.weight[idx]}') if idx < 2: # If the full first or second token of the result is in input then improve score # Bonus for a full match as against above partial matches if res_tokens[idx] in inp_tokens[idx]: in_score -= self.first_token_match_bonus # Average over number of tokens (with fractional weight). Gives 0-100% regardless of weighting and number of tokens in_score = in_score / num_inp_tokens # self.logger.debug(f'raw in={in_score} numtkn={num_inp_tokens}') # Calculate percent of DB RESULT text that was unmatched if orig_res_len > 0: out_score = int(100.0 * len(res_word_list.strip(' ')) / orig_res_len) # self.logger.debug(f"Out=[{res_word_list.strip(' ')}] orig_len={orig_res_len}") else: out_score = 0 if not inp_place.standard_parse: # If Tokens were not in hierarchical order, give penalty parse_penalty = self.wrong_order_penalty else: parse_penalty = 0.0 if '*' in inp_place.original_entry: # if it was a wildcard search it's hard to rank - add a penalty wildcard_penalty = self.wildcard_penalty else: wildcard_penalty = 0.0 # Feature score is to ensure "important" places get higher rank (large city, etc) feature_score = Geodata.Geodata.get_priority(res_place.feature) # Add up scores - Each item is 0-100 and weighed as below in_weight = 1.0 - self.out_weight - self.feature_weight score = in_score * in_weight + out_score * self.out_weight + feature_score * self.feature_weight + parse_penalty + wildcard_penalty # self.logger.debug(f'SCORE {score:.1f} [{res_title}] out={out_score * out_weight:.1f} ' # f'in={in_score:.1f} feat={feature_score * feature_weight:.1f} parse={parse_penalty}\n {score_diags}') return score