def __init__(self, components, country_rtree, debug=False): self.components = components self.country_rtree = country_rtree self.debug = debug self.formatter = AddressFormatter()
def __init__(self, geoplanet_db): self.db = sqlite3.connect(geoplanet_db) # These aren't too large and it's easier to have them in memory self.places = { row[0]: row[1:] for row in self.db.execute('select * from places') } self.aliases = defaultdict(list) self.coterminous_admins = {} self.admins_with_ambiguous_city = set() print('Doing admin ambiguities') for row in self.db.execute('''select p.id, (select count(*) from places where parent_id = p.id) as num_places, (select count(*) from places where parent_id = p.id and place_type = "Town") as num_towns, p2.id from places p join places p2 on p2.parent_id = p.id and p.name = p2.name and p.place_type != "Town" and p2.place_type = "Town" group by p.id'''): place_id, num_places, num_towns, coterminous_town_id = row num_places = int(num_places) num_towns = int(num_towns) if num_places == 1 and num_towns == 1: self.coterminous_admins[place_id] = coterminous_town_id self.admins_with_ambiguous_city.add(place_id) print('num coterminous: {}'.format(len(self.coterminous_admins))) print('num ambiguous: {}'.format(len(self.admins_with_ambiguous_city))) print('Doing aliases') for row in self.db.execute('''select a.* from aliases a left join places p on a.id = p.id and p.place_type in ("State", "County") and a.language != p.language where name_type != "S" -- no colloquial aliases like "The Big Apple" and name_type != "V" -- variants can often be demonyms like "Welsh" or "English" for UK and p.id is NULL -- exclude foreign-language states/county names order by id, language, case name_type when "P" then 1 when "Q" then 2 when "V" then 3 when "A" then 4 when "S" then 5 else 6 end'''): place = self.places.get(row[0]) if not place: continue self.aliases[row[0]].append(row[1:]) print('Doing variant aliases') variant_aliases = 0 for i, row in enumerate( self.db.execute( '''select a.*, p.name, p.country_code from aliases a join places p using(id) where a.name_type = "V" and a.language = p.language''' )): place_name, country_code = row[-2:] country = country_code.lower() row = row[:-2] place_id, alias, name_type, language = row language = self.language_codes[language] if language != 'unk': alias_sans_affixes = name_affixes.replace_affixes( alias, language, country=country) if alias_sans_affixes: alias = alias_sans_affixes place_name_sans_affixes = name_affixes.replace_affixes( place_name, language, country=country) if place_name_sans_affixes: place_name = place_name_sans_affixes else: language = None if equivalent(place_name, alias, toponym_abbreviations_gazetteer, language): self.aliases[row[0]].append(row[1:]) variant_aliases += 1 if i % 10000 == 0 and i > 0: print('tested {} variant aliases with {} positives'.format( i, variant_aliases)) self.aliases = dict(self.aliases) self.formatter = AddressFormatter()
def __init__(self): self.formatter = AddressFormatter()