def init_disambiguation(): global char_scripts, script_languages char_scripts[:] = [] char_scripts.extend(get_chars_by_script()) script_languages.update({ script: set(langs) for script, langs in get_script_languages().iteritems() })
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other geometries. ''' index = cls(save_dir=output_dir) ensure_dir(scratch_dir) logger = logging.getLogger('neighborhoods') logger.setLevel(logging.INFO) qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods') ensure_dir(qs_scratch_dir) logger.info('Creating Quattroshapes neighborhoods') qs = QuattroshapesReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) logger.info('Creating Zetashapes neighborhoods') zs = cls.create_zetashapes_neighborhoods_index() logger.info('Creating IDF index') idf = IDFIndex() char_scripts = get_chars_by_script() for idx in (zs, qs): for i, (props, poly) in enumerate(idx.polygons): name = props.get('name') if name is not None: doc = cls.count_words(name) idf.update(doc) for key, attrs, deps in parse_osm(filename): for k, v in attrs.iteritems(): if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)): doc = cls.count_words(v) idf.update(doc) qs.matched = [False] * qs.i zs.matched = [False] * zs.i logger.info('Matching OSM points to neighborhood polygons') # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons num_polys = 0 for node_id, attrs, deps in parse_osm(filename): try: lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon']) except ValueError: continue osm_name = attrs.get('name') if not osm_name: continue is_neighborhood = attrs.get('place') == 'neighbourhood' ranks = [] osm_names = [] for key in OSM_NAME_TAGS: name = attrs.get(key) if name: osm_names.append(name) for name_key in OSM_NAME_TAGS: osm_names.extend([v for k, v in attrs.iteritems() if k.startswith('{}:'.format(name_key))]) for idx in (zs, qs): candidates = idx.get_candidate_polygons(lat, lon, all_levels=True) if candidates: max_sim = 0.0 arg_max = None normalized_qs_names = {} for osm_name in osm_names: contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts for c in safe_decode(osm_name))) for i in candidates: props, poly = idx.polygons[i] name = normalized_qs_names.get(i) if not name: name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) normalized_qs_names[i] = name level = props.get(QuattroshapesReverseGeocoder.LEVEL) if is_neighborhood and level != 'neighborhood': continue if not contains_ideographs: sim = NeighborhoodDeduper.compare(osm_name, name, idf) else: # Many Han/Hangul characters are common, shouldn't use IDF sim = NeighborhoodDeduper.compare_ideographs(osm_name, name) if sim > max_sim: max_sim = sim arg_max = (max_sim, props, poly.context, idx, i) if arg_max: ranks.append(arg_max) ranks.sort(key=operator.itemgetter(0), reverse=True) if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD: score, props, poly, idx, i = ranks[0] if idx is zs: attrs['polygon_type'] = 'neighborhood' else: level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) if level == 'neighborhood': attrs['polygon_type'] = 'neighborhood' else: attrs['polygon_type'] = 'local_admin' attrs['source'] = 'osm' index.index_polygon(poly) index.add_polygon(poly, attrs) idx.matched[i] = True num_polys += 1 if num_polys % 1000 == 0 and num_polys > 0: logger.info('did {} neighborhoods'.format(num_polys)) for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')): for i, (props, poly) in enumerate(idx.polygons): if idx.matched[i]: continue props['source'] = source if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': props['polygon_type'] = 'neighborhood' else: # We don't actually care about local admin polygons unless they match OSM continue index.index_polygon(poly.context) index.add_polygon(poly.context, props) return index
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other geometries. ''' index = cls(save_dir=output_dir) logger = logging.getLogger('neighborhoods') qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods') ensure_dir(qs_scratch_dir) logger.info('Creating ClickThatHood neighborhoods') cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index() logger.info('Creating OSM neighborhoods') osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file) logger.info('Creating Quattroshapes neighborhoods') qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir) osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir) osm_admin_rtree.cache_size = 1000 logger.info('Creating IDF index') idf = IDFIndex() char_scripts = get_chars_by_script() for idx in (cth, qs, osmn): for i in xrange(idx.i): props = idx.get_properties(i) name = props.get('name') if name is not None: doc = cls.count_words(name) idf.update(doc) for key, attrs, deps in parse_osm(filename): for k, v in six.iteritems(attrs): if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)): doc = cls.count_words(v) idf.update(doc) for i in six.moves.xrange(osmn.i): props = osmn.get_properties(i) poly = osmn.get_polygon(i) props['source'] = 'osm' props['component'] = AddressFormatter.SUBURB props['polygon_type'] = 'neighborhood' index.index_polygon(poly.context) index.add_polygon(poly.context, props) qs.matched = [False] * qs.i cth.matched = [False] * cth.i logger.info('Matching OSM points to neighborhood polygons') # Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons num_polys = 0 for element_id, attrs, deps in parse_osm(filename): try: lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon']) except ValueError: continue osm_name = attrs.get('name') if not osm_name: continue id_type, element_id = element_id.split(':') element_id = long(element_id) props['type'] = id_type props['id'] = element_id possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.EXTENDED_NEIGHBORHOOD) is_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD) country, candidate_languages = country_rtree.country_and_languages(lat, lon) component_name = None component_name = osm_address_components.component_from_properties(country, attrs) ranks = [] osm_names = [] for key in OSM_NAME_TAGS: name = attrs.get(key) if name: osm_names.append(name) for name_key in OSM_NAME_TAGS: osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))]) for idx in (cth, qs): candidates = idx.get_candidate_polygons(lat, lon, return_all=True) if candidates: max_sim = 0.0 arg_max = None normalized_qs_names = {} for osm_name in osm_names: contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts for c in safe_decode(osm_name))) for i in candidates: props = idx.get_properties(i) name = normalized_qs_names.get(i) if not name: name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) normalized_qs_names[i] = name if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood': continue if not contains_ideographs: sim = NeighborhoodDeduper.compare(osm_name, name, idf) else: # Many Han/Hangul characters are common, shouldn't use IDF sim = NeighborhoodDeduper.compare_ideographs(osm_name, name) if sim > max_sim: max_sim = sim poly = idx.get_polygon(i) arg_max = (max_sim, props, poly.context, idx, i) if arg_max: ranks.append(arg_max) ranks.sort(key=operator.itemgetter(0), reverse=True) if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD: score, props, poly, idx, i = ranks[0] existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True) existing_neighborhood_boundaries = osmn.point_in_poly(lat, lon, return_all=True) skip_node = False for boundaries in (existing_osm_boundaries, existing_neighborhood_boundaries): for poly_index, osm_props in enumerate(boundaries): containing_component = None name = osm_props.get('name') # Only exact name matches here since we're comparins OSM to OSM if name and name.lower() != attrs.get('name', '').lower(): continue if boundaries is existing_neighborhood_boundaries: containing_component = AddressFormatter.SUBURB skip_node = True break else: containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries[poly_index + 1:]] containing_component = osm_address_components.component_from_properties(country, osm_props, containing=containing_ids) if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]: skip_node = True break if skip_node: break # Skip this element if skip_node: continue if idx is cth: if props['component'] == AddressFormatter.SUBURB: attrs['polygon_type'] = 'neighborhood' elif props['component'] == AddressFormatter.CITY_DISTRICT: attrs['polygon_type'] = 'local_admin' else: continue source = 'osm_cth' else: level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) source = 'osm_quattro' if level == 'neighborhood': attrs['polygon_type'] = 'neighborhood' else: attrs['polygon_type'] = 'local_admin' containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries] component = osm_address_components.component_from_properties(country, attrs, containing=containing_ids) attrs['component'] = component attrs['source'] = source index.index_polygon(poly) index.add_polygon(poly, attrs) idx.matched[i] = True num_polys += 1 if num_polys % 1000 == 0 and num_polys > 0: logger.info('did {} neighborhoods'.format(num_polys)) for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')): for i in xrange(idx.i): props = idx.get_properties(i) poly = idx.get_polygon(i) if idx.matched[i]: continue props['source'] = source if idx is cth: component = props['component'] if component == AddressFormatter.SUBURB: props['polygon_type'] = 'neighborhood' elif component == AddressFormatter.CITY_DISTRICT: props['polygon_type'] = 'local_admin' else: continue elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': component = AddressFormatter.SUBURB name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) props['name'] = name if cls.quattroshapes_city_district_regex.match(name): component = AddressFormatter.CITY_DISTRICT props['component'] = component props['polygon_type'] = 'neighborhood' else: # We don't actually care about local admin polygons unless they match OSM continue index.index_polygon(poly.context) index.add_polygon(poly.context, props) return index
def init_disambiguation(): global char_scripts, script_languages char_scripts[:] = [] char_scripts.extend(get_chars_by_script()) script_languages.update({script: set(langs) for script, langs in get_script_languages().iteritems()})
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other geometries. ''' index = cls(save_dir=output_dir) ensure_dir(scratch_dir) logger = logging.getLogger('neighborhoods') logger.setLevel(logging.INFO) qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods') ensure_dir(qs_scratch_dir) logger.info('Creating Quattroshapes neighborhoods') qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index( quattroshapes_dir, qs_scratch_dir) logger.info('Creating Zetashapes neighborhoods') zs = cls.create_zetashapes_neighborhoods_index() logger.info('Creating IDF index') idf = IDFIndex() char_scripts = get_chars_by_script() for idx in (zs, qs): for i, (props, poly) in enumerate(idx.polygons): name = props.get('name') if name is not None: doc = cls.count_words(name) idf.update(doc) for key, attrs, deps in parse_osm(filename): for k, v in attrs.iteritems(): if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)): doc = cls.count_words(v) idf.update(doc) qs.matched = [False] * qs.i zs.matched = [False] * zs.i logger.info('Matching OSM points to neighborhood polygons') # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons num_polys = 0 for node_id, attrs, deps in parse_osm(filename): try: lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon']) except ValueError: continue osm_name = attrs.get('name') if not osm_name: continue is_neighborhood = attrs.get('place') == 'neighbourhood' ranks = [] osm_names = [] for key in OSM_NAME_TAGS: name = attrs.get(key) if name: osm_names.append(name) for name_key in OSM_NAME_TAGS: osm_names.extend([ v for k, v in attrs.iteritems() if k.startswith('{}:'.format(name_key)) ]) for idx in (zs, qs): candidates = idx.get_candidate_polygons(lat, lon, return_all=True) if candidates: max_sim = 0.0 arg_max = None normalized_qs_names = {} for osm_name in osm_names: contains_ideographs = any( ((char_scripts[ord(c)] or '').lower() in ideographic_scripts for c in safe_decode(osm_name))) for i in candidates: props, poly = idx.polygons[i] name = normalized_qs_names.get(i) if not name: name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) normalized_qs_names[i] = name if is_neighborhood and idx is qs and props.get( QuattroshapesReverseGeocoder.LEVEL ) != 'neighborhood': continue if not contains_ideographs: sim = NeighborhoodDeduper.compare( osm_name, name, idf) else: # Many Han/Hangul characters are common, shouldn't use IDF sim = NeighborhoodDeduper.compare_ideographs( osm_name, name) if sim > max_sim: max_sim = sim arg_max = (max_sim, props, poly.context, idx, i) if arg_max: ranks.append(arg_max) ranks.sort(key=operator.itemgetter(0), reverse=True) if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD: score, props, poly, idx, i = ranks[0] if idx is zs: attrs['polygon_type'] = 'neighborhood' source = 'osm_zeta' else: level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) source = 'osm_quattro' if level == 'neighborhood': attrs['polygon_type'] = 'neighborhood' else: attrs['polygon_type'] = 'local_admin' attrs['source'] = source index.index_polygon(poly) index.add_polygon(poly, attrs) idx.matched[i] = True num_polys += 1 if num_polys % 1000 == 0 and num_polys > 0: logger.info('did {} neighborhoods'.format(num_polys)) for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')): for i, (props, poly) in enumerate(idx.polygons): if idx.matched[i]: continue props['source'] = source if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': props['polygon_type'] = 'neighborhood' else: # We don't actually care about local admin polygons unless they match OSM continue index.index_polygon(poly.context) index.add_polygon(poly.context, props) return index
if prefix_search and self.trie.get(token[:prefix_len]): yield (token_types.PHRASE, [(c, ) + t], prefix_search) continue yield c, t, data street_types_gazetteer = DictionaryPhraseFilter( 'street_types.txt', 'directionals.txt', 'concatenated_suffixes_separable.txt', 'concatenated_suffixes_inseparable.txt', 'concatenated_prefixes_separable.txt', 'stopwords.txt', ) char_scripts = get_chars_by_script() script_languages = { script: set(langs) for script, langs in get_script_languages().iteritems() } UNKNOWN_SCRIPT = 'Unknown' COMMON_SCRIPT = 'Common' MAX_ASCII = 127 def get_string_script(s): s = safe_decode(s) str_len = len(s) script = last_script = UNKNOWN_SCRIPT is_ascii = True
# For toponyms, we want to limit the countries we consider to those where # the place names can themselves be considered training examples of the language WELL_REPRESENTED_LANGUAGE_COUNTRIES = { 'en': set(['gb', 'us', 'ca', 'au', 'nz', 'ie']), 'fr': set(['fr']), 'it': set(['it']), 'de': set(['de', 'at']), 'nl': set(['nl']), 'es': set(['es', 'ar', 'mx', 'cl', 'co', 'pe', 'ec', 'pr', 'uy', 've', 'cu', 'do', 'bo', 'gt', 'cr', 'py', 'sv', 'pa', 'ni', 'hn']), 'pt': set(['pt', 'br']), } char_scripts = get_chars_by_script() script_languages = {script: set(langs) for script, langs in six.iteritems(get_script_languages())} lang_scripts = defaultdict(set) for script, langs in six.iteritems(script_languages): for lang in langs: lang_scripts[lang].add(script) lang_scripts = dict(lang_scripts) UNKNOWN_SCRIPT = 'Unknown' COMMON_SCRIPT = 'Common' MAX_ASCII = 127 def get_string_script(s):