def create_neighborhoods_index(cls): scratch_dir = cls.SCRATCH_DIR repo_path = os.path.join(scratch_dir, 'click_that_hood') cls.clone_repo(repo_path) data_path = os.path.join(repo_path, 'public', 'data') neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods') ensure_dir(neighborhoods_dir) index = cls(save_dir=neighborhoods_dir) for c in cls.config['files']: filename = c['filename'] component = c['component'] path = os.path.join(data_path, filename) features = json.load(open(path))['features'] for f in features: f['properties']['component'] = component try: index.add_geojson_like_file(features) except ValueError: continue return index
def download_cldr(temp_dir=None): if os.path.exists(CLDR_DIR): shutil.rmtree(CLDR_DIR) ensure_dir(CLDR_DIR) if not temp_dir: temp_dir = tempfile.gettempdir() cldr_filename = os.path.join(temp_dir, CLDR_URL.rsplit('/', 1)[-1]) subprocess.check_call(['wget', CLDR_URL, '-O', cldr_filename]) subprocess.check_call(['unzip', cldr_filename, '-d', CLDR_DIR])
def download_file(self, wof_id): s3_path, filename = self.path_and_filename(wof_id) local_path = self.local_path(wof_id) local_dir = os.path.dirname(local_path) s3_key = six.u('/').join(('data', s3_path, filename)) try: bucket = self.WOF_S3_BUCKET self.s3.head_object(Bucket=bucket, Key=s3_key) ensure_dir(local_dir) if not os.path.exists(local_path): self.s3.download_file(self.WOF_S3_BUCKET, s3_key, local_path) return True except Exception: return False
def download_wof_postcodes(wof_dir): ensure_dir(wof_dir) clone_repo(wof_dir, WOF_PLACE_DATA_REPO) response = requests.get(SEED_URLS_JSON) if response.ok: content = json.loads(response.content) for d in content: repo_name = d['name'] if int(d.get('count', 0)) > 0: repo = d['url'] print('doing {}'.format(repo_name)) repo_dir = clone_repo(wof_dir, repo) else: print('skipping {}'.format(repo_name))
def create_zetashapes_neighborhoods_index(cls): scratch_dir = cls.SCRATCH_DIR repo_path = os.path.join(scratch_dir, 'neighborhoods') cls.clone_repo(repo_path) neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index') ensure_dir(neighborhoods_dir) index = GeohashPolygonIndex() have_geonames = set() is_neighborhood = set() for filename in os.listdir(repo_path): path = os.path.join(repo_path, filename) base_name = filename.split('.')[0].split('gn-')[-1] if filename.endswith('.geojson') and filename.startswith('gn-'): have_geonames.add(base_name) elif filename.endswith('metadata.json'): data = json.load(open(os.path.join(repo_path, filename))) if data.get('neighborhoodNoun', [None])[0] in (None, 'rione'): is_neighborhood.add(base_name) for filename in os.listdir(repo_path): if not filename.endswith('.geojson'): continue base_name = filename.rsplit('.geojson')[0] if base_name in have_geonames: f = open(os.path.join(repo_path, 'gn-{}'.format(filename))) elif base_name in is_neighborhood: f = open(os.path.join(repo_path, filename)) else: continue index.add_geojson_like_file(json.load(f)['features']) return index
def __init__(self, filename, db_dir): self.filename = filename self.node_ids = array.array('l') self.logger = logging.getLogger('osm.intersections') # Store these in a LevelDB ensure_dir(db_dir) ways_dir = os.path.join(db_dir, 'ways') ensure_dir(ways_dir) nodes_dir = os.path.join(db_dir, 'nodes') ensure_dir(nodes_dir) self.way_props = LevelDB(ways_dir) self.node_props = LevelDB(nodes_dir) # These form a graph and should always have the same length self.intersection_edges_nodes = array.array('l') self.intersection_edges_ways = array.array('l')
def __init__(self, wof_dir, cache_size=10000, **s3_args): self.wof_dir = wof_dir self.admin_dir = os.path.join(wof_dir, 'admin') ensure_dir(self.admin_dir) self.client = WhosOnFirst(self.admin_dir, **s3_args)
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other geometries. ''' index = cls(save_dir=output_dir) ensure_dir(scratch_dir) logger = logging.getLogger('neighborhoods') logger.setLevel(logging.INFO) qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods') ensure_dir(qs_scratch_dir) logger.info('Creating Quattroshapes neighborhoods') qs = QuattroshapesReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) logger.info('Creating Zetashapes neighborhoods') zs = cls.create_zetashapes_neighborhoods_index() logger.info('Creating IDF index') idf = IDFIndex() char_scripts = get_chars_by_script() for idx in (zs, qs): for i, (props, poly) in enumerate(idx.polygons): name = props.get('name') if name is not None: doc = cls.count_words(name) idf.update(doc) for key, attrs, deps in parse_osm(filename): for k, v in attrs.iteritems(): if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)): doc = cls.count_words(v) idf.update(doc) qs.matched = [False] * qs.i zs.matched = [False] * zs.i logger.info('Matching OSM points to neighborhood polygons') # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons num_polys = 0 for node_id, attrs, deps in parse_osm(filename): try: lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon']) except ValueError: continue osm_name = attrs.get('name') if not osm_name: continue is_neighborhood = attrs.get('place') == 'neighbourhood' ranks = [] osm_names = [] for key in OSM_NAME_TAGS: name = attrs.get(key) if name: osm_names.append(name) for name_key in OSM_NAME_TAGS: osm_names.extend([v for k, v in attrs.iteritems() if k.startswith('{}:'.format(name_key))]) for idx in (zs, qs): candidates = idx.get_candidate_polygons(lat, lon, all_levels=True) if candidates: max_sim = 0.0 arg_max = None normalized_qs_names = {} for osm_name in osm_names: contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts for c in safe_decode(osm_name))) for i in candidates: props, poly = idx.polygons[i] name = normalized_qs_names.get(i) if not name: name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) normalized_qs_names[i] = name level = props.get(QuattroshapesReverseGeocoder.LEVEL) if is_neighborhood and level != 'neighborhood': continue if not contains_ideographs: sim = NeighborhoodDeduper.compare(osm_name, name, idf) else: # Many Han/Hangul characters are common, shouldn't use IDF sim = NeighborhoodDeduper.compare_ideographs(osm_name, name) if sim > max_sim: max_sim = sim arg_max = (max_sim, props, poly.context, idx, i) if arg_max: ranks.append(arg_max) ranks.sort(key=operator.itemgetter(0), reverse=True) if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD: score, props, poly, idx, i = ranks[0] if idx is zs: attrs['polygon_type'] = 'neighborhood' else: level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) if level == 'neighborhood': attrs['polygon_type'] = 'neighborhood' else: attrs['polygon_type'] = 'local_admin' attrs['source'] = 'osm' index.index_polygon(poly) index.add_polygon(poly, attrs) idx.matched[i] = True num_polys += 1 if num_polys % 1000 == 0 and num_polys > 0: logger.info('did {} neighborhoods'.format(num_polys)) for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')): for i, (props, poly) in enumerate(idx.polygons): if idx.matched[i]: continue props['source'] = source if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': props['polygon_type'] = 'neighborhood' else: # We don't actually care about local admin polygons unless they match OSM continue index.index_polygon(poly.context) index.add_polygon(poly.context, props) return index
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other geometries. ''' index = cls(save_dir=output_dir) logger = logging.getLogger('neighborhoods') qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods') ensure_dir(qs_scratch_dir) logger.info('Creating ClickThatHood neighborhoods') cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index() logger.info('Creating OSM neighborhoods') osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file) logger.info('Creating Quattroshapes neighborhoods') qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir) osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir) osm_admin_rtree.cache_size = 1000 logger.info('Creating IDF index') idf = IDFIndex() char_scripts = get_chars_by_script() for idx in (cth, qs, osmn): for i in xrange(idx.i): props = idx.get_properties(i) name = props.get('name') if name is not None: doc = cls.count_words(name) idf.update(doc) for key, attrs, deps in parse_osm(filename): for k, v in six.iteritems(attrs): if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)): doc = cls.count_words(v) idf.update(doc) for i in six.moves.xrange(osmn.i): props = osmn.get_properties(i) poly = osmn.get_polygon(i) props['source'] = 'osm' props['component'] = AddressFormatter.SUBURB props['polygon_type'] = 'neighborhood' index.index_polygon(poly.context) index.add_polygon(poly.context, props) qs.matched = [False] * qs.i cth.matched = [False] * cth.i logger.info('Matching OSM points to neighborhood polygons') # Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons num_polys = 0 for element_id, attrs, deps in parse_osm(filename): try: lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon']) except ValueError: continue osm_name = attrs.get('name') if not osm_name: continue id_type, element_id = element_id.split(':') element_id = long(element_id) props['type'] = id_type props['id'] = element_id possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.EXTENDED_NEIGHBORHOOD) is_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD) country, candidate_languages = country_rtree.country_and_languages(lat, lon) component_name = None component_name = osm_address_components.component_from_properties(country, attrs) ranks = [] osm_names = [] for key in OSM_NAME_TAGS: name = attrs.get(key) if name: osm_names.append(name) for name_key in OSM_NAME_TAGS: osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))]) for idx in (cth, qs): candidates = idx.get_candidate_polygons(lat, lon, return_all=True) if candidates: max_sim = 0.0 arg_max = None normalized_qs_names = {} for osm_name in osm_names: contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts for c in safe_decode(osm_name))) for i in candidates: props = idx.get_properties(i) name = normalized_qs_names.get(i) if not name: name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) normalized_qs_names[i] = name if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood': continue if not contains_ideographs: sim = NeighborhoodDeduper.compare(osm_name, name, idf) else: # Many Han/Hangul characters are common, shouldn't use IDF sim = NeighborhoodDeduper.compare_ideographs(osm_name, name) if sim > max_sim: max_sim = sim poly = idx.get_polygon(i) arg_max = (max_sim, props, poly.context, idx, i) if arg_max: ranks.append(arg_max) ranks.sort(key=operator.itemgetter(0), reverse=True) if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD: score, props, poly, idx, i = ranks[0] existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True) existing_neighborhood_boundaries = osmn.point_in_poly(lat, lon, return_all=True) skip_node = False for boundaries in (existing_osm_boundaries, existing_neighborhood_boundaries): for poly_index, osm_props in enumerate(boundaries): containing_component = None name = osm_props.get('name') # Only exact name matches here since we're comparins OSM to OSM if name and name.lower() != attrs.get('name', '').lower(): continue if boundaries is existing_neighborhood_boundaries: containing_component = AddressFormatter.SUBURB skip_node = True break else: containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries[poly_index + 1:]] containing_component = osm_address_components.component_from_properties(country, osm_props, containing=containing_ids) if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]: skip_node = True break if skip_node: break # Skip this element if skip_node: continue if idx is cth: if props['component'] == AddressFormatter.SUBURB: attrs['polygon_type'] = 'neighborhood' elif props['component'] == AddressFormatter.CITY_DISTRICT: attrs['polygon_type'] = 'local_admin' else: continue source = 'osm_cth' else: level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) source = 'osm_quattro' if level == 'neighborhood': attrs['polygon_type'] = 'neighborhood' else: attrs['polygon_type'] = 'local_admin' containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries] component = osm_address_components.component_from_properties(country, attrs, containing=containing_ids) attrs['component'] = component attrs['source'] = source index.index_polygon(poly) index.add_polygon(poly, attrs) idx.matched[i] = True num_polys += 1 if num_polys % 1000 == 0 and num_polys > 0: logger.info('did {} neighborhoods'.format(num_polys)) for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')): for i in xrange(idx.i): props = idx.get_properties(i) poly = idx.get_polygon(i) if idx.matched[i]: continue props['source'] = source if idx is cth: component = props['component'] if component == AddressFormatter.SUBURB: props['polygon_type'] = 'neighborhood' elif component == AddressFormatter.CITY_DISTRICT: props['polygon_type'] = 'local_admin' else: continue elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': component = AddressFormatter.SUBURB name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) props['name'] = name if cls.quattroshapes_city_district_regex.match(name): component = AddressFormatter.CITY_DISTRICT props['component'] = component props['polygon_type'] = 'neighborhood' else: # We don't actually care about local admin polygons unless they match OSM continue index.index_polygon(poly.context) index.add_polygon(poly.context, props) return index
def create_neighborhoods_index(cls, osm_neighborhoods_file): scratch_dir = cls.SCRATCH_DIR neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index') ensure_dir(neighborhoods_dir) return cls.create_from_osm_file(osm_neighborhoods_file, output_dir=neighborhoods_dir)
zip_path = filename + '.zip' zip_url_path = six.b('/').join([safe_encode(p) for p in path[:-1]] + [quote_plus(filename)]) url = urljoin(OPENADDRESSES_LATEST_DIR, zip_url_path) download_pre_release_downloads(out_dir) print(six.u('doing {}').format(safe_decode(source))) success = download_and_unzip_file(url, out_dir) if not success: print(six.u('ERR: could not download {}').format(source)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-o', '--out-dir', required=True, help='Output directory') parser.add_argument('--all', action='store_true', default=False, help='Download all completed OpenAddresses files') args = parser.parse_args() ensure_dir(args.out_dir) if args.all: openaddresses_download_all_files(args.out_dir) else: openaddresses_download_configured_files(args.out_dir)
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other geometries. ''' index = cls(save_dir=output_dir) ensure_dir(scratch_dir) logger = logging.getLogger('neighborhoods') logger.setLevel(logging.INFO) qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods') ensure_dir(qs_scratch_dir) logger.info('Creating Quattroshapes neighborhoods') qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index( quattroshapes_dir, qs_scratch_dir) logger.info('Creating Zetashapes neighborhoods') zs = cls.create_zetashapes_neighborhoods_index() logger.info('Creating IDF index') idf = IDFIndex() char_scripts = get_chars_by_script() for idx in (zs, qs): for i, (props, poly) in enumerate(idx.polygons): name = props.get('name') if name is not None: doc = cls.count_words(name) idf.update(doc) for key, attrs, deps in parse_osm(filename): for k, v in attrs.iteritems(): if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)): doc = cls.count_words(v) idf.update(doc) qs.matched = [False] * qs.i zs.matched = [False] * zs.i logger.info('Matching OSM points to neighborhood polygons') # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons num_polys = 0 for node_id, attrs, deps in parse_osm(filename): try: lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon']) except ValueError: continue osm_name = attrs.get('name') if not osm_name: continue is_neighborhood = attrs.get('place') == 'neighbourhood' ranks = [] osm_names = [] for key in OSM_NAME_TAGS: name = attrs.get(key) if name: osm_names.append(name) for name_key in OSM_NAME_TAGS: osm_names.extend([ v for k, v in attrs.iteritems() if k.startswith('{}:'.format(name_key)) ]) for idx in (zs, qs): candidates = idx.get_candidate_polygons(lat, lon, return_all=True) if candidates: max_sim = 0.0 arg_max = None normalized_qs_names = {} for osm_name in osm_names: contains_ideographs = any( ((char_scripts[ord(c)] or '').lower() in ideographic_scripts for c in safe_decode(osm_name))) for i in candidates: props, poly = idx.polygons[i] name = normalized_qs_names.get(i) if not name: name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) normalized_qs_names[i] = name if is_neighborhood and idx is qs and props.get( QuattroshapesReverseGeocoder.LEVEL ) != 'neighborhood': continue if not contains_ideographs: sim = NeighborhoodDeduper.compare( osm_name, name, idf) else: # Many Han/Hangul characters are common, shouldn't use IDF sim = NeighborhoodDeduper.compare_ideographs( osm_name, name) if sim > max_sim: max_sim = sim arg_max = (max_sim, props, poly.context, idx, i) if arg_max: ranks.append(arg_max) ranks.sort(key=operator.itemgetter(0), reverse=True) if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD: score, props, poly, idx, i = ranks[0] if idx is zs: attrs['polygon_type'] = 'neighborhood' source = 'osm_zeta' else: level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) source = 'osm_quattro' if level == 'neighborhood': attrs['polygon_type'] = 'neighborhood' else: attrs['polygon_type'] = 'local_admin' attrs['source'] = source index.index_polygon(poly) index.add_polygon(poly, attrs) idx.matched[i] = True num_polys += 1 if num_polys % 1000 == 0 and num_polys > 0: logger.info('did {} neighborhoods'.format(num_polys)) for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')): for i, (props, poly) in enumerate(idx.polygons): if idx.matched[i]: continue props['source'] = source if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': props['polygon_type'] = 'neighborhood' else: # We don't actually care about local admin polygons unless they match OSM continue index.index_polygon(poly.context) index.add_polygon(poly.context, props) return index
url = urljoin(OPENADDRESSES_LATEST_DIR, zip_url_path) download_pre_release_downloads(out_dir) print(six.u('doing {}').format(safe_decode(source))) success = download_and_unzip_file(url, out_dir) if not success: print(six.u('ERR: could not download {}').format(source)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-o', '--out-dir', required=True, help='Output directory') parser.add_argument('--all', action='store_true', default=False, help='Download all completed OpenAddresses files') args = parser.parse_args() ensure_dir(args.out_dir) if args.all: openaddresses_download_all_files(args.out_dir) else: openaddresses_download_configured_files(args.out_dir)