def extract_geonames(data_path, dump_path): alt_names = tab_file(data_path + "/alternateNames.sorted.txt", columns["alternate"]) geonames = tab_file(data_path + "/allCountries.sorted.txt", columns["geoname"]) dump = Dump(dump_path + "/geonames/geonames.%04d.json.gz") extra_alt_name = {} for geoname in geonames: alt_name_list = [] if extra_alt_name.get("geoname_id") == geoname["geoname_id"]: alt_name_list.append(extra_alt_name) extra_alt_name = {} for alt_name in alt_names: if alt_name["geoname_id"] == geoname["geoname_id"]: alt_name_list.append(alt_name) else: extra_alt_name = alt_name break geoname["alternate_names"] = alt_name_list try: for col in ("latitude", "longitude"): geoname[col] = float(geoname[col]) except ValueError: ### busted coordinates continue centroid = [geoname["longitude"], geoname["latitude"]] population = None try: population = int(geoname["population"]) except ValueError: pass uri = "http://geonames.org/" + geoname["geoname_id"] names = [] alt_name_list.append({ "name": geoname["name"], "lang": "", "type": "preferred" }) for alt_name in alt_name_list: name_type = "" if alt_name.get("is_colloquial"): name_type = "colloquial" if alt_name.get("is_historic"): name_type = "historic" if alt_name.get("is_preferred"): name_type = "preferred" if alt_name.get("is_short"): name_type = "short" alt_name = { "lang": alt_name["lang"], "type": name_type, "name": alt_name["name"] } names.append(alt_name) ascii_name = transliterate(alt_name) if ascii_name: names.append(ascii_name) place = { "name": geoname["name"], "centroid": centroid, "feature_code": geoname["feature_code"], "geometry": {"type": "Point", "coordinates": centroid}, "is_primary": True, "source": geoname, "alternate": names, "updated": geoname["changed_at"], "population": population, "uris": [uri], "relationships": [], "timeframe": {}, "admin": [] } dump.write(uri, place) dump.close()
def extract_lc_auth(data_path, dump_path): dump = Dump(dump_path + "/lc_auth/lc_auth.%04d.json.gz") for line in file(data_path): auth = json.loads(line) if "isMemberOfMADSScheme" in auth: del auth["isMemberOfMADSScheme"] if "adminMetadata" in auth: del auth["adminMetadata"] alt_names = [] # print "hasVariant: ", auth.get("hasVariant") for label in get_labels(auth.get("hasVariant")): alt_names.append({"name": label}) for label in get_labels(auth.get("hasEarlierEstablishedForm")): alt_names.append({"name": label, "type": "historical"}) geom = fcode = None has_source = auth.get("hasSource") if has_source: note = has_source.get("citation-note") if note: lat = coords_to_dms(ns_coords, note) lon = coords_to_dms(ew_coords, note) if lat and lon: geom = {"type": "Point", "coordinates": [lon, lat]} # search the citation-note fcode = extract_fcode(note) if not fcode: source = has_source.get("citation-source", "") fcode = extract_fcode(source) if not fcode: fcode = "AUTH" uri = auth["id"] if "authoritativeLabel" not in auth: continue updated = datetime.datetime.utcnow().replace(second=0, microsecond=0).isoformat() auth_source = {} auth_source = {"type": [], "id": "", "authoritativeLabel": "", "note": "", "editorialNote": ""} for key in auth_source.keys(): if auth.has_key(key): auth_source[key] = auth[key] place = { "name": auth["authoritativeLabel"], "feature_code": fcode, "alternate": alt_names, "is_primary": True, "updated": updated, "source": auth_source, "uris": [uri], "relationships": [], "timeframe": {}, "admin": [], } if geom: place["geometry"] = geom place["centroid"] = geom["coordinates"] else: place["geometry"] = {} place["centroid"] = [] dump.write(uri, place) dump.close()
def extract_geonames(data_path, dump_path): alt_names = tab_file(data_path + "/alternateNames.sorted.txt", columns["alternate"]) geonames = tab_file(data_path + "/allCountries.sorted.txt", columns["geoname"]) dump = Dump(dump_path + "/geonames/geonames.%04d.json.gz") extra_alt_name = {} for geoname in geonames: alt_name_list = [] if extra_alt_name.get("geoname_id") == geoname["geoname_id"]: alt_name_list.append(extra_alt_name) extra_alt_name = {} for alt_name in alt_names: if alt_name["geoname_id"] == geoname["geoname_id"]: alt_name_list.append(alt_name) else: extra_alt_name = alt_name break geoname["alternate_names"] = alt_name_list try: for col in ("latitude", "longitude"): geoname[col] = float(geoname[col]) except ValueError: ### busted coordinates continue centroid = [geoname["longitude"], geoname["latitude"]] population = None try: population = int(geoname["population"]) except ValueError: pass uri = "http://geonames.org/" + geoname["geoname_id"] names = [] alt_name_list.append({ "name": geoname["name"], "lang": "", "type": "preferred" }) for alt_name in alt_name_list: name_type = "" if alt_name.get("is_colloquial"): name_type = "colloquial" if alt_name.get("is_historic"): name_type = "historic" if alt_name.get("is_preferred"): name_type = "preferred" if alt_name.get("is_short"): name_type = "short" alt_name = { "lang": alt_name["lang"], "type": name_type, "name": alt_name["name"] } names.append(alt_name) ascii_name = transliterate(alt_name) if ascii_name: names.append(ascii_name) place = { "name": geoname["name"], "centroid": centroid, "feature_code": geoname["feature_code"], "geometry": { "type": "Point", "coordinates": centroid }, "is_primary": True, "source": geoname, "alternate": names, "updated": geoname["changed_at"], "population": population, "uris": [uri], "relationships": [], "timeframe": {}, "admin": [] } dump.write(uri, place) dump.close()
def extract_lc_auth(data_path, dump_path): dump = Dump(dump_path + "/lc_auth/lc_auth.%04d.json.gz") for line in file(data_path): auth = json.loads(line) if "isMemberOfMADSScheme" in auth: del auth["isMemberOfMADSScheme"] if "adminMetadata" in auth: del auth["adminMetadata"] alt_names = [] #print "hasVariant: ", auth.get("hasVariant") for label in get_labels(auth.get("hasVariant")): alt_names.append({"name": label}) for label in get_labels(auth.get("hasEarlierEstablishedForm")): alt_names.append({"name": label, "type": "historical"}) geom = fcode = None has_source = auth.get("hasSource") if has_source: note = has_source.get("citation-note") if note: lat = coords_to_dms(ns_coords, note) lon = coords_to_dms(ew_coords, note) if lat and lon: geom = {"type": "Point", "coordinates": [lon, lat]} # search the citation-note fcode = extract_fcode(note) if not fcode: source = has_source.get("citation-source", "") fcode = extract_fcode(source) if not fcode: fcode = "AUTH" uri = auth["id"] if "authoritativeLabel" not in auth: continue updated = datetime.datetime.utcnow().replace( second=0, microsecond=0).isoformat() auth_source = {} auth_source = { "type": [], "id": "", "authoritativeLabel": "", "note": "", "editorialNote": "" } for key in auth_source.keys(): if auth.has_key(key): auth_source[key] = auth[key] place = { "name": auth["authoritativeLabel"], "feature_code": fcode, "alternate": alt_names, "is_primary": True, "updated": updated, "source": auth_source, "uris": [uri], "relationships": [], "timeframe": {}, "admin": [] } if geom: place["geometry"] = geom place["centroid"] = geom["coordinates"] else: place["geometry"] = {} place["centroid"] = [] dump.write(uri, place) dump.close()