예제 #1
0
def extract_geonames(data_path, dump_path):
    alt_names = tab_file(data_path + "/alternateNames.sorted.txt", columns["alternate"])
    geonames = tab_file(data_path + "/allCountries.sorted.txt", columns["geoname"])
    dump = Dump(dump_path + "/geonames/geonames.%04d.json.gz")
    extra_alt_name = {}
    for geoname in geonames:
        alt_name_list = []
        if extra_alt_name.get("geoname_id") == geoname["geoname_id"]:
            alt_name_list.append(extra_alt_name)
            extra_alt_name = {}
        for alt_name in alt_names:
            if alt_name["geoname_id"] == geoname["geoname_id"]:
                alt_name_list.append(alt_name)
            else:
                extra_alt_name = alt_name
                break
        geoname["alternate_names"] = alt_name_list
        try:
            for col in ("latitude", "longitude"):
                geoname[col] = float(geoname[col])
        except ValueError:
            ### busted coordinates
            continue
        centroid = [geoname["longitude"], geoname["latitude"]]
        population = None
        try:
            population = int(geoname["population"])
        except ValueError:
            pass
        uri = "http://geonames.org/" + geoname["geoname_id"]
        names = []
        alt_name_list.append({
            "name": geoname["name"],
            "lang": "",
            "type": "preferred"
        })
        for alt_name in alt_name_list:
            name_type = ""
            if alt_name.get("is_colloquial"): name_type = "colloquial"
            if alt_name.get("is_historic"): name_type = "historic"
            if alt_name.get("is_preferred"): name_type = "preferred"
            if alt_name.get("is_short"): name_type = "short"
            alt_name = {
                "lang": alt_name["lang"], 
                "type": name_type, 
                "name": alt_name["name"]
            }
            names.append(alt_name)
            ascii_name = transliterate(alt_name)
            if ascii_name: names.append(ascii_name)
        place = {
            "name": geoname["name"],
            "centroid": centroid,
            "feature_code": geoname["feature_code"],
            "geometry": {"type": "Point", "coordinates": centroid},
            "is_primary": True,
            "source": geoname,
            "alternate": names,
            "updated": geoname["changed_at"],
            "population": population,
            "uris": [uri],
            "relationships": [],
            "timeframe": {},
            "admin": []
        }
        dump.write(uri, place)

    dump.close()
예제 #2
0
파일: lc_auth.py 프로젝트: NYPL/gazetteer
def extract_lc_auth(data_path, dump_path):
    dump = Dump(dump_path + "/lc_auth/lc_auth.%04d.json.gz")
    for line in file(data_path):
        auth = json.loads(line)
        if "isMemberOfMADSScheme" in auth:
            del auth["isMemberOfMADSScheme"]
        if "adminMetadata" in auth:
            del auth["adminMetadata"]
        alt_names = []
        # print "hasVariant: ", auth.get("hasVariant")
        for label in get_labels(auth.get("hasVariant")):
            alt_names.append({"name": label})
        for label in get_labels(auth.get("hasEarlierEstablishedForm")):
            alt_names.append({"name": label, "type": "historical"})
        geom = fcode = None
        has_source = auth.get("hasSource")
        if has_source:
            note = has_source.get("citation-note")
            if note:
                lat = coords_to_dms(ns_coords, note)
                lon = coords_to_dms(ew_coords, note)
                if lat and lon:
                    geom = {"type": "Point", "coordinates": [lon, lat]}
                # search the citation-note
                fcode = extract_fcode(note)
            if not fcode:
                source = has_source.get("citation-source", "")
                fcode = extract_fcode(source)
        if not fcode:
            fcode = "AUTH"
        uri = auth["id"]
        if "authoritativeLabel" not in auth:
            continue

        updated = datetime.datetime.utcnow().replace(second=0, microsecond=0).isoformat()

        auth_source = {}
        auth_source = {"type": [], "id": "", "authoritativeLabel": "", "note": "", "editorialNote": ""}
        for key in auth_source.keys():
            if auth.has_key(key):
                auth_source[key] = auth[key]

        place = {
            "name": auth["authoritativeLabel"],
            "feature_code": fcode,
            "alternate": alt_names,
            "is_primary": True,
            "updated": updated,
            "source": auth_source,
            "uris": [uri],
            "relationships": [],
            "timeframe": {},
            "admin": [],
        }
        if geom:
            place["geometry"] = geom
            place["centroid"] = geom["coordinates"]
        else:
            place["geometry"] = {}
            place["centroid"] = []
        dump.write(uri, place)
    dump.close()
예제 #3
0
def extract_geonames(data_path, dump_path):
    alt_names = tab_file(data_path + "/alternateNames.sorted.txt",
                         columns["alternate"])
    geonames = tab_file(data_path + "/allCountries.sorted.txt",
                        columns["geoname"])
    dump = Dump(dump_path + "/geonames/geonames.%04d.json.gz")
    extra_alt_name = {}
    for geoname in geonames:
        alt_name_list = []
        if extra_alt_name.get("geoname_id") == geoname["geoname_id"]:
            alt_name_list.append(extra_alt_name)
            extra_alt_name = {}
        for alt_name in alt_names:
            if alt_name["geoname_id"] == geoname["geoname_id"]:
                alt_name_list.append(alt_name)
            else:
                extra_alt_name = alt_name
                break
        geoname["alternate_names"] = alt_name_list
        try:
            for col in ("latitude", "longitude"):
                geoname[col] = float(geoname[col])
        except ValueError:
            ### busted coordinates
            continue
        centroid = [geoname["longitude"], geoname["latitude"]]
        population = None
        try:
            population = int(geoname["population"])
        except ValueError:
            pass
        uri = "http://geonames.org/" + geoname["geoname_id"]
        names = []
        alt_name_list.append({
            "name": geoname["name"],
            "lang": "",
            "type": "preferred"
        })
        for alt_name in alt_name_list:
            name_type = ""
            if alt_name.get("is_colloquial"): name_type = "colloquial"
            if alt_name.get("is_historic"): name_type = "historic"
            if alt_name.get("is_preferred"): name_type = "preferred"
            if alt_name.get("is_short"): name_type = "short"
            alt_name = {
                "lang": alt_name["lang"],
                "type": name_type,
                "name": alt_name["name"]
            }
            names.append(alt_name)
            ascii_name = transliterate(alt_name)
            if ascii_name: names.append(ascii_name)
        place = {
            "name": geoname["name"],
            "centroid": centroid,
            "feature_code": geoname["feature_code"],
            "geometry": {
                "type": "Point",
                "coordinates": centroid
            },
            "is_primary": True,
            "source": geoname,
            "alternate": names,
            "updated": geoname["changed_at"],
            "population": population,
            "uris": [uri],
            "relationships": [],
            "timeframe": {},
            "admin": []
        }
        dump.write(uri, place)

    dump.close()
예제 #4
0
def extract_lc_auth(data_path, dump_path):
    dump = Dump(dump_path + "/lc_auth/lc_auth.%04d.json.gz")
    for line in file(data_path):
        auth = json.loads(line)
        if "isMemberOfMADSScheme" in auth: del auth["isMemberOfMADSScheme"]
        if "adminMetadata" in auth: del auth["adminMetadata"]
        alt_names = []
        #print "hasVariant: ", auth.get("hasVariant")
        for label in get_labels(auth.get("hasVariant")):
            alt_names.append({"name": label})
        for label in get_labels(auth.get("hasEarlierEstablishedForm")):
            alt_names.append({"name": label, "type": "historical"})
        geom = fcode = None
        has_source = auth.get("hasSource")
        if has_source:
            note = has_source.get("citation-note")
            if note:
                lat = coords_to_dms(ns_coords, note)
                lon = coords_to_dms(ew_coords, note)
                if lat and lon:
                    geom = {"type": "Point", "coordinates": [lon, lat]}
                # search the citation-note
                fcode = extract_fcode(note)
            if not fcode:
                source = has_source.get("citation-source", "")
                fcode = extract_fcode(source)
        if not fcode:
            fcode = "AUTH"
        uri = auth["id"]
        if "authoritativeLabel" not in auth:
            continue

        updated = datetime.datetime.utcnow().replace(
            second=0, microsecond=0).isoformat()

        auth_source = {}
        auth_source = {
            "type": [],
            "id": "",
            "authoritativeLabel": "",
            "note": "",
            "editorialNote": ""
        }
        for key in auth_source.keys():
            if auth.has_key(key):
                auth_source[key] = auth[key]

        place = {
            "name": auth["authoritativeLabel"],
            "feature_code": fcode,
            "alternate": alt_names,
            "is_primary": True,
            "updated": updated,
            "source": auth_source,
            "uris": [uri],
            "relationships": [],
            "timeframe": {},
            "admin": []
        }
        if geom:
            place["geometry"] = geom
            place["centroid"] = geom["coordinates"]
        else:
            place["geometry"] = {}
            place["centroid"] = []
        dump.write(uri, place)
    dump.close()