def process(file, datasetName, outputDir, rawData=None, removeInvalidProperties=False): rawData = rawData if rawData is not None else utils.getEncodedFileContent( file)[0] kml2geojson.main.GEOTYPES = ["Polygon", "LineString", "Point"] try: data = rawData # remove all Document properties if removeInvalidProperties: data = re.compile("<Documen[^>]*>").sub("<Document>", data) geojson = kml2geojson.main.build_feature_collection( md.parseString(data)) for element in geojson["features"]: # extract data from nested tables properties = {} for prop in element["properties"]: value = str(element["properties"][prop]) if "<table" in value: page = htmlParser.document_fromstring(value) for row in page.xpath("body/table")[0].findall("tr"): childs = row.findall("td") if len(childs) == 2: variableName = utils.normalizeText(childs[0].text) properties[utils.normalizeText( variableName)] = utils.getValidTextValue( childs[1].text) else: if (prop != "styleUrl"): properties[utils.normalizeText( prop)] = utils.getValidTextValue(value) element["properties"] = properties sql = geojsonHandler.writeSQLScript(file, datasetName, outputDir, geojsonObject=geojson) except xml.parsers.expat.ExpatError as err: if removeInvalidProperties: print("no se pudo parsear el archivo", Logs.ERROR, {"error": str(err)}) else: process(file, datasetName, outputDir, rawData=rawData, removeInvalidProperties=True)
def extract_deaths(self, html): text = html.find("b", string="Character Deaths") if text: result = [] rows = text.find_all_next("tr") for item in rows: if item.text == "Search Character" or item.text == "Account Information": break timestamp = normalizeText( item.select_one("td:nth-of-type(1)").text.strip()) description = normalizeText( item.select_one("td:nth-of-type(2)").text.strip()) result.append({ "timestamp": timestamp, "description": description }) return result
def export(self): while len(self.fileList) > 0: file = self.fileList.pop() self.counter["total"] += 1 print("--------------------------------------------------------------------") print("procesando " + file["path"], Logs.INFO ) if file["type"] not in acceptedFiles and file["type"] not in zippedFiles: print("Archivo no soportado", Logs.ERROR) self.counter["archivo_no_soportado"] += 1 continue if self.arguments["forceUpdates"] or self.arguments["skipUpdates"] or not geoserver.featureExists( file["tableName"] ): # the layer is not in geoserver metadata = file["metadata"] if self.arguments["forceUpdates"] or self.arguments["skipUpdates"] or self.arguments["featureName"] or metadata.readFeatureData(): if file["type"] in zippedFiles: # is a zipped directory extractedFiles = zip.extract( file["path"], file["ckanID"] ) if extractedFiles is not None: namePreffix = "" if len(extractedFiles) == 1 else "{}__".format( file["tableName"] ) for extractedFile in extractedFiles: configObject = self.buildConfigObject( extractedFile, metadata=metadata ) name = utils.normalizeText( configObject["name"] ) name = name[ len("conjunto_de_datos_"):] if "conjunto_de_datos_" in name else name configObject["tableName"] = namePreffix + name configObject["extracted"] = True self.fileList.append( configObject ) else: # process file self.processFile( file ) else: print("no se encontraron metadatos en CKAN/DGM", Logs.ERROR, {"metadata": str(metadata)} ) self.counter["metadatos_error"] += 1 else: print("la capa ya existe en geoserver", Logs.ERROR) self.counter["procesados_anteriormente"] += 1 # print counters print("--------------------------------------------------------------------") print("counters", Logs.INFO, self.counter)
def buildConfigObject(self, path, additionalValues={}, metadata=None): config = {} fileName, fileType = utils.getFileNameAndType( path ) config["path"] = path config["name"] = fileName config["ckanID"] = fileName config["type"] = fileType config["tableName"] = utils.normalizeText( config["ckanID"] ) config["metadata"] = DGMMetadata( config["ckanID"], self.arguments ) if metadata is None else metadata config["extracted"] = False for key in additionalValues: config[key] = additionalValues[key] return config
def extract_guild_membership(self, html): result = html.find("td", string=re.compile(GUILD_MEMBERSHIP_REGEX)) return normalizeText(self._getInformation(result))
def extract_last_login(self, html): result = html.find("td", string="Last Login:") return normalizeText(self._getInformation(result))
def writeSQLScript(file, datasetName, outputDir, geojsonObject=None, srid=GEOMETRY_COLUMN_SRID): scriptFileName = "{outputDir}/{script}.sql".format(outputDir=outputDir, script=datasetName) data = geojsonObject if geojsonObject is not None else json.loads( utils.getEncodedFileContent(file)[0] ) print("Escribiendo script en " + scriptFileName, Logs.INFO) createTableSQL = "" geometrySQL = [] insertSQL = [] # table columns columns = [] validColumns = [] columnsType = {} columnString = "" textColumns = ["cve_ent", "cve_mun", "cve_loc", "cvegeo"] # numeric columns that should be treated as str (catalogs) for column in data["features"][0]["properties"]: validColumnName = utils.normalizeText(column) columns.append(column) validColumns.append(validColumnName) columnsType[validColumnName] = "text" if validColumnName in textColumns else utils.getObjType(data["features"][0]["properties"][column]) columnString += validColumnName + "," sqlColumns = "gid serial PRIMARY KEY" for header in validColumns: sqlColumns += ",{column} {column_type}".format(column=header, column_type=columnsType[header]) createTableSQL = sqlCreateTable.format(datasetName=datasetName, columns=sqlColumns) geometryColumns = [] columnsCreated = {} counter = 0 for element in data["features"]: geometryType = element["geometry"]["type"] properties = element["properties"] featuresToProcess = [] if geometryType == "GeometryCollection": for geometry in element["geometry"]["geometries"]: featuresToProcess.append({ "geometry": geometry }) else: featuresToProcess.append(element) for feature in featuresToProcess: geometryType = feature["geometry"]["type"] # validate geometry coordinates if geometryType.lower() == "point": feature["geometry"]["coordinates"] = feature["geometry"]["coordinates"][0:2] elif geometryType.lower() == "polygon": coordinates = [[]] for batchCoordinates in feature["geometry"]["coordinates"]: # coordinates depth if isinstance(batchCoordinates[0][0], list): batchCoordinates = batchCoordinates[0] for index in range(0, len(batchCoordinates)): coordinates[0].append(batchCoordinates[index][0:2]) feature["geometry"]["coordinates"] = coordinates elif geometryType.lower() == "linestring": for index in range(0, len(feature["geometry"]["coordinates"])): feature["geometry"]["coordinates"][index] = feature["geometry"]["coordinates"][index][0:2] if not geometryType in columnsCreated: query, columnName = createGeometryColumn(datasetName, geometryType.upper(), "_{}".format(geometryType.lower()), srid=srid) geometryColumns.append(columnName) columnsCreated[geometryType] = True geometrySQL.append(query) values = [] for index in range(0, len(columns)): try: value = "" if properties[ columns[index] ] is None else utils.getValidTextValue(properties[ columns[index] ]) except: value = "" pass values.append( utils.getValidSQLValue(value, columnsType[validColumns[index]]) ) values.append("ST_SetSRID(ST_GeomFromGeoJSON('" + json.dumps(feature["geometry"]) + "')," + srid + ")") sql = sqlInsert.format(dataset=datasetName, columns=columnString, values=",".join(values), geometry_column=GEOMETRY_COLUMN_NAME, suffix="_"+geometryType.lower()) insertSQL.append(sql) # WRITE SQL FILE with open( scriptFileName , "w" ) as sqlScript: # create table sqlScript.write( createTableSQL + utils.QUERY_DELIMITER ) # create geometries for query in geometrySQL: sqlScript.write( query + utils.QUERY_DELIMITER ) # insert features for query in insertSQL: sqlScript.write( query + utils.QUERY_DELIMITER )
def __init__(self, tableName): self.table = utils.normalizeText(tableName)