def cacheRegionCities2Json(self, limit, showDone=False): # TODO - refactor to Locator/LocationContext - make available via command line wd = Wikidata() config = LocationContext.getDefaultConfig() countryManager = CountryManager(config=config) countryManager.fromCache() regionManager = RegionManager(config=config) regionManager.fromCache() regionList = regionManager.getList() total = len(regionList) cachePath = f"{config.getCachePath()}/regions" if not os.path.exists(cachePath): os.makedirs(cachePath) for index, region in enumerate(regionList): if index >= limit: break regionId = region.wikidataid msg = f"{index+1:4d}/{total:4d}:getting cities for {region.name} {region.iso} {region.wikidataid}" jsonFileName = f"{cachePath}/{region.iso}.json" if os.path.isfile(jsonFileName): if showDone: print(msg) else: try: regionCities = wd.getCitiesForRegion(regionId, msg) jsonStr = json.dumps(regionCities) with open(jsonFileName, "w") as jsonFile: jsonFile.write(jsonStr) except Exception as ex: self.handleWikidataException(ex)
def testWikidataRegions(self): ''' test getting region information from wikidata ''' wikidata = Wikidata() try: regionList = wikidata.getRegions() self.assertTrue(len(regionList) >= 3000) except Exception as ex: self.handleWikidataException(ex) pass
def testWikidataCityStates(self): ''' test getting region information from wikidata ''' wikidata = Wikidata() try: regionList = wikidata.getCityStates() self.assertTrue(len(regionList) >= 2) cityStateNames = [r.get('name') for r in regionList] self.assertTrue("Singapore" in cityStateNames) except Exception as ex: self.handleWikidataException(ex) pass
def testWikidataCities(self): ''' test getting city information from wikidata ''' # Wikidata time outs in CI environment need to be avoided if getpass.getuser() != "wf": return config = StorageConfig.getSQL(debug=self.debug) config.cacheRootDir = "/tmp/wdhs" cachedir = config.getCachePath() config.cacheFile = f"{cachedir}/hs.db" # use 2018 wikidata copy # wikidata.endpoint="http://blazegraph.bitplan.com/sparql" # use 2020 wikidata copy wikidata = Wikidata() wikidata.endpoint = "https://confident.dbis.rwth-aachen.de/jena/wdhs/sparql" #wikidata.endpoint="http://jena.bitplan.com/wdhs/sparql" regions = [{ "name": "Singapore", "country": "Q334", "region": None, "cities": 46 }, { "name": "Beijing", "country": None, "region": "Q956", "cities": 25 }, { "name": "Paris", "country": None, "region": "Q13917", "cities": 1242 }, { "name": "Barcelona", "country": None, "region": "Q5705", "cities": 1242 }, { "name": "Rome", "country": None, "region": "Q1282", "cities": 1242 }] limit = 1000000 #if self.inCI() else 100 cityList = wikidata.getCities(limit=limit) sqlDB = SQLDB(config.cacheFile) entityInfo = sqlDB.createTable(cityList, "hs", withDrop=True) sqlDB.store(cityList, entityInfo, fixNone=True) expected = 200000 # if self.inCI() else limit self.assertTrue(len(cityList) >= expected)
def testCityFromCityStates(self): ''' tests if city states are queried correctly if given the region For city states the city is region and city (in some cases also country). This test ensures that by querying for the cities of a region the city states include themself in the result (the result for cities in city-states often includes the municipalities) ''' wd = Wikidata() cityStateRecords = wd.getCityStates() for cityStateRecord in cityStateRecords: regionId = cityStateRecord.get('wikidataid') regionCities = wd.getCitiesForRegion( regionId, msg=f"Query for cities in {cityStateRecord.get('name')}") foundCities = [c.get('wikidataid') for c in regionCities] self.assertTrue(regionId in foundCities)
def testWikidataCountries(self): ''' test getting country information from wikidata ''' wikidata = Wikidata() try: countryList = wikidata.getCountries() self.assertTrue(len(countryList) >= 200) expectedAttrs = Country.getSamples()[0].keys() for country in countryList: if self.debug: print(country) for attr in expectedAttrs: self.assertTrue(hasattr(country, attr)) except Exception as ex: self.handleWikidataException(ex) pass
def testGetWikidataId(self): ''' test getting a wikiDataId from a given URL ''' # test entity wikidataURL = "https://www.wikidata.org/wiki/Q1" expectedID = "Q1" wikiDataId = Wikidata.getWikidataId(wikidataURL) self.assertEqual(wikiDataId, expectedID) # test property wikidataURLProperty = "https://www.wikidata.org/wiki/Property:P31" expectedPropertyID = "P31" propertyId = Wikidata.getWikidataId(wikidataURLProperty) self.assertEqual(expectedPropertyID, propertyId) # test invalid entries wikidataURLProperty = "" parsedId = Wikidata.getWikidataId(wikidataURLProperty) self.assertIsNone(parsedId)
def testCacheLocationLabels(self): ''' Generates the location label tabels in the SQL db fro countries, regions and cities by querying wikidata for the rdfs:label and skos:altLa of each location. A view containing all location labels is also created. ''' testLocationLabelExtraction = False if testLocationLabelExtraction: wd = Wikidata() config = LocationContext.getDefaultConfig() countryManager = CountryManager(config=config) regionManager = RegionManager(config=config) cityManager = CityManager(config=config) sqlDb = SQLDB(dbname=config.cacheFile, debug=self.debug) for manager in countryManager, regionManager, cityManager: manager.fromCache() wikidataIdQuery = f"SELECT DISTINCT wikidataid FROM {manager.entityPluralName}" wikidataIdQueryRes = sqlDb.query(wikidataIdQuery) wikidataIds = [l['wikidataid'] for l in wikidataIdQueryRes] chunkSize = 1000 iterations = math.ceil(len(wikidataIds) / chunkSize) progress = 0 res = [] for i in range(iterations): workOnIds = wikidataIds[i * chunkSize:(i + 1) * chunkSize] progress += len(workOnIds) index = 0 values = "" for location in workOnIds: spacer = " \n\t\t\t" if index % 10 == 0 else " " values += f"{spacer}wd:{wd.getWikidataId(location)}" index += 1 query = self.getLablesQuery(values) res.extend( wd.query( f"Query {i}/{iterations} - Querying {manager.entityName} Labels", queryString=query)) wd.store2DB(res, tableName=f"{manager.entityName}_labels", sqlDB=sqlDb) self.createViews(sqlDB=sqlDb)
def populate_Regions(self,sqlDB): ''' populate database with regions from wikiData Args: sqlDB(SQLDB): target SQL database ''' print("retrieving Region data from wikidata ... (this might take a minute)") wikidata=Wikidata() wikidata.getRegions() entityInfo=sqlDB.createTable(wikidata.regionList[:5000],"regions",primaryKey=None,withDrop=True) sqlDB.store(wikidata.regionList,entityInfo,fixNone=True)
def populate_Countries(self,sqlDB): ''' populate database with countries from wikiData Args: sqlDB(SQLDB): target SQL database ''' print("retrieving Country data from wikidata ... (this might take a few seconds)") wikidata=Wikidata() wikidata.getCountries() entityInfo=sqlDB.createTable(wikidata.countryList,"countries",None,withDrop=True,sampleRecordCount=200) sqlDB.store(wikidata.countryList,entityInfo,fixNone=True)
def testGetCoordinateComponents(self): ''' test the splitting of coordinate components in WikiData query results ''' cList = [{ "coordinate": 'Point(-118.25 35.05694444)', "expected": (-118.25, 35.05694444) }] for c in cList: coordinate = c["coordinate"] expLat, expLon = c["expected"] lon, lat = Wikidata.getCoordinateComponents(coordinate) self.assertEqual(expLat, lat) self.assertEqual(expLon, lon)
def getWikidataCityPopulation(self,sqlDB,endpoint=None): ''' Args: sqlDB(SQLDB): target SQL database endpoint(str): url of the wikidata endpoint or None if default should be used ''' dbFile=self.db_path+"/city_wikidata_population.db" rawTableName="cityPops" # is the wikidata population database available? if not os.path.exists(dbFile): # shall we created it from a wikidata query? if endpoint is not None: wikidata=Wikidata() wikidata.endpoint=endpoint cityList=wikidata.getCityPopulations() wikiCitiesDB=SQLDB(dbFile) entityInfo=wikiCitiesDB.createTable(cityList[:300],rawTableName,primaryKey=None,withDrop=True) wikiCitiesDB.store(cityList,entityInfo,fixNone=True) else: # just download a copy print("Downloading %s ... this might take a few seconds" % dbFile) dbUrl="http://wiki.bitplan.com/images/confident/city_wikidata_population.db" urllib.request.urlretrieve(dbUrl,dbFile) # (re) open the database wikiCitiesDB=SQLDB(dbFile) # check whether the table is populated tableList=sqlDB.getTableList() tableName="citiesWithPopulation" if self.db_recordCount(tableList, tableName)<10000: # check that database is writable # https://stackoverflow.com/a/44707371/1497139 sqlDB.execute("pragma user_version=0") # makes sure both tables are in target sqlDB wikiCitiesDB.copyTo(sqlDB) # create joined table sqlQuery=""" select geoname_id, city_name, cp.cityLabel, country_iso_code, country_name, subdivision_1_iso_code, subdivision_1_name, cp.city as wikidataurl, cp.cityPop from cities c join cityPops cp on c.geoname_id=cp.geoNameId union select geoNameId as geoname_id, null as city_name, cityLabel, countryIsoCode as country_iso_code, countryLabel as country_name, null as subdivision_1_iso_code, null as subdivision_1_name, city as wikidataurl, cityPop from cityPops where cityPop is not Null group by geoNameId order by cityPop desc """ cityList=sqlDB.query(sqlQuery) entityInfo=sqlDB.createTable(cityList,tableName,primaryKey=None,withDrop=True,sampleRecordCount=500) sqlDB.store(cityList,entityInfo,fixNone=True)