def cacheRegionCities2Json(self, limit, showDone=False):
     # TODO - refactor to Locator/LocationContext - make available via command line
     wd = Wikidata()
     config = LocationContext.getDefaultConfig()
     countryManager = CountryManager(config=config)
     countryManager.fromCache()
     regionManager = RegionManager(config=config)
     regionManager.fromCache()
     regionList = regionManager.getList()
     total = len(regionList)
     cachePath = f"{config.getCachePath()}/regions"
     if not os.path.exists(cachePath):
         os.makedirs(cachePath)
     for index, region in enumerate(regionList):
         if index >= limit:
             break
         regionId = region.wikidataid
         msg = f"{index+1:4d}/{total:4d}:getting cities for {region.name} {region.iso} {region.wikidataid}"
         jsonFileName = f"{cachePath}/{region.iso}.json"
         if os.path.isfile(jsonFileName):
             if showDone:
                 print(msg)
         else:
             try:
                 regionCities = wd.getCitiesForRegion(regionId, msg)
                 jsonStr = json.dumps(regionCities)
                 with open(jsonFileName, "w") as jsonFile:
                     jsonFile.write(jsonStr)
             except Exception as ex:
                 self.handleWikidataException(ex)
Пример #2
0
 def testWikidataRegions(self):
     '''
     test getting region information from wikidata
     '''
     wikidata = Wikidata()
     try:
         regionList = wikidata.getRegions()
         self.assertTrue(len(regionList) >= 3000)
     except Exception as ex:
         self.handleWikidataException(ex)
         pass
Пример #3
0
 def testWikidataCityStates(self):
     '''
     test getting region information from wikidata
     '''
     wikidata = Wikidata()
     try:
         regionList = wikidata.getCityStates()
         self.assertTrue(len(regionList) >= 2)
         cityStateNames = [r.get('name') for r in regionList]
         self.assertTrue("Singapore" in cityStateNames)
     except Exception as ex:
         self.handleWikidataException(ex)
         pass
Пример #4
0
 def testWikidataCities(self):
     '''
     test getting city information from wikidata
     
     '''
     # Wikidata time outs in CI environment need to be avoided
     if getpass.getuser() != "wf":
         return
     config = StorageConfig.getSQL(debug=self.debug)
     config.cacheRootDir = "/tmp/wdhs"
     cachedir = config.getCachePath()
     config.cacheFile = f"{cachedir}/hs.db"
     # use 2018 wikidata copy
     # wikidata.endpoint="http://blazegraph.bitplan.com/sparql"
     # use 2020 wikidata copy
     wikidata = Wikidata()
     wikidata.endpoint = "https://confident.dbis.rwth-aachen.de/jena/wdhs/sparql"
     #wikidata.endpoint="http://jena.bitplan.com/wdhs/sparql"
     regions = [{
         "name": "Singapore",
         "country": "Q334",
         "region": None,
         "cities": 46
     }, {
         "name": "Beijing",
         "country": None,
         "region": "Q956",
         "cities": 25
     }, {
         "name": "Paris",
         "country": None,
         "region": "Q13917",
         "cities": 1242
     }, {
         "name": "Barcelona",
         "country": None,
         "region": "Q5705",
         "cities": 1242
     }, {
         "name": "Rome",
         "country": None,
         "region": "Q1282",
         "cities": 1242
     }]
     limit = 1000000  #if self.inCI() else 100
     cityList = wikidata.getCities(limit=limit)
     sqlDB = SQLDB(config.cacheFile)
     entityInfo = sqlDB.createTable(cityList, "hs", withDrop=True)
     sqlDB.store(cityList, entityInfo, fixNone=True)
     expected = 200000  # if self.inCI() else limit
     self.assertTrue(len(cityList) >= expected)
 def testCityFromCityStates(self):
     '''
     tests if city states are queried correctly if given the region
     For city states the city is region and city (in some cases also country).
     This test ensures that by querying for the cities of a region the city states include themself in the result
     (the result for cities in city-states often includes the municipalities)
     '''
     wd = Wikidata()
     cityStateRecords = wd.getCityStates()
     for cityStateRecord in cityStateRecords:
         regionId = cityStateRecord.get('wikidataid')
         regionCities = wd.getCitiesForRegion(
             regionId,
             msg=f"Query for cities in {cityStateRecord.get('name')}")
         foundCities = [c.get('wikidataid') for c in regionCities]
         self.assertTrue(regionId in foundCities)
Пример #6
0
 def testWikidataCountries(self):
     '''
     test getting country information from wikidata
     '''
     wikidata = Wikidata()
     try:
         countryList = wikidata.getCountries()
         self.assertTrue(len(countryList) >= 200)
         expectedAttrs = Country.getSamples()[0].keys()
         for country in countryList:
             if self.debug:
                 print(country)
                 for attr in expectedAttrs:
                     self.assertTrue(hasattr(country, attr))
     except Exception as ex:
         self.handleWikidataException(ex)
         pass
Пример #7
0
 def testGetWikidataId(self):
     '''
     test getting a wikiDataId from a given URL
     '''
     # test entity
     wikidataURL = "https://www.wikidata.org/wiki/Q1"
     expectedID = "Q1"
     wikiDataId = Wikidata.getWikidataId(wikidataURL)
     self.assertEqual(wikiDataId, expectedID)
     # test property
     wikidataURLProperty = "https://www.wikidata.org/wiki/Property:P31"
     expectedPropertyID = "P31"
     propertyId = Wikidata.getWikidataId(wikidataURLProperty)
     self.assertEqual(expectedPropertyID, propertyId)
     # test invalid entries
     wikidataURLProperty = ""
     parsedId = Wikidata.getWikidataId(wikidataURLProperty)
     self.assertIsNone(parsedId)
Пример #8
0
    def testCacheLocationLabels(self):
        '''
        Generates the location label tabels in the SQL db fro countries, regions and cities by querying wikidata for
        the rdfs:label and skos:altLa of each location.
        A view containing all location labels is also created.
        '''
        testLocationLabelExtraction = False
        if testLocationLabelExtraction:
            wd = Wikidata()
            config = LocationContext.getDefaultConfig()
            countryManager = CountryManager(config=config)
            regionManager = RegionManager(config=config)
            cityManager = CityManager(config=config)
            sqlDb = SQLDB(dbname=config.cacheFile, debug=self.debug)
            for manager in countryManager, regionManager, cityManager:
                manager.fromCache()
                wikidataIdQuery = f"SELECT DISTINCT wikidataid FROM {manager.entityPluralName}"
                wikidataIdQueryRes = sqlDb.query(wikidataIdQuery)
                wikidataIds = [l['wikidataid'] for l in wikidataIdQueryRes]

                chunkSize = 1000
                iterations = math.ceil(len(wikidataIds) / chunkSize)
                progress = 0
                res = []
                for i in range(iterations):
                    workOnIds = wikidataIds[i * chunkSize:(i + 1) * chunkSize]
                    progress += len(workOnIds)
                    index = 0
                    values = ""
                    for location in workOnIds:
                        spacer = "  \n\t\t\t" if index % 10 == 0 else " "
                        values += f"{spacer}wd:{wd.getWikidataId(location)}"
                        index += 1
                    query = self.getLablesQuery(values)
                    res.extend(
                        wd.query(
                            f"Query {i}/{iterations} - Querying {manager.entityName} Labels",
                            queryString=query))
                wd.store2DB(res,
                            tableName=f"{manager.entityName}_labels",
                            sqlDB=sqlDb)
            self.createViews(sqlDB=sqlDb)
Пример #9
0
 def populate_Regions(self,sqlDB):
     '''
     populate database with regions from wikiData
     
     Args:
         sqlDB(SQLDB): target SQL database
     '''
     print("retrieving Region data from wikidata ... (this might take a minute)")
     wikidata=Wikidata()
     wikidata.getRegions()
     entityInfo=sqlDB.createTable(wikidata.regionList[:5000],"regions",primaryKey=None,withDrop=True)
     sqlDB.store(wikidata.regionList,entityInfo,fixNone=True)
Пример #10
0
 def populate_Countries(self,sqlDB):
     '''
     populate database with countries from wikiData
     
     Args:
         sqlDB(SQLDB): target SQL database
     '''
     print("retrieving Country data from wikidata ... (this might take a few seconds)")
     wikidata=Wikidata()
     wikidata.getCountries()
     entityInfo=sqlDB.createTable(wikidata.countryList,"countries",None,withDrop=True,sampleRecordCount=200)
     sqlDB.store(wikidata.countryList,entityInfo,fixNone=True)
Пример #11
0
 def testGetCoordinateComponents(self):
     '''
     test the splitting of coordinate components in WikiData query results
     '''
     cList = [{
         "coordinate": 'Point(-118.25 35.05694444)',
         "expected": (-118.25, 35.05694444)
     }]
     for c in cList:
         coordinate = c["coordinate"]
         expLat, expLon = c["expected"]
         lon, lat = Wikidata.getCoordinateComponents(coordinate)
         self.assertEqual(expLat, lat)
         self.assertEqual(expLon, lon)
Пример #12
0
    def getWikidataCityPopulation(self,sqlDB,endpoint=None):
        '''
        Args:
            sqlDB(SQLDB): target SQL database
            endpoint(str): url of the wikidata endpoint or None if default should be used
        '''
        dbFile=self.db_path+"/city_wikidata_population.db"
        rawTableName="cityPops"
        # is the wikidata population database available?
        if not os.path.exists(dbFile):
            # shall we created it from a wikidata query?
            if endpoint is not None:
                wikidata=Wikidata()
                wikidata.endpoint=endpoint
                cityList=wikidata.getCityPopulations()
                wikiCitiesDB=SQLDB(dbFile) 
                entityInfo=wikiCitiesDB.createTable(cityList[:300],rawTableName,primaryKey=None,withDrop=True)
                wikiCitiesDB.store(cityList,entityInfo,fixNone=True)
            else:
                # just download a copy 
                print("Downloading %s ... this might take a few seconds" % dbFile)
                dbUrl="http://wiki.bitplan.com/images/confident/city_wikidata_population.db"
                urllib.request.urlretrieve(dbUrl,dbFile)
        # (re) open the database
        wikiCitiesDB=SQLDB(dbFile) 
          
        # check whether the table is populated
        tableList=sqlDB.getTableList()        
        tableName="citiesWithPopulation"     
      
        if self.db_recordCount(tableList, tableName)<10000:
            # check that database is writable
            # https://stackoverflow.com/a/44707371/1497139
            sqlDB.execute("pragma user_version=0")
            # makes sure both tables are in target sqlDB
            wikiCitiesDB.copyTo(sqlDB)
            # create joined table
            sqlQuery="""
              select 
    geoname_id,
    city_name,
    cp.cityLabel,
    country_iso_code,
    country_name,
    subdivision_1_iso_code,
    subdivision_1_name,
    cp.city as wikidataurl,
    cp.cityPop 
  from cities c 
  join cityPops cp 
  on c.geoname_id=cp.geoNameId 
union  
  select 
    geoNameId as geoname_id,
    null as city_name,
    cityLabel,
    countryIsoCode as country_iso_code,
    countryLabel as country_name,
    null as subdivision_1_iso_code,
    null as subdivision_1_name,
    city as wikidataurl,
    cityPop 
  from cityPops 
  where cityPop is not Null
group by geoNameId
order by cityPop desc
            """
            cityList=sqlDB.query(sqlQuery) 
            entityInfo=sqlDB.createTable(cityList,tableName,primaryKey=None,withDrop=True,sampleRecordCount=500)
            sqlDB.store(cityList,entityInfo,fixNone=True)