def testGeoTextAndGrapy(self): ''' test the GeoText and geograpy3 library ''' debug = True limit = 100 sqlQuery = """select count(*) as count, locality from Event_wikicfp where locality is not null group by locality order by 1 desc LIMIT %d """ % limit dbFile = Lookup.getDBFile() if os.path.isfile(dbFile): sqlDB = SQLDB(dbFile) else: lookup = Lookup.ensureAllIsAvailable("testGeoText") sqlDB = lookup.getSQLDB() if sqlDB is not None: print("testGeoText from database %s " % sqlDB.dbname) totalResult = sqlDB.query("""select count(*) as count from event_wikicfp where locality is not null""") total = totalResult[0]['count'] listOfDicts = sqlDB.query(sqlQuery) index = 0 rsum = 0 found = 0 problems = [] for record in listOfDicts: locality = record['locality'] count = record['count'] index += 1 rsum += count print("%5d: %5d/%5d %5.1f%%=%s" % (index, count, rsum, rsum / total * 100, locality)) geo = GeoText(locality) if debug: print(" %s" % geo.countries) print(" %s" % geo.cities) city = geograpy.locateCity(locality) if city is not None: found += 1 else: problems.append(locality) if debug: print(" %s%s" % (city, '✅' if city is not None else '❌')) if self.debug: print("found %d/%d = %5.1f%%" % (found, limit, found / limit * 100)) print("problems: %s" % problems) self.assertTrue(found / limit > 0.8) pass
def testLookup(self): ''' test the number of sources and storing to "Event_all" ''' if getpass.getuser() != "travis": lookup = Lookup("test") self.assertEqual(8, len(lookup.ems)) errors = lookup.store('Event_all') self.assertEqual(0, len(errors)) lookup.createView() pass
def testExamples(self): ''' test reading the examples ''' examples=Lookup.getExamples() print (examples) self.assertEqual(16,len(examples.keys())) pass
def getWordUsageDB(self): ''' get the Word usage database ''' wordUsageDBFile = Lookup.getDBFile("wordusage") if os.path.isfile(wordUsageDBFile): wSQLDB = SQLDB(wordUsageDBFile) return wSQLDB return None
def testPlantUml(self): ''' get plant UML functionality ''' schemaManager = None if getpass.getuser() != "travis": o = Ontology() schemaManager = o.getRQSchema( fromCache=False) # to force SMW query lookup = Lookup("plantuml", getAll=False, butNot='or') dbfile = lookup.getDBFile('Event_all') sqlDB = SQLDB(dbfile) tableList = sqlDB.getTableList() eventTableList = [] eventSchemas = lookup.getEventSchemas() for table in tableList: tableName = table['name'] if tableName.startswith("Event_"): table['schema'] = eventSchemas[tableName] eventTableList.append(table) countQuery = "SELECT count(*) as count from %s" % tableName countResult = sqlDB.query(countQuery) table['instances'] = countResult[0]['count'] self.assertEqual(8, len(eventTableList)) uml = UML() now = datetime.now() nowYMD = now.strftime("%Y-%m-%d") title = """ConfIDent Entities %s [[https://projects.tib.eu/en/confident/ © 2019-2020 ConfIDent project]] see also [[http://ptp.bitplan.com/settings Proceedings Title Parser]] """ % nowYMD plantUml = uml.mergeSchema(schemaManager, eventTableList, title=title, packageName='DataDonations', generalizeTo="Event") print(plantUml) self.assertTrue("Event <|-- Event_confref" in plantUml) self.assertTrue("class Event " in plantUml)
def testWordParser(self): ''' try finding quantiles see https://stackoverflow.com/questions/2374640/how-do-i-calculate-percentiles-with-python-numpy ''' lookup = Lookup("test Word parser") sqlDB = lookup.getSQLDB() if sqlDB is not None: totalWordUsages = [] for source in ['wikidata', 'crossref', 'dblp', 'CEUR-WS']: listOfDicts = TestWordParser.getProceedingsTitles( sqlDB, source) cwp = CorpusWordParser() wordusages = cwp.parse(listOfDicts) lens = {} for wordusage in wordusages: totalWordUsages.append(wordusage.__dict__) if wordusage.eventId in lens: lens[wordusage.eventId] += 1 else: lens[wordusage.eventId] = 1 df = DataFrame(lens.values()) print(df.quantile(1)) quantileValues = df.quantile(.90) print(quantileValues) plot = Plot(lens.values(), "%s wordcount histogram" % source, xlabel="wordcount", ylabel="frequency") plot.hist(mode='save') wordUsageDBFile = Lookup.getDBFile("wordusage") wSQLDB = SQLDB(wordUsageDBFile) entityInfo = wSQLDB.createTable(totalWordUsages, "wordusage", withDrop=True) wSQLDB.store(totalWordUsages, entityInfo)
def index(titles="", tc=None, errs=None, result=None, message=None, metadebug=True): """ render index page with the given parameters""" return render_template('index.html', titles=titles, tc=tc, errs=errs, result=result, message=message, metadebug=metadebug, examples=Lookup.getExamples())
def test_SQL(self): ''' test SQL queries ''' qm=QueryManager(lang='sql',debug=False) self.assertEqual(17,len(qm.queriesByName)) lookup=Lookup.ensureAllIsAvailable() sqlDB=lookup.getSQLDB() for name,query in qm.queriesByName.items(): listOfDicts=sqlDB.query(query.query) markup=query.asWikiMarkup(listOfDicts) print("== %s ==" % (name)) print("=== query ===") print (query.asWikiSourceMarkup()) print("=== result ===") print(markup)
def testError(self): ''' test error handling according to https://github.com/WolfgangFahl/ProceedingsTitleParser/issues/4 ''' lookup=Lookup("testError") tp=lookup.tp self.assertTrue("Innsbruck" in tp.dictionary.tokens) titles=['Tagungsband des 17. Workshops "Software Engineering im Unterricht der Hochschulen" 2020 (SEUH 2020),Innsbruck, Österreich, 26. - 27.02.2020.'] tp.fromLines(titles, 'line') tc,errs,result=tp.parseAll() # there should be a failed entry in the counter self.assertEqual(1,tc["fail"]) self.assertEqual(1,len(errs)) err=errs[0] self.assertTrue("Expected" in str(err)) self.assertEqual(1,len(result)) title=result[0] print (title.metadata()) self.assertTrue('city' in title.metadata()) print (title.notfound)
def testExtractMode(self): ''' test extract mode ''' lookup=Lookup("testExtractMode") urls=['http://ceur-ws.org/Vol-2635/', 'http://ceur-ws.org/Vol-2599/', 'http://ceur-ws.org/Vol-2553/', 'http://ceur-ws.org/Vol-2512/', 'http://ceur-ws.org/Vol-2489/', 'http://ceur-ws.org/', 'http://ceur-ws.org/Vol-9999/'] tp=lookup.tp tp.fromLines(urls,'line') tc,errs,result=tp.parseAll() print (tc) print (errs) print (result) # expect 4 ok 1 fail and 2 invalid/ignored self.assertEqual(4,tc['success']); self.assertEqual(1,tc['fail']);
def testCreateEventAll(self): ''' check that the event all database is created correctly ''' withWikiData = True lookup = Lookup("CreateEventAll") self.assertEqual(8, len(lookup.ems)) errors = lookup.check(lookup.getSQLDB(), debug=True) if len(errors) > 0: print(errors) sqlDB = lookup.createEventAll(maxAgeMin=0, withWikiData=withWikiData) errors = lookup.check(sqlDB, debug=True) if len(errors) > 0: self.assertEqual(0, len(errors))
def testSpacy(self): ''' test the space NLP library ''' nlp = spacy.load('en_core_web_sm') index = 0 limit = 100 lookup = Lookup.ensureAllIsAvailable() sqlDB = lookup.getSQLDB() if sqlDB is not None: for source in ['wikidata', 'crossref', 'dblp', 'CEUR-WS']: listOfDicts = TestWordParser.getProceedingsTitles( sqlDB, source) for record in listOfDicts: title = record['title'] doc = nlp(title) print("found %d entities in %s:%s" % (len(doc.ents), record['eventId'], title)) for ent in doc.ents: print(" %s(%s)" % (ent, ent.label_)) index += 1 if index > limit: break if index > limit: break
def testNERMode(self): ''' test named entity recognition mode ''' lookup=Lookup("testNerMode") tp=lookup.tp titles=['ATVA 2020 18th International Symposium on Automated Technology for Verification and Analysis', 'Proceedings of the 8th International Workshop on Bibliometric-enhanced Information Retrieval (BIR)co-located with the 41st European Conference on Information Retrieval (ECIR 2019)Cologne, Germany, April 14th, 2019.'] tp.fromLines(titles,'line') tc,errs,result=tp.parseAll() print (tc) print (errs) print (result) # make sure we have exactly one result self.assertEqual(2,len(result)) for title in result: print (title) print (title.info) print (title.metadata()) print (title.notfound) # make sure we found the relevant event self.assertTrue(len(title.events)>0) print (title.events) for event in title.events: print (event.url)
from flask.helpers import send_from_directory import argparse import sys from ptp.lookup import Lookup #from json2xml import json2xml #https://github.com/vinitkumar/json2xml/issues/59 #from flask_accept import accept scriptdir = os.path.dirname(os.path.abspath(__file__)) app = Flask(__name__, static_url_path='', static_folder=scriptdir + '/../web', template_folder=scriptdir + '/../templates') # create a most current eventall database lookup = Lookup("webserver", singleDB=True) @app.route('/') def home(): return index() def index(titles="", tc=None, errs=None, result=None, message=None, metadebug=True): """ render index page with the given parameters""" return render_template('index.html',