def runTest(doi,transform): db = couchdb_server['documents'] eqnID, source = rand.choice(db[doi]['source'].items()) results = None searchTerm = None try: searchTerm = transform(source) url = "http://localhost:%s/documents/_external/index?searchTerm=\"%s\"&searchTimeout=20&limit=2500" % (port, urllib.quote(searchTerm)) startTime = time.time() resultsFile = urllib.urlopen(url) endTime = time.time() results = minidom.parse(resultsFile) if results.getElementsByTagName("LatexParseError"): print "Latex parse error on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime) return False if results.getElementsByTagName("TimedOut"): print "Timed out on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime) return False if results.getElementsByTagName("LimitExceeded"): print "Limit exceeded on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime) return False for result in results.getElementsByTagName("Article") + results.getElementsByTagName("Chapter"): if result.attributes.get('doi').value == decodeDoi(doi): for eqn in result.getElementsByTagName("equation"): if eqn.attributes.get('id').value == eqnID: print "Passed on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime) return True print "Failed on doi: %s and eqnID: %s (%fs)" % (doi, eqnID, endTime-startTime) print searchTerm return False except KeyboardInterrupt, e: raise e
def reprocess(): db = couchdb_server['documents'] print "Reprocessing latex sources" for doi in db: print "Reprocessing %s" % decodeDoi(doi) doc = db[doi] doc['content'] = dict(filterNone([(preprocess(eqnID, latex)) for (eqnID, latex) in doc['source'].items()])) db[doi] = doc
def convert_journalID_containerID(): db = couchdb_server['documents'] print "Converting" for doi in db: print "Converting %s" % decodeDoi(doi) doc = db[doi] if 'journalID' in doc: doc['containerID'] = doc['journalID'] del doc['journalID'] db[doi] = doc
def check_dates(): db = couchdb_server['documents'] print "Checking dates" for doi in db: try: doc = db[doi] actual = doc['publicationYear'] expected = ml_year(decodeDoi(doi)) if expected != "": if expected != actual: print ("Doi: %s Expected: %s Actual: %s" % (doi, expected, actual)) doc['publicationYear'] = expected db[doi] = doc else: print ("Doi: %s ok" % doi) elif doc.get('format', 'article').lower() == 'article': print ("ML year not defined for article: %s" % doi) except KeyboardInterrupt, e: raise e except Exception, e: print ("Failed on doi: %s" % doi) print e
if results.getElementsByTagName("LimitExceeded"): print "Limit exceeded on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime) return False for result in results.getElementsByTagName("Article") + results.getElementsByTagName("Chapter"): if result.attributes.get('doi').value == decodeDoi(doi): for eqn in result.getElementsByTagName("equation"): if eqn.attributes.get('id').value == eqnID: print "Passed on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime) return True print "Failed on doi: %s and eqnID: %s (%fs)" % (doi, eqnID, endTime-startTime) print searchTerm return False except KeyboardInterrupt, e: raise e except Exception, e: print "Error on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, 0) print e try: print "Searchterm: %s" % searchTerm except UnicodeEncodeError: pass return False def runTests(n,transform): db = couchdb_server['documents'] dois = list(db) for i in xrange(0,n): doi = None source = None while not source: try: