def createOrUpdateGroupPid(request, obj, change): import ezid import log f = ezid.setMetadata if change else ezid.createIdentifier r = f( obj.pid, models.getAdminUser(), { "_ezid_role": "group", "_export": "no", "_profile": "ezid", "ezid.group.groupname": obj.groupname, "ezid.group.realm": obj.realm.name, "ezid.group.organizationName": obj.organizationName, "ezid.group.organizationAcronym": obj.organizationAcronym, "ezid.group.organizationUrl": obj.organizationUrl, "ezid.group.organizationStreetAddress": obj.organizationStreetAddress, "ezid.group.agreementOnFile": str(obj.agreementOnFile), "ezid.group.crossrefEnabled": str(obj.crossrefEnabled), "ezid.group.shoulders": " ".join(s.prefix for s in obj.shoulders.all()), "ezid.group.notes": obj.notes }) if r.startswith("success:"): django.contrib.messages.success(request, "Group PID %s." %\ ("updated" if change else "created")) else: log.otherError("admin.createOrUpdateGroupPid", Exception( "ezid.%s call failed: %s" % ("setMetadata" if change else\ "createIdentifier", r))) django.contrib.messages.error(request, "Error %s group PID." %\ ("updating" if change else "creating"))
def _linkcheckUpdateDaemon(): if _resultsUploadSameTimeOfDay: django.db.connections["search"].close() time.sleep(_sameTimeOfDayDelta()) else: # We arbitrarily sleep 10 minutes to avoid putting a burden on the # server near startup or reload. time.sleep(600) while _enabled and threading.currentThread().getName() == _threadName: start = time.time() try: siGenerator = _harvest( ezidapp.models.SearchIdentifier, ["identifier", "linkIsBroken"] ) lcGenerator = _harvest( ezidapp.models.LinkChecker, ["identifier", "numFailures"], lambda lc: lc.numFailures >= _notificationThreshold, ) si = siGenerator.next() lc = lcGenerator.next() while ( si != None and _enabled and threading.currentThread().getName() == _threadName ): while lc != None and lc.identifier < si.identifier: lc = lcGenerator.next() newValue = None if lc == None or lc.identifier > si.identifier: if si.linkIsBroken: newValue = False else: if not si.linkIsBroken: newValue = True lc = lcGenerator.next() if newValue != None: # Before updating the SearchIdentifier, we carefully lock # the table and ensure that the object still exists. try: with django.db.transaction.atomic(using="search"): si2 = ezidapp.models.SearchIdentifier.objects.get( identifier=si.identifier ) si2.linkIsBroken = newValue si2.computeHasIssues() si2.save(update_fields=["linkIsBroken", "hasIssues"]) except ezidapp.models.SearchIdentifier.DoesNotExist: pass si = siGenerator.next() except Exception, e: log.otherError("linkcheck_update._linkcheckUpdateDaemon", e) # Since we're going to be sleeping for potentially a long time, # release any memory held. siGenerator = lcGenerator = si = lc = si2 = None django.db.connections["search"].close() if _resultsUploadSameTimeOfDay: time.sleep(_sameTimeOfDayDelta()) else: time.sleep(max(_resultsUploadCycle - (time.time() - start), 0))
def _backprocDaemon(): _lock.acquire() try: logger.debug('Running background processing threads: count={}'.format( len(_runningThreads))) logger.debug('New thread: {}'.format( threading.currentThread().getName())) _runningThreads.add(threading.currentThread().getName()) logger.debug('New count: {}'.format(threading.active_count())) finally: _lock.release() # If we were started due to a reload, we wait for the previous # thread to terminate... but not forever. 60 seconds is arbitrary. totalWaitTime = 0 try: while _checkContinue(): _lock.acquire() try: n = len(_runningThreads) finally: _lock.release() if n == 1: break assert ( totalWaitTime <= 60 ), "new backproc daemon started before previous daemon terminated" totalWaitTime += _idleSleep time.sleep(_idleSleep) except AssertionError, e: log.otherError("backproc._backprocDaemon", e)
def _daemonThread(): doSleep = True while True: if doSleep: django.db.connections["default"].close() django.db.connections["search"].close() time.sleep(_idleSleep) try: _checkAbort() r = ezidapp.models.DownloadQueue.objects.all().order_by("seq")[:1] if len(r) == 0: doSleep = True continue r = r[0] _checkAbort() if r.stage == ezidapp.models.DownloadQueue.CREATE: _createFile(r) elif r.stage == ezidapp.models.DownloadQueue.HARVEST: _harvest(r) elif r.stage == ezidapp.models.DownloadQueue.COMPRESS: _compressFile(r) elif r.stage == ezidapp.models.DownloadQueue.DELETE: _deleteUncompressedFile(r) elif r.stage == ezidapp.models.DownloadQueue.MOVE: _moveCompressedFile(r) elif r.stage == ezidapp.models.DownloadQueue.NOTIFY: _notifyRequestor(r) else: assert False, "unhandled case" doSleep = False except _AbortException: break except Exception, e: log.otherError("download._daemonThread", e) doSleep = True
def recomputeStatistics (): """ Recomputes and stores identifier statistics. The old statistics are completely replaced. """ try: users = { u.id: (u.pid, u.group.pid, u.realm.name) for u in\ ezidapp.models.SearchUser.objects.all().select_related("group", "realm") } counts = {} lastIdentifier = "" while True: qs = ezidapp.models.SearchIdentifier.objects.filter( identifier__gt=lastIdentifier).only("identifier", "owner_id", "createTime", "isTest", "hasMetadata").order_by("identifier") qs = list(qs[:1000]) if len(qs) == 0: break for id in qs: if not id.isTest and id.owner_id in users: t = (_timestampToMonth(id.createTime), id.owner_id, _identifierType(id.identifier), id.hasMetadata) counts[t] = counts.get(t, 0) + 1 lastIdentifier = qs[-1].identifier with django.db.transaction.atomic(): ezidapp.models.Statistics.objects.all().delete() for t, v in counts.items(): c = ezidapp.models.Statistics(month=t[0], owner=users[t[1]][0], ownergroup=users[t[1]][1], realm=users[t[1]][2], type=t[2], hasMetadata=t[3], count=v) c.full_clean(validate_unique=False) c.save(force_insert=True) except Exception, e: log.otherError("stats.recomputeStatistics", e)
def withAutoReconnect(functionName, function, continuationCheck=None): """ Calls 'function' and returns the result. If an operational database error is encountered (e.g., a lost connection), the call is repeated until it succeeds. 'continuationCheck', if not None, should be another function that signals when the attempts should cease by raising an exception or returning False. If 'continuationCheck' returns False, this function raises AbortException (defined in this module). 'functionName' is the name of 'function' for logging purposes. """ firstError = True while True: try: return function() except django.db.OperationalError, e: # We're silent about the first error because it might simply be # due to the database connection having timed out. if not firstError: log.otherError("search_util.withAutoReconnect/" + functionName, e) time.sleep(_reconnectDelay) if continuationCheck != None and not continuationCheck(): raise AbortException() # In some cases a lost connection causes the thread's database # connection object to be permanently screwed up. The following # call solves the problem. (Note that Django's database # connection objects are indexed generically, but are stored # thread-local.) django.db.connections["search"].close() firstError = False
def clean(self): import log if self.pid == "": try: s = shoulder.getAgentShoulder() assert s.isArk, "Agent shoulder type must be ARK" self.pid = "{}{}".format(s.prefix, minter.mint_id(s)) except Exception, e: log.otherError("group.Group.clean", e) raise
def createOrUpdateUserPid(request, obj, change): import ezid import log f = ezid.setMetadata if change else ezid.createIdentifier r = f( obj.pid, models.getAdminUser(), { "_ezid_role": "user", "_export": "no", "_profile": "ezid", "ezid.user.username": obj.username, "ezid.user.group": "%s|%s " % (obj.group.groupname, obj.group.pid), "ezid.user.realm": obj.realm.name, "ezid.user.displayName": obj.displayName, "ezid.user.accountEmail": obj.accountEmail, "ezid.user.primaryContactName": obj.primaryContactName, "ezid.user.primaryContactEmail": obj.primaryContactEmail, "ezid.user.primaryContactPhone": obj.primaryContactPhone, "ezid.user.secondaryContactName": obj.secondaryContactName, "ezid.user.secondaryContactEmail": obj.secondaryContactEmail, "ezid.user.secondaryContactPhone": obj.secondaryContactPhone, "ezid.user.inheritGroupShoulders": str(obj.inheritGroupShoulders), "ezid.user.shoulders": " ".join(s.prefix for s in obj.shoulders.all()), "ezid.user.crossrefEnabled": str(obj.crossrefEnabled), "ezid.user.crossrefEmail": obj.crossrefEmail, "ezid.user.proxies": " ".join( "%s|%s" % (u.username, u.pid) for u in obj.proxies.all() ), "ezid.user.isGroupAdministrator": str(obj.isGroupAdministrator), "ezid.user.isRealmAdministrator": str(obj.isRealmAdministrator), "ezid.user.isSuperuser": str(obj.isSuperuser), "ezid.user.loginEnabled": str(obj.loginEnabled), "ezid.user.password": obj.password, "ezid.user.notes": obj.notes, }, ) if r.startswith("success:"): if request != None: django.contrib.messages.success( request, "User PID %s." % ("updated" if change else "created") ) else: log.otherError( "admin.createOrUpdateUserPid", Exception( "ezid.%s call failed: %s" % ("setMetadata" if change else "createIdentifier", r) ), ) if request != None: django.contrib.messages.error( request, "Error %s user PID." % ("updating" if change else "creating") )
def clean(self): import log import noid_nog if self.pid == "": try: s = shoulder.getAgentShoulder() assert s.isArk, "agent shoulder type must be ARK" self.pid = "ark:/" + noid_nog.getMinter( s.minter).mintIdentifier() except Exception, e: log.otherError("group.Group.clean", e) raise
def _workerThread(sh): # Sleep between 1x and 2x the idle sleep, to give the main daemon a # chance to load the row cache and to prevent the workers from # running synchronously. time.sleep(sh.idleSleep * (random.random() + 1)) while True: try: while True: rows = _nextUnprocessedLoadedRows(sh) if len(rows) > 0: break _sleep(sh) try: if len(rows) == 1: f = sh.functions["single"][rows[0].operation] f(sh, rows, rows[0].identifier, util.deblobify(rows[0].metadata)) else: f = sh.functions["batch"][rows[0].operation] f( sh, rows, [(r.identifier, util.deblobify(r.metadata)) for r in rows], ) except _AbortException: raise except Exception, e: # N.B.: on the assumption that the registrar-specific function # used callWrapper defined above, the error can only be # permanent. for r in rows: r.error = util.formatException(e) r.errorIsPermanent = True _checkAbort(sh) with django.db.transaction.atomic(): for r in rows: r.save() log.otherError("register_async._workerThread/" + sh.registrar, e) else: _checkAbort(sh) with django.db.transaction.atomic(): for r in rows: # Django "helpfully" sets seq, the primary key, to None # after deleting a row. But we need the seq value to # delete the row out of sh.loadedRows, ergo... t = r.seq r.delete() r.seq = t finally:
def _daemonThread(sh): _sleep(sh) while True: try: while True: n = _loadRows(sh) if n > 0: break _sleep(sh) while _loadedRowsLength(sh) > 0: _sleep(sh) except _AbortException: break except Exception, e: log.otherError("register_async._daemonThread/" + sh.registrar, e) _sleep(sh)
def _daemonThread(): maxSeq = None while True: django.db.connections["default"].close() django.db.connections["search"].close() time.sleep(_idleSleep) try: _checkAbort() # First, a quick test to avoid retrieving the entire table if # nothing needs to be done. Note that in the loop below, if any # entry is deleted or if any identifier is processed, maxSeq is # set to None, thus forcing another round of processing. if maxSeq != None: if _queue().objects.aggregate( django.db.models.Max("seq"))["seq__max"] == maxSeq: continue # Hopefully the queue will not grow so large that the following # query will cause a burden. query = _queue().objects.all().order_by("seq") if len(query) > 0: maxSeq = query[len(query) - 1].seq else: maxSeq = None for r in query: # If there are multiple entries for this identifier, we are # necessarily looking at the first, i.e., the earliest, and # the others must represent subsequent modifications. Hence # we simply delete this entry regardless of its status. if _queue().objects.filter( identifier=r.identifier).count() > 1: r.delete() maxSeq = None else: if r.status == ezidapp.models.CrossrefQueue.UNSUBMITTED: _doDeposit(r) maxSeq = None elif r.status == ezidapp.models.CrossrefQueue.SUBMITTED: _doPoll(r) maxSeq = None else: pass except _AbortException: break except Exception, e: log.otherError("crossref._daemonThread", e) maxSeq = None
def authenticate(username, password, request=None, coAuthenticate=True): """ Authenticates a username and password. Returns a StoreUser object if the authentication is successful, None if unsuccessful, or a string error message if an error occurs. If 'request' is not None, the appropriate variables are added to the request session. If 'request' is not None and coAuthenticate is True, and if the user is an administrative user, the user is authenticated with the Django admin app as well. Easter egg: if the username has the form "@user" and the EZID administrator password is given, and if username "user" exists, then a StoreUser object for "user" is returned (even if logins are not enabled for the user). """ if username.startswith("@"): username = username[1:] sudo = True else: sudo = False username = username.strip() if username == "": return "error: bad request - username required" password = password.strip() if password == "": return "error: bad request - password required" user = ezidapp.models.getUserByUsername(username) if user == None or user.isAnonymous: return None if (sudo and ezidapp.models.getAdminUser().authenticate(password)) or\ (not sudo and user.authenticate(password)): if request != None: request.session[SESSION_KEY] = user.id # Add session variables to support the Django admin interface. if coAuthenticate and not sudo and\ django.contrib.auth.models.User.objects.filter(username=username)\ .exists(): authUser = django.contrib.auth.authenticate(username=username, password=password) if authUser != None: django.contrib.auth.login(request, authUser) else: log.otherError("userauth.authenticate", Exception( "administrator password mismatch; run " +\ "'django-admin ezidadminsetpassword' to correct")) return user else: return None
def _newsDaemon(): global _items while _enabled and threading.currentThread().getName() == _threadName: try: feed = feedparser.parse(_url) if len(feed.entries) > 0: items = [] for i in range(min(len(feed.entries), 3)): items.append((feed.entries[i].title, feed.entries[i].link)) else: items = _noItems except Exception, e: log.otherError("newsfeed._newsDaemon", e) items = _noItems _lock.acquire() try: if threading.currentThread().getName() == _threadName: _items = items finally: _lock.release() time.sleep(_pollingInterval)
def clean(self): import log # The following two statements are here just to support the Django # admin app, which has its own rules about how model objects are # constructed. If no group has been assigned, we can return # immediately because a validation error will already have been # triggered. if not hasattr(self, "group"): return if not hasattr(self, "realm"): self.realm = self.group.realm if self.realm != self.group.realm: raise django.core.exceptions.ValidationError( "User's realm does not match user's group's realm.") if self.pid == "": try: s = shoulder.getAgentShoulder() assert s.isArk, "Agent shoulder type must be ARK" self.pid = "{}{}".format(s.prefix, nog.minter.mint_id(s)) except Exception, e: log.otherError("user.User.clean", e) raise
def updateUserPids(request, users): import ezid import log errors = False for u in users: r = ezid.setMetadata( u.pid, models.getAdminUser(), { "ezid.user.shoulders": " ".join(s.prefix for s in u.shoulders.all()), "ezid.user.crossrefEnabled": str(u.crossrefEnabled), "ezid.user.crossrefEmail": u.crossrefEmail }) if not r.startswith("success:"): errors = True log.otherError("admin.updateUserPids", Exception("ezid.setMetadata call failed: " + r)) if errors: django.contrib.messages.error(request, "Error updating user PIDs.") else: django.contrib.messages.success(request, "User PIDs updated.")
def _backprocDaemon (): _lock.acquire() try: _runningThreads.add(threading.currentThread().getName()) finally: _lock.release() # If we were started due to a reload, we wait for the previous # thread to terminate... but not forever. 60 seconds is arbitrary. totalWaitTime = 0 try: while _checkContinue(): _lock.acquire() try: n = len(_runningThreads) finally: _lock.release() if n == 1: break assert totalWaitTime <= 60,\ "new backproc daemon started before previous daemon terminated" totalWaitTime += _idleSleep time.sleep(_idleSleep) except AssertionError, e: log.otherError("backproc._backprocDaemon", e)
name=ns.name, minter=ns.minter, crossrefEnabled=(ns.get( "registration_agency", "") == "crossref")) if "datacenter" in ns: s.datacenter = datacenters[ns.datacenter] else: s.datacenter = None s.full_clean(validate_unique=False) s.save() shoulders[prefix] = s except Exception, e: # Log the error, but otherwise continue to run with the shoulders # and datacenters we have. log.otherError( "shoulder._reconcileShoulders", Exception("error %s external shoulder file: %s" % (stage, util.formatException(e)))) with django.db.transaction.atomic(): # In all cases, to fill the in-memory caches do fresh queries to # get proper dependent datacenter objects. _shoulders = dict((s.prefix, s) for s in Shoulder.objects.\ select_related("datacenter").all()) dc = dict((d.symbol, d) for d in\ store_datacenter.StoreDatacenter.objects.all()) _datacenters = (dc, dict((d.id, d) for d in dc.values())) def _lockAndLoad(f): # Decorator. def wrapped(*args, **kwargs): _lock.acquire()
requestTime=int(time.time()), rawRequest=request.urlencode(), requestor=requestor, format=_formatCode[format], compression=_compressionCode[compression], columns=_encode(columns), constraints=_encode(d), options=_encode(options), notify=_encode(notify), filename=filename, toHarvest=",".join(toHarvest)) r.save() return "success: %s/download/%s.%s" % (_ezidUrl, filename, _fileSuffix(r)) except Exception, e: log.otherError("download.enqueueRequest", e) return "error: internal server error" def getQueueLength(): """ Returns the length of the batch download queue. """ return ezidapp.models.DownloadQueue.objects.count() class _AbortException(Exception): pass def _checkAbort():
else: _checkAbort(sh) with django.db.transaction.atomic(): for r in rows: # Django "helpfully" sets seq, the primary key, to None # after deleting a row. But we need the seq value to # delete the row out of sh.loadedRows, ergo... t = r.seq r.delete() r.seq = t finally: _deleteLoadedRows(sh, rows) except _AbortException: break except Exception, e: log.otherError("register_async._workerThread/" + sh.registrar, e) _sleep(sh) def enqueueIdentifier(model, identifier, operation, blob): """ Adds an identifier to the asynchronous registration queue named by 'model'. 'identifier' should be the normalized, qualified identifier, e.g., "doi:10.5060/FOO". 'operation' is the identifier operation and should be one of the strings "create", "update", or "delete". 'blob' is the identifier's metadata dictionary in blob form. """ e = model( enqueueTime=int(time.time()), identifier=identifier,
def authenticate(username, password, request=None, coAuthenticate=True): """ Authenticates a username and password. Returns a StoreUser object if the authentication is successful, None if unsuccessful, or a string error message if an error occurs. If 'request' is not None, the appropriate variables are added to the request session. If 'request' is not None and coAuthenticate is True, and if the user is an administrative user, the user is authenticated with the Django admin app as well. Easter egg: if the username has the form "@user" and the EZID administrator password is given, and if username "user" exists, then a StoreUser object for "user" is returned (even if logins are not enabled for the user). """ logger.debug('Authenticating user. username="******"'.format(username)) if username.startswith("@"): username = username[1:] sudo = True logger.debug('User is authenticating as an administrator') else: sudo = False logger.debug( 'User is authenticating as a regular, non-privileged user') username = username.strip() if username == "": logger.debug( 'Auth failed due to missing username. username="******"'.format( username)) return "error: bad request - username required" password = password.strip() if password == "": logger.debug( 'Auth failed due to missing password. username="******"'.format( username)) return "error: bad request - password required" user = ezidapp.models.getUserByUsername(username) logger.debug('Username resolved. user="******"'.format(user)) if user == None or user.isAnonymous: logger.debug('Auth failed due unknown or anonymous user. ' 'user="******" user.isAnonymous={}'.format( user, None if not user else user.isAnonymous)) return None if (sudo and ezidapp.models.getAdminUser().authenticate(password)) or ( not sudo and user.authenticate(password)): logger.debug('Auth successful. user="******" sudo="{}"'.format(user, sudo)) if request != None: logger.debug('Auth in active request') request.session[SESSION_KEY] = user.id # Add session variables to support the Django admin interface. if (coAuthenticate and not sudo and django.contrib.auth.models.User.objects.filter( username=username).exists()): authUser = django.contrib.auth.authenticate(username=username, password=password) if authUser != None: django.contrib.auth.login(request, authUser) else: log.otherError( "userauth.authenticate", Exception( "administrator password mismatch; run " + "'django-admin ezidadminsetpassword' to correct"), ) else: logger.debug('Auth without an active request') return user else: logger.debug('Auth failed. username="******" sudo="{}"'.format( username, sudo)) return None
blob, ) elif update_model.actualObject.isCrossref: crossref.enqueueIdentifier( update_model.identifier, update_model.get_operation_display(), metadata, blob, ) update_model.delete() else: django.db.connections["default"].close() django.db.connections["search"].close() time.sleep(_idleSleep) except Exception, e: log.otherError("backproc._backprocDaemon", e) django.db.connections["default"].close() django.db.connections["search"].close() time.sleep(_idleSleep) _lock.acquire() try: _runningThreads.remove(threading.currentThread().getName()) finally: _lock.release() def loadConfig(): global _enabled, _idleSleep, _threadName _enabled = (django.conf.settings.DAEMON_THREADS_ENABLED and config.get("daemons.backproc_enabled").lower() == "true") if _enabled:
"unexpected return from metadata submission: " + r except urllib2.HTTPError, e: if e.fp != None: try: m = e.fp.read() except Exception: pass else: if not e.msg.endswith("\n"): e.msg += "\n" e.msg += m raise e finally: if c: c.close() except Exception, e: log.otherError("crossref._submitDeposit", _wrapException("error submitting deposit, doi %s, batch %s" %\ (doi, batchId), e)) return False else: return True def _pollDepositStatus(batchId, doi): """ Polls the status of the metadata submission identified by 'batchId'. 'doi' is the identifier in question. The return is one of the tuples: ("submitted", message) 'message' further indicates the status within Crossref, e.g., "in_process". The status may also be, somewhat confusingly,
def _statusDaemon(): while _enabled and threading.currentThread().getName() == _threadName: try: activeUsers, waitingUsers, isPaused = ezid.getStatus() na = sum(activeUsers.values()) nw = sum(waitingUsers.values()) ndo = datacite.numActiveOperations() uql = ezidapp.models.UpdateQueue.objects.count() bql = binder_async.getQueueLength() daql = datacite_async.getQueueLength() cqs = crossref.getQueueStatistics() doql = download.getQueueLength() as_ = search_util.numActiveSearches() no = log.getOperationCount() log.resetOperationCount() log.status("pid=%d" % os.getpid(), "threads=%d" % threading.activeCount(), "paused" if isPaused else "running", "activeOperations=%d%s" % (na, _formatUserCountList(activeUsers)), "waitingRequests=%d%s" % (nw, _formatUserCountList(waitingUsers)), "activeDataciteOperations=%d" % ndo, "updateQueueLength=%d" % uql, "binderQueueLength=%d" % bql, "dataciteQueueLength=%d" % daql, "crossrefQueue:archived/unsubmitted/submitted=%d/%d/%d" %\ (cqs[2]+cqs[3], cqs[0], cqs[1]), "downloadQueueLength=%d" % doql, "activeSearches=%d" % as_, "operationCount=%d" % no) if _cloudwatchEnabled: import boto3 # Disable annoying boto3 logging. logging.getLogger("botocore").setLevel(logging.ERROR) try: c = boto3.client("cloudwatch", region_name=_cloudwatchRegion) d = [{ "Name": "InstanceName", "Value": _cloudwatchInstanceName }] data = { "ActiveOperations": na, "WaitingRequests": nw, "ActiveDataciteOperations": ndo, "UpdateQueueLength": uql, "BinderQueueLength": bql, "DataciteQueueLength": daql, "CrossrefQueueLength": cqs[0] + cqs[1], "DownloadQueueLength": doql, "ActiveSearches": as_, "OperationRate": float(no) / _reportingInterval } r = c.put_metric_data(Namespace=_cloudwatchNamespace, MetricData=[{ "MetricName": k, "Dimensions": d, "Value": float(v), "Unit": "Count/Second" if k == "OperationRate" else "Count" }\ for k, v in data.items()]) assert r["ResponseMetadata"]["HTTPStatusCode"] == 200 except: # Ignore CloudWatch exceptions, as it's not essential. pass except Exception, e: log.otherError("status._statusDaemon", e) django.db.connections["default"].close() time.sleep(_reportingInterval)