def processBadAssignedWorkers(self): assigned_workers = yield GoogleUser.get_assigned_workers() running_workers = yield Worker.getWorkers(Worker.redis_workers_keys) running_workers = [ worker.get('name') for worker in running_workers if not worker.get('name') is None ] death_workers = yield Worker.getWorkers( Worker.redis_death_workers_keys) death_workers = [ worker.get('name') for worker in death_workers if not worker.get('name') is None ] registered_workers = set(running_workers + death_workers) assigned_workers = set(assigned_workers) bad_workers = assigned_workers.difference(registered_workers) if bad_workers: self.log.warning( 'BAD_WORKERS %s are assigned to users. Running %s Death %s', bad_workers, len(running_workers), len(death_workers)) for worker in bad_workers: bad_users = yield GoogleUser.get_connected(worker_name=worker) total_bad_users = len(bad_users) if total_bad_users > 0: self.log.info( 'Reconnecting %s users assigned to bad worker %s', total_bad_users, worker) last_user_index = total_bad_users - 1 for i in xrange(total_bad_users): try: data = bad_users[i] user = GoogleUser(**data) user.worker = user.userid yield user.save() reactor.callLater(0, self.reloginUser, user, worker, i == last_user_index) self.log.info( '[%s] Reconnecting %s/%s user(s) of worker %s', user.userid, i + 1, total_bad_users, worker) except Exception as e: self.log.err( e, '[%s] Exception while reconnecting' % (data['_userid'])) #Remove worker and queue when no users were assigned if total_bad_users == 0: yield self.removeWorker(worker)
def processDeathWorkers(self): #avoid process death workers when service is not running death_workers = yield Worker.getWorkers( Worker.redis_death_workers_keys) if self.running else [] if death_workers: self.log.info('DEATH_WORKERS %s', [worker.get('name') for worker in death_workers]) for worker in death_workers: name = worker.get('name') if conf.DIST_QUEUE_LOGIN in worker.get('queues', []): connected_users = yield GoogleUser.get_connected(name) total_users = len(connected_users) self.log.info( 'Reconnecting %s connected user(s) of death worker %s', total_users, name) last_user_index = total_users - 1 for i in xrange(total_users): try: data = connected_users[i] user = GoogleUser(**data) #Update worker as userid to enqueue new jobs in user own queue user.worker = user.userid yield user.save() #Get pending jobs reactor.callLater(0, self.reloginUser, user, name, i == last_user_index) self.log.info( '[%s] Reconnecting %s/%s user(s) of worker %s', user.userid, i + 1, total_users, name) except Exception as e: self.log.err( e, '[%s] Exception while reconnecting' % (data['_userid'])) #Remove worker and queue when no users were assigned if total_users == 0: yield self.removeWorker(name) else: yield self.removeWorker(name)
def checkRunningWorkers(self): workers = yield Worker.getWorkers(Worker.redis_workers_keys) if workers: self.log.info('CHECKING_RUNNING_WORKERS %s', len(workers)) for worker in workers: name = worker.get('name') key = worker.get('key') lastTime = worker.get('lastTime') if key is None or name is None or lastTime is None: self.log.warning('WORKER_DATA_WRONG %s', worker) continue death = worker.get('death') if death is None: lastTime = parser.parse(lastTime) delta = datetime.utcnow() - lastTime if delta.seconds > conf.SUPERVISOR_WORKER_REFRESH_TIME: self.log.warning( 'REGISTERING_WORKER_DEATH %s has not been updated since %s second(s)', name, delta.seconds) w = Worker([], name=name) w.log = self.log yield w.register_death()