class UIDStack(object): """Thin convenience wrapper around gevent.queue.LifoQueue. Each entry in the stack is a pair (uid, metadata), where the metadata may be None.""" def __init__(self): self._lifoqueue = LifoQueue() def empty(self): return self._lifoqueue.empty() def get(self): return self._lifoqueue.get_nowait() def peek(self): # This should be LifoQueue.peek_nowait(), which is currently buggy in # gevent. Can update with gevent version 1.0.2. return self._lifoqueue.queue[-1] def put(self, uid, metadata): self._lifoqueue.put((uid, metadata)) def discard(self, objects): self._lifoqueue.queue = [ item for item in self._lifoqueue.queue if item not in objects ] def qsize(self): return self._lifoqueue.qsize() def __iter__(self): for item in self._lifoqueue.queue: yield item
class UIDStack(object): """Thin convenience wrapper around gevent.queue.LifoQueue. Each entry in the stack is a pair (uid, metadata), where the metadata may be None.""" def __init__(self): self._lifoqueue = LifoQueue() def empty(self): return self._lifoqueue.empty() def get(self): return self._lifoqueue.get_nowait() def peek(self): # This should be LifoQueue.peek_nowait(), which is currently buggy in # gevent. Can update with gevent version 1.0.2. return self._lifoqueue.queue[-1] def put(self, uid, metadata): self._lifoqueue.put((uid, metadata)) def discard(self, objects): self._lifoqueue.queue = [item for item in self._lifoqueue.queue if item not in objects] def qsize(self): return self._lifoqueue.qsize() def __iter__(self): for item in self._lifoqueue.queue: yield item
class Stack(object): """Thin convenience wrapper around gevent.queue.LifoQueue.""" def __init__(self, key, initial_elements=None): self.key = key self._lifoqueue = LifoQueue() if initial_elements is not None: self._lifoqueue.queue = sorted(list(initial_elements), key=self.key) def empty(self): return self._lifoqueue.empty() def get(self): return self._lifoqueue.get_nowait() def peek(self): # This should be LifoQueue.peek_nowait(), which is currently buggy in # gevent. Can update with gevent version 1.0.2. return self._lifoqueue.queue[-1] def put(self, obj): self._lifoqueue.put(obj) def update_from(self, objects): for obj in sorted(list(objects), key=self.key): self._lifoqueue.put(obj) def discard(self, objects): self._lifoqueue.queue = [item for item in self._lifoqueue.queue if item not in objects] def qsize(self): return self._lifoqueue.qsize() def __iter__(self): for item in self._lifoqueue.queue: yield item
class ClientPool(object): DEFAULT_CLIENT_EXPIRE_TIME = 300 DEFAULT_CLOSE_EXPIRE_CLIENT_INTERVAL = 60 def __init__(self, pool_name, pool_size, client_class, close_client_handler, *client_args, **client_kwargs): assert pool_size > 0 assert client_class is not None and hasattr(client_class, '__call__') assert close_client_handler is None or hasattr(close_client_handler, '__call__') self._pool_name = pool_name self._pool_size = pool_size self._client_class = client_class self._close_client_handler = close_client_handler self._client_args = client_args self._client_kwargs = client_kwargs self._queue = LifoQueue(maxsize=pool_size) for i in range(pool_size): self._queue.put(ClientHolder()) self._client_expire_time = self.DEFAULT_CLIENT_EXPIRE_TIME self._gc_task = ScheduleTask( name='ClientPool-GC-%s' % pool_name, start_after_seconds=0, interval_seconds=self.DEFAULT_CLOSE_EXPIRE_CLIENT_INTERVAL, handler=self._close_expire_client) self._gc_task.run() def __del__(self): self._gc_task.stop() @contextmanager def get_client(self, block=True, pool_acquire_client_timeout=1, req_timeout=5): client_holder = self._get_client(block, pool_acquire_client_timeout) tm = None try: tm = gevent.Timeout.start_new(req_timeout) yield client_holder.get_client() except BaseException as e: logger.error( 'Client is out pool for too long %s seconds, raise exception: %s', req_timeout, e) self._close_client(client_holder) raise finally: if tm: tm.cancel() self.push(client_holder) def _get_client(self, block=True, timeout=1): if self.is_empty(): logger.info('ClientPool: %s is empty.', self._pool_name) client_holder = self._queue.get(block=block, timeout=timeout) if client_holder.get_client() is None: tm = None try: tm = gevent.Timeout.start_new(timeout) client_holder.set_client(self._create_client()) except BaseException as e: client_holder.set_client(None) self.push(client_holder) raise finally: if tm: tm.cancel() client_holder.set_access_time(time.time()) return client_holder def push(self, client_holder): if not self.is_full(): self._queue.put_nowait(client_holder) def is_full(self): return self._queue.qsize() >= self._pool_size def is_empty(self): return self._queue.qsize() <= 0 def _create_client(self): return self._client_class(*self._client_args, **self._client_kwargs) def _close_client(self, client_holder): if self._close_client_handler and client_holder.get_client(): try: self._close_client_handler(client_holder.get_client()) except Exception as e: logger.error('Close client raise exception: %s', e) client_holder.set_client(None) def _close_expire_client(self): cur_time = time.time() need_closed_clients = [] for client_holder in self._queue.queue: if client_holder.get_client( ) and cur_time - client_holder.get_access_time( ) > self._client_expire_time: need_closed_clients.append(client_holder.get_client) for client in need_closed_clients: self._close_client_handler(client)
class BaseProcessor(LoggerMixin): name = 'base-processor' @classmethod def from_engine(cls, engine, *args, **kwargs): return cls(engine, *args, **kwargs) def _request(self): return self.engine.request request = property(_request) def __init__(self, engine, *args, **kwargs): from time import time from hashlib import md5 from threading import Lock from gevent.queue import LifoQueue self.processor_name = '%s:%s' % (self.name, md5(str( time())).hexdigest()[:6]) LoggerMixin.__init__(self) self.engine = engine self.__redis = None self.redis_lock = Lock() self.progress = 0 self.total = 0 # 忽略统计 self.bypassed_cnt = 0 # 超过这一限制时,add_task就暂停向其中添加任务 self.maxsize = 1000 self.tasks = LifoQueue() self.workers = [] # 默认的polling间隔为1秒 self.polling_interval = 1 import argparse arg_parser = argparse.ArgumentParser() # 并发数量 arg_parser.add_argument('--concur', type=int) args, leftover = arg_parser.parse_known_args() from core import dhaulagiri_settings if args.concur: dhaulagiri_settings['core']['concur'] = args.concur self.concur = dhaulagiri_settings['core']['concur'] self.checkpoint_ts = None self.checkpoint_prog = None self.init_ts = time() # 心跳任务 self.heart_beat = None # worker的Monitor。Worker在每次循环开始的时候,都会在该对象中进行一次状态更新 self.worker_monitor = {} def update_worker_status(self, worker): """ 更新worker的状态 :param worker: :return: """ from time import time name = worker.worker_name self.worker_monitor[name] = time() def get_worker_stat(self): """ 获得worker队列的状态 :return: """ from time import time # 如果60秒都没有状态更新,说明该worker进入zombie状态 time_window = 90 cur = time() active = dict( filter(lambda item: item[1] >= cur - time_window, self.worker_monitor.items())) zombie = dict( filter(lambda item: item[1] < cur - time_window, self.worker_monitor.items())) return {'zombie': zombie, 'active': active} def incr_progress(self): self.progress += 1 def _start_workers(self): def timer(): """ 每30秒启动一次,输出当前进度 """ import time while True: msg = 'Progress: %d / %d.' % (self.progress, self.total) cts = time.time() if self.checkpoint_prog is not None and self.checkpoint_ts is not None: rate = (self.progress - self.checkpoint_prog) / ( cts - self.checkpoint_ts) * 60 msg = '%s %s' % (msg, 'Processing rate: %d items/min' % int(rate)) self.checkpoint_ts = cts self.checkpoint_prog = self.progress # 获得worker monitor统计 stat = self.get_worker_stat() msg += ', active workers: %d, zombie workers: %d' % (len( stat['active']), len(stat['zombie'])) self.log(msg) gevent.sleep(30) self.heart_beat = gevent.spawn(timer) gevent.signal(signal.SIGKILL, gevent.kill) gevent.signal(signal.SIGQUIT, gevent.kill) for i in xrange(self.concur): worker = Worker.from_processor(self, self.tasks) self.workers.append(worker) def add_task(self, task, *args, **kwargs): # 是否启用流量控制 flow_control = True while flow_control: # 如果self.tasks中的项目过多,则暂停添加 if self.tasks.qsize() > self.maxsize: gevent.sleep(self.polling_interval) else: break func = lambda: task(*args, **kwargs) task_key = getattr(task, 'task_key', None) if task_key: setattr(func, 'task_key', task_key) self.tasks.put(func, timeout=120) self.logger.debug( 'New task%s added to the queue. Remaining: %d' % ('(%s)' % task_key if task_key else '', self.tasks.qsize())) gevent.sleep(0) def _wait_for_workers(self): """ 等待所有的worker是否完成。判据:所有的worker都处于idle状态,并且tasks队列已空 :return: """ while True: if not self.tasks.empty(): gevent.sleep(self.polling_interval) continue completed = True for w in self.workers: if not w.idle: gevent.sleep(self.polling_interval) completed = False break if completed: break gevent.killall([w.gevent for w in self.workers]) gevent.kill(self.heart_beat) def run(self): self._start_workers() self.populate_tasks() self._wait_for_workers() import time self.log( 'Processor ended: %d items processed(%d bypassed) in %d minutes' % (self.progress, self.bypassed_cnt, int((time.time() - self.init_ts) / 60.0))) def populate_tasks(self): raise NotImplementedError