def __init__(self, index): super(GetWorker, self).__init__() self.index = index self._batch_param = {} self.manager = RedisManager(RECORD_REDIS, QUEUE_REDIS, CACHE_REDIS) for batch_id in Record.instance().get_unfinished_batch(): parameter = Record.instance().get_parameter(batch_id) total_count = Record.instance().get_total_number(batch_id) if total_count is None: continue self.manager.worker_init_distributed_queue(batch_id, int(total_count)) self._batch_param[batch_id] = parameter
def run(self, *args, **kwargs): """ end, background_cleansing, status: None, None, begin None, 1, begin cleaning None, 0, finish cleaning time, 0, begin delete time, None, finish delete None, 0, finish cleaning then exception """ for batch_id, queue_dict in self.manager.get_queue_with_priority(): queue = queue_dict['queue'] if Record.instance().is_finished(batch_id) is True: # this queue and queue object in distributed_queues can be # delete, but not needed. When run finish and # background_cleansing finsh, this process end. # BTW, worker instance will reboot every 10 minutes. # If another worker delete queue, I don't need to do anything. continue background = queue.get_background_cleaning_status() if background is None: self.work(batch_id, queue_dict, *args, **kwargs) elif background == '1': self.work(batch_id, queue_dict, *args, **kwargs) elif background == '0': pass
def run(self, *args, **kwargs): """ end, background_cleansing, status: None, None, begin None, 1, begin cleaning None, 0, finish cleaning time, 0, begin delete time, None, finish delete None, 0, finish cleaning then exception """ checkout_cache = {} for batch_id, queue_dict in self.manager.get_queue_with_priority(): queue = queue_dict['queue'] if Record.instance().is_finished(batch_id) is True: # this queue and queue object in distributed_queues can be # delete, but not needed. When run finish and # background_cleansing finsh, this process end. # BTW, worker instance will reboot every 10 minutes. # If another worker delete queue, I don't need to do anything. continue background = queue.get_background_cleaning_status() if background == '0': pass elif background is None or background == '1': checkout_cache[batch_id] = queue_dict today_str = datetime.now().strftime('%Y%m%d') while len(checkout_cache) > 0: removes = [] batch_urlid = {} for batch_id, queue_dict in checkout_cache.iteritems(): # get url_ids from queue get_logger(batch_id, today_str, '/opt/service/log/').info('begin get items from queue') results = queue_dict['queue'].get(block=True, timeout=3, interval=1) get_logger(batch_id, today_str, '/opt/service/log/').info('finish get items from queue') if not results: removes.append(batch_id) continue batch_urlid[batch_id] = results [checkout_cache.pop(i) for i in removes] while len(batch_urlid) > 0: removes = [] for batch_id, results in batch_urlid.iteritems(): # download and process if len(results) > 0: url_id = results.pop() other_batch_process_time = self.get_other_batch_process_time( set(batch_urlid.keys()) - set([batch_id]) ) start = time.time() self.work(batch_id, checkout_cache[batch_id], url_id, other_batch_process_time, *args, **kwargs) self.update_process_time_of_this_batch(batch_id, start) else: removes.append(batch_id) [batch_urlid.pop(i) for i in removes]
def get_status(batch_id): if not hasattr(get_status, '_queue'): setattr(get_status, '_queue', Queue(batch_id)) print('job {} remain {} urls need to crawl'.format( batch_id, get_status._queue.conn.scard(batch_id))) for key, value in Record.instance().conn.hgetall(batch_id).iteritems(): print('\t{}: {}'.format(key, value))
def worker_init_distributed_queue(self, batch_id, total_count): """ init distributed queue for worker side """ priority = int( Record.instance().get_priority(batch_id) ) thinhash = ThinHash(batch_id, total_count) queue = Queue(batch_id, priority=priority) self.set_distributed_queue(batch_id, queue, thinhash, priority, True) return self.cache[batch_id]
def work(self, batch_id, queue_dict, *args, **kwargs): batch_key_filename = batch_id.rsplit('-', 1)[0].replace('-', '_') module = __import__('workers.{}'.format(batch_key_filename), fromlist=['process']) while 1: results = queue_dict['queue'].get(block=True, timeout=3, interval=1) if results == []: break for url_id, count in results: url = queue_dict['thinhash'].hget(url_id) process_status = module.process(url, self._batch_param[batch_id], self.manager, *args, **kwargs) if process_status: queue_dict['queue'].task_done(url_id) else: Record.instance().increase_failed(batch_id)
def init_distributed_queue(self, batch_id, parameter, total_count, priority=1, timeout=180, failure_times=3): """ init distributed queue for master side param total_count: can be a predetermined number larger than real total_count """ # keep the step order Record.instance().begin(batch_id, parameter, total_count, priority) thinhash = ThinHash(batch_id, total_count) queue = Queue(batch_id, priority=priority, timeout=timeout, failure_times=failure_times) self.set_distributed_queue(batch_id, queue, thinhash, priority, True) return self.cache[batch_id]
def work(self, batch_id, queue_dict, url_id, other_batch_process_time, *args, **kwargs): try: batch_key_filename = batch_id.rsplit('-', 1)[0].replace('-', '_') module = __import__('workers.{}'.format(batch_key_filename), fromlist=['process']) except: module = __import__('workers.prefetch', fromlist=['process']) today_str = datetime.now().strftime('%Y%m%d') if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('begin get url from thinhash redis') # TODO change to hmget url = queue_dict['thinhash'].hget(url_id) if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('end get url from thinhash redis') try: process_status = module.process(url, batch_id, self._batch_param[batch_id], self.manager, other_batch_process_time, *args, **kwargs) except Exception as e: Record.instance().add_exception(batch_id, url, repr(e)) queue_dict['queue'].task_done(url_id) return if process_status: if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('begin task done for record redis') queue_dict['queue'].task_done(url_id) Record.instance().increase_success(batch_id) if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info('end task done for record redis')
def delete_queue(self, batch_id): """ :return: False error, None not finish yet, True delete queue, or already deleted. """ distributed = self.get_distributed_queue(batch_id) if distributed is None: return False if distributed['queue'].get_background_cleaning_status() != '0': return # set failed queue value times to value url for field, times in distributed['queue'].get_failed_fields().iteritems(): url = distributed['thinhash'].hget(field) distributed['queue'].set_failed_times_to_url(field, url) if Record.instance().if_not_finish_set(batch_id) == 1: distributed['thinhash'].delete() distributed['queue'].flush() self.cache.pop(batch_id) return True return True
def run(self, *args, **kwargs): """ end, background_cleansing, status: None, None, begin None, 1, begin cleaning None, 0, finish cleaning time, 0, begin delete time, None, finish delete None, 0, finish cleaning then exception """ for batch_id, queue_dict in self.manager.cache.iteritems(): queue = queue_dict['queue'] if Record.instance().is_finished(batch_id) is True: # this queue and queue object in distributed_queues can be # delete, but not needed. When run finish and # background_cleansing finsh, this process end. # BTW, worker instance will reboot every 10 minutes. # If another worker delete queue, I don't need to do anything. continue tasks = [] background = queue.get_background_cleaning_status() if background is None: tasks.append(gevent.spawn(queue.background_cleaning)) tasks.append( gevent.spawn(self.work, batch_id, queue_dict, *args, **kwargs)) elif background == '1': tasks.append( gevent.spawn(self.work, batch_id, queue_dict, *args, **kwargs)) elif background == '0': self.manager.delete_queue(batch_id) gevent.joinall(tasks)
def work(self, batch_id, queue_dict, *args, **kwargs): batch_key_filename = batch_id.rsplit('-', 1)[0].replace('-', '_') try: module = __import__('workers.{}'.format(batch_key_filename), fromlist=['process']) except: module = __import__('workers.prefetch', fromlist=['process']) while 1: today_str = datetime.now().strftime('%Y%m%d') if kwargs and kwargs.get("debug"): get_logger( batch_id, today_str, '/opt/service/log/').info('begin get items from queue') results = queue_dict['queue'].get(block=True, timeout=3, interval=1) if kwargs and kwargs.get("debug"): get_logger( batch_id, today_str, '/opt/service/log/').info('finish get items from queue') if not results: break for url_id in results: if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info( 'begin get url from thinhash redis') # TODO change to hmget url = queue_dict['thinhash'].hget(url_id) if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info( 'end get url from thinhash redis') try: process_status = module.process( url, batch_id, self._batch_param[batch_id], self.manager, *args, **kwargs) except Exception as e: Record.instance().add_exception(batch_id, url, repr(e)) queue_dict['queue'].task_done(url_id) continue if process_status: if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info( 'begin task done for record redis') queue_dict['queue'].task_done(url_id) Record.instance().increase_success(batch_id) if kwargs and kwargs.get("debug"): get_logger(batch_id, today_str, '/opt/service/log/').info( 'end task done for record redis')