def __init__(self, conf, beanstalkd_addr=None, logger=None): self.conf = conf self.logger = logger or get_logger(self.conf) self.namespace = conf['namespace'] self.success = True # counters self.items_processed = 0 self.total_items_processed = 0 self.errors = 0 self.total_errors = 0 self.total_expected_items = None # report self.start_time = 0 self.last_report = 0 self.report_interval = int_value(self.conf.get('report_interval'), self.DEFAULT_REPORT_INTERVAL) # dispatcher self.dispatcher = None # input self.beanstalkd = None if beanstalkd_addr: self.beanstalkd = BeanstalkdListener( beanstalkd_addr, self.conf.get('beanstalkd_worker_tube') or self.DEFAULT_BEANSTALKD_WORKER_TUBE, self.logger)
def __init__(self, conf, logger, beanstalkd_addr, **kwargs): super(BlobImprover, self).__init__(conf, logger, volume=None, **kwargs) self.content_factory = ContentFactory(self.conf, logger=self.logger) beanstalkd_tube = self.conf.get('beanstalkd_tube', DEFAULT_IMPROVER_TUBE) self.listener = BeanstalkdListener(beanstalkd_addr, beanstalkd_tube, self.logger, **kwargs) self.sender = BeanstalkdSender(beanstalkd_addr, beanstalkd_tube, self.logger, **kwargs) self.retry_delay = int_value(self.conf.get('retry_delay'), 30) self.reqid_prefix = 'blob-impr-'
def listen_beanstalkd_reply_forever(self): """ Process this orchestrator's job replies """ self.logger.info('Connecting to the reply beanstalkd') while self.running: try: listener = BeanstalkdListener( addr=self.beanstalkd_reply_addr, tube=self.beanstalkd_reply_tube, logger=self.logger) break except ConnectionError: self.logger.error('Failed to connect to the reply beanstalkd') sleep(5) self.logger.info('Listening to replies on %s (tube=%s)', self.beanstalkd_reply_addr, self.beanstalkd_reply_tube) # keep the job results in memory while self.running: connection_error = self.listen_loop(listener) # in case of a beanstalkd connection error # sleep to avoid spamming if connection_error: sleep(2) self.logger.info('Exited listening thread')
def __init__(self, conf, beanstalkd_addr=None, logger=None): self.conf = conf self.logger = logger or get_logger(self.conf) self.namespace = conf['namespace'] self.success = True # exit gracefully self.running = True signal.signal(signal.SIGINT, self.exit_gracefully) signal.signal(signal.SIGTERM, self.exit_gracefully) # counters self.items_processed = 0 self.total_items_processed = 0 self.errors = 0 self.total_errors = 0 self.total_expected_items = None # report self.start_time = 0 self.last_report = 0 self.report_interval = int_value(self.conf.get( 'report_interval'), self.DEFAULT_REPORT_INTERVAL) # dispatcher self.dispatcher = None # input self.beanstalkd = None if beanstalkd_addr: self.beanstalkd = BeanstalkdListener( beanstalkd_addr, self.conf.get('beanstalkd_worker_tube') or self.DEFAULT_BEANSTALKD_WORKER_TUBE, self.logger) # retry self.retryer = None self.retry_queue = None if self.beanstalkd: self.retryer = BeanstalkdSender( self.beanstalkd.addr, self.beanstalkd.tube, self.logger) self.retry_queue = eventlet.Queue() self.retry_delay = int_value(self.conf.get('retry_delay'), self.DEFAULT_RETRY_DELAY)
class BlobImprover(Rebuilder): """ Move chunks of objects declared as "perfectible", if possible to improve them (increased distance between chunks or better hosting service). """ supported_events = (EventTypes.CONTENT_PERFECTIBLE, ) def __init__(self, conf, logger, beanstalkd_addr, **kwargs): super(BlobImprover, self).__init__(conf, logger, volume=None, **kwargs) self.content_factory = ContentFactory(self.conf, logger=self.logger) beanstalkd_tube = self.conf.get('beanstalkd_tube', DEFAULT_IMPROVER_TUBE) self.listener = BeanstalkdListener(beanstalkd_addr, beanstalkd_tube, self.logger, **kwargs) self.sender = BeanstalkdSender(beanstalkd_addr, beanstalkd_tube, self.logger, **kwargs) self.retry_delay = int_value(self.conf.get('retry_delay'), 30) self.reqid_prefix = 'blob-impr-' def exit_gracefully(self, signum, frame): super(BlobImprover, self).exit_gracefully(signum, frame) self.listener.running = False def _event_from_job(self, job_id, data, **kwargs): """Decode a JSON string into an event dictionary.""" # pylint: disable=no-member event = json.loads(data) type_ = event.get('event') # Bury events that should not be there if type_ not in self.__class__.supported_events: msg = 'Discarding event %s (type=%s)' % (event.get('job_id'), type_) self.logger.info(msg) raise exceptions.ExplicitBury(msg) yield event def _create_worker(self, **kwargs): return BlobImproverWorker(self, **kwargs) def _fill_queue(self, queue, **kwargs): max_events = kwargs.get('max_events') sent_events = 0 # Do not block more than 2 seconds events = self.listener.fetch_jobs(self._event_from_job, reserve_timeout=2, **kwargs) for event in events: queue.put(event) sent_events += 1 if max_events > 0 and sent_events >= max_events: self.logger.info('Max events (%d) reached, exiting', max_events) break if not self.running: break events.close() def _read_retry_queue(self, queue, **kwargs): while True: # Reschedule jobs we were not able to handle. item = queue.get() sent = False while not sent: sent = self.sender.send_job(json.dumps(item), delay=self.retry_delay) if not sent: sleep(1.0) self.sender.job_done() queue.task_done() def _item_to_string(self, item, **kwargs): try: url = item['url'] fullpath = encode_fullpath(url['account'], url['user'], url['path'], url.get('version', 1), url['content']) # TODO(FVE): maybe tell some numbers about chunks if item.get('event') == EventTypes.CONTENT_PERFECTIBLE: return 'perfectible object %s' % (fullpath, ) else: return 'object %s' % (fullpath, ) except (KeyError, ValueError) as err: return '<unknown item> ({0})'.format(repr(err)) def _get_report(self, status, end_time, counters, **kwargs): items_processed, errors, total_items_processed, total_errors = counters time_since_last_report = (end_time - self.last_report) or 0.00001 total_time = (end_time - self.start_time) or 0.00001 return ('%(status)s volume=%(volume)s ' 'last_report=%(last_report)s %(time_since_last_report).2fs ' 'chunks=%(chunks)d %(chunks_rate).2f/s ' 'errors=%(errors)d %(errors_rate).2f%% ' 'start_time=%(start_time)s %(total_time).2fs ' 'total_chunks=%(total_chunks)d ' '%(total_chunks_rate).2f/s ' 'total_errors=%(total_errors)d %(total_errors_rate).2f%%' % { 'status': status, 'volume': self.volume, 'last_report': datetime.fromtimestamp(int(self.last_report)).isoformat(), 'time_since_last_report': time_since_last_report, 'chunks': items_processed, 'chunks_rate': items_processed / time_since_last_report, 'errors': errors, 'errors_rate': 100 * errors / float(items_processed or 1), 'start_time': datetime.fromtimestamp(int(self.start_time)).isoformat(), 'total_time': total_time, 'total_chunks': total_items_processed, 'total_chunks_rate': total_items_processed / total_time, 'total_errors': total_errors, 'total_errors_rate': 100 * total_errors / float(total_items_processed or 1) })
def __init__(self, conf, tool): super(_DistributedDispatcher, self).__init__(conf, tool) self.sending = None self.max_items_per_second = int_value( self.conf.get('items_per_second'), self.tool.DEFAULT_ITEM_PER_SECOND) # All available beanstalkd conscience_client = ConscienceClient(self.conf) all_beanstalkd = conscience_client.all_services('beanstalkd') all_available_beanstalkd = dict() for beanstalkd in all_beanstalkd: if beanstalkd['score'] <= 0: continue all_available_beanstalkd[beanstalkd['addr']] = beanstalkd if not all_available_beanstalkd: raise OioException('No beanstalkd available') # Beanstalkd workers workers_tube = self.conf.get('distributed_beanstalkd_worker_tube') \ or self.tool.DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE self.beanstalkd_workers = dict() for beanstalkd in locate_tube(all_available_beanstalkd.values(), workers_tube): beanstalkd_worker = BeanstalkdSender(beanstalkd['addr'], workers_tube, self.logger) self.beanstalkd_workers[beanstalkd['addr']] = beanstalkd_worker self.logger.info( 'Beanstalkd %s using tube %s is selected as a worker', beanstalkd_worker.addr, beanstalkd_worker.tube) if not self.beanstalkd_workers: raise OioException('No beanstalkd worker available') nb_workers = len(self.beanstalkd_workers) if self.max_items_per_second > 0: # Max 2 seconds in advance queue_size_per_worker = self.max_items_per_second * 2 / nb_workers else: queue_size_per_worker = 64 for _, beanstalkd_worker in self.beanstalkd_workers.items(): beanstalkd_worker.low_limit = queue_size_per_worker / 2 beanstalkd_worker.high_limit = queue_size_per_worker # Beanstalkd reply beanstalkd_reply = dict() try: local_services = conscience_client.local_services() for local_service in local_services: if local_service['type'] != 'beanstalkd': continue beanstalkd = all_available_beanstalkd.get( local_service['addr']) if beanstalkd is None: continue if beanstalkd_reply \ and beanstalkd_reply['score'] >= beanstalkd['score']: continue beanstalkd_reply = beanstalkd except Exception as exc: # pylint: disable=broad-except self.logger.warning( 'ERROR when searching for beanstalkd locally: %s', exc) if not beanstalkd_reply: self.logger.warn('No beanstalkd available locally') try: beanstalkd = conscience_client.next_instance('beanstalkd') beanstalkd_reply = all_available_beanstalkd[beanstalkd['addr']] except Exception as exc: # pylint: disable=broad-except self.logger.warning('ERROR when searching for beanstalkd: %s', exc) beanstalkd_reply_addr = beanstalkd_reply['addr'] # If the tube exists, another service must have already used this tube tube_reply = workers_tube + '.reply.' + str(time.time()) tubes = Beanstalk.from_url('beanstalk://' + beanstalkd_reply_addr).tubes() if tube_reply in tubes: raise OioException('Beanstalkd %s using tube %s is already used') self.beanstalkd_reply = BeanstalkdListener(beanstalkd_reply_addr, tube_reply, self.logger) self.logger.info( 'Beanstalkd %s using tube %s is selected for the replies', self.beanstalkd_reply.addr, self.beanstalkd_reply.tube)
class _DistributedDispatcher(_Dispatcher): """ Dispatch tasks on the platform. """ def __init__(self, conf, tool): super(_DistributedDispatcher, self).__init__(conf, tool) self.sending = None self.max_items_per_second = int_value( self.conf.get('items_per_second'), self.tool.DEFAULT_ITEM_PER_SECOND) # All available beanstalkd conscience_client = ConscienceClient(self.conf) all_beanstalkd = conscience_client.all_services('beanstalkd') all_available_beanstalkd = dict() for beanstalkd in all_beanstalkd: if beanstalkd['score'] <= 0: continue all_available_beanstalkd[beanstalkd['addr']] = beanstalkd if not all_available_beanstalkd: raise OioException('No beanstalkd available') # Beanstalkd workers workers_tube = self.conf.get('distributed_beanstalkd_worker_tube') \ or self.tool.DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE self.beanstalkd_workers = dict() for beanstalkd in locate_tube(all_available_beanstalkd.values(), workers_tube): beanstalkd_worker = BeanstalkdSender(beanstalkd['addr'], workers_tube, self.logger) self.beanstalkd_workers[beanstalkd['addr']] = beanstalkd_worker self.logger.info( 'Beanstalkd %s using tube %s is selected as a worker', beanstalkd_worker.addr, beanstalkd_worker.tube) if not self.beanstalkd_workers: raise OioException('No beanstalkd worker available') nb_workers = len(self.beanstalkd_workers) if self.max_items_per_second > 0: # Max 2 seconds in advance queue_size_per_worker = self.max_items_per_second * 2 / nb_workers else: queue_size_per_worker = 64 for _, beanstalkd_worker in self.beanstalkd_workers.items(): beanstalkd_worker.low_limit = queue_size_per_worker / 2 beanstalkd_worker.high_limit = queue_size_per_worker # Beanstalkd reply beanstalkd_reply = dict() try: local_services = conscience_client.local_services() for local_service in local_services: if local_service['type'] != 'beanstalkd': continue beanstalkd = all_available_beanstalkd.get( local_service['addr']) if beanstalkd is None: continue if beanstalkd_reply \ and beanstalkd_reply['score'] >= beanstalkd['score']: continue beanstalkd_reply = beanstalkd except Exception as exc: # pylint: disable=broad-except self.logger.warning( 'ERROR when searching for beanstalkd locally: %s', exc) if not beanstalkd_reply: self.logger.warn('No beanstalkd available locally') try: beanstalkd = conscience_client.next_instance('beanstalkd') beanstalkd_reply = all_available_beanstalkd[beanstalkd['addr']] except Exception as exc: # pylint: disable=broad-except self.logger.warning('ERROR when searching for beanstalkd: %s', exc) beanstalkd_reply_addr = beanstalkd_reply['addr'] # If the tube exists, another service must have already used this tube tube_reply = workers_tube + '.reply.' + str(time.time()) tubes = Beanstalk.from_url('beanstalk://' + beanstalkd_reply_addr).tubes() if tube_reply in tubes: raise OioException('Beanstalkd %s using tube %s is already used') self.beanstalkd_reply = BeanstalkdListener(beanstalkd_reply_addr, tube_reply, self.logger) self.logger.info( 'Beanstalkd %s using tube %s is selected for the replies', self.beanstalkd_reply.addr, self.beanstalkd_reply.tube) def _fetch_tasks_events_to_send(self): items_with_beanstalkd_reply = \ self.tool.fetch_items_with_beanstalkd_reply() for item, _ in items_with_beanstalkd_reply: yield self.tool.task_event_from_item(item) def _tasks_res_from_res_event(self, job_id, data, **kwargs): res_event = json.loads(data) beanstalkd_worker_addr = res_event['beanstalkd_worker']['addr'] tasks_res = self.tool.tasks_res_from_res_event(res_event) self.beanstalkd_workers[beanstalkd_worker_addr].job_done() return tasks_res def _all_events_are_processed(self): """ Tell if all workers have finished to process their events. """ if self.sending: return False total_events = 0 for worker in self.beanstalkd_workers.values(): total_events += worker.nb_jobs return total_events <= 0 def _send_task_event(self, task_event, reply_loc, next_worker): """ Send the event through a non-full sender. """ task_event['beanstalkd_reply'] = reply_loc workers = self.beanstalkd_workers.values() nb_workers = len(workers) while True: for _ in range(nb_workers): success = workers[next_worker].send_job(json.dumps(task_event)) next_worker = (next_worker + 1) % nb_workers if success: return next_worker self.logger.warn("All beanstalkd workers are full") sleep(5) def _distribute_events(self, reply_loc=None): next_worker = 0 items_run_time = 0 try: tasks_events = self._fetch_tasks_events_to_send() items_run_time = ratelimit(items_run_time, self.max_items_per_second) next_worker = self._send_task_event(next(tasks_events), reply_loc, next_worker) self.sending = True for task_event in tasks_events: items_run_time = ratelimit(items_run_time, self.max_items_per_second) next_worker = self._send_task_event(task_event, reply_loc, next_worker) if not self.tool.running: break except Exception as exc: if not isinstance(exc, StopIteration) and self.tool.running: self.logger.error("Failed to distribute events: %s", exc) self.tool.success = False finally: self.sending = False def run(self): self.tool.start_time = self.tool.last_report = time.time() self.tool.log_report('START', force=True) reply_loc = { 'addr': self.beanstalkd_reply.addr, 'tube': self.beanstalkd_reply.tube } # pylint: disable=no-member thread = threading.Thread(target=self._distribute_events, args=[reply_loc]) thread.start() # Wait until the thread is started sending events while self.sending is None: sleep(0.1) # Retrieve responses until all events are processed try: while not self._all_events_are_processed(): tasks_res = self.beanstalkd_reply.fetch_job( self._tasks_res_from_res_event, timeout=DISTRIBUTED_DISPATCHER_TIMEOUT) for task_res in tasks_res: self.tool.update_counters(task_res) yield task_res self.tool.log_report('RUN') except OioTimeout: self.logger.error('No response for %d seconds', DISTRIBUTED_DISPATCHER_TIMEOUT) self.tool.success = False except Exception: # pylint: disable=broad-except self.logger.exception('ERROR in distributed dispatcher') self.tool.success = False self.tool.log_report('DONE', force=True)
class Tool(object): """ Process all found items. For the task_res variable, the following format must be respected: (item, info, error). """ DEFAULT_BEANSTALKD_WORKER_TUBE = 'oio-process' DEFAULT_REPORT_INTERVAL = 3600 DEFAULT_RETRY_DELAY = 3600 DEFAULT_ITEM_PER_SECOND = 30 DEFAULT_CONCURRENCY = 1 DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE = 'oio-process' def __init__(self, conf, beanstalkd_addr=None, logger=None): self.conf = conf self.logger = logger or get_logger(self.conf) self.namespace = conf['namespace'] self.success = True # exit gracefully self.running = True signal.signal(signal.SIGINT, self.exit_gracefully) signal.signal(signal.SIGTERM, self.exit_gracefully) # counters self.items_processed = 0 self.total_items_processed = 0 self.errors = 0 self.total_errors = 0 self.total_expected_items = None # report self.start_time = 0 self.last_report = 0 self.report_interval = int_value(self.conf.get('report_interval'), self.DEFAULT_REPORT_INTERVAL) # dispatcher self.dispatcher = None # input self.beanstalkd = None if beanstalkd_addr: self.beanstalkd = BeanstalkdListener( beanstalkd_addr, self.conf.get('beanstalkd_worker_tube') or self.DEFAULT_BEANSTALKD_WORKER_TUBE, self.logger) # retry self.retryer = None self.retry_queue = None if self.beanstalkd: self.retryer = BeanstalkdSender(self.beanstalkd.addr, self.beanstalkd.tube, self.logger) self.retry_queue = eventlet.Queue() self.retry_delay = int_value(self.conf.get('retry_delay'), self.DEFAULT_RETRY_DELAY) @staticmethod def items_from_task_event(task_event): """ Convert the task event into a list (generator) of items. """ raise NotImplementedError() @staticmethod def task_event_from_item(item): """ Convert the item into a task event. """ raise NotImplementedError() @staticmethod def tasks_res_from_res_event(res_event): """ Convert the result event into a list (generator) of tasks result. """ raise NotImplementedError() @staticmethod def res_event_from_task_res(task_res): """ Convert the task result into a result event. """ raise NotImplementedError() @staticmethod def string_from_item(item): """ Convert the item into a string. """ raise NotImplementedError() def exit_gracefully(self, signum, frame): self.logger.info('Stop sending and wait for all results already sent') self.success = False self.running = False if self.beanstalkd: self.beanstalkd.running = False def _item_with_beanstalkd_reply_from_task_event(self, job_id, data): task_event = json.loads(data) beanstalkd_reply = task_event.get('beanstalkd_reply') items = self.items_from_task_event(task_event) for item in items: yield (item, beanstalkd_reply) def _fetch_items_with_beanstalkd_reply_from_beanstalkd(self): # Do not block more than 2 seconds return self.beanstalkd.fetch_jobs( self._item_with_beanstalkd_reply_from_task_event, reserve_timeout=2) def _fetch_items(self): """ Fetch items from inputs (other than the beanstalkd). """ raise NotImplementedError() def _fetch_items_with_beanstalkd_reply(self): items = self._fetch_items() for item in items: yield (item, None) def fetch_items_with_beanstalkd_reply(self): """ Fetch items with beanstalkd reply (useful if the task is distributed). """ if self.beanstalkd: return self._fetch_items_with_beanstalkd_reply_from_beanstalkd() return self._fetch_items_with_beanstalkd_reply() def update_counters(self, task_res): """ Update all counters of the tool. """ _, _, error = task_res self.items_processed += 1 if error is not None: self.errors += 1 def _update_total_counters(self): items_processed = self.items_processed self.items_processed = 0 self.total_items_processed += items_processed errors = self.errors self.errors = 0 self.total_errors += errors return items_processed, self.total_items_processed, \ errors, self.total_errors def _get_report(self, status, end_time, counters): raise NotImplementedError() def log_report(self, status, force=False): """ Log a report with a fixed interval. """ end_time = time.time() if force or (end_time - self.last_report >= self.report_interval): counters = self._update_total_counters() self.logger.info(self._get_report(status, end_time, counters)) self.last_report = end_time def create_worker(self, queue_workers, queue_reply): """ Create worker to process the items. """ raise NotImplementedError() def prepare_local_dispatcher(self): """ The tool will dispatch the tasks locally. """ self.dispatcher = _LocalDispatcher(self.conf, self) def prepare_distributed_dispatcher(self): """ The tool will dispatch the tasks on the platform. """ self.dispatcher = _DistributedDispatcher(self.conf, self) def _load_total_expected_items(self): raise NotImplementedError() def _read_retry_queue(self): if self.retry_queue is None: return while True: # Reschedule jobs we were not able to handle. item = self.retry_queue.get() if self.retryer: sent = False while not sent: sent = self.retryer.send_job(json.dumps( self.task_event_from_item(item)), delay=self.retry_delay) if not sent: sleep(1.0) self.retryer.job_done() self.retry_queue.task_done() def run(self): """ Start processing all found items. """ if self.dispatcher is None: raise ValueError('No dispatcher') eventlet.spawn_n(self._load_total_expected_items) # spawn one worker for the retry queue eventlet.spawn_n(self._read_retry_queue) for task_res in self.dispatcher.run(): yield task_res # block until the retry queue is empty if self.retry_queue: self.retry_queue.join() def is_success(self): """ Check if there are any errors. """ if not self.success: return False if self.total_items_processed == 0: self.logger.warn('No item to proccess') return self.total_errors == 0
def __init__(self, conf, tool): super(_DistributedDispatcher, self).__init__(conf, tool) self.sending = False # All available beanstalkd conscience_client = ConscienceClient(self.conf) all_beanstalkd = conscience_client.all_services('beanstalkd') all_available_beanstalkd = dict() for beanstalkd in all_beanstalkd: if beanstalkd['score'] <= 0: continue all_available_beanstalkd[beanstalkd['addr']] = beanstalkd if not all_available_beanstalkd: raise OioException('No beanstalkd available') # Beanstalkd workers workers_tube = self.conf.get('distributed_beanstalkd_worker_tube') \ or self.tool.DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE self.beanstalkd_workers = dict() for _, beanstalkd in all_available_beanstalkd.items(): beanstalkd_worker_addr = beanstalkd['addr'] # If the tube exists, # there should be a service that listens to this tube tubes = Beanstalk.from_url('beanstalk://' + beanstalkd_worker_addr).tubes() if workers_tube not in tubes: continue beanstalkd_worker = BeanstalkdSender(beanstalkd_worker_addr, workers_tube, self.logger) self.beanstalkd_workers[beanstalkd_worker_addr] = beanstalkd_worker self.logger.info( 'Beanstalkd %s using tube %s is selected as a worker', beanstalkd_worker.addr, beanstalkd_worker.tube) if not self.beanstalkd_workers: raise OioException('No beanstalkd worker available') # Beanstalkd reply beanstalkd_reply = dict() try: local_services = conscience_client.local_services() for local_service in local_services: if local_service['type'] != 'beanstalkd': continue beanstalkd = all_available_beanstalkd.get( local_service['addr']) if beanstalkd is None: continue if beanstalkd_reply \ and beanstalkd_reply['score'] >= beanstalkd['score']: continue beanstalkd_reply = beanstalkd except Exception as exc: # pylint: disable=broad-except self.logger.warning( 'ERROR when searching for beanstalkd locally: %s', exc) if not beanstalkd_reply: self.logger.warn('No beanstalkd available locally') try: beanstalkd = conscience_client.next_instance('beanstalkd') beanstalkd_reply = all_available_beanstalkd[beanstalkd['addr']] except Exception as exc: # pylint: disable=broad-except self.logger.warning('ERROR when searching for beanstalkd: %s', exc) beanstalkd_reply_addr = beanstalkd_reply['addr'] # If the tube exists, another service must have already used this tube tube_reply = workers_tube + '.reply.' + str(time.time()) tubes = Beanstalk.from_url('beanstalk://' + beanstalkd_reply_addr).tubes() if tube_reply in tubes: raise OioException('Beanstalkd %s using tube %s is already used') self.beanstalkd_reply = BeanstalkdListener(beanstalkd_reply_addr, tube_reply, self.logger) self.logger.info( 'Beanstalkd %s using tube %s is selected for the replies', self.beanstalkd_reply.addr, self.beanstalkd_reply.tube)
class Tool(object): """ Process all found items. For the task_res variable, the following format must be respected: (item, info, error). """ DEFAULT_BEANSTALKD_WORKER_TUBE = 'oio-process' DEFAULT_REPORT_INTERVAL = 3600 DEFAULT_ITEM_PER_SECOND = 30 DEFAULT_WORKERS = 1 DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE = 'oio-process' def __init__(self, conf, beanstalkd_addr=None, logger=None): self.conf = conf self.logger = logger or get_logger(self.conf) self.namespace = conf['namespace'] self.success = True # counters self.items_processed = 0 self.total_items_processed = 0 self.errors = 0 self.total_errors = 0 self.total_expected_items = None # report self.start_time = 0 self.last_report = 0 self.report_interval = int_value(self.conf.get('report_interval'), self.DEFAULT_REPORT_INTERVAL) # dispatcher self.dispatcher = None # input self.beanstalkd = None if beanstalkd_addr: self.beanstalkd = BeanstalkdListener( beanstalkd_addr, self.conf.get('beanstalkd_worker_tube') or self.DEFAULT_BEANSTALKD_WORKER_TUBE, self.logger) @staticmethod def items_from_task_event(task_event): """ Convert the task event into a list (generator) of items. """ raise NotImplementedError() @staticmethod def task_event_from_item(item): """ Convert the item into a task event. """ raise NotImplementedError() @staticmethod def tasks_res_from_res_event(res_event): """ Convert the result event into a list (generator) of tasks result. """ raise NotImplementedError() @staticmethod def res_event_from_task_res(task_res): """ Convert the task result into a result event. """ raise NotImplementedError() @staticmethod def string_from_item(item): """ Convert the item into a string. """ raise NotImplementedError() def _item_with_beanstalkd_reply_from_task_event(self, job_id, data): task_event = json.loads(data) beanstalkd_reply = task_event.get('beanstalkd_reply') items = self.items_from_task_event(task_event) for item in items: yield (item, beanstalkd_reply) def _fetch_items_with_beanstalkd_reply_from_beanstalkd(self): return self.beanstalkd.fetch_jobs( self._item_with_beanstalkd_reply_from_task_event) def _fetch_items(self): """ Fetch items from inputs (other than the beanstalkd). """ raise NotImplementedError() def _fetch_items_with_beanstalkd_reply(self): items = self._fetch_items() for item in items: yield (item, None) def fetch_items_with_beanstalkd_reply(self): """ Fetch items with beanstalkd reply (useful if the task is distributed). """ if self.beanstalkd: return self._fetch_items_with_beanstalkd_reply_from_beanstalkd() return self._fetch_items_with_beanstalkd_reply() def update_counters(self, task_res): """ Update all counters of the tool. """ _, _, error = task_res self.items_processed += 1 if error is not None: self.errors += 1 def _update_total_counters(self): items_processed = self.items_processed self.items_processed = 0 self.total_items_processed += items_processed errors = self.errors self.errors = 0 self.total_errors += errors return items_processed, self.total_items_processed, \ errors, self.total_errors def _get_report(self, status, end_time, counters): raise NotImplementedError() def log_report(self, status, force=False): """ Log a report with a fixed interval. """ end_time = time.time() if force or (end_time - self.last_report >= self.report_interval): counters = self._update_total_counters() self.logger.info(self._get_report(status, end_time, counters)) self.last_report = end_time def create_worker(self, queue_workers, queue_reply): """ Create worker to process the items. """ raise NotImplementedError() def prepare_local_dispatcher(self): """ The tool will dispatch the tasks locally. """ self.dispatcher = _LocalDispatcher(self.conf, self) def prepare_distributed_dispatcher(self): """ The tool will dispatch the tasks on the platform. """ self.dispatcher = _DistributedDispatcher(self.conf, self) def _load_total_expected_items(self): raise NotImplementedError() def run(self): """ Start processing all found items. """ if self.dispatcher is None: raise ValueError('No dispatcher') self._load_total_expected_items() return self.dispatcher.run() def is_success(self): """ Check if there are any errors. """ if not self.success: return False if self.total_items_processed == 0: self.logger.warn('No item to proccess') return self.total_errors == 0