class IdentityWorker(AbstractWorker): """ Marks all unit_of_work as <complete>""" def __init__(self, process_name): self.hadoop_process = None super(IdentityWorker, self).__init__(process_name) def __del__(self): super(IdentityWorker, self).__del__() # **************** Abstract Methods ************************ def _init_performance_ticker(self, logger): self.performance_ticker = AggregatorPerformanceTicker(logger) self.performance_ticker.start() # ********************** thread-related methods **************************** def _mq_callback(self, message): """ try/except wrapper in case exception breaks the abstract method, this method: - catches the exception - logs the exception - marks unit of work as INVALID""" unit_of_work = None try: # @param object_id: ObjectId of the unit_of_work from mq object_id = ObjectId(message.body) unit_of_work = unit_of_work_helper.retrieve_by_id(self.logger, object_id) if unit_of_work.get_state() == UnitOfWorkEntry.STATE_CANCELED \ or unit_of_work.get_state() == UnitOfWorkEntry.STATE_PROCESSED: # garbage collector might have reposted this UOW self.logger.warning('Skipping unit_of_work: id %s; state %s;' \ % (str(message.body), unit_of_work.get_state()), exc_info=False) self.consumer.acknowledge(message.delivery_tag) return except Exception: self.logger.error('Safety fuse. Can not identify unit_of_work %s' % str(message.body), exc_info=True) self.consumer.acknowledge(message.delivery_tag) return try: self.performance_ticker.start_uow(unit_of_work) unit_of_work.set_state(UnitOfWorkEntry.STATE_PROCESSED) unit_of_work.set_number_of_processed_documents(0) unit_of_work.set_started_at(datetime.utcnow()) unit_of_work.set_finished_at(datetime.utcnow()) unit_of_work_helper.update(self.logger, unit_of_work) self.performance_ticker.finish_uow() except Exception as e: unit_of_work.set_state(UnitOfWorkEntry.STATE_INVALID) unit_of_work_helper.update(self.logger, unit_of_work) self.performance_ticker.cancel_uow() self.logger.error('Safety fuse while processing unit_of_work %s in timeperiod %s : %r'\ % (message.body, unit_of_work.get_timestamp(), e), exc_info=True) finally: self.consumer.acknowledge(message.delivery_tag) self.consumer.close()
def _init_performance_ticker(self, logger): self.performance_ticker = AggregatorPerformanceTicker(logger) self.performance_ticker.start()
class AbstractHadoopWorker(AbstractWorker): """ Abstract class is inherited by all workers/aggregators that are aware of unit_of_work and capable of processing it""" def __init__(self, process_name): self.hadoop_process = None super(AbstractHadoopWorker, self).__init__(process_name) def __del__(self): super(AbstractHadoopWorker, self).__del__() # **************** Abstract Methods ************************ def _init_performance_ticker(self, logger): self.performance_ticker = AggregatorPerformanceTicker(logger) self.performance_ticker.start() # **************** Process Supervisor Methods ************************ def _start_process(self, start_timestamp, end_timestamp): try: self.logger.info('start: %s {' % self.process_name) p = psutil.Popen([settings['hadoop_command'], 'jar', settings['hadoop_jar'], '-D', 'process.name=' + self.process_name, '-D', 'timeperiod.working=' + str(start_timestamp), '-D', 'timeperiod.next=' + str(end_timestamp)], close_fds=True, cwd=settings['process_cwd'], stdin=PIPE, stdout=PIPE, stderr=PIPE) self.hadoop_process = p self.logger.info('Started %s with pid = %r' % (self.process_name, p.pid)) except Exception: self.logger.error('Exception on starting: %s' % self.process_name, exc_info=True) finally: self.logger.info('}') def _poll_process(self): """ between death of a process and its actual termination lies poorly documented requirement - <purging process' io pipes and reading exit status>. this can be done either by os.wait() or process.wait() @return tuple (boolean: alive, int: return_code) """ try: self.logger.warn(self.hadoop_process.stderr.read()) self.logger.info(self.hadoop_process.stdout.read()) returncode = self.hadoop_process.wait(timeout=0.01) if returncode is None: # process is already terminated self.logger.info('Process %s is terminated' % self.process_name) else: # process is terminated; possibly by OS self.logger.info('Process %s got terminated. Cleaning up' % self.process_name) self.hadoop_process = None return False, returncode except TimeoutExpired: # process is alive and OK return True, None except Exception: self.logger.error('Exception on polling: %s' % self.process_name, exc_info=True) return False, 999 # ********************** thread-related methods **************************** def _mq_callback(self, message): """ try/except wrapper in case exception breaks the abstract method, this method: - catches the exception - logs the exception - marks unit of work as INVALID""" unit_of_work = None try: # @param object_id: ObjectId of the unit_of_work from mq object_id = ObjectId(message.body) unit_of_work = unit_of_work_helper.retrieve_by_id(self.logger, object_id) if unit_of_work.get_state() == UnitOfWorkEntry.STATE_CANCELED \ or unit_of_work.get_state() == UnitOfWorkEntry.STATE_PROCESSED: # garbage collector might have reposted this UOW self.logger.warning('Skipping unit_of_work: id %s; state %s;' \ % (str(message.body), unit_of_work.get_state()), exc_info=False) self.consumer.acknowledge(message.delivery_tag) return except Exception: self.logger.error('Safety fuse. Can not identify unit_of_work %s' % str(message.body), exc_info=True) self.consumer.acknowledge(message.delivery_tag) return try: start_timestamp = unit_of_work.get_start_timestamp() end_timestamp = unit_of_work.get_end_timestamp() unit_of_work.set_state(UnitOfWorkEntry.STATE_IN_PROGRESS) unit_of_work.set_started_at(datetime.utcnow()) unit_of_work_helper.update(self.logger, unit_of_work) self.performance_ticker.start_uow(unit_of_work) self._start_process(start_timestamp, end_timestamp) code = None alive = True while alive: alive, code = self._poll_process() if code == 0: unit_of_work.set_number_of_processed_documents(self.performance_ticker.posts_per_job) unit_of_work.set_finished_at(datetime.utcnow()) unit_of_work.set_state(UnitOfWorkEntry.STATE_PROCESSED) self.performance_ticker.finish_uow() else: unit_of_work.set_state(UnitOfWorkEntry.STATE_INVALID) self.performance_ticker.cancel_uow() self.logger.info('Hadoop Map/Reduce return code is %r' % code) unit_of_work_helper.update(self.logger, unit_of_work) except Exception as e: unit_of_work.set_state(UnitOfWorkEntry.STATE_INVALID) unit_of_work_helper.update(self.logger, unit_of_work) self.performance_ticker.cancel_uow() self.logger.error('Safety fuse while processing unit_of_work %s in timeperiod %s : %r'\ % (message.body, unit_of_work.get_timestamp(), e), exc_info=True) finally: self.consumer.acknowledge(message.delivery_tag) self.consumer.close()
class AbstractAwareWorker(AbstractWorker): """ Abstract class is inherited by all workers/aggregators that are aware of unit_of_work and capable of processing it""" def __init__(self, process_name): super(AbstractAwareWorker, self).__init__(process_name) self.aggregated_objects = dict() def __del__(self): self._flush_aggregated_objects() super(AbstractAwareWorker, self).__del__() # **************** Abstract Methods ************************ def _init_performance_ticker(self, logger): self.performance_ticker = AggregatorPerformanceTicker(logger) self.performance_ticker.start() def _get_tunnel_port(self): """ abstract method to retrieve Python-HBase tunnel port""" pass def _get_source_collection(self): """collection with data for processing""" return CollectionContext.get_collection(self.logger, ProcessContext.get_source_collection(self.process_name)) def _get_target_collection(self): """collection to store aggregated documents""" return CollectionContext.get_collection(self.logger, ProcessContext.get_target_collection(self.process_name)) def _flush_aggregated_objects(self): """ method inserts aggregated objects to HBaseTunnel @return number_of_aggregated_objects """ if len(self.aggregated_objects) == 0: # nothing to do return 0 total_transfered_bytes = 0 number_of_aggregated_objects = len(self.aggregated_objects) self.logger.info('Aggregated %d documents. Performing flush.' % number_of_aggregated_objects) tunnel_address = (settings['tunnel_host'], self._get_tunnel_port()) for key in self.aggregated_objects: client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) client_socket.connect(tunnel_address) document = self.aggregated_objects[key] tunnel_obj = json.dumps(document.data, cls=DecimalEncoder) transfered_bytes = client_socket.send(tunnel_obj) if transfered_bytes == 0: raise RuntimeError("Transferred 0 bytes. Socket connection broken") total_transfered_bytes += transfered_bytes client_socket.shutdown(socket.SHUT_WR) client_socket.close() client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) client_socket.connect(tunnel_address) transfered_bytes = client_socket.send('FLUSH') if transfered_bytes == 0: raise RuntimeError("Transferred 0 bytes. Socket connection broken") client_socket.close() self.logger.info('Flush successful. Transmitted %r bytes' % total_transfered_bytes) del self.aggregated_objects self.aggregated_objects = dict() gc.collect() return number_of_aggregated_objects def _get_aggregated_object(self, composite_key): """ method talks with the map of instances of aggregated objects @param composite_key presents tuple, comprising of domain_name and timestamp""" if composite_key not in self.aggregated_objects: self.aggregated_objects[composite_key] = self._init_target_object(composite_key) return self.aggregated_objects[composite_key] def _init_target_key(self, *args): """ abstract method to create composite key from source compounds like domain_name and timestamp""" pass def _init_target_object(self, composite_key): """ abstract method to instantiate new object that will be holding aggregated data """ pass def _init_source_object(self, document): """ abstract method to initialise object with map from source collection """ pass # ********************** thread-related methods **************************** def _process_not_empty_cursor(self, cursor): """ abstract method to process cursor with result set from DB method returns: shall_continue - True if outer loop shall continue new_start_id - mongo.ObjectId of the next start point""" pass def perform_post_processing(self, timestamp): """ abstract method to perform post-processing (before flushing)""" pass def _mq_callback(self, message): """ try/except wrapper in case exception breaks the abstract method, this method: - catches the exception - logs the exception - marks unit of work as INVALID""" unit_of_work = None try: # @param object_id: ObjectId of the unit_of_work from mq object_id = ObjectId(message.body) unit_of_work = unit_of_work_helper.retrieve_by_id(self.logger, object_id) if unit_of_work.get_state() == UnitOfWorkEntry.STATE_CANCELED \ or unit_of_work.get_state() == UnitOfWorkEntry.STATE_PROCESSED: # garbage collector might have reposted this UOW self.logger.warning('Skipping unit_of_work: id %s; state %s;' \ % (str(message.body), unit_of_work.get_state()), exc_info=False) self.consumer.acknowledge(message.delivery_tag) return except Exception: self.logger.error('Safety fuse. Can not identify unit_of_work %s' % str(message.body), exc_info=True) self.consumer.acknowledge(message.delivery_tag) return try: start_id_obj = ObjectId(unit_of_work.get_start_id()) end_id_obj = ObjectId(unit_of_work.get_end_id()) start_timestamp = unit_of_work.get_start_timestamp() end_timestamp = unit_of_work.get_end_timestamp() unit_of_work.set_state(UnitOfWorkEntry.STATE_IN_PROGRESS) unit_of_work.set_started_at(datetime.utcnow()) unit_of_work_helper.update(self.logger, unit_of_work) self.performance_ticker.start_uow(unit_of_work) bulk_threshold = settings['bulk_threshold'] iteration = 0 while True: source_collection = self._get_source_collection() if iteration == 0: queue = { '_id' : { '$gte' : start_id_obj, '$lte' : end_id_obj } } else: queue = { '_id' : { '$gt' : start_id_obj, '$lte' : end_id_obj } } if start_timestamp is not None and end_timestamp is not None: # remove all accident objects that may be in [start_id_obj : end_id_obj] range queue[AbstractModel.TIMESTAMP] = { '$gte' : start_timestamp, '$lt' : end_timestamp } cursor = source_collection.find(queue).sort('_id', ASCENDING).limit(bulk_threshold) count = cursor.count(with_limit_and_skip=True) if count == 0 and iteration == 0: msg = 'No entries in %s at range [%s : %s]'\ % (str(source_collection.name), unit_of_work.get_start_id(), unit_of_work.get_end_id()) self.logger.warning(msg) break else: shall_continue, new_start_id = self._process_not_empty_cursor(cursor) if shall_continue: start_id_obj = new_start_id iteration += 1 else: break msg = 'Cursor exploited after %s iterations' % str(iteration) self.logger.info(msg) self.perform_post_processing(unit_of_work.get_timestamp()) number_of_aggregated_objects = self._flush_aggregated_objects() unit_of_work.set_number_of_aggregated_documents(number_of_aggregated_objects) unit_of_work.set_number_of_processed_documents(self.performance_ticker.posts_per_job) unit_of_work.set_finished_at(datetime.utcnow()) unit_of_work.set_state(UnitOfWorkEntry.STATE_PROCESSED) unit_of_work_helper.update(self.logger, unit_of_work) self.performance_ticker.finish_uow() except Exception as e: unit_of_work.set_state(UnitOfWorkEntry.STATE_INVALID) unit_of_work_helper.update(self.logger, unit_of_work) self.performance_ticker.cancel_uow() del self.aggregated_objects self.aggregated_objects = dict() gc.collect() self.logger.error('Safety fuse while processing unit_of_work %s in timeperiod %s : %r'\ % (message.body, unit_of_work.get_timestamp(), e), exc_info=True) finally: self.consumer.acknowledge(message.delivery_tag) self.consumer.close()