def insert_uow(self, process_name, start_time, end_time, iteration, time_record): """ creates unit_of_work and inserts it into the MongoDB @raise DuplicateKeyError if unit_of_work with given parameters already exists """ first_object_id = 0 last_object_id = iteration unit_of_work = UnitOfWorkEntry() unit_of_work.set_timestamp(start_time) unit_of_work.set_start_id(first_object_id) unit_of_work.set_end_id(last_object_id) unit_of_work.set_start_timestamp(start_time) unit_of_work.set_end_timestamp(end_time) unit_of_work.set_created_at(datetime.utcnow()) unit_of_work.set_source_collection(None) unit_of_work.set_target_collection(None) unit_of_work.set_state(unit_of_work.STATE_REQUESTED) unit_of_work.set_process_name(process_name) unit_of_work.set_number_of_retries(0) try: uow_id = unit_of_work_helper.insert(self.logger, unit_of_work) except DuplicateKeyError as e: e.first_object_id = str(first_object_id) e.last_object_id = str(last_object_id) e.process_name = process_name e.timestamp = start_time raise e self.publishers.get_publisher(process_name).publish(str(uow_id)) msg = 'Published: UOW %r for %r in timeperiod %r.' % (uow_id, process_name, start_time) self._log_message(INFO, process_name, time_record, msg) return unit_of_work
def create_unit_of_work(self, logger, process_name, first_object_id, last_object_id, timestamp): """ method is used to insert unit_of_work """ unit_of_work = UnitOfWorkEntry() unit_of_work.set_timestamp(timestamp) unit_of_work.set_start_timestamp(timestamp) unit_of_work.set_end_timestamp(timestamp) unit_of_work.set_start_id(first_object_id) unit_of_work.set_end_id(last_object_id) unit_of_work.set_source_collection(None) unit_of_work.set_target_collection(None) unit_of_work.set_state(UnitOfWorkEntry.STATE_REQUESTED) unit_of_work.set_process_name(process_name) unit_of_work.set_number_of_retries(0) uow_id = unit_of_work_helper.insert(logger, unit_of_work) return uow_id
def create_unit_of_work(process_name, first_object_id, last_object_id): """ method is used to insert unit_of_work """ source_collection = ProcessContext.get_source_collection(process_name) target_collection = ProcessContext.get_target_collection(process_name) logger = ProcessContext.get_logger(process_name) unit_of_work = UnitOfWorkEntry() unit_of_work.set_timestamp('UNIT_TEST') unit_of_work.set_start_id(first_object_id) unit_of_work.set_end_id(last_object_id) unit_of_work.set_source_collection(source_collection) unit_of_work.set_target_collection(target_collection) unit_of_work.set_state(UnitOfWorkEntry.STATE_REQUESTED) unit_of_work.set_process_name(process_name) unit_of_work.set_number_of_retries(0) uow_id = unit_of_work_helper.insert(logger, unit_of_work) return uow_id
def compute_scope_of_processing(self, process_name, start_time, end_time, time_record): """method reads collection and identify slice for processing""" source_collection_name = ProcessContext.get_source_collection(process_name) target_collection_name = ProcessContext.get_target_collection(process_name) source_collection = CollectionContext.get_collection(self.logger, source_collection_name) query = { AbstractModel.TIMESTAMP : { '$gte' : start_time, '$lt' : end_time } } asc_search = source_collection.find(spec=query, fields='_id').sort('_id', ASCENDING).limit(1) if asc_search.count() == 0: raise LookupError('No messages in timeperiod: %s:%s in collection %s' % (start_time, end_time, source_collection_name)) first_object_id = asc_search[0]['_id'] dec_search = source_collection.find(spec=query, fields='_id').sort('_id', DESCENDING).limit(1) last_object_id = dec_search[0]['_id'] unit_of_work = UnitOfWorkEntry() unit_of_work.set_timestamp(start_time) unit_of_work.set_start_id(str(first_object_id)) unit_of_work.set_end_id(str(last_object_id)) unit_of_work.set_start_timestamp(start_time) unit_of_work.set_end_timestamp(end_time) unit_of_work.set_created_at(datetime.utcnow()) unit_of_work.set_source_collection(source_collection_name) unit_of_work.set_target_collection(target_collection_name) unit_of_work.set_state(unit_of_work.STATE_REQUESTED) unit_of_work.set_process_name(process_name) unit_of_work.set_number_of_retries(0) try: uow_id = unit_of_work_helper.insert(self.logger, unit_of_work) except DuplicateKeyError as e: e.first_object_id = str(first_object_id) e.last_object_id = str(last_object_id) e.process_name = process_name e.timestamp = start_time raise e self.publishers.get_publisher(process_name).publish(str(uow_id)) msg = 'Published: UOW %r for %r in timeperiod %r.' % (uow_id, process_name, start_time) self._log_message(INFO, process_name, time_record, msg) return unit_of_work
def _process_single_document(self, document): """ actually inspects UOW retrieved from the database""" repost = False unit_of_work = UnitOfWorkEntry(document) process_name = unit_of_work.get_process_name() if unit_of_work.get_state() == UnitOfWorkEntry.STATE_INVALID: repost = True elif ( unit_of_work.get_state() == UnitOfWorkEntry.STATE_IN_PROGRESS or unit_of_work.get_state() == UnitOfWorkEntry.STATE_REQUESTED ): last_activity = unit_of_work.get_started_at() if last_activity is None: last_activity = unit_of_work.get_created_at() if datetime.utcnow() - last_activity > timedelta(hours=REPOST_AFTER_HOURS): repost = True if repost: creation_time = unit_of_work.get_created_at() if datetime.utcnow() - creation_time < timedelta(hours=LIFE_SUPPORT_HOURS): unit_of_work.set_state(UnitOfWorkEntry.STATE_REQUESTED) unit_of_work.set_number_of_retries(unit_of_work.get_number_of_retries() + 1) unit_of_work_helper.update(self.logger, unit_of_work) self.publishers.get_publisher(process_name).publish(str(document["_id"])) self.logger.info( "UOW marked for re-processing: process %s; id %s; attempt %d" % (process_name, str(document["_id"]), unit_of_work.get_number_of_retries()) ) self.performance_ticker.increment() else: unit_of_work.set_state(UnitOfWorkEntry.STATE_CANCELED) unit_of_work_helper.update(self.logger, unit_of_work) self.logger.info( "UOW transfered to STATE_CANCELED: process %s; id %s; attempt %d" % (process_name, str(document["_id"]), unit_of_work.get_number_of_retries()) )