def retrieve_for_timestamp(self, timestamp, unprocessed_only): """ method iterates thru all objects in timetable collections and load them into timetable""" resp = dict() resp.update(self._search_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_HOURLY), timestamp, unprocessed_only)) resp.update(self._search_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_DAILY), timestamp, unprocessed_only)) resp.update(self._search_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_MONTHLY), timestamp, unprocessed_only)) resp.update(self._search_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_YEARLY), timestamp, unprocessed_only)) return resp
def _get_timetable_collection(self, process_name): """timetable stores timeperiod in 4 collections: hourly, daily, monthly and yearly; method looks for the proper timetable_collection base on process TIME_QUALIFIER""" qualifier = ProcessContext.get_time_qualifier(process_name) if qualifier == ProcessContext.QUALIFIER_HOURLY: collection = CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_HOURLY) elif qualifier == ProcessContext.QUALIFIER_DAILY: collection = CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_DAILY) elif qualifier == ProcessContext.QUALIFIER_MONTHLY: collection = CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_MONTHLY) elif qualifier == ProcessContext.QUALIFIER_YEARLY: collection = CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_YEARLY) else: raise ValueError('unknown time qualifier: %s for %s' % (qualifier, process_name)) return collection
def retrieve_configuration(logger, box_id): """ method reads box configuration from the MongoDB""" collection = CollectionContext.get_collection(logger, COLLECTION_BOX_CONFIGURATION) document = collection.find_one( { BoxConfigurationEntry.BOX_ID : box_id } ) if document is None: raise LookupError('MongoDB has no process list for box_id = %r' % box_id) return BoxConfigurationEntry(document)
def clean_session_entries(): connection = CollectionContext.get_collection(logging, COLLECTION_SINGLE_SESSION) for i in range(base_fixtures.TOTAL_ENTRIES): key = generate_session_composite_key(i, base_fixtures.TOTAL_ENTRIES) connection.remove({ AbstractModel.DOMAIN_NAME : key[0], AbstractModel.TIMESTAMP : key[1], AbstractModel.FAMILY_USER_PROFILE + '.' + AbstractModel.SESSION_ID : 'session_id_' + str(i)})
def _mq_callback(self, message): """ wraps call of abstract method with try/except in case exception breaks the abstract method, this method: - catches the exception - logs the exception - marks unit of work as INVALID""" try: single_session_collection = CollectionContext.get_collection(self.logger, COLLECTION_SINGLE_SESSION) raw_data = RawData(message.body) query = {AbstractModel.DOMAIN_NAME: raw_data.get_key()[0], AbstractModel.FAMILY_USER_PROFILE + '.' + AbstractModel.SESSION_ID: raw_data.get_session_id()} document = single_session_collection.find_one(query) if document is None: # insert the record session = SingleSessionStatistics() # input data constraints - both session_id and user_id must be present in MQ message session.composite_key(raw_data.get_key()[0], time_helper.raw_to_session(raw_data.get_key()[1])) session.set_session_id(raw_data.get_session_id()) session.set_ip(raw_data.get_ip()) session.set_total_duration(0) session = self.update_session_body(raw_data, session) self.add_entry(session, 0, raw_data) self.performance_ticker.increment_insert() else: # update the click_xxx info session = SingleSessionStatistics(document) session = self.update_session_body(raw_data, session) duration = raw_data.get_key()[1] - time_helper.session_to_epoch(session.get_key()[1]) session.set_total_duration(duration) index = session.get_number_of_entries() self.add_entry(session, index, raw_data) self.performance_ticker.increment_update() if time.time() - self._last_safe_save_time < self.SAFE_SAVE_INTERVAL: isSafe = False else: isSafe = True self._last_safe_save_time = time.time() single_session_collection.save(session.get_document(), safe=isSafe) self.consumer.acknowledge(message.delivery_tag) except AutoReconnect as e: self.logger.error('MongoDB connection error: %r\nRe-queueing message & exiting the worker' % e) self.consumer.reject(message.delivery_tag) raise e except (KeyError, IndexError) as e: self.logger.error('Error is considered Unrecoverable: %r\nCancelled message: %r' % (e, message.body)) self.consumer.cancel(message.delivery_tag) except Exception as e: self.logger.error('Error is considered Recoverable: %r\nRe-queueing message: %r' % (e, message.body)) self.consumer.reject(message.delivery_tag)
def retrieve_by_id(logger, object_id): """ method finds unit_of_work record and returns it to the caller""" query = {"_id": object_id} collection = CollectionContext.get_collection(logger, COLLECTION_UNITS_OF_WORK) db_entry = collection.find_one(query) if db_entry is None: msg = "Unit_of_work with ID=%s was not found" % str(object_id) logger.warning(msg) raise LookupError(msg) return UnitOfWorkEntry(db_entry)
def retrieve(logger, process_name): """ method finds scheduler_configuration record and returns it to the caller""" query = {"process_name": process_name} collection = CollectionContext.get_collection(logger, COLLECTION_SCHEDULER_CONFIGURATION) db_entry = collection.find_one(query) if db_entry is None: msg = "SchedulerConfigurationEntry for process=%s was not found" % str(process_name) logger.warning(msg) raise LookupError(msg) return SchedulerConfigurationEntry(db_entry)
def update_scope_of_processing(self, process_name, unit_of_work, start_time, end_time, time_record): """method reads collection and refine slice upper bound for processing""" source_collection_name = unit_of_work.get_source_collection() source_collection = CollectionContext.get_collection(self.logger, source_collection_name) query = { AbstractModel.TIMESTAMP : { '$gte' : start_time, '$lt' : end_time } } dec_search = source_collection.find(spec=query, fields='_id').sort('_id', DESCENDING).limit(1) last_object_id = dec_search[0]['_id'] unit_of_work.set_end_id(str(last_object_id)) unit_of_work_helper.update(self.logger, unit_of_work) msg = 'Updated range to process for %s in timeperiod %s for collection %s: [%s : %s]'\ % (process_name, time_record.get_timestamp(), source_collection_name, unit_of_work.get_start_id(), str(last_object_id)) self._log_message(INFO, process_name, time_record, msg)
def retrieve_by_params(logger, process_name, timestamp, start_obj_id, end_obj_id): """ method finds unit_of_work record and returns it to the caller""" query = { UnitOfWorkEntry.PROCESS_NAME: process_name, UnitOfWorkEntry.TIMESTAMP: timestamp, UnitOfWorkEntry.START_OBJ_ID: start_obj_id, UnitOfWorkEntry.END_OBJ_ID: end_obj_id, } collection = CollectionContext.get_collection(logger, COLLECTION_UNITS_OF_WORK) db_entry = collection.find_one(query) if db_entry is None: msg = "Unit_of_work satisfying query %r was not found" % query logger.warning(msg) raise LookupError(msg) return UnitOfWorkEntry(db_entry)
def compute_scope_of_processing(self, process_name, start_time, end_time, time_record): """method reads collection and identify slice for processing""" source_collection_name = ProcessContext.get_source_collection(process_name) target_collection_name = ProcessContext.get_target_collection(process_name) source_collection = CollectionContext.get_collection(self.logger, source_collection_name) query = { AbstractModel.TIMESTAMP : { '$gte' : start_time, '$lt' : end_time } } asc_search = source_collection.find(spec=query, fields='_id').sort('_id', ASCENDING).limit(1) if asc_search.count() == 0: raise LookupError('No messages in timeperiod: %s:%s in collection %s' % (start_time, end_time, source_collection_name)) first_object_id = asc_search[0]['_id'] dec_search = source_collection.find(spec=query, fields='_id').sort('_id', DESCENDING).limit(1) last_object_id = dec_search[0]['_id'] unit_of_work = UnitOfWorkEntry() unit_of_work.set_timestamp(start_time) unit_of_work.set_start_id(str(first_object_id)) unit_of_work.set_end_id(str(last_object_id)) unit_of_work.set_start_timestamp(start_time) unit_of_work.set_end_timestamp(end_time) unit_of_work.set_created_at(datetime.utcnow()) unit_of_work.set_source_collection(source_collection_name) unit_of_work.set_target_collection(target_collection_name) unit_of_work.set_state(unit_of_work.STATE_REQUESTED) unit_of_work.set_process_name(process_name) unit_of_work.set_number_of_retries(0) try: uow_id = unit_of_work_helper.insert(self.logger, unit_of_work) except DuplicateKeyError as e: e.first_object_id = str(first_object_id) e.last_object_id = str(last_object_id) e.process_name = process_name e.timestamp = start_time raise e self.publishers.get_publisher(process_name).publish(str(uow_id)) msg = 'Published: UOW %r for %r in timeperiod %r.' % (uow_id, process_name, start_time) self._log_message(INFO, process_name, time_record, msg) return unit_of_work
def create_site_stats(collection, composite_key_function, statistics_klass, seed='RANDOM_SEED_OBJECT'): connection = CollectionContext.get_collection(logging, collection) random.seed(seed) object_ids = [] for i in range(TOTAL_ENTRIES): key = composite_key_function(i, TOTAL_ENTRIES) site_stat = statistics_klass() site_stat.composite_key(key[0], key[1]) site_stat.set_number_of_visits(random.randint(1, 1000)) site_stat.set_total_duration(random.randint(0, 100)) items = _generate_entries('os_', 5, i) site_stat.set_os(items) items = _generate_entries('browser_', 5, i) site_stat.set_browsers(items) items = dict() items['(320, 240)'] = 3 items['(640, 480)'] = 5 items['(1024, 960)'] = 7 items['(1280, 768)'] = 9 site_stat.set_screen_res(items) items = dict() items['ca_en'] = 3 items['ca_fr'] = 5 items['ua_uk'] = 7 items['us_en'] = 9 site_stat.set_languages(items) items = dict() items['ca'] = 3 items['fr'] = 5 items['uk'] = 7 items['us'] = 9 site_stat.set_countries(items) stat_id = connection.insert(site_stat.get_document(), safe=True) object_ids.append(stat_id) return object_ids
def create_session_stats(composite_key_function, seed='RANDOM_SEED_OBJECT'): time_array = ['20010303102210', '20010303102212', '20010303102215', '20010303102250'] connection = CollectionContext.get_collection(logging, COLLECTION_SINGLE_SESSION) random.seed(seed) object_ids = [] for i in range(TOTAL_ENTRIES): key = composite_key_function(i, TOTAL_ENTRIES) session = SingleSessionStatistics() session.composite_key(key[0], key[1]) session.set_session_id('session_id_' + str(i)) session.set_ip('192.168.0.2') if i % 3 == 0: session.set_screen_res(240, 360) elif i % 5 == 0: session.set_screen_res(360, 480) else: session.set_screen_res(760, 980) if i % 2 == 0: session.set_os('Linux') session.set_browser('FF ' + str(i % 4)) session.set_language('en_ca') session.set_country('ca') else: session.set_os('Windows') session.set_browser('IE ' + str(i % 9)) session.set_language('ua_uk') session.set_country('eu') session.set_total_duration(random.randint(0, 200)) session.set_number_of_pageviews(random.randint(1, 5)) for index in range(random.randint(1, 4)): session.set_number_of_entries(index + 1) session.set_entry_timestamp(index, time_array[index]) sess_id = connection.insert(session.get_document(), safe=True) object_ids.append(sess_id) return object_ids
def start(self): """ reading scheduler configurations and starting timers to trigger events """ collection = CollectionContext.get_collection(self.logger, COLLECTION_SCHEDULER_CONFIGURATION) cursor = collection.find({}) if cursor.count() == 0: raise LookupError('MongoDB has no scheduler configuration entries') for entry in cursor: document = SchedulerConfigurationEntry(entry) interval = document.get_interval() is_active = document.get_process_state() == SchedulerConfigurationEntry.STATE_ON type = ProcessContext.get_type(document.get_process_name()) parameters = [document.get_process_name(), document] if type == TYPE_ALERT: function = self.fire_alert elif type == TYPE_HORIZONTAL_AGGREGATOR: function = self.fire_worker elif type == TYPE_VERTICAL_AGGREGATOR: function = self.fire_worker elif type == TYPE_GARBAGE_COLLECTOR: function = self.fire_garbage_collector else: self.logger.error('Can not start scheduler for %s since it has no processing function' % type) continue handler = RepeatTimer(interval, function, args=parameters) self.thread_handlers[document.get_process_name()] = handler if is_active: handler.start() self.logger.info('Started scheduler for %s:%s, triggering every %d seconds'\ % (type, document.get_process_name(), interval)) else: self.logger.info('Handler for %s:%s registered in Scheduler. Idle until activated.'\ % (type, document.get_process_name())) # as Scheduler is now initialized and running - we can safely start its MX self.start_mx()
def _get_source_collection(self): """collection with data for processing""" return CollectionContext.get_collection(self.logger, ProcessContext.get_source_collection(self.process_name))
def update_configuration(logger, box_configuration): """ method updates box configuration in the MongoDB""" w_number = CollectionContext.get_w_number(logger, COLLECTION_BOX_CONFIGURATION) collection = CollectionContext.get_collection(logger, COLLECTION_BOX_CONFIGURATION) collection.save(box_configuration.get_document(), safe=True, w=w_number)
def insert(logger, unit_of_work): """ inserts unit of work to MongoDB. @throws DuplicateKeyError is such record already exists """ w_number = CollectionContext.get_w_number(logger, COLLECTION_UNITS_OF_WORK) collection = CollectionContext.get_collection(logger, COLLECTION_UNITS_OF_WORK) uow_id = collection.insert(unit_of_work.get_document(), safe=True, w=w_number) return uow_id
def remove(logger, uow_id): w_number = CollectionContext.get_w_number(logger, COLLECTION_UNITS_OF_WORK) collection = CollectionContext.get_collection(logger, COLLECTION_UNITS_OF_WORK) collection.remove(uow_id, safe=True, w=w_number)
def update(logger, unit_of_work): """ method finds unit_of_work record and change its status""" w_number = CollectionContext.get_w_number(logger, COLLECTION_UNITS_OF_WORK) collection = CollectionContext.get_collection(logger, COLLECTION_UNITS_OF_WORK) collection.save(unit_of_work.get_document(), safe=True, w=w_number)
def load_tree(self): """ method iterates thru all objects in timetable collections and load them into timetable""" self._build_tree_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_HOURLY)) self._build_tree_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_DAILY)) self._build_tree_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_MONTHLY)) self._build_tree_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_YEARLY))
def update(logger, scheduler_configuration): """ method finds scheduler_configuration record and update its DB representation""" w_number = CollectionContext.get_w_number(logger, COLLECTION_SCHEDULER_CONFIGURATION) collection = CollectionContext.get_collection(logger, COLLECTION_SCHEDULER_CONFIGURATION) collection.save(scheduler_configuration.get_document(), safe=True, w=w_number)
def _get_target_collection(self): """collection to store aggregated documents""" return CollectionContext.get_collection(self.logger, ProcessContext.get_target_collection(self.process_name))
def __init__(self, process_name): super(GarbageCollectorWorker, self).__init__(process_name) self.publishers = PublishersPool(self.logger) self.collection = CollectionContext.get_collection(self.logger, COLLECTION_UNITS_OF_WORK) self.lock = Lock()