def __init__(self, concurrency): """ concurrency: allowed number of model slots """ self._logger = _getLogger() self._profiling = (ModelSwapperConfig().getboolean( "debugging", "profiling") or self._logger.isEnabledFor(logging.DEBUG)) # Allowed number of model slots self._concurrency = concurrency # Input-reader thread target function sets this when it starts running to # let our event loop know that things are off to a good start self._notificationReaderStartedEvent = threading.Event() self._notificationMutex = threading.Lock() # Mutex used to guaranteed that no further model input notifications will # be added to main event queue once self._stopNotificationReader is set # _runNotificationReaderThread will not process any more notifications # once it detects that this flag is true self._stopNotificationReader = False # The event loop will exit some time after an event handler sets this flag # to True self._eventLoopStopPending = False # (non-thread-safe) The tuple of all slot agents self._slotAgents = tuple( SlotAgent(slotID=i) for i in xrange(concurrency)) assert self._slotAgents # Thread-safe event queue for SwapController self._eventQ = Queue.Queue() # Main event loop's ModelSwapperInterface instance. MUST NOT use from # threads because ModelSwapperInterface self._mainSwapper = ModelSwapperInterface() # A (non-thread-safe) FIFO of models that are waiting to be scheduled for # running; there is incoming data for them that needs to be processed self._waitingModelsFIFO = [] # A (non-thread-safe) map of modelIDs to _RunningModelInfo instances self._runningModelsMap = dict() # A (non-thread-safe) list of free slot indexes into the self._slotsAgents # tuple self._freeSlots = list(xrange(len(self._slotAgents))) # (non-thread-safe) Indexes of SlotAgents pending preemption self._pendingPreemptSlotsSet = set() self._notificationReaderThread = threading.Thread( target=self._runNotificationReaderThread, name="%s-input-reader-%s" % (self.__class__.__name__, id(self))) # Allow process to exit even if thread is still running self._notificationReaderThread.setDaemon(True)
def __init__(self, modelID): """ :param modelID: model ID; string """ self._logger = _getLogger() self._modelID = modelID self._swapperAPI = ModelSwapperInterface() self._archiver = _ModelArchiver(self._modelID) # "deleteModel" command handler sets this flag to force our processing # loop to terminate self._done = False modelSwapperConfig = ModelSwapperConfig() self._targetMaxRequestsPerCheckpoint = modelSwapperConfig.getint( "model_runner", "target_requests_per_checkpoint") self._profiling = (modelSwapperConfig.getboolean( "debugging", "profiling") or self._logger.isEnabledFor(logging.DEBUG)) if self._profiling: self._logger.info("Profiling is turned on") self._modelLoadSec = 0
def testStartMultipleModelRunnersAndStopThem(self): # Starts several ModelRunners and stops them gracefully # to confirm that they can all stop without conflicting with each other: # if ModelRunnerProxy doesn't configure subprocess.Popen with # `close_fds=True`, then graceful shutdown will fail because the stdin # of some child processes will be cloned into those that are started # after them and closing stding of an earlier ModelRunner child process # won't have the desired effect of terminating that process (since other # clones of that file descriptor will prevent it from fully closing) # # TODO send commands to models and verify output runners = [] modelIDs = tuple("abcdef" + str(i) for i in xrange(5)) with ModelSwapperInterface() as swapper: modelInputMQs = tuple( swapper._getModelInputQName(modelID=modelID) for modelID in modelIDs) with amqp_test_utils.managedQueueDeleter(modelInputMQs): with MessageBusConnector() as bus: for mq in modelInputMQs: bus.createMessageQueue(mq, durable=True) for modelID in modelIDs: runners.append( slot_agent.ModelRunnerProxy(modelID=modelID, onTermination=lambda: None, logger=_LOGGER)) returnCodes = [runner.stopGracefully() for runner in runners] self.assertEqual(returnCodes, [0] * len(runners))
def run(self): with ModelSwapperInterface() as modelSwapper: engine = repository.engineFactory() while True: with engine.connect() as conn: pendingStacks = repository.retryOnTransientErrors( repository.getAutostackMetricsPendingDataCollection)( conn) if not pendingStacks: time.sleep(self._NOTHING_READY_SLEEP_TIME_SEC) continue # Build a sequence of autostack metric requests requests = [] for autostack, metrics in pendingStacks: refBase = len(requests) requests.extend( AutostackMetricRequest(refID=refBase + i, autostack=autostack, metric=metric) for i, metric in enumerate(metrics)) # Collect, aggregate, and stream metric data self._processAutostackMetricRequests(engine, requests, modelSwapper)
def __init__(self, modelID): """ :param modelID: model ID; string """ self._logger = _getLogger() self._modelID = modelID self._swapperAPI = ModelSwapperInterface() self._archiver = _ModelArchiver(self._modelID) # "deleteModel" command handler sets this flag to force our processing # loop to terminate self._done = False modelSwapperConfig = ModelSwapperConfig() self._targetMaxRequestsPerCheckpoint = modelSwapperConfig.getint( "model_runner", "target_requests_per_checkpoint") self._profiling = ( modelSwapperConfig.getboolean("debugging", "profiling") or self._logger.isEnabledFor(logging.DEBUG)) if self._profiling: self._logger.info("Profiling is turned on") self._modelLoadSec = 0
def runServer(): # Get the current list of custom metrics appConfig = Config("application.conf", os.environ["APPLICATION_CONFIG_PATH"]) engine = repository.engineFactory(appConfig) global gCustomMetrics now = datetime.datetime.utcnow() with engine.connect() as conn: gCustomMetrics = dict( (m.name, [m, now]) for m in repository.getCustomMetrics(conn)) queueName = appConfig.get("metric_listener", "queue_name") global gProfiling gProfiling = (appConfig.getboolean("debugging", "profiling") or LOGGER.isEnabledFor(logging.DEBUG)) del appConfig metricStreamer = MetricStreamer() modelSwapper = ModelSwapperInterface() with MessageBusConnector() as bus: if not bus.isMessageQeueuePresent(queueName): bus.createMessageQueue(mqName=queueName, durable=True) LOGGER.info("Waiting for messages. To exit, press CTRL+C") with bus.consume(queueName) as consumer: messages = [] messageRxTimes = [] while True: message = consumer.pollOneMessage() if message is not None: messages.append(message) if gProfiling: messageRxTimes.append(time.time()) if message is None or len(messages) >= MAX_MESSAGES_PER_BATCH: if messages: # Process the batch try: _handleBatch(engine, messages, messageRxTimes, metricStreamer, modelSwapper) except Exception: # pylint: disable=W0703 LOGGER.exception("Unknown failure in processing messages.") # Make sure that we ack messages when there is an unexpected error # to avoid getting hung forever on one bad record. # Ack all the messages messages[-1].ack(multiple=True) # Clear the message buffer messages = [] messageRxTimes = [] else: # Queue is empty, wait before retrying time.sleep(POLL_DELAY_SEC)
def __init__(self, concurrency): """ concurrency: allowed number of model slots """ self._logger = _getLogger() self._profiling = ( ModelSwapperConfig().getboolean("debugging", "profiling") or self._logger.isEnabledFor(logging.DEBUG)) # Allowed number of model slots self._concurrency = concurrency # Input-reader thread target function sets this when it starts running to # let our event loop know that things are off to a good start self._notificationReaderStartedEvent = threading.Event() self._notificationMutex = threading.Lock() # Mutex used to guaranteed that no further model input notifications will # be added to main event queue once self._stopNotificationReader is set # _runNotificationReaderThread will not process any more notifications # once it detects that this flag is true self._stopNotificationReader = False # The event loop will exit some time after an event handler sets this flag # to True self._eventLoopStopPending = False # (non-thread-safe) The tuple of all slot agents self._slotAgents = tuple(SlotAgent(slotID=i) for i in xrange(concurrency)) assert self._slotAgents # Thread-safe event queue for SwapController self._eventQ = Queue.Queue() # Main event loop's ModelSwapperInterface instance. MUST NOT use from # threads because ModelSwapperInterface self._mainSwapper = ModelSwapperInterface() # A (non-thread-safe) FIFO of models that are waiting to be scheduled for # running; there is incoming data for them that needs to be processed self._waitingModelsFIFO = [] # A (non-thread-safe) map of modelIDs to _RunningModelInfo instances self._runningModelsMap = dict() # A (non-thread-safe) list of free slot indexes into the self._slotsAgents # tuple self._freeSlots = list(xrange(len(self._slotAgents))) # (non-thread-safe) Indexes of SlotAgents pending preemption self._pendingPreemptSlotsSet = set() self._notificationReaderThread = threading.Thread( target=self._runNotificationReaderThread, name="%s-input-reader-%s" % (self.__class__.__name__, id(self))) # Allow process to exit even if thread is still running self._notificationReaderThread.setDaemon(True)
def runConsumerThread(destList, numExpected): with ModelSwapperInterface() as swapper: with swapper.consumeResults() as consumer: for batch in consumer: destList.append(batch) batch.ack() _LOGGER.info("Got result batch=%r", batch) if len(destList) == numExpected: break
def _runNotificationReaderThread(self): """ Read model data notifications and pass them to the event loop """ self._logger.info("Notification Reader thread is running") # Let the main event loop know that this thread started successfully self._notificationReaderStartedEvent.set() with ModelSwapperInterface() as swapperAPI: # First, make sure our notification message queue exists, so we don't # miss any new notifications while we're checking for models with pending # input self._logger.info("SWAPPER_API: %r", swapperAPI) swapperAPI.initSchedulerNotification() # At start, notify main event loop of each model whose input is non-empty self._logger.info("Checking for models with pending input") i = 0 for i, modelID in enumerate(swapperAPI.getModelsWithInputPending(), 1): self._logger.debug("Input pending for model=%s", modelID) self._newInputNotifyTS(modelID=modelID) self._logger.info("%s model(s) had pending input", i) # Service the SwapController's input queue util stop is requested with swapperAPI.consumeModelSchedulerNotifications() as consumer: numHandledNotifications = 0 try: for notification in consumer: with self._notificationMutex: if self._stopNotificationReader: self._logger.info( "Notification reader exiting due to stop request" ) break self._newInputNotifyTS(modelID=notification.value) notification.ack() numHandledNotifications += 1 else: raise Exception( "Unexpected termination of consumer loop in " "Notification Reader") finally: self._logger.info( "Control is leaving notification reader loop after processing %s " "notifications", numHandledNotifications)
def createHTMModel(modelId, params): """ Dispatch command to create HTM model :param modelId: unique identifier of the metric row :param modelParams: model params for creating a scalar model per ModelSwapper interface :param modelSwapper: htmengine.model_swapper.model_swapper_interface object """ with ModelSwapperInterface() as modelSwapper: modelSwapper.defineModel(modelID=modelId, args=params, commandID=createGuid())
def testSubmitResults(self, messageBusConnectorClassMock): results = [ ModelCommandResult(commandID="abc", method="testMethod", status=0, args={'key1': 4098, 'key2': 4139}), ModelInferenceResult(rowID="foo", status=0, anomalyScore=1), ModelInferenceResult(rowID="bar", status=0, anomalyScore=2) ] messageBusConnectorMock = messageBusConnectorClassMock.return_value interface = ModelSwapperInterface() modelID = "foofar" msg = ResultMessagePackager.marshal( modelID=modelID, batchState=BatchPackager.marshal(batch=results)) interface.submitResults(modelID=modelID, results=results) mqName = interface._resultsQueueName messageBusConnectorMock.publish.assert_called_once_with(mqName, msg, persistent=True)
def testStartModelRunnerAndStopIt(self): # Simple test that starts a ModelRunner and stops it gracefully # TODO send command to model and verify output modelID = "abcdef" with ModelSwapperInterface() as swapper: modelInputMQ = swapper._getModelInputQName(modelID=modelID) with amqp_test_utils.managedQueueDeleter(modelInputMQ): with MessageBusConnector() as bus: bus.createMessageQueue(modelInputMQ, durable=True) runner = slot_agent.ModelRunnerProxy(modelID=modelID, onTermination=lambda: None, logger=_LOGGER) returnCode = runner.stopGracefully() self.assertEqual(returnCode, 0)
class SwapController(object): """ Orchestrate model swapping across a fixed set of SlotAgent instances based on incoming model commands and records. """ # Event methods for the SwapController's event loop. These are handled by # instance methods with names constructed as follows: # "_handle" + method + "Event" (e.g., _handleStopEventLoopRequestEvent) _NEW_INPUT_NOTIFY_METHOD = "NewInputNotify" _MODEL_DONE_NOTIFY_METHOD = "ModelDoneNotify" _STOP_EVENT_LOOP_REQUEST_METHOD = "StopEventLoopRequest" _NOTIFICATION_READER_THREAD_START_WAIT_TIMEOUT_SEC = 5 _NOTIFICATION_READER_THREAD_JOIN_TIMEOUT_SEC = 10 _EXIT_CODE_ON_FAILURE_OF_NOTIFICATION_READER_THREAD = 1 def __init__(self, concurrency): """ concurrency: allowed number of model slots """ self._logger = _getLogger() self._profiling = ( ModelSwapperConfig().getboolean("debugging", "profiling") or self._logger.isEnabledFor(logging.DEBUG)) # Allowed number of model slots self._concurrency = concurrency # Input-reader thread target function sets this when it starts running to # let our event loop know that things are off to a good start self._notificationReaderStartedEvent = threading.Event() self._notificationMutex = threading.Lock() # Mutex used to guaranteed that no further model input notifications will # be added to main event queue once self._stopNotificationReader is set # _runNotificationReaderThread will not process any more notifications # once it detects that this flag is true self._stopNotificationReader = False # The event loop will exit some time after an event handler sets this flag # to True self._eventLoopStopPending = False # (non-thread-safe) The tuple of all slot agents self._slotAgents = tuple(SlotAgent(slotID=i) for i in xrange(concurrency)) assert self._slotAgents # Thread-safe event queue for SwapController self._eventQ = Queue.Queue() # Main event loop's ModelSwapperInterface instance. MUST NOT use from # threads because ModelSwapperInterface self._mainSwapper = ModelSwapperInterface() # A (non-thread-safe) FIFO of models that are waiting to be scheduled for # running; there is incoming data for them that needs to be processed self._waitingModelsFIFO = [] # A (non-thread-safe) map of modelIDs to _RunningModelInfo instances self._runningModelsMap = dict() # A (non-thread-safe) list of free slot indexes into the self._slotsAgents # tuple self._freeSlots = list(xrange(len(self._slotAgents))) # (non-thread-safe) Indexes of SlotAgents pending preemption self._pendingPreemptSlotsSet = set() self._notificationReaderThread = threading.Thread( target=self._runNotificationReaderThread, name="%s-input-reader-%s" % (self.__class__.__name__, id(self))) # Allow process to exit even if thread is still running self._notificationReaderThread.setDaemon(True) @logEntryExit(_getLogger, logging.INFO) @logExceptions(_getLogger) def run(self): """ Run SwapController; blocking """ # Start our input-reader thread self._logger.info("Starting Notification Reader thread") self._notificationReaderThread.start() self._notificationReaderStartedEvent.wait( timeout=self._NOTIFICATION_READER_THREAD_START_WAIT_TIMEOUT_SEC) assert self._notificationReaderStartedEvent.is_set(), \ "Notification-reader thread failed to start in time" self._logger.info("Notification Reader started, now entering Event Loop") requestedStopOfRemainingModels = False while True: if self._eventLoopStopPending: if not self._runningModelsMap and not self._waitingModelsFIFO: # All models are idle now, so close Slot Agents and bail out for sa in self._slotAgents: sa.close() self._logger.info("Closed all Slot Agents; leaving event loop") break elif not self._waitingModelsFIFO and not requestedStopOfRemainingModels: # Only running models remain, so request to stop them gracefully assert self._runningModelsMap for modelInfo in self._runningModelsMap.itervalues(): if modelInfo.slotIndex not in self._pendingPreemptSlotsSet: self._slotAgents[modelInfo.slotIndex].stopModel() self._pendingPreemptSlotsSet.add(modelInfo.slotIndex) requestedStopOfRemainingModels = True self._logger.info("Requested stop of remaining running models") # Get and handle next event evt = self._eventQ.get() method = evt["method"] handler = getattr(self, "_handle" + method + "Event") handler(**evt) def requestStopTS(self): """ [thread-safe; non-blocking] Enqueue a request to stop the SwapController. This will cause the instance's run() method to return eventually """ # This will prevent notification reader thread from processing any more # notifications. By the time we release the lock, no new notifications # will be placed in our event queue, so we can proceed with graceful # shutdown. with self._notificationMutex: self._stopNotificationReader = True self._logger.info("Requested stop of notification processing") # Asynchronously request graceful shutdown of our event loop. This allows # all remaining events from the notification-reader to be handled # prior to commensing graceful shutdown. self._logger.info("Requesting event-loop shutdown") self._eventQ.put({"method" : self._STOP_EVENT_LOOP_REQUEST_METHOD}) def _newInputNotifyTS(self, modelID): """ [thread-safe] Notify Model Swapper that new input data arrived for the given model :param modelID: ID of the model for which new data arrived """ self._eventQ.put( {"method" : self._NEW_INPUT_NOTIFY_METHOD, "modelID" : modelID}) def _modelDoneNotifyTS(self, modelID, exitStatus): """ [thread-safe] Notify Model Swapper that a ModelRunner process completed. This method is passed as a callback to the SlotAgent that runs the model. :param modelID: model ID of model that completed execution :param exitStatus: the model's ModelRunner process exit status per os.WEXITSTATUS """ self._eventQ.put({"method" : self._MODEL_DONE_NOTIFY_METHOD, "modelID" : modelID, "exitStatus" : exitStatus, "endTime" : time.time()}) def _handleStopEventLoopRequestEvent(self, method): # pylint: disable=W0613 """ Set a flag to signal our event loop that it's time for graceful shutdown of the event loop. The event is enqueued by the "stop request" hander after it blocks further processing of notifications by notification reader. """ self._eventLoopStopPending = True self._logger.info("Set _eventLoopStopPending") def _handleNewInputNotifyEvent(self, method, # pylint: disable=W0613 modelID): """ Notification that new input was queued up for a particular model """ runningModelInfo = self._runningModelsMap.get(modelID) if runningModelInfo is not None: # This model is already running runningModelInfo.updateTimestamp() elif modelID not in self._waitingModelsFIFO: # This model was not running and is not awaiting execution # TODO: for a large _waitingModelsFIFO, the membership check may be slow. # Consider employing a map of all models of interest with current state # (waiting/pending/running/idle) reflected in the items for O(1) lookups, # but profile first to see if it's a problem! # NOTE: it's possible that the model has already processed all its input # and we're handling this notification belatedly, and this may result in # unnecessary start-up of its ModelRunner. We should generally be pretty # well caught up with notifications, and this shouldn't be an issue. # However, if necessary, we can mitigate this by checking the model's # input queue *just before* starting the model. if self._freeSlots: # No models should be waiting if we have a free slot assert not self._waitingModelsFIFO, repr(self._waitingModelsFIFO) # Assign the model to a free slot self._assignModelToFreeSlot(modelID) else: # This model needs to wait until resources become available self._waitingModelsFIFO.append(modelID) if self._profiling: self._logger.info("{TAG:SWAP.SC.MODEL.WAIT} model=%s; " "numWaitingModels=%s; numPendingPreemptSlots=%s", modelID, len(self._waitingModelsFIFO), len(self._pendingPreemptSlotsSet)) self._requestPreemptionOfRunningSlotIfNeededAndPossible() def _handleModelDoneNotifyEvent(self, method, # pylint: disable=W0613 modelID, exitStatus, endTime): """ Notification that a particular model completed execution and the SlotAgent that ran it is now available for a new assignment. exitStatus: the exit status of the ModelRunner process (per os.WEXITSTATUS) """ doneModelInfo = self._runningModelsMap.pop(modelID) if self._profiling: self._logger.info( "{TAG:SWAP.SC.MODEL.DONE} model=%s; slot=%d; exitStatus=%d; " "duration=%s; numRunningModels=%s; numWaitingModels=%s", modelID, doneModelInfo.slotIndex, exitStatus, endTime - doneModelInfo.startTime, len(self._runningModelsMap), len(self._waitingModelsFIFO)) assert doneModelInfo.slotIndex not in self._freeSlots assert 0 <= doneModelInfo.slotIndex < len(self._slotAgents) assert len(self._freeSlots) < len(self._slotAgents) self._freeSlots.append(doneModelInfo.slotIndex) # Drop the slot from pending-preempt set in case it was scheduled for # preemption self._pendingPreemptSlotsSet.discard(doneModelInfo.slotIndex) # Ack SlotAgent to ready it for the next model self._slotAgents[doneModelInfo.slotIndex].releaseSlot() if self._mainSwapper.modelInputPending(modelID): # There is more unprocessed input data for the completed model, # so notify ourselves asynchronously to schedule this model self._newInputNotifyTS(modelID) if self._waitingModelsFIFO: # Start a waiting model, now that we know there is a free slot newModelID = self._waitingModelsFIFO.pop(0) self._assignModelToFreeSlot(newModelID) self._requestPreemptionOfRunningSlotIfNeededAndPossible() def _assignModelToFreeSlot(self, modelID): """ Assign the given model to a free slot """ assert modelID not in self._runningModelsMap assert modelID not in self._waitingModelsFIFO freeSlotIndex = self._freeSlots.pop() self._slotAgents[freeSlotIndex].startModel( modelID=modelID, modelFinishedCallback=partial(self._modelDoneNotifyTS, modelID)) self._runningModelsMap[modelID] = _RunningModelInfo(freeSlotIndex) assert ((len(self._runningModelsMap) + len(self._freeSlots)) == len(self._slotAgents)), ( len(self._runningModelsMap), len(self._freeSlots), len(self._slotAgents)) self._logger.debug( "{TAG:SWAP.SC.MODEL.ASSIGN} model=%s; slot=%s; numRunningModels=%s; " "numFreeSlots=%s; numWaitingModels=%s; numPendingPreemptSlots=%s", modelID, freeSlotIndex, len(self._runningModelsMap), len(self._freeSlots), len(self._waitingModelsFIFO), len(self._pendingPreemptSlotsSet)) def _requestPreemptionOfRunningSlotIfNeededAndPossible(self): """ Schedule a single slot for preemption if needed and possible """ # There shouldn't be any free slots when we're asked to preempt assert not self._freeSlots, repr(self._freeSlots) if (len(self._waitingModelsFIFO) <= len(self._pendingPreemptSlotsSet) or len(self._pendingPreemptSlotsSet) >= len(self._slotAgents)): # Not needed or no preemptable slots return # Find an LRU non-pending-preempt busy slot agent, and request to # preempt it lru = sorted( ((i.slotIndex, i.timestamp) for i in self._runningModelsMap.itervalues() if i.slotIndex not in self._pendingPreemptSlotsSet), key=lambda element: element[1]) slotIndex, timestamp = lru.pop(0) # Request preemption of the LRU slot self._slotAgents[slotIndex].stopModel() self._pendingPreemptSlotsSet.add(slotIndex) if self._profiling: self._logger.info( "{TAG:SWAP.SC.SLOT.PREEMPT.REQ} slot=%d with timestamp=%s; " "numWaitingModels=%s; numPendingPreemptSlots=%s", slotIndex, timestamp, len(self._waitingModelsFIFO), len(self._pendingPreemptSlotsSet)) @abortProgramOnAnyException( _EXIT_CODE_ON_FAILURE_OF_NOTIFICATION_READER_THREAD, logger=_getLogger()) @logEntryExit(_getLogger, logging.INFO) @logExceptions(_getLogger) def _runNotificationReaderThread(self): """ Read model data notifications and pass them to the event loop """ self._logger.info("Notification Reader thread is running") # Let the main event loop know that this thread started successfully self._notificationReaderStartedEvent.set() with ModelSwapperInterface() as swapperAPI: # First, make sure our notification message queue exists, so we don't # miss any new notifications while we're checking for models with pending # input self._logger.info("SWAPPER_API: %r", swapperAPI) swapperAPI.initSchedulerNotification() # At start, notify main event loop of each model whose input is non-empty self._logger.info("Checking for models with pending input") i = 0 for i, modelID in enumerate(swapperAPI.getModelsWithInputPending(), 1): self._logger.debug("Input pending for model=%s", modelID) self._newInputNotifyTS(modelID=modelID) self._logger.info("%s model(s) had pending input", i) # Service the SwapController's input queue util stop is requested with swapperAPI.consumeModelSchedulerNotifications() as consumer: numHandledNotifications = 0 try: for notification in consumer: with self._notificationMutex: if self._stopNotificationReader: self._logger.info( "Notification reader exiting due to stop request") break self._newInputNotifyTS(modelID=notification.value) notification.ack() numHandledNotifications += 1 else: raise Exception("Unexpected termination of consumer loop in " "Notification Reader") finally: self._logger.info( "Control is leaving notification reader loop after processing %s " "notifications", numHandledNotifications)
def testModelSwapper(self): """Simple end-to-end test of the model swapper system.""" modelSchedulerSubprocess = self._startModelSchedulerSubprocess() self.addCleanup(lambda: modelSchedulerSubprocess.kill() if modelSchedulerSubprocess.returncode is None else None) modelID = "foobar" resultBatches = [] with ModelSwapperInterface() as swapperAPI: possibleModels = getScalarMetricWithTimeOfDayParams(metricData=[0], minVal=0, maxVal=1000) # Submit requests including a model creation command and two data rows. args = possibleModels[0] args["inputRecordSchema"] = ( FieldMetaInfo("c0", FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo("c1", FieldMetaType.float, FieldMetaSpecial.none), ) # Define the model _LOGGER.info("Defining the model") swapperAPI.defineModel(modelID=modelID, args=args, commandID="defineModelCmd1") # Attempt to define the same model again _LOGGER.info("Defining the model again") swapperAPI.defineModel(modelID=modelID, args=args, commandID="defineModelCmd2") # Send input rows to the model inputRows = [ ModelInputRow( rowID="rowfoo", data=[datetime.datetime(2013, 5, 23, 8, 13, 00), 5.3]), ModelInputRow( rowID="rowbar", data=[datetime.datetime(2013, 5, 23, 8, 13, 15), 2.4]), ] _LOGGER.info("Submitting batch of %d input rows...", len(inputRows)) swapperAPI.submitRequests(modelID=modelID, requests=inputRows) _LOGGER.info("These models have pending input: %s", swapperAPI.getModelsWithInputPending()) # Retrieve all results. # NOTE: We collect results via background thread to avoid # deadlocking the test runner in the event consuming blocks unexpectedly _LOGGER.info("Reading all batches of results...") numBatchesExpected = 3 resultBatches.extend( self._consumeResults(numBatchesExpected, timeout=20)) self.assertEqual(len(resultBatches), numBatchesExpected) with MessageBusConnector() as bus: # The results message queue should be empty now self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName)) # Delete the model _LOGGER.info("Deleting the model") swapperAPI.deleteModel(modelID=modelID, commandID="deleteModelCmd1") _LOGGER.info("Waiting for model deletion result") resultBatches.extend(self._consumeResults(1, timeout=20)) self.assertEqual(len(resultBatches), 4) with MessageBusConnector() as bus: # The results message queue should be empty now self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName)) # The model input queue should be deleted now self.assertFalse( bus.isMessageQeueuePresent( swapperAPI._getModelInputQName(modelID=modelID))) # Try deleting the model again, to make sure there are no exceptions _LOGGER.info("Attempting to delete the model again") swapperAPI.deleteModel(modelID=modelID, commandID="deleteModelCmd1") # Verify results # First result batch should be the first defineModel result batch = resultBatches[0] self.assertEqual(batch.modelID, modelID) self.assertEqual(len(batch.objects), 1) result = batch.objects[0] self.assertIsInstance(result, ModelCommandResult) self.assertEqual(result.method, "defineModel") self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.commandID, "defineModelCmd1") # The second result batch should for the second defineModel result for the # same model batch = resultBatches[1] self.assertEqual(batch.modelID, modelID) self.assertEqual(len(batch.objects), 1) result = batch.objects[0] self.assertIsInstance(result, ModelCommandResult) self.assertEqual(result.method, "defineModel") self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.commandID, "defineModelCmd2") # The third batch should be for the two input rows batch = resultBatches[2] self.assertEqual(batch.modelID, modelID) self.assertEqual(len(batch.objects), len(inputRows)) for inputRow, result in zip(inputRows, batch.objects): self.assertIsInstance(result, ModelInferenceResult) self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.rowID, inputRow.rowID) self.assertIsInstance(result.anomalyScore, float) # The fourth batch should be for the "deleteModel" batch = resultBatches[3] self.assertEqual(batch.modelID, modelID) self.assertEqual(len(batch.objects), 1) result = batch.objects[0] self.assertIsInstance(result, ModelCommandResult) self.assertEqual(result.method, "deleteModel") self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.commandID, "deleteModelCmd1") # Signal Model Scheduler Service subprocess to shut down and wait for it waitResult = dict() def runWaiterThread(): try: waitResult["returnCode"] = modelSchedulerSubprocess.wait() except: _LOGGER.exception( "Waiting for modelSchedulerSubprocess failed") waitResult["exceptionInfo"] = traceback.format_exc() raise return modelSchedulerSubprocess.terminate() waiterThread = threading.Thread(target=runWaiterThread) waiterThread.setDaemon(True) waiterThread.start() waiterThread.join(timeout=30) self.assertFalse(waiterThread.isAlive()) self.assertEqual(waitResult["returnCode"], 0, msg=repr(waitResult))
def testCollectAndPublishMetrics(self): # Start Metric Collector, create a set of Metrics, wait for it to collect # some metrics and to publish them to the metric_exchange, then validate # attributes of the published metrics. # # TODO Add more metric types # TODO Deeper validation of the published metrics # Start our own instance of metric collector and wait for data points with self._startModelSchedulerSubprocess() as modelSchedulerSubprocess, \ self._startMetricCollectorSubprocess() as metricCollectorSubprocess: # Create some models for metric collector to harvest region = "us-west-2" namespace = "AWS/EC2" resourceType = ResourceTypeNames.EC2_INSTANCE engine = repository.engineFactory() adapter = createCloudwatchDatasourceAdapter() ec2Instances = adapter.describeResources(region=region, resourceType=resourceType) self.assertGreater(len(ec2Instances), 0) maxModels = 10 ec2Instances = ec2Instances[:min(maxModels, Quota.getInstanceQuota( ))] metricInstances = [] _LOGGER.info("Starting %d models", len(ec2Instances)) self.assertGreater(len(ec2Instances), 0) for ec2Instance in ec2Instances: metricSpec = { "region": region, "namespace": namespace, "metric": "CPUUtilization", "dimensions": { "InstanceId": ec2Instance["resID"] } } modelSpec = { "datasource": "cloudwatch", "metricSpec": metricSpec } metricId = adapter.monitorMetric(modelSpec) with engine.connect() as conn: repository.setMetricStatus(conn, metricId, MetricStatus.ACTIVE) metricInstances.append(metricId) _LOGGER.info("Waiting for results from models...") seenMetricIDs = set() allMetricIDs = set(metricInstances) # Register a timeout so we won't deadlock the test def onTimeout(resultsQueueName): _LOGGER.error( "Timed out waiting to get results from models; numResults=%d; " "expected=%d", len(seenMetricIDs), len(allMetricIDs)) # HACK delete model swapper results queue to abort the consumer try: with MessageBusConnector() as bus: bus.deleteMessageQueue(resultsQueueName) except Exception: _LOGGER.exception("Failed to delete results mq=%s", resultsQueueName) raise with ModelSwapperInterface() as modelSwapper: with modelSwapper.consumeResults() as consumer: timer = threading.Timer( 120, onTimeout, args=[modelSwapper._resultsQueueName]) timer.start() try: for batch in consumer: seenMetricIDs.add(batch.modelID) batch.ack() if seenMetricIDs == allMetricIDs: break else: self.fail( "Expected %d results, but got only %d: %s" % ( len(allMetricIDs), len(seenMetricIDs), seenMetricIDs, )) _LOGGER.info("Got %d results from models", len(seenMetricIDs)) finally: timer.cancel() # Terminate metric_collector subprocess gracefully to avoid too much # error logging junk on the terminal metricCollectorSubprocess.send_signal(signal.SIGINT) # Terminate metric_collector subprocess gracefully to avoid too much # error logging junk on the terminal modelSchedulerSubprocess.send_signal(signal.SIGINT)
def run(self): """ Collect metric data and status for active metrics """ # NOTE: the process pool must be created BEFORE this main (parent) process # creates any global or class-level shared resources (e.g., boto # connection) that would have undersirable consequences when # replicated into and used by forked child processes (e.g., the same MySQL # connection socket file descriptor used by multiple processes). And we # can't take advantage of the process Pool's maxtasksperchild feature # either (for the same reason) self._log.info("Starting grok Metric Collector") resultsQueue = multiprocessing.Manager().JoinableQueue() recvPipe, sendPipe = multiprocessing.Pipe(False) processPool = multiprocessing.Pool( processes=self._WORKER_PROCESS_POOL_SIZE, maxtasksperchild=None) try: with ModelSwapperInterface() as modelSwapper: engine = repository.engineFactory() while True: startTime = time.time() if startTime > self._nextCacheGarbageCollectionTime: # TODO: unit-test self._garbageCollectInfoCache() # Determine which metrics are due for an update metricsToUpdate = self._getCandidateMetrics(engine) filterDuration = time.time() - startTime if not metricsToUpdate: time.sleep(self._NO_PENDING_METRICS_SLEEP_SEC) continue # Collect metric data collectionStartTime = time.time() poolResults = self._collectDataForMetrics(metricsToUpdate, processPool, resultsQueue) # Process/dispatch results in parallel in another thread as results # become available in resultsQueue dispatchThread = ( threading.Thread(target=self._processAndDispatchThreadTarget, args=(engine, metricsToUpdate, resultsQueue, modelSwapper, sendPipe))) dispatchStartTime = time.time() dispatchThread.start() # Syncronize with processPool poolResults.wait() # Wait for collection tasks to complete metricPollDuration = time.time() - collectionStartTime resultsQueue.join() # Block until all tasks completed... # Syncronize with dispatchThread resultsQueue.put(self._SENTINEL) # Signal to dispatchThread that # there are no more results to # process. resultsQueue.join() numEmpty, numErrors = recvPipe.recv() # Get dispatchThread stats dispatchDuration = time.time() - dispatchStartTime self._log.info( "Processed numMetrics=%d; numEmpty=%d; numErrors=%d; " "duration=%.4fs (filter=%.4fs; query=%.4fs; dispatch=%.4fs)", len(metricsToUpdate), numEmpty, numErrors, time.time() - startTime, filterDuration, metricPollDuration, dispatchDuration) finally: self._log.info("Exiting Metric Collector run-loop") processPool.terminate() processPool.join()
def testCloneModel(self): modelSchedulerSubprocess = self._startModelSchedulerSubprocess() self.addCleanup(lambda: modelSchedulerSubprocess.kill() if modelSchedulerSubprocess.returncode is None else None) modelID = "abc" destModelID = "def" resultBatches = [] with ModelSwapperInterface() as swapperAPI: args = getScalarMetricWithTimeOfDayAnomalyParams(metricData=[0], minVal=0, maxVal=1000) # Submit requests including a model creation command and two data rows. args["inputRecordSchema"] = ( FieldMetaInfo("c0", FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo("c1", FieldMetaType.float, FieldMetaSpecial.none), ) # Define the model _LOGGER.info("Defining the model") swapperAPI.defineModel(modelID=modelID, args=args, commandID="defineModelCmd1") resultBatches.extend(self._consumeResults(1, timeout=20)) self.assertEqual(len(resultBatches), 1) # Clone the just-defined model _LOGGER.info("Cloning model") swapperAPI.cloneModel(modelID, destModelID, commandID="cloneModelCmd1") resultBatches.extend(self._consumeResults(1, timeout=20)) self.assertEqual(len(resultBatches), 2) # Send input rows to the clone inputRows = [ ModelInputRow( rowID="rowfoo", data=[datetime.datetime(2013, 5, 23, 8, 13, 00), 5.3]), ModelInputRow( rowID="rowbar", data=[datetime.datetime(2013, 5, 23, 8, 13, 15), 2.4]), ] _LOGGER.info("Submitting batch of %d input rows...", len(inputRows)) swapperAPI.submitRequests(modelID=destModelID, requests=inputRows) _LOGGER.info("These models have pending input: %s", swapperAPI.getModelsWithInputPending()) resultBatches.extend(self._consumeResults(1, timeout=20)) self.assertEqual(len(resultBatches), 3) with MessageBusConnector() as bus: # The results message queue should be empty now self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName)) # Delete the model _LOGGER.info("Deleting the model") swapperAPI.deleteModel(modelID=destModelID, commandID="deleteModelCmd1") _LOGGER.info("Waiting for model deletion result") resultBatches.extend(self._consumeResults(1, timeout=20)) self.assertEqual(len(resultBatches), 4) with MessageBusConnector() as bus: # The results message queue should be empty now self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName)) # The model input queue should be deleted now self.assertFalse( bus.isMessageQeueuePresent( swapperAPI._getModelInputQName(modelID=destModelID))) # Verify results # First result batch should be the defineModel result batch = resultBatches[0] self.assertEqual(batch.modelID, modelID) self.assertEqual(len(batch.objects), 1) result = batch.objects[0] self.assertIsInstance(result, ModelCommandResult) self.assertEqual(result.method, "defineModel") self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.commandID, "defineModelCmd1") # The second result batch should for the cloneModel result batch = resultBatches[1] self.assertEqual(batch.modelID, modelID) self.assertEqual(len(batch.objects), 1) result = batch.objects[0] self.assertIsInstance(result, ModelCommandResult) self.assertEqual(result.method, "cloneModel") self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.commandID, "cloneModelCmd1") # The third batch should be for the two input rows batch = resultBatches[2] self.assertEqual(batch.modelID, destModelID) self.assertEqual(len(batch.objects), len(inputRows)) for inputRow, result in zip(inputRows, batch.objects): self.assertIsInstance(result, ModelInferenceResult) self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.rowID, inputRow.rowID) self.assertIsInstance(result.anomalyScore, float) self.assertIsInstance(result.multiStepBestPredictions, dict) # The fourth batch should be for the "deleteModel" batch = resultBatches[3] self.assertEqual(batch.modelID, destModelID) self.assertEqual(len(batch.objects), 1) result = batch.objects[0] self.assertIsInstance(result, ModelCommandResult) self.assertEqual(result.method, "deleteModel") self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.commandID, "deleteModelCmd1") # Signal Model Scheduler Service subprocess to shut down and wait for it waitResult = dict() def runWaiterThread(): try: waitResult["returnCode"] = modelSchedulerSubprocess.wait() except: _LOGGER.exception( "Waiting for modelSchedulerSubprocess failed") waitResult["exceptionInfo"] = traceback.format_exc() raise return modelSchedulerSubprocess.terminate() waiterThread = threading.Thread(target=runWaiterThread) waiterThread.setDaemon(True) waiterThread.start() waiterThread.join(timeout=30) self.assertFalse(waiterThread.isAlive()) self.assertEqual(waitResult["returnCode"], 0, msg=repr(waitResult))
def deleteHTMModel(modelId): with ModelSwapperInterface() as modelSwapper: modelSwapper.deleteModel(modelID=modelId, commandID=createGuid())
def _auxTestRunModelWithFullThenIncrementalCheckpoints( self, classifierEnabled): modelID = "foobar" checkpointMgr = model_checkpoint_mgr.ModelCheckpointMgr() args = getScalarMetricWithTimeOfDayAnomalyParams(metricData=[0], minVal=0, maxVal=1000) args["modelConfig"]["modelParams"]["clEnable"] = classifierEnabled # Submit requests including a model creation command and two data rows. args["inputRecordSchema"] = ( FieldMetaInfo("c0", FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo("c1", FieldMetaType.float, FieldMetaSpecial.none), ) with ModelSwapperInterface() as swapperAPI: # Define the model _LOGGER.info("Defining the model") swapperAPI.defineModel(modelID=modelID, args=args, commandID="defineModelCmd1") # Send input rows to the model inputRows = [ ModelInputRow( rowID="rowfoo", data=[datetime.datetime(2014, 5, 23, 8, 13, 00), 5.3]), ModelInputRow( rowID="rowbar", data=[datetime.datetime(2014, 5, 23, 8, 13, 15), 2.4]), ] _LOGGER.info( "Submitting batch of %d input rows with ids=[%s..%s]...", len(inputRows), inputRows[0].rowID, inputRows[-1].rowID) swapperAPI.submitRequests(modelID=modelID, requests=inputRows) # Run model_runner and collect results with self._startModelRunnerSubprocess( modelID) as modelRunnerProcess: resultBatches = self._consumeResults(numExpectedBatches=2, timeout=15) self._waitForProcessToStopAndCheck(modelRunnerProcess) with MessageBusConnector() as bus: # The results message queue should be empty now self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName)) self.assertEqual(len(resultBatches), 2, repr(resultBatches)) # First result batch should be the first defineModel result batch = resultBatches[0] self.assertEqual(batch.modelID, modelID) self.assertEqual(len(batch.objects), 1) result = batch.objects[0] self.assertIsInstance(result, ModelCommandResult) self.assertEqual(result.method, "defineModel") self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.commandID, "defineModelCmd1") # The second result batch should be for the two input rows batch = resultBatches[1] self.assertEqual(batch.modelID, modelID) self.assertEqual(len(batch.objects), len(inputRows)) for inputRow, result in zip(inputRows, batch.objects): self.assertIsInstance(result, ModelInferenceResult) self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.rowID, inputRow.rowID) self.assertIsInstance(result.anomalyScore, float) if classifierEnabled: self.assertIsInstance(result.multiStepBestPredictions, dict) else: self.assertIsNone(result.multiStepBestPredictions) # Verify model checkpoint model = checkpointMgr.load(modelID) del model attrs = checkpointMgr.loadCheckpointAttributes(modelID) self.assertIn( model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME, attrs, msg=repr(attrs)) self.assertEqual(len(attrs[ model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME]), 2, msg=repr(attrs)) self.assertNotIn(model_runner._ModelArchiver. _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME, attrs, msg=repr(attrs)) # Now, check incremental checkpointing inputRows2 = [ ModelInputRow( rowID=2, data=[datetime.datetime(2014, 5, 23, 8, 13, 20), 2.7]), ModelInputRow( rowID=3, data=[datetime.datetime(2014, 5, 23, 8, 13, 25), 3.9]), ] _LOGGER.info( "Submitting batch of %d input rows with ids=[%s..%s]...", len(inputRows2), inputRows2[0].rowID, inputRows2[-1].rowID) inputBatchID = swapperAPI.submitRequests(modelID=modelID, requests=inputRows2) with self._startModelRunnerSubprocess( modelID) as modelRunnerProcess: resultBatches = self._consumeResults(numExpectedBatches=1, timeout=15) self._waitForProcessToStopAndCheck(modelRunnerProcess) with MessageBusConnector() as bus: self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName)) batch = resultBatches[0] self.assertEqual(batch.modelID, modelID) self.assertEqual(len(batch.objects), len(inputRows2)) for inputRow, result in zip(inputRows2, batch.objects): self.assertIsInstance(result, ModelInferenceResult) self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.rowID, inputRow.rowID) self.assertIsInstance(result.anomalyScore, float) if classifierEnabled: self.assertIsInstance(result.multiStepBestPredictions, dict) else: self.assertIsNone(result.multiStepBestPredictions) model = checkpointMgr.load(modelID) del model attrs = checkpointMgr.loadCheckpointAttributes(modelID) self.assertIn( model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME, attrs, msg=repr(attrs)) self.assertSequenceEqual(attrs[ model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME], [inputBatchID], msg=repr(attrs)) self.assertIn(model_runner._ModelArchiver. _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME, attrs, msg=repr(attrs)) self.assertSequenceEqual( model_runner._ModelArchiver._decodeDataSamples( attrs[model_runner._ModelArchiver. _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME]), [row.data for row in inputRows2], msg=repr(attrs)) # Final run with incremental checkpointing inputRows3 = [ ModelInputRow( rowID=4, data=[datetime.datetime(2014, 5, 23, 8, 13, 30), 4.7]), ModelInputRow( rowID=5, data=[datetime.datetime(2014, 5, 23, 8, 13, 35), 5.9]), ] _LOGGER.info( "Submitting batch of %d input rows with ids=[%s..%s]...", len(inputRows3), inputRows3[0].rowID, inputRows3[-1].rowID) inputBatchID = swapperAPI.submitRequests(modelID=modelID, requests=inputRows3) with self._startModelRunnerSubprocess( modelID) as modelRunnerProcess: resultBatches = self._consumeResults(numExpectedBatches=1, timeout=15) self._waitForProcessToStopAndCheck(modelRunnerProcess) with MessageBusConnector() as bus: self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName)) batch = resultBatches[0] self.assertEqual(batch.modelID, modelID) self.assertEqual(len(batch.objects), len(inputRows3)) for inputRow, result in zip(inputRows3, batch.objects): self.assertIsInstance(result, ModelInferenceResult) self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.rowID, inputRow.rowID) self.assertIsInstance(result.anomalyScore, float) if classifierEnabled: self.assertIsInstance(result.multiStepBestPredictions, dict) else: self.assertIsNone(result.multiStepBestPredictions) model = checkpointMgr.load(modelID) del model attrs = checkpointMgr.loadCheckpointAttributes(modelID) self.assertIn( model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME, attrs, msg=repr(attrs)) self.assertSequenceEqual(attrs[ model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME], [inputBatchID], msg=repr(attrs)) self.assertIn(model_runner._ModelArchiver. _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME, attrs, msg=repr(attrs)) self.assertSequenceEqual( model_runner._ModelArchiver._decodeDataSamples( attrs[model_runner._ModelArchiver. _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME]), [row.data for row in itertools.chain(inputRows2, inputRows3)], msg=repr(attrs)) # Delete the model _LOGGER.info("Deleting the model=%s", modelID) swapperAPI.deleteModel(modelID=modelID, commandID="deleteModelCmd1") with self._startModelRunnerSubprocess( modelID) as modelRunnerProcess: resultBatches = self._consumeResults(numExpectedBatches=1, timeout=15) self._waitForProcessToStopAndCheck(modelRunnerProcess) self.assertEqual(len(resultBatches), 1, repr(resultBatches)) # First result batch should be the first defineModel result batch = resultBatches[0] self.assertEqual(batch.modelID, modelID) self.assertEqual(len(batch.objects), 1) result = batch.objects[0] self.assertIsInstance(result, ModelCommandResult) self.assertEqual(result.method, "deleteModel") self.assertEqual(result.status, htmengineerrno.SUCCESS) self.assertEqual(result.commandID, "deleteModelCmd1") with MessageBusConnector() as bus: self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName)) # The model input queue should be deleted now self.assertFalse( bus.isMessageQeueuePresent( swapperAPI._getModelInputQName(modelID=modelID))) # The model checkpoint should be gone too with self.assertRaises(model_checkpoint_mgr.ModelNotFound): checkpointMgr.load(modelID) with self.assertRaises(model_checkpoint_mgr.ModelNotFound): checkpointMgr.loadModelDefinition(modelID) with self.assertRaises(model_checkpoint_mgr.ModelNotFound): checkpointMgr.loadCheckpointAttributes(modelID) with self.assertRaises(model_checkpoint_mgr.ModelNotFound): checkpointMgr.remove(modelID)
class ModelRunner(object): # How many exception traceback tail characters to include in error results _MAX_TRACEBACK_TAIL = 400 def __init__(self, modelID): """ :param modelID: model ID; string """ self._logger = _getLogger() self._modelID = modelID self._swapperAPI = ModelSwapperInterface() self._archiver = _ModelArchiver(self._modelID) # "deleteModel" command handler sets this flag to force our processing # loop to terminate self._done = False modelSwapperConfig = ModelSwapperConfig() self._targetMaxRequestsPerCheckpoint = modelSwapperConfig.getint( "model_runner", "target_requests_per_checkpoint") self._profiling = (modelSwapperConfig.getboolean( "debugging", "profiling") or self._logger.isEnabledFor(logging.DEBUG)) if self._profiling: self._logger.info("Profiling is turned on") self._modelLoadSec = 0 @property def _model(self): """ An OPF Model object or None if not loaded yet """ return self._archiver.model @property def _inputRowEncoder(self): """ An _InputRowEncoder object or None if model not loaded yet """ return self._archiver.inputRowEncoder @property def _checkpointMgr(self): return self._archiver.checkpointMgr def __repr__(self): return "%s<modelID=%s>" % (self.__class__.__name__, self._modelID) def __enter__(self): """ Context Manager protocol method. Allows a ModelRunner instance to be used in a "with" statement for automatic clean-up Parameters: ------------------------------------------------------------------------ retval: self. """ return self def __exit__(self, _excType, _excVal, _excTb): """ Context Manager protocol method. Allows a ModelRunner instance to be used in a "with" statement for automatic cleanup Returns: False so as not to suppress the exception, if any """ self.close() return False def close(self): """ Clean up """ self._logger.debug("%r: Closing...", self) self._swapperAPI.close() @logExceptions(_getLogger()) def run(self): startTime = time.time() self._logger.debug("%r: Starting run", self) totalBatches = 0 totalRequests = 0 totalDupBatches = 0 # Retrieve the batch IDs that went into the current checkpoint since # previous checkpoint modelCheckpointBatchIDSet = self._archiver.modelCheckpointBatchIDSet try: while not self._done: currentRunBatchIDSet = set() currentRunInputSamples = [] currentRunNumRequests = 0 lastRequestBatch = None with self._swapperAPI.consumeRequests( modelID=self._modelID, blocking=False) as consumer: if self._profiling: batchStartTime = time.time() # Process the next run of batches until # self._targetMaxRequestsPerCheckpoint is reached or exceeded for candidateBatch in consumer: if (candidateBatch.batchID in currentRunBatchIDSet or candidateBatch.batchID in modelCheckpointBatchIDSet): self._logger.warn( "%r: Already processed this batch=%s, skipping it (we must " "have lost channel/connection before or during ack)", self, candidateBatch.batchID) # Make it go away for good candidateBatch.ack() totalDupBatches += 1 continue # NOTE: lastRequestBatch is used after this loop to ack this and # preceding batches lastRequestBatch = candidateBatch currentRunBatchIDSet.add(lastRequestBatch.batchID) totalBatches += 1 inputObjects = lastRequestBatch.objects numItems = len(inputObjects) currentRunNumRequests += numItems totalRequests += numItems self._logger.debug( "%r: Processing input batch #%s; batch=%s, numItems=%s...", self, totalBatches, lastRequestBatch.batchID, numItems) if self._profiling: procStartTime = time.time() self._modelLoadSec = 0 # Process the input batch results = self._processInputBatch( inputObjects, currentRunInputSamples) # Send results if self._profiling: submitStartTime = time.time() self._swapperAPI.submitResults(modelID=self._modelID, results=results) if self._profiling: now = time.time() tailRowTimestampISO = tailRowID = None # Assumption: no empty batches if isinstance(inputObjects[-1], ModelInputRow): # Assumption: entire batch consistes of ModelInputRow objects tailRowTimestampISO = inputObjects[-1].data[ 0].isoformat() + "Z" tailRowID = inputObjects[-1].rowID self._logger.info( "{TAG:SWAP.MR.BATCH.DONE} model=%s; batch=%s; numItems=%s; " "tailRowID=%s; tailRowTS=%s; duration=%.4fs; " "loadDuration=%.4fs; procDuration=%.4fs; submitDuration=%.4fs; " "totalBatches=%s; totalItems=%s", self._modelID, lastRequestBatch.batchID, len(results), tailRowID, tailRowTimestampISO, now - batchStartTime, self._modelLoadSec, submitStartTime - procStartTime - self._modelLoadSec, now - submitStartTime, totalBatches, totalRequests) if self._done: self._logger.debug( "%r: command handler requested exit, leaving " "consumer loop", self) break if currentRunNumRequests >= self._targetMaxRequestsPerCheckpoint: self._logger.debug( "End of current run: currentRunNumRequests=%s", currentRunNumRequests) break else: # for self._done = True self._logger.debug( "%r: No more input batches, leaving consumer loop", self) # Checkpoint the model and ack all request batches processed in this # run if lastRequestBatch is not None: assert len(currentRunBatchIDSet) > 0 modelCheckpointBatchIDSet = currentRunBatchIDSet # Checkpoint the model. if self._model is not None: if self._profiling: checkpointStartTime = time.time() self._archiver.saveModel( currentRunBatchIDSet=currentRunBatchIDSet, currentRunInputSamples=currentRunInputSamples) if self._profiling: self._logger.info( "%r: {TAG:SWAP.MR.CHKPT.DONE} currentRunNumRequests=%s; " "currentRunNumBatches=%s; duration=%.4fs", self, currentRunNumRequests, len(currentRunBatchIDSet), time.time() - checkpointStartTime) # Ack the last request batch and all unacked batches before it # consumed during this run lastRequestBatch.ack(multiple=True) if not self._done: # Check if SwapController wants to preempt us (it closes the other # end of our stdin to signal the intention) readReadyList = select.select((sys.stdin, ), (), (), 0)[0] if readReadyList: self._logger.debug( "%r: SwapController wants to preempt us, " "leaving", self) self._done = True finally: if totalBatches == 0: self._logger.warn("%r: zero input batches were processed", self) self._logger.info( "%r: {TAG:SWAP.MR.FINAL.SUMMARY} totalBatches=%s; totalRequests=%s; " "totalDupBatches=%s; duration=%s", self, totalBatches, totalRequests, totalDupBatches, time.time() - startTime) def _processInputBatch(self, inputObjects, currentRunInputSamples): """ Process a batch of model commands and/or inference input data rows :param inputObjects: a sequence of ModelCommand or ModelInputRow objects :param currentRunInputSamples: a list; the data of input rows will be appended to this list for input rows that were processed successfully :returns: a sequence of the corresponding ModelCommandResult or ModelInferenceResult objects TODO: unit-test return value """ results = [] for request in inputObjects: if isinstance(request, ModelCommand): results.append(self._processModelCommand(request)) elif isinstance(request, ModelInputRow): results.append( self._processInputRow(request, currentRunInputSamples)) else: raise ValueError("Unexpected request: " + repr(request)) return results def _processModelCommand(self, command): """ command: ModelCommand instance Returns: a ModelCommandResult instance """ self._logger.info("%r: Processing model command: %r", self, command) try: if command.method == "defineModel": return self._defineModel(command) elif command.method == "deleteModel": return self._deleteModel(command) elif command.method == "cloneModel": return self._cloneModel(command) else: raise _ModelRunnerError(errno=htmengineerrno.ERR_INVALID_ARG, msg="Unknown command method=%s" % (command.method, )) except (Exception, _ModelRunnerError) as e: # pylint: disable=W0703 self._logger.exception("%r: %r failed;", self, command) return ModelCommandResult( commandID=command.commandID, method=command.method, status=(e.errno if isinstance(e, _ModelRunnerError) else htmengineerrno.ERR), errorMessage="%s - %r failed on modelID=%s: %s. (tb: ...%s)" % (datetime.utcnow().isoformat(), command, self._modelID, str(e) or repr(e), traceback.format_exc()[-self._MAX_TRACEBACK_TAIL:])) def _defineModel(self, command): """ Handle the "defineModel" command command: ModelCommand instance for the "defineModel" command retval: ModelCommandResult instance raises _ModelRunnerError """ newModelDefinition = dict( modelParams=dict(modelConfig=command.args["modelConfig"], inferenceArgs=command.args["inferenceArgs"]), inputSchema=command.args["inputRecordSchema"]) try: self._checkpointMgr.define(modelID=self._modelID, definition=newModelDefinition) assert self._model is None except model_checkpoint_mgr.ModelAlreadyExists: # Check if the existing model has the same definition existingModelDefinition = self._checkpointMgr.loadModelDefinition( self._modelID) if newModelDefinition != existingModelDefinition: self._logger.error( "defineModel: modelID=%s already exists with different " "creation meta data; existing=%r; new=%r", self._modelID, existingModelDefinition, newModelDefinition) raise _ModelRunnerError( errno=htmengineerrno.ERR_MODEL_ALREADY_EXISTS, msg="defineModel: modelID=%s already exists with different " "creation meta data" % (self._modelID, )) else: # newModelDefinition is the same as existingModelDefinition; might be # side-effect of at-least-once delivery guarantee of the reliable data # path, so treat it as success self._logger.warn( "defineModel: modelID=%s already exists with same creation meta " "data (side-effect of at-least-once delivery guarantee?)", self._modelID) return ModelCommandResult(commandID=command.commandID, method=command.method, status=0) def _cloneModel(self, command): """ Handle the "cloneModel" command command: ModelCommand instance for the "cloneModel" command retval: ModelCommandResult instance """ try: self._checkpointMgr.clone(self._modelID, command.args["modelID"]) except model_checkpoint_mgr.ModelNotFound as e: errorMessage = "%r: Cloning failed - source model not found (%r)" % ( self, e, ) self._logger.error(errorMessage) return ModelCommandResult(commandID=command.commandID, method=command.method, status=htmengineerrno.ERR_NO_SUCH_MODEL, errorMessage=errorMessage) except model_checkpoint_mgr.ModelAlreadyExists as e: # Assuming it's the result of at-least-once delivery semantics self._logger.warn( "%r: Cloning destination already exists, suppressing (%r)", self, e) return ModelCommandResult(commandID=command.commandID, method=command.method, status=0) def _deleteModel(self, command): """ Handle the "deleteModel" command command: ModelCommand instance for the "deleteModel" command retval: ModelCommandResult instance raises _ModelRunnerError """ try: # Delete the model's checkpoint try: self._checkpointMgr.remove(modelID=self._modelID) except model_checkpoint_mgr.ModelNotFound: # Suppress it: could be side-effect of at-least-once delivery # guarantee, so treat it as success self._logger.warn( "deleteModel: modelID=%s not found (side-effect " "of at-least-once delivery guarantee?); " "suppressing error", self._modelID) # Clean up model's resources in ModelSwapperInterface self._swapperAPI.cleanUpAfterModelDeletion(self._modelID) finally: self._archiver = _ModelArchiver(self._modelID) self._done = True return ModelCommandResult(commandID=command.commandID, method=command.method, status=0) def _processInputRow(self, row, currentRunInputSamples): """ :param row: ModelInputRow instance :param currentRunInputSamples: a list; the input row's data will be appended to this list if the row is processed successfully :returns: a ModelInferenceResult instance """ try: if self._model is None: self._loadModel() # Convert a flat input row into a format that is consumable by an OPF # model self._inputRowEncoder.appendRecord(row.data) inputRecord = self._inputRowEncoder.getNextRecordDict() # Infer r = self._model.run(inputRecord) currentRunInputSamples.append(row.data) return ModelInferenceResult( rowID=row.rowID, status=0, anomalyScore=r.inferences["anomalyScore"], multiStepBestPredictions=r.inferences.get( "multiStepBestPredictions")) except (Exception, _ModelRunnerError) as e: # pylint: disable=W0703 self._logger.exception("%r: Inference failed for row=%r", self, row) return ModelInferenceResult( rowID=row.rowID, status=(e.errno if isinstance(e, _ModelRunnerError) else htmengineerrno.ERR), errorMessage= "%s - Inference failed for rowID=%s of modelID=%s (%r): " "(tb: ...%s)" % (datetime.utcnow().isoformat(), row.rowID, self._modelID, e, traceback.format_exc()[-self._MAX_TRACEBACK_TAIL:])) def _loadModel(self): """ Load the model and construct the input row encoder Side-effect: self._model and self._inputRowEncoder are loaded; self._modelLoadSec is set if profiling is turned on """ if self._model is None: if self._profiling: startTime = time.time() # Load the model self._archiver.loadModel() if self._profiling: self._modelLoadSec = time.time() - startTime
class ModelRunner(object): # How many exception traceback tail characters to include in error results _MAX_TRACEBACK_TAIL = 400 def __init__(self, modelID): """ :param modelID: model ID; string """ self._logger = _getLogger() self._modelID = modelID self._swapperAPI = ModelSwapperInterface() self._archiver = _ModelArchiver(self._modelID) # "deleteModel" command handler sets this flag to force our processing # loop to terminate self._done = False modelSwapperConfig = ModelSwapperConfig() self._targetMaxRequestsPerCheckpoint = modelSwapperConfig.getint( "model_runner", "target_requests_per_checkpoint") self._profiling = ( modelSwapperConfig.getboolean("debugging", "profiling") or self._logger.isEnabledFor(logging.DEBUG)) if self._profiling: self._logger.info("Profiling is turned on") self._modelLoadSec = 0 @property def _model(self): """ An OPF Model object or None if not loaded yet """ return self._archiver.model @property def _inputRowEncoder(self): """ An _InputRowEncoder object or None if model not loaded yet """ return self._archiver.inputRowEncoder @property def _checkpointMgr(self): return self._archiver.checkpointMgr def __repr__(self): return "%s<modelID=%s>" % (self.__class__.__name__, self._modelID) def __enter__(self): """ Context Manager protocol method. Allows a ModelRunner instance to be used in a "with" statement for automatic clean-up Parameters: ------------------------------------------------------------------------ retval: self. """ return self def __exit__(self, _excType, _excVal, _excTb): """ Context Manager protocol method. Allows a ModelRunner instance to be used in a "with" statement for automatic cleanup Returns: False so as not to suppress the exception, if any """ self.close() return False def close(self): """ Clean up """ self._logger.debug("%r: Closing...", self) self._swapperAPI.close() @logExceptions(_getLogger()) def run(self): startTime = time.time() self._logger.debug("%r: Starting run", self) totalBatches = 0 totalRequests = 0 totalDupBatches = 0 # Retrieve the batch IDs that went into the current checkpoint since # previous checkpoint modelCheckpointBatchIDSet = self._archiver.modelCheckpointBatchIDSet try: while not self._done: currentRunBatchIDSet = set() currentRunInputSamples = [] currentRunNumRequests = 0 lastRequestBatch = None with self._swapperAPI.consumeRequests( modelID=self._modelID, blocking=False) as consumer: if self._profiling: batchStartTime = time.time() # Process the next run of batches until # self._targetMaxRequestsPerCheckpoint is reached or exceeded for candidateBatch in consumer: if (candidateBatch.batchID in currentRunBatchIDSet or candidateBatch.batchID in modelCheckpointBatchIDSet): self._logger.warn( "%r: Already processed this batch=%s, skipping it (we must " "have lost channel/connection before or during ack)", self, candidateBatch.batchID) # Make it go away for good candidateBatch.ack() totalDupBatches += 1 continue # NOTE: lastRequestBatch is used after this loop to ack this and # preceding batches lastRequestBatch = candidateBatch currentRunBatchIDSet.add(lastRequestBatch.batchID) totalBatches += 1 inputObjects = lastRequestBatch.objects numItems = len(inputObjects) currentRunNumRequests += numItems totalRequests += numItems self._logger.debug( "%r: Processing input batch #%s; batch=%s, numItems=%s...", self, totalBatches, lastRequestBatch.batchID, numItems) if self._profiling: procStartTime = time.time() self._modelLoadSec = 0 # Process the input batch results = self._processInputBatch(inputObjects, currentRunInputSamples) # Send results if self._profiling: submitStartTime = time.time() self._swapperAPI.submitResults(modelID=self._modelID, results=results) if self._profiling: now = time.time() tailRowTimestampISO = tailRowID = None # Assumption: no empty batches if isinstance(inputObjects[-1], ModelInputRow): # Assumption: entire batch consistes of ModelInputRow objects tailRowTimestampISO = inputObjects[-1].data[0].isoformat() + "Z" tailRowID = inputObjects[-1].rowID self._logger.info( "{TAG:SWAP.MR.BATCH.DONE} model=%s; batch=%s; numItems=%s; " "tailRowID=%s; tailRowTS=%s; duration=%.4fs; " "loadDuration=%.4fs; procDuration=%.4fs; submitDuration=%.4fs; " "totalBatches=%s; totalItems=%s", self._modelID, lastRequestBatch.batchID, len(results), tailRowID, tailRowTimestampISO, now - batchStartTime, self._modelLoadSec, submitStartTime - procStartTime - self._modelLoadSec, now - submitStartTime, totalBatches, totalRequests) if self._done: self._logger.debug("%r: command handler requested exit, leaving " "consumer loop", self) break if currentRunNumRequests >= self._targetMaxRequestsPerCheckpoint: self._logger.debug("End of current run: currentRunNumRequests=%s", currentRunNumRequests) break else: # for self._done = True self._logger.debug( "%r: No more input batches, leaving consumer loop", self) # Checkpoint the model and ack all request batches processed in this # run if lastRequestBatch is not None: assert len(currentRunBatchIDSet) > 0 modelCheckpointBatchIDSet = currentRunBatchIDSet # Checkpoint the model. if self._model is not None: if self._profiling: checkpointStartTime = time.time() self._archiver.saveModel( currentRunBatchIDSet=currentRunBatchIDSet, currentRunInputSamples=currentRunInputSamples) if self._profiling: self._logger.info( "%r: {TAG:SWAP.MR.CHKPT.DONE} currentRunNumRequests=%s; " "currentRunNumBatches=%s; duration=%.4fs", self, currentRunNumRequests, len(currentRunBatchIDSet), time.time() - checkpointStartTime) # Ack the last request batch and all unacked batches before it # consumed during this run lastRequestBatch.ack(multiple=True) if not self._done: # Check if SwapController wants to preempt us (it closes the other # end of our stdin to signal the intention) readReadyList = select.select((sys.stdin,), (), (), 0)[0] if readReadyList: self._logger.debug("%r: SwapController wants to preempt us, " "leaving", self) self._done = True finally: if totalBatches == 0: self._logger.warn("%r: zero input batches were processed", self) self._logger.info( "%r: {TAG:SWAP.MR.FINAL.SUMMARY} totalBatches=%s; totalRequests=%s; " "totalDupBatches=%s; duration=%s", self, totalBatches, totalRequests, totalDupBatches, time.time() - startTime) def _processInputBatch(self, inputObjects, currentRunInputSamples): """ Process a batch of model commands and/or inference input data rows :param inputObjects: a sequence of ModelCommand or ModelInputRow objects :param currentRunInputSamples: a list; the data of input rows will be appended to this list for input rows that were processed successfully :returns: a sequence of the corresponding ModelCommandResult or ModelInferenceResult objects TODO: unit-test return value """ results = [] for request in inputObjects: if isinstance(request, ModelCommand): results.append(self._processModelCommand(request)) elif isinstance(request, ModelInputRow): results.append(self._processInputRow(request, currentRunInputSamples)) else: raise ValueError("Unexpected request: " + repr(request)) return results def _processModelCommand(self, command): """ command: ModelCommand instance Returns: a ModelCommandResult instance """ self._logger.info("%r: Processing model command: %r", self, command) try: if command.method == "defineModel": return self._defineModel(command) elif command.method == "deleteModel": return self._deleteModel(command) elif command.method == "cloneModel": return self._cloneModel(command) else: raise _ModelRunnerError( errno=htmengineerrno.ERR_INVALID_ARG, msg="Unknown command method=%s" % (command.method,)) except (Exception, _ModelRunnerError) as e: # pylint: disable=W0703 self._logger.exception("%r: %r failed;", self, command) return ModelCommandResult( commandID=command.commandID, method=command.method, status=(e.errno if isinstance(e, _ModelRunnerError) else htmengineerrno.ERR), errorMessage="%s - %r failed on modelID=%s: %s. (tb: ...%s)" % ( datetime.utcnow().isoformat(), command, self._modelID, str(e) or repr(e), traceback.format_exc()[-self._MAX_TRACEBACK_TAIL:])) def _defineModel(self, command): """ Handle the "defineModel" command command: ModelCommand instance for the "defineModel" command retval: ModelCommandResult instance raises _ModelRunnerError """ newModelDefinition = dict( modelParams=dict(modelConfig=command.args["modelConfig"], inferenceArgs=command.args["inferenceArgs"]), inputSchema=command.args["inputRecordSchema"] ) try: self._checkpointMgr.define(modelID=self._modelID, definition=newModelDefinition) assert self._model is None except model_checkpoint_mgr.ModelAlreadyExists: # Check if the existing model has the same definition existingModelDefinition = self._checkpointMgr.loadModelDefinition( self._modelID) if newModelDefinition != existingModelDefinition: self._logger.error( "defineModel: modelID=%s already exists with different " "creation meta data; existing=%r; new=%r", self._modelID, existingModelDefinition, newModelDefinition) raise _ModelRunnerError( errno=htmengineerrno.ERR_MODEL_ALREADY_EXISTS, msg="defineModel: modelID=%s already exists with different " "creation meta data" % (self._modelID,)) else: # newModelDefinition is the same as existingModelDefinition; might be # side-effect of at-least-once delivery guarantee of the reliable data # path, so treat it as success self._logger.warn( "defineModel: modelID=%s already exists with same creation meta " "data (side-effect of at-least-once delivery guarantee?)", self._modelID) return ModelCommandResult(commandID=command.commandID, method=command.method, status=0) def _cloneModel(self, command): """ Handle the "cloneModel" command command: ModelCommand instance for the "cloneModel" command retval: ModelCommandResult instance """ try: self._checkpointMgr.clone(self._modelID, command.args["modelID"]) except model_checkpoint_mgr.ModelNotFound as e: errorMessage = "%r: Cloning failed - source model not found (%r)" % ( self, e,) self._logger.error(errorMessage) return ModelCommandResult(commandID=command.commandID, method=command.method, status=htmengineerrno.ERR_NO_SUCH_MODEL, errorMessage=errorMessage) except model_checkpoint_mgr.ModelAlreadyExists as e: # Assuming it's the result of at-least-once delivery semantics self._logger.warn( "%r: Cloning destination already exists, suppressing (%r)", self, e) return ModelCommandResult(commandID=command.commandID, method=command.method, status=0) def _deleteModel(self, command): """ Handle the "deleteModel" command command: ModelCommand instance for the "deleteModel" command retval: ModelCommandResult instance raises _ModelRunnerError """ try: # Delete the model's checkpoint try: self._checkpointMgr.remove(modelID=self._modelID) except model_checkpoint_mgr.ModelNotFound: # Suppress it: could be side-effect of at-least-once delivery # guarantee, so treat it as success self._logger.warn("deleteModel: modelID=%s not found (side-effect " "of at-least-once delivery guarantee?); " "suppressing error", self._modelID) # Clean up model's resources in ModelSwapperInterface self._swapperAPI.cleanUpAfterModelDeletion(self._modelID) finally: self._archiver = _ModelArchiver(self._modelID) self._done = True return ModelCommandResult(commandID=command.commandID, method=command.method, status=0) def _processInputRow(self, row, currentRunInputSamples): """ :param row: ModelInputRow instance :param currentRunInputSamples: a list; the input row's data will be appended to this list if the row is processed successfully :returns: a ModelInferenceResult instance """ try: if self._model is None: self._loadModel() # Convert a flat input row into a format that is consumable by an OPF # model self._inputRowEncoder.appendRecord(row.data) inputRecord = self._inputRowEncoder.getNextRecordDict() # Infer r = self._model.run(inputRecord) currentRunInputSamples.append(row.data) return ModelInferenceResult( rowID=row.rowID, status=0, anomalyScore=r.inferences["anomalyScore"], multiStepBestPredictions=r.inferences.get("multiStepBestPredictions")) except (Exception, _ModelRunnerError) as e: # pylint: disable=W0703 self._logger.exception("%r: Inference failed for row=%r", self, row) return ModelInferenceResult( rowID=row.rowID, status=(e.errno if isinstance(e, _ModelRunnerError) else htmengineerrno.ERR), errorMessage="%s - Inference failed for rowID=%s of modelID=%s (%r): " "(tb: ...%s)" % ( datetime.utcnow().isoformat(), row.rowID, self._modelID, e, traceback.format_exc()[-self._MAX_TRACEBACK_TAIL:])) def _loadModel(self): """ Load the model and construct the input row encoder Side-effect: self._model and self._inputRowEncoder are loaded; self._modelLoadSec is set if profiling is turned on """ if self._model is None: if self._profiling: startTime = time.time() # Load the model self._archiver.loadModel() if self._profiling: self._modelLoadSec = time.time() - startTime
def run(self): """ Consumes pending results. Once result batch arrives, it will be dispatched to the correct model command result handler. :see: `_processModelCommandResult` and `_processModelInferenceResults` """ # Properties for publishing model command results on RabbitMQ exchange modelCommandResultProperties = MessageProperties( deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE, headers=dict(dataType="model-cmd-result")) # Properties for publishing model inference results on RabbitMQ exchange modelInferenceResultProperties = MessageProperties( deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE) # Declare an exchange for forwarding our results with amqp.synchronous_amqp_client.SynchronousAmqpClient( amqp.connection.getRabbitmqConnectionParameters( )) as amqpClient: amqpClient.declareExchange(self._modelResultsExchange, exchangeType="fanout", durable=True) with ModelSwapperInterface() as modelSwapper, MessageBusConnector( ) as bus: with modelSwapper.consumeResults() as consumer: for batch in consumer: if self._profiling: batchStartTime = time.time() inferenceResults = [] for result in batch.objects: try: if isinstance(result, ModelCommandResult): self._processModelCommandResult( batch.modelID, result) # Construct model command result message for consumption by # downstream processes try: cmdResultMessage = self._composeModelCommandResultMessage( modelID=batch.modelID, cmdResult=result) except (ObjectNotFoundError, MetricNotMonitoredError): pass else: bus.publishExg( exchange=self._modelResultsExchange, routingKey="", body=self._serializeModelResult( cmdResultMessage), properties=modelCommandResultProperties ) elif isinstance(result, ModelInferenceResult): inferenceResults.append(result) else: self._log.error("Unsupported ModelResult=%r", result) except ObjectNotFoundError: self._log.exception( "Error processing result=%r " "from model=%s", result, batch.modelID) if inferenceResults: result = self._processModelInferenceResults( inferenceResults, metricID=batch.modelID) if result is not None: # Construct model results payload for consumption by # downstream processes metricRow, dataRows = result resultsMessage = self._composeModelInferenceResultsMessage( metricRow, dataRows) payload = self._serializeModelResult( resultsMessage) bus.publishExg( exchange=self._modelResultsExchange, routingKey="", body=payload, properties=modelInferenceResultProperties) batch.ack() if self._profiling: if inferenceResults: if result is not None: # pylint: disable=W0633 metricRow, rows = result rowIdRange = ("%s..%s" % (rows[0].rowid, rows[-1].rowid) if len(rows) > 1 else str( rows[0].rowid)) self._log.info( "{TAG:ANOM.BATCH.INF.DONE} model=%s; " "numItems=%d; rows=[%s]; tailRowTS=%s; duration=%.4fs; " "ds=%s; name=%s", batch.modelID, len(batch.objects), rowIdRange, rows[-1].timestamp.isoformat() + "Z", time.time() - batchStartTime, metricRow.datasource, metricRow.name) else: self._log.info( "{TAG:ANOM.BATCH.CMD.DONE} model=%s; " "numItems=%d; duration=%.4fs", batch.modelID, len(batch.objects), time.time() - batchStartTime) self._log.info("Stopped processing model results")
class SwapController(object): """ Orchestrate model swapping across a fixed set of SlotAgent instances based on incoming model commands and records. """ # Event methods for the SwapController's event loop. These are handled by # instance methods with names constructed as follows: # "_handle" + method + "Event" (e.g., _handleStopEventLoopRequestEvent) _NEW_INPUT_NOTIFY_METHOD = "NewInputNotify" _MODEL_DONE_NOTIFY_METHOD = "ModelDoneNotify" _STOP_EVENT_LOOP_REQUEST_METHOD = "StopEventLoopRequest" _NOTIFICATION_READER_THREAD_START_WAIT_TIMEOUT_SEC = 5 _NOTIFICATION_READER_THREAD_JOIN_TIMEOUT_SEC = 10 _EXIT_CODE_ON_FAILURE_OF_NOTIFICATION_READER_THREAD = 1 def __init__(self, concurrency): """ concurrency: allowed number of model slots """ self._logger = _getLogger() self._profiling = (ModelSwapperConfig().getboolean( "debugging", "profiling") or self._logger.isEnabledFor(logging.DEBUG)) # Allowed number of model slots self._concurrency = concurrency # Input-reader thread target function sets this when it starts running to # let our event loop know that things are off to a good start self._notificationReaderStartedEvent = threading.Event() self._notificationMutex = threading.Lock() # Mutex used to guaranteed that no further model input notifications will # be added to main event queue once self._stopNotificationReader is set # _runNotificationReaderThread will not process any more notifications # once it detects that this flag is true self._stopNotificationReader = False # The event loop will exit some time after an event handler sets this flag # to True self._eventLoopStopPending = False # (non-thread-safe) The tuple of all slot agents self._slotAgents = tuple( SlotAgent(slotID=i) for i in xrange(concurrency)) assert self._slotAgents # Thread-safe event queue for SwapController self._eventQ = Queue.Queue() # Main event loop's ModelSwapperInterface instance. MUST NOT use from # threads because ModelSwapperInterface self._mainSwapper = ModelSwapperInterface() # A (non-thread-safe) FIFO of models that are waiting to be scheduled for # running; there is incoming data for them that needs to be processed self._waitingModelsFIFO = [] # A (non-thread-safe) map of modelIDs to _RunningModelInfo instances self._runningModelsMap = dict() # A (non-thread-safe) list of free slot indexes into the self._slotsAgents # tuple self._freeSlots = list(xrange(len(self._slotAgents))) # (non-thread-safe) Indexes of SlotAgents pending preemption self._pendingPreemptSlotsSet = set() self._notificationReaderThread = threading.Thread( target=self._runNotificationReaderThread, name="%s-input-reader-%s" % (self.__class__.__name__, id(self))) # Allow process to exit even if thread is still running self._notificationReaderThread.setDaemon(True) @logEntryExit(_getLogger, logging.INFO) @logExceptions(_getLogger()) def run(self): """ Run SwapController; blocking """ # Start our input-reader thread self._logger.info("Starting Notification Reader thread") self._notificationReaderThread.start() self._notificationReaderStartedEvent.wait( timeout=self._NOTIFICATION_READER_THREAD_START_WAIT_TIMEOUT_SEC) assert self._notificationReaderStartedEvent.is_set(), \ "Notification-reader thread failed to start in time" self._logger.info( "Notification Reader started, now entering Event Loop") requestedStopOfRemainingModels = False while True: if self._eventLoopStopPending: if not self._runningModelsMap and not self._waitingModelsFIFO: # All models are idle now, so close Slot Agents and bail out for sa in self._slotAgents: sa.close() self._logger.info( "Closed all Slot Agents; leaving event loop") break elif not self._waitingModelsFIFO and not requestedStopOfRemainingModels: # Only running models remain, so request to stop them gracefully assert self._runningModelsMap for modelInfo in self._runningModelsMap.itervalues(): if modelInfo.slotIndex not in self._pendingPreemptSlotsSet: self._slotAgents[modelInfo.slotIndex].stopModel() self._pendingPreemptSlotsSet.add( modelInfo.slotIndex) requestedStopOfRemainingModels = True self._logger.info( "Requested stop of remaining running models") # Get and handle next event evt = self._eventQ.get() method = evt["method"] handler = getattr(self, "_handle" + method + "Event") handler(**evt) def requestStopTS(self): """ [thread-safe; non-blocking] Enqueue a request to stop the SwapController. This will cause the instance's run() method to return eventually """ # This will prevent notification reader thread from processing any more # notifications. By the time we release the lock, no new notifications # will be placed in our event queue, so we can proceed with graceful # shutdown. with self._notificationMutex: self._stopNotificationReader = True self._logger.info("Requested stop of notification processing") # Asynchronously request graceful shutdown of our event loop. This allows # all remaining events from the notification-reader to be handled # prior to commensing graceful shutdown. self._logger.info("Requesting event-loop shutdown") self._eventQ.put({"method": self._STOP_EVENT_LOOP_REQUEST_METHOD}) def _newInputNotifyTS(self, modelID): """ [thread-safe] Notify Model Swapper that new input data arrived for the given model :param modelID: ID of the model for which new data arrived """ self._eventQ.put({ "method": self._NEW_INPUT_NOTIFY_METHOD, "modelID": modelID }) def _modelDoneNotifyTS(self, modelID, exitStatus): """ [thread-safe] Notify Model Swapper that a ModelRunner process completed. This method is passed as a callback to the SlotAgent that runs the model. :param modelID: model ID of model that completed execution :param exitStatus: the model's ModelRunner process exit status per os.WEXITSTATUS """ self._eventQ.put({ "method": self._MODEL_DONE_NOTIFY_METHOD, "modelID": modelID, "exitStatus": exitStatus, "endTime": time.time() }) def _handleStopEventLoopRequestEvent(self, method): # pylint: disable=W0613 """ Set a flag to signal our event loop that it's time for graceful shutdown of the event loop. The event is enqueued by the "stop request" hander after it blocks further processing of notifications by notification reader. """ self._eventLoopStopPending = True self._logger.info("Set _eventLoopStopPending") def _handleNewInputNotifyEvent( self, method, # pylint: disable=W0613 modelID): """ Notification that new input was queued up for a particular model """ runningModelInfo = self._runningModelsMap.get(modelID) if runningModelInfo is not None: # This model is already running runningModelInfo.updateTimestamp() elif modelID not in self._waitingModelsFIFO: # This model was not running and is not awaiting execution # TODO: for a large _waitingModelsFIFO, the membership check may be slow. # Consider employing a map of all models of interest with current state # (waiting/pending/running/idle) reflected in the items for O(1) lookups, # but profile first to see if it's a problem! # NOTE: it's possible that the model has already processed all its input # and we're handling this notification belatedly, and this may result in # unnecessary start-up of its ModelRunner. We should generally be pretty # well caught up with notifications, and this shouldn't be an issue. # However, if necessary, we can mitigate this by checking the model's # input queue *just before* starting the model. if self._freeSlots: # No models should be waiting if we have a free slot assert not self._waitingModelsFIFO, repr( self._waitingModelsFIFO) # Assign the model to a free slot self._assignModelToFreeSlot(modelID) else: # This model needs to wait until resources become available self._waitingModelsFIFO.append(modelID) if self._profiling: self._logger.info( "{TAG:SWAP.SC.MODEL.WAIT} model=%s; " "numWaitingModels=%s; numPendingPreemptSlots=%s", modelID, len(self._waitingModelsFIFO), len(self._pendingPreemptSlotsSet)) self._requestPreemptionOfRunningSlotIfNeededAndPossible() def _handleModelDoneNotifyEvent( self, method, # pylint: disable=W0613 modelID, exitStatus, endTime): """ Notification that a particular model completed execution and the SlotAgent that ran it is now available for a new assignment. exitStatus: the exit status of the ModelRunner process (per os.WEXITSTATUS) """ doneModelInfo = self._runningModelsMap.pop(modelID) if self._profiling: self._logger.info( "{TAG:SWAP.SC.MODEL.DONE} model=%s; slot=%d; exitStatus=%d; " "duration=%s; numRunningModels=%s; numWaitingModels=%s", modelID, doneModelInfo.slotIndex, exitStatus, endTime - doneModelInfo.startTime, len(self._runningModelsMap), len(self._waitingModelsFIFO)) assert doneModelInfo.slotIndex not in self._freeSlots assert 0 <= doneModelInfo.slotIndex < len(self._slotAgents) assert len(self._freeSlots) < len(self._slotAgents) self._freeSlots.append(doneModelInfo.slotIndex) # Drop the slot from pending-preempt set in case it was scheduled for # preemption self._pendingPreemptSlotsSet.discard(doneModelInfo.slotIndex) # Ack SlotAgent to ready it for the next model self._slotAgents[doneModelInfo.slotIndex].releaseSlot() if self._mainSwapper.modelInputPending(modelID): # There is more unprocessed input data for the completed model, # so notify ourselves asynchronously to schedule this model self._newInputNotifyTS(modelID) if self._waitingModelsFIFO: # Start a waiting model, now that we know there is a free slot newModelID = self._waitingModelsFIFO.pop(0) self._assignModelToFreeSlot(newModelID) self._requestPreemptionOfRunningSlotIfNeededAndPossible() def _assignModelToFreeSlot(self, modelID): """ Assign the given model to a free slot """ assert modelID not in self._runningModelsMap assert modelID not in self._waitingModelsFIFO freeSlotIndex = self._freeSlots.pop() self._slotAgents[freeSlotIndex].startModel( modelID=modelID, modelFinishedCallback=partial(self._modelDoneNotifyTS, modelID)) self._runningModelsMap[modelID] = _RunningModelInfo(freeSlotIndex) assert ((len(self._runningModelsMap) + len(self._freeSlots)) == len( self._slotAgents)), (len(self._runningModelsMap), len(self._freeSlots), len(self._slotAgents)) self._logger.debug( "{TAG:SWAP.SC.MODEL.ASSIGN} model=%s; slot=%s; numRunningModels=%s; " "numFreeSlots=%s; numWaitingModels=%s; numPendingPreemptSlots=%s", modelID, freeSlotIndex, len(self._runningModelsMap), len(self._freeSlots), len(self._waitingModelsFIFO), len(self._pendingPreemptSlotsSet)) def _requestPreemptionOfRunningSlotIfNeededAndPossible(self): """ Schedule a single slot for preemption if needed and possible """ # There shouldn't be any free slots when we're asked to preempt assert not self._freeSlots, repr(self._freeSlots) if (len(self._waitingModelsFIFO) <= len(self._pendingPreemptSlotsSet) or len(self._pendingPreemptSlotsSet) >= len(self._slotAgents)): # Not needed or no preemptable slots return # Find an LRU non-pending-preempt busy slot agent, and request to # preempt it lru = sorted(((i.slotIndex, i.timestamp) for i in self._runningModelsMap.itervalues() if i.slotIndex not in self._pendingPreemptSlotsSet), key=lambda element: element[1]) slotIndex, timestamp = lru.pop(0) # Request preemption of the LRU slot self._slotAgents[slotIndex].stopModel() self._pendingPreemptSlotsSet.add(slotIndex) if self._profiling: self._logger.info( "{TAG:SWAP.SC.SLOT.PREEMPT.REQ} slot=%d with timestamp=%s; " "numWaitingModels=%s; numPendingPreemptSlots=%s", slotIndex, timestamp, len(self._waitingModelsFIFO), len(self._pendingPreemptSlotsSet)) @abortProgramOnAnyException( _EXIT_CODE_ON_FAILURE_OF_NOTIFICATION_READER_THREAD, logger=_getLogger()) @logEntryExit(_getLogger, logging.INFO) @logExceptions(_getLogger()) def _runNotificationReaderThread(self): """ Read model data notifications and pass them to the event loop """ self._logger.info("Notification Reader thread is running") # Let the main event loop know that this thread started successfully self._notificationReaderStartedEvent.set() with ModelSwapperInterface() as swapperAPI: # First, make sure our notification message queue exists, so we don't # miss any new notifications while we're checking for models with pending # input self._logger.info("SWAPPER_API: %r", swapperAPI) swapperAPI.initSchedulerNotification() # At start, notify main event loop of each model whose input is non-empty self._logger.info("Checking for models with pending input") i = 0 for i, modelID in enumerate(swapperAPI.getModelsWithInputPending(), 1): self._logger.debug("Input pending for model=%s", modelID) self._newInputNotifyTS(modelID=modelID) self._logger.info("%s model(s) had pending input", i) # Service the SwapController's input queue util stop is requested with swapperAPI.consumeModelSchedulerNotifications() as consumer: numHandledNotifications = 0 try: for notification in consumer: with self._notificationMutex: if self._stopNotificationReader: self._logger.info( "Notification reader exiting due to stop request" ) break self._newInputNotifyTS(modelID=notification.value) notification.ack() numHandledNotifications += 1 else: raise Exception( "Unexpected termination of consumer loop in " "Notification Reader") finally: self._logger.info( "Control is leaving notification reader loop after processing %s " "notifications", numHandledNotifications)