def _handleMetricCollectionError(self, engine, metricObj, startTime, error): """ Update the metric's collector_error member and promote the metric to ERROR state if needed :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricObj: Metric instance :param startTime: metric collection start time; unix epoch timestamp :param error: exception object that is the result of metric data collection or (possibly pending) resource status check """ # Quarantine the metric temporarily self._metricInfoCache[metricObj.uid].quarantineEndTime = ( startTime + metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO) try: if metricObj.collector_error is None: # TODO: unit-test # Begin error grace period for this metric deadline = time.time() + self._metricErrorGracePeriod with engine.connect() as conn: repository.retryOnTransientErrors( repository.setMetricCollectorError)( conn, metricObj.uid, utils.jsonEncode( dict(deadline=deadline, message=repr(error)))) self._log.error( "Started error grace period on metric=<%r> through %sZ due to " "error=%r", metricObj, datetime.utcfromtimestamp(deadline).isoformat(), error) elif (time.time() > utils.jsonDecode( metricObj.collector_error)["deadline"]): # TODO: unit-test # Error grace period expired: promote the metric to ERROR state with engine.connect() as conn: repository.retryOnTransientErrors( repository.setMetricStatus)(conn, metricObj.uid, MetricStatus.ERROR, repr(error)) self._log.error( "Metric Collector: grace period expired; placed metric=<%r> in " "ERROR state due to error=%r", metricObj, error) except app_exceptions.ObjectNotFoundError: # TODO: unit-test self._log.warning("Metric deleted? metric=%r", metricObj, exc_info=True)
def _getCandidateMetrics(self, engine): """ Return the metrics that are due for an update :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :returns: a (possibly empty) sequence of Metric instances that are due for an update (after removing quarantined indexes) TODO: unit-test """ with engine.connect() as conn: metricsToUpdate = repository.retryOnTransientErrors( repository.getCloudwatchMetricsPendingDataCollection)(conn) # Remove quarantined metrics quarantinedIndexes = None if metricsToUpdate: now = time.time() quarantinedIndexes = set( i for i, m in enumerate(metricsToUpdate) if now < self._metricInfoCache[m.uid].quarantineEndTime) metricsToUpdate = OrderedDict((m.uid, m) for i, m in enumerate(metricsToUpdate) if i not in quarantinedIndexes) if not metricsToUpdate: # TODO: unit-test self._log.debug("All candidate numMetrics=%d are quarantined", len(quarantinedIndexes)) return metricsToUpdate
def _getCandidateMetrics(self, engine): """ Return the metrics that are due for an update :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :returns: a (possibly empty) sequence of Metric instances that are due for an update (after removing quarantined indexes) TODO: unit-test """ with engine.connect() as conn: metricsToUpdate = repository.retryOnTransientErrors( repository.getCloudwatchMetricsPendingDataCollection)(conn) # Remove quarantined metrics quarantinedIndexes = None if metricsToUpdate: now = time.time() quarantinedIndexes = set( i for i, m in enumerate(metricsToUpdate) if now < self._metricInfoCache[m.uid].quarantineEndTime) metricsToUpdate = OrderedDict( (m.uid, m) for i, m in enumerate(metricsToUpdate) if i not in quarantinedIndexes) if not metricsToUpdate: # TODO: unit-test self._log.debug("All candidate numMetrics=%d are quarantined", len(quarantinedIndexes)) return metricsToUpdate
def _handleMetricCollectionError(self, engine, metricObj, startTime, error): """ Update the metric's collector_error member and promote the metric to ERROR state if needed :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricObj: Metric instance :param startTime: metric collection start time; unix epoch timestamp :param error: exception object that is the result of metric data collection or (possibly pending) resource status check """ # Quarantine the metric temporarily self._metricInfoCache[metricObj.uid].quarantineEndTime = ( startTime + metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO) try: if metricObj.collector_error is None: # TODO: unit-test # Begin error grace period for this metric deadline = time.time() + self._metricErrorGracePeriod with engine.connect() as conn: repository.retryOnTransientErrors(repository.setMetricCollectorError)( conn, metricObj.uid, utils.jsonEncode(dict(deadline=deadline, message=repr(error)))) self._log.error( "Started error grace period on metric=<%r> through %sZ due to " "error=%r", metricObj, datetime.utcfromtimestamp(deadline).isoformat(), error) elif (time.time() > utils.jsonDecode(metricObj.collector_error)["deadline"]): # TODO: unit-test # Error grace period expired: promote the metric to ERROR state with engine.connect() as conn: repository.retryOnTransientErrors(repository.setMetricStatus)( conn, metricObj.uid, MetricStatus.ERROR, repr(error)) self._log.error( "Metric Collector: grace period expired; placed metric=<%r> in " "ERROR state due to error=%r", metricObj, error) except app_exceptions.ObjectNotFoundError: # TODO: unit-test self._log.warning("Metric deleted? metric=%r", metricObj, exc_info=True)
def unmonitorMetric(self, metricId): """ Unmonitor a metric :param metricId: unique identifier of the metric row :raises YOMP.app.exceptions.ObjectNotFoundError: if metric with the referenced metric uid doesn't exist """ # Delete the metric from the database with self.connectionFactory() as conn: repository.retryOnTransientErrors(repository.deleteMetric)( conn, metricId) # Send request to delete HTM model model_swapper_utils.deleteHTMModel(metricId) self._log.info("Cloudwatch Metric unmonitored: metric=%r", metricId)
def exportModel(self, metricId): """ Export the given model. :param metricId: datasource-specific unique metric identifier :returns: Model-export specification for Cloudwatch model :rtype: dict :: { "datasource": "cloudwatch", "metricSpec": { "region": "us-west-2", "namespace": "AWS/EC2", "metric": "CPUUtilization", "dimensions": { "InstanceId": "i-12d67826" } }, # Same modelParams with which model was created, if any "modelParams": { "min": 0, # optional "max": 100 # optional } } :raises YOMP.app.exceptions.ObjectNotFoundError: if referenced metric doesn't exist """ with self.connectionFactory() as conn: metricObj = repository.retryOnTransientErrors(repository.getMetric)( conn, metricId, fields=[schema.metric.c.parameters]) parameters = htmengine.utils.jsonDecode(metricObj.parameters) return parameters
def run(self): with ModelSwapperInterface() as modelSwapper: engine = repository.engineFactory() while True: with engine.connect() as conn: pendingStacks = repository.retryOnTransientErrors( repository.getAutostackMetricsPendingDataCollection )(conn) if not pendingStacks: time.sleep(self._NOTHING_READY_SLEEP_TIME_SEC) continue # Build a sequence of autostack metric requests requests = [] for autostack, metrics in pendingStacks: refBase = len(requests) requests.extend( AutostackMetricRequest(refID=refBase + i, autostack=autostack, metric=metric) for i, metric in enumerate(metrics) ) # Collect, aggregate, and stream metric data self._processAutostackMetricRequests(engine, requests, modelSwapper)
def run(self): with ModelSwapperInterface() as modelSwapper: engine = repository.engineFactory() while True: with engine.connect() as conn: pendingStacks = repository.retryOnTransientErrors( repository.getAutostackMetricsPendingDataCollection)(conn) if not pendingStacks: time.sleep(self._NOTHING_READY_SLEEP_TIME_SEC) continue # Build a sequence of autostack metric requests requests = [] for autostack, metrics in pendingStacks: refBase = len(requests) requests.extend( AutostackMetricRequest(refID=refBase + i, autostack=autostack, metric=metric) for i, metric in enumerate(metrics)) # Collect, aggregate, and stream metric data self._processAutostackMetricRequests(engine, requests, modelSwapper)
def _processCollectedData(self, engine, metricsToUpdate, modelSwapper, collectResult): """ Process collected metrics data; publish it on the destination message queue, and update metric instance status and metric collection state. :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricsToUpdate: a sequence of Metric instances which were due for an update :type metricsToUpdate: collections.OrderedDict :param modelSwapper: ModelSwapperInterface object for running models :param collectResult: _DataCollectionResult object :returns: 2-tuple (numEmpty, numErrors), where numEmpty is the count of metrics with empty data (not ready or data gap) numErrors is the count of metrics with errors """ numEmpty = numErrors = 0 metricObj = metricsToUpdate[collectResult.metricID] # Update cached resource status resourceInfoCacheItem = self._resourceInfoCache[metricObj.server] newResourceStatus = None if collectResult.resourceStatus is not None: if isinstance(collectResult.resourceStatus, Exception): resourceInfoCacheItem.error = collectResult.resourceStatus else: newResourceStatus = collectResult.resourceStatus resourceInfoCacheItem.error = None if isinstance(collectResult.data, Exception): dataError = collectResult.data newData = None nextStartTime = None else: dataError = None newData = collectResult.data nextStartTime = collectResult.nextCallStart if newData is not None or newResourceStatus is not None: rangeStart = metricObj.last_timestamp if not newData: # Data is not ready yet or data gap numEmpty += 1 self._metricInfoCache[metricObj.uid].quarantineEndTime = ( collectResult.creationTime + metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO) else: headTS = newData[0][0] tailTS = newData[-1][0] startTime = time.time() # Update resource status and next metric start time @repository.retryOnTransientErrors def runSQL(engine): with engine.begin() as conn: try: # Save resource status if newResourceStatus is not None: repository.saveMetricInstanceStatus(conn, metricObj.server, newResourceStatus) # Save starting time for the next data query in metric row if nextStartTime is not None: repository.setMetricLastTimestamp(conn, metricObj.uid, nextStartTime) except app_exceptions.ObjectNotFoundError: self._log.warning("Metric deleted?", exc_info=True) if newResourceStatus is not None or nextStartTime is not None: runSQL(engine) if newData: # This is done outside the cursor context to minimize lock duration. # # NOTE: It is still possible for duplicate data to be sent to the # exchange so the recipient will need to de-dupe it. It is also # possible for the metric to be updated in the database but data not # sent to the exchange. try: self.metricStreamer.streamMetricData(newData, metricID=metricObj.uid, modelSwapper=modelSwapper) except ValueError: self._log.exception("Failed to stream metric data %r" % newData) except app_exceptions.ObjectNotFoundError: # We expect that the model exists but in the odd case that it has # already been deleted we don't want to crash the process. self._log.info("Metric not found when adding data.") if self._profiling: self._log.info( "{TAG:APP.MC.DATA.TO_STRM.DONE} numItems=%d for metric=%r; " "rangeStart=%s; ts=[%s]; duration=%.4fs; ", len(newData), metricObj, rangeStart.isoformat() + "Z" if rangeStart is not None else None, (("%sZ..%sZ" % (headTS.isoformat(), tailTS.isoformat())) if len(newData) > 1 else (headTS.isoformat() + "Z")), time.time() - startTime + collectResult.duration) else: if self._log.isEnabledFor(logging.DEBUG): self._log.debug( "{TAG:APP.MC.DATA.NONE} No data for metric=%r; rangeStart=%s; " "duration=%.4fs", metricObj, rangeStart.isoformat() + "Z" if rangeStart is not None else None, time.time() - startTime + collectResult.duration) # Take care of errors error = dataError or resourceInfoCacheItem.error if error is not None: # Metric data or resource status collection error numErrors += 1 # Update metric error grace period and quarantine info self._handleMetricCollectionError(engine, metricObj, startTime=collectResult.creationTime, error=error) else: if metricObj.collector_error is not None: oldErrorInfo = metricObj.collector_error # There was pending collector error before, but no error this time, so # clear metric's collector_error try: with engine.connect() as conn: repository.retryOnTransientErrors( repository.setMetricCollectorError)(conn, metricObj.uid, None) except app_exceptions.ObjectNotFoundError: self._log.warning("Metric deleted?", exc_info=True) self._log.info("metric=<%r> exited error grace state %s", metricObj, oldErrorInfo) return (numEmpty, numErrors)
def _processCollectedData(self, engine, metricsToUpdate, modelSwapper, collectResult): """ Process collected metrics data; publish it on the destination message queue, and update metric instance status and metric collection state. :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricsToUpdate: a sequence of Metric instances which were due for an update :type metricsToUpdate: collections.OrderedDict :param modelSwapper: ModelSwapperInterface object for running models :param collectResult: _DataCollectionResult object :returns: 2-tuple (numEmpty, numErrors), where numEmpty is the count of metrics with empty data (not ready or data gap) numErrors is the count of metrics with errors """ numEmpty = numErrors = 0 metricObj = metricsToUpdate[collectResult.metricID] # Update cached resource status resourceInfoCacheItem = self._resourceInfoCache[metricObj.server] newResourceStatus = None if collectResult.resourceStatus is not None: if isinstance(collectResult.resourceStatus, Exception): resourceInfoCacheItem.error = collectResult.resourceStatus else: newResourceStatus = collectResult.resourceStatus resourceInfoCacheItem.error = None if isinstance(collectResult.data, Exception): dataError = collectResult.data newData = None nextStartTime = None else: dataError = None newData = collectResult.data nextStartTime = collectResult.nextCallStart if newData is not None or newResourceStatus is not None: rangeStart = metricObj.last_timestamp if not newData: # Data is not ready yet or data gap numEmpty += 1 self._metricInfoCache[metricObj.uid].quarantineEndTime = ( collectResult.creationTime + metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO) else: headTS = newData[0][0] tailTS = newData[-1][0] startTime = time.time() # Update resource status and next metric start time @repository.retryOnTransientErrors def runSQL(engine): with engine.begin() as conn: try: # Save resource status if newResourceStatus is not None: repository.saveMetricInstanceStatus( conn, metricObj.server, newResourceStatus) # Save starting time for the next data query in metric row if nextStartTime is not None: repository.setMetricLastTimestamp( conn, metricObj.uid, nextStartTime) except app_exceptions.ObjectNotFoundError: self._log.warning("Metric deleted?", exc_info=True) if newResourceStatus is not None or nextStartTime is not None: runSQL(engine) if newData: # This is done outside the cursor context to minimize lock duration. # # NOTE: It is still possible for duplicate data to be sent to the # exchange so the recipient will need to de-dupe it. It is also # possible for the metric to be updated in the database but data not # sent to the exchange. try: self.metricStreamer.streamMetricData( newData, metricID=metricObj.uid, modelSwapper=modelSwapper) except ValueError: self._log.exception("Failed to stream metric data %r" % newData) except app_exceptions.ObjectNotFoundError: # We expect that the model exists but in the odd case that it has # already been deleted we don't want to crash the process. self._log.info("Metric not found when adding data.") if self._profiling: self._log.info( "{TAG:APP.MC.DATA.TO_STRM.DONE} numItems=%d for metric=%r; " "rangeStart=%s; ts=[%s]; duration=%.4fs; ", len(newData), metricObj, rangeStart.isoformat() + "Z" if rangeStart is not None else None, (("%sZ..%sZ" % (headTS.isoformat(), tailTS.isoformat())) if len(newData) > 1 else (headTS.isoformat() + "Z")), time.time() - startTime + collectResult.duration) else: if self._log.isEnabledFor(logging.DEBUG): self._log.debug( "{TAG:APP.MC.DATA.NONE} No data for metric=%r; rangeStart=%s; " "duration=%.4fs", metricObj, rangeStart.isoformat() + "Z" if rangeStart is not None else None, time.time() - startTime + collectResult.duration) # Take care of errors error = dataError or resourceInfoCacheItem.error if error is not None: # Metric data or resource status collection error numErrors += 1 # Update metric error grace period and quarantine info self._handleMetricCollectionError( engine, metricObj, startTime=collectResult.creationTime, error=error) else: if metricObj.collector_error is not None: oldErrorInfo = metricObj.collector_error # There was pending collector error before, but no error this time, so # clear metric's collector_error try: with engine.connect() as conn: repository.retryOnTransientErrors( repository.setMetricCollectorError)(conn, metricObj.uid, None) except app_exceptions.ObjectNotFoundError: self._log.warning("Metric deleted?", exc_info=True) self._log.info("metric=<%r> exited error grace state %s", metricObj, oldErrorInfo) return (numEmpty, numErrors)
def _processAutostackMetricRequests(self, engine, requests, modelSwapper): """ Execute autostack metric requests, aggregate and stream collected metric data :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param requests: sequence of AutostackMetricRequest objects :param modelSwapper: Model Swapper """ # Start collecting requested metric data collectionIter = self._metricGetter.collectMetricData(requests) # Aggregate each collection and dispatch to app MetricStreamer for metricCollection in collectionIter: request = requests[metricCollection.refID] metricObj = request.metric data = None if metricCollection.slices: aggregationFn = getAggregationFn(metricObj) if aggregationFn: data = aggregate(metricCollection.slices, aggregationFn=aggregationFn) else: data = aggregate(metricCollection.slices) try: with engine.connect() as conn: repository.retryOnTransientErrors(repository.setMetricLastTimestamp)( conn, metricObj.uid, metricCollection.nextMetricTime ) except ObjectNotFoundError: self._log.warning( "Processing autostack data collection results for " "unknown model=%s (model deleted?)", metricObj.uid, ) continue if data: try: self.metricStreamer.streamMetricData(data, metricID=metricObj.uid, modelSwapper=modelSwapper) except ObjectNotFoundError: # We expect that the model exists but in the odd case that it has # already been deleted we don't want to crash the process. self._log.info("Metric not found when adding data. metric=%s" % metricObj.uid) self._log.debug( "{TAG:APP.AGG.DATA.PUB} Published numItems=%d for metric=%s;" "timeRange=[%sZ-%sZ]; headTS=%sZ; tailTS=%sZ", len(data), getMetricLogPrefix(metricObj), metricCollection.timeRange.start.isoformat(), metricCollection.timeRange.end.isoformat(), data[0][0].isoformat(), data[-1][0].isoformat(), ) else: self._log.info( "{TAG:APP.AGG.DATA.NONE} No data for metric=%s;" "timeRange=[%sZ-%sZ]", getMetricLogPrefix(metricObj), metricCollection.timeRange.start.isoformat(), metricCollection.timeRange.end.isoformat(), )
def messageHandler(self, message): """ Inspect all inbound model results in a batch for anomaly thresholds and trigger notifications where applicable. :param amqp.messages.ConsumerMessage message: ``message.body`` is a serialized batch of model inference results generated in ``AnomalyService`` and must be deserialized using ``AnomalyService.deserializeModelResult()``. The message conforms to htmengine/runtime/json_schema/model_inference_results_msg_schema.json """ if message.properties.headers and "dataType" in message.properties.headers: # Not a model inference result return YOMP.app.config.loadConfig() # reload config on every batch engine = repository.engineFactory() # Cache minimum threshold to trigger any notification to avoid permuting # settings x metricDataRows try: try: batch = AnomalyService.deserializeModelResult(message.body) except Exception: self._log.exception("Error deserializing model result") raise # Load all settings for all users (once per incoming batch) with engine.connect() as conn: settings = repository.retryOnTransientErrors(repository.getAllNotificationSettings)(conn) self._log.debug("settings: %r" % settings) if settings: minThreshold = min(setting.sensitivity for setting in settings) else: minThreshold = 0.99999 metricInfo = batch["metric"] metricId = metricInfo["uid"] resource = metricInfo["resource"] for row in batch["results"]: if row["anomaly"] >= minThreshold: for settingObj in settings: if row["rowid"] <= 1000: continue # Not enough data rowDatetime = datetime.utcfromtimestamp(row["ts"]) if rowDatetime < datetime.utcnow() - timedelta(seconds=3600): continue # Skip old if row["anomaly"] >= settingObj.sensitivity: # First let's clear any old users out of the database. with engine.connect() as conn: repository.retryOnTransientErrors(repository.deleteStaleNotificationDevices)( conn, _NOTIFICATION_DEVICE_STALE_DAYS ) # If anomaly_score meets or exceeds any of the device # notification sensitivity settings, trigger notification. # repository.addNotification() will handle throttling. notificationId = str(uuid.uuid4()) with engine.connect() as conn: result = repository.retryOnTransientErrors(repository.addNotification)( conn, uid=notificationId, server=resource, metric=metricId, rowid=row["rowid"], device=settingObj.uid, windowsize=(settingObj.windowsize), timestamp=rowDatetime, acknowledged=0, seen=0, ) self._log.info( "NOTIFICATION=%s SERVER=%s METRICID=%s DEVICE=%s " "Notification generated. " % (notificationId, resource, metricId, settingObj.uid) ) if result is not None and result.rowcount > 0 and settingObj.email_addr: # Notification was generated. Attempt to send email with engine.connect() as conn: notificationObj = repository.getNotification(conn, notificationId) self.sendNotificationEmail(engine, settingObj, notificationObj) if not settings: # There are no device notification settings stored on this server, # no notifications will be generated. However, log that a # an anomaly was detected and notification would be sent if there # were any configured devices self._log.info( "<%r>" % (metricInfo) + ("{TAG:APP.NOTIFICATION} Anomaly " "detected at %s, but no devices are " "configured.") % rowDatetime ) finally: message.ack() # Do cleanup with engine.connect() as conn: repository.clearOldNotifications(conn) # Delete all notifications outside
def _processAutostackMetricRequests(self, engine, requests, modelSwapper): """ Execute autostack metric requests, aggregate and stream collected metric data :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param requests: sequence of AutostackMetricRequest objects :param modelSwapper: Model Swapper """ # Start collecting requested metric data collectionIter = self._metricGetter.collectMetricData(requests) # Aggregate each collection and dispatch to app MetricStreamer for metricCollection in collectionIter: request = requests[metricCollection.refID] metricObj = request.metric data = None if metricCollection.slices: aggregationFn = getAggregationFn(metricObj) if aggregationFn: data = aggregate(metricCollection.slices, aggregationFn=aggregationFn) else: data = aggregate(metricCollection.slices) try: with engine.connect() as conn: repository.retryOnTransientErrors(repository.setMetricLastTimestamp)( conn, metricObj.uid, metricCollection.nextMetricTime) except ObjectNotFoundError: self._log.warning("Processing autostack data collection results for " "unknown model=%s (model deleted?)", metricObj.uid) continue if data: try: self.metricStreamer.streamMetricData(data, metricID=metricObj.uid, modelSwapper=modelSwapper) except ObjectNotFoundError: # We expect that the model exists but in the odd case that it has # already been deleted we don't want to crash the process. self._log.info("Metric not found when adding data. metric=%s" % metricObj.uid) self._log.debug( "{TAG:APP.AGG.DATA.PUB} Published numItems=%d for metric=%s;" "timeRange=[%sZ-%sZ]; headTS=%sZ; tailTS=%sZ", len(data), getMetricLogPrefix(metricObj), metricCollection.timeRange.start.isoformat(), metricCollection.timeRange.end.isoformat(), data[0][0].isoformat(), data[-1][0].isoformat()) else: self._log.info( "{TAG:APP.AGG.DATA.NONE} No data for metric=%s;" "timeRange=[%sZ-%sZ]", getMetricLogPrefix(metricObj), metricCollection.timeRange.start.isoformat(), metricCollection.timeRange.end.isoformat())