def start(self): # Removes possible left over cached engine # (needed if non-patched engine is run prior) repository.engineFactory(reset=True) # Override the Repository database name try: self._configPatch.start() self._configPatchApplied = True # Verity that the database doesn't exist yet assert self.tempDatabaseName not in getAllDatabaseNames(), ( "Temp repo db=%s already existed" % (self.tempDatabaseName,)) # Now create the temporary repository database self._attemptedToCreateRepository = True repository.reset() # Verify that the temporary repository database got created assert self.tempDatabaseName in getAllDatabaseNames(), ( "Temp repo db=%s not found" % (self.tempDatabaseName,)) except: # Attempt to clean up self.stop() raise
def testMetricDataForRandomRowID(uid): ''' This tests if the metric data returned by the GET call : _models/<uid>/data has anomaly_score consistent with what is there in the actual database by asserting it against a dao.MetricData.get() call It repeats the process for 5 random sample rows for each uid in the database. Algorithm : - Query the MetricDataHandler GET call for a certain uid - Check if response is OK - Find the last row id for the uid - Select a random row between 1 and last row id - Find the anomaly score for that row id - Assert on the anomaly score ''' response = self.app.get("/%s/data" %uid, headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) with repository.engineFactory().connect() as conn: lastRowID = repository.getMetric(conn, uid).last_rowid for _ in range(5): randomRowID = randrange(1, lastRowID) with repository.engineFactory().connect() as conn: singleMetricData = repository.getMetricData( conn, uid, rowid=randomRowID).first() metricData = getMetricDataWithRowID(getAllModelsResult['data'], randomRowID) self.assertEqual(metricData[2], singleMetricData.anomaly_score) self.assertEqual(datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), singleMetricData.timestamp)
def testMetricDataForRandomRowID(uid): ''' This tests if the metric data returned by the GET call : _models/<uid>/data has anomaly_score consistent with what is there in the actual database by asserting it against a dao.MetricData.get() call It repeats the process for 5 random sample rows for each uid in the database. Algorithm : - Query the MetricDataHandler GET call for a certain uid - Check if response is OK - Find the last row id for the uid - Select a random row between 1 and last row id - Find the anomaly score for that row id - Assert on the anomaly score ''' response = self.app.get("/%s/data" % uid, headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) with repository.engineFactory().connect() as conn: lastRowID = repository.getMetric(conn, uid).last_rowid for _ in range(5): randomRowID = randrange(1, lastRowID) with repository.engineFactory().connect() as conn: singleMetricData = repository.getMetricData( conn, uid, rowid=randomRowID).first() metricData = getMetricDataWithRowID(getAllModelsResult['data'], randomRowID) self.assertEqual(metricData[2], singleMetricData.anomaly_score) self.assertEqual( datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), singleMetricData.timestamp)
def start(self): # Removes possible left over cached engine # (needed if non-patched engine is run prior) repository.engineFactory(reset=True) # Override the Repository database name try: self._configPatch.start() self._configPatchApplied = True # Verity that the database doesn't exist yet assert self.tempDatabaseName not in getAllDatabaseNames(), ( "Temp repo db=%s already existed" % (self.tempDatabaseName, )) # Now create the temporary repository database self._attemptedToCreateRepository = True repository.reset() # Verify that the temporary repository database got created assert self.tempDatabaseName in getAllDatabaseNames(), ( "Temp repo db=%s not found" % (self.tempDatabaseName, )) except: # Attempt to clean up self.stop() raise
def GET(self): """ Get model data stats :: GET /_models/data/stats Returns: :: { "processing_time_remaining": 37 } """ with repository.engineFactory().connect() as conn: unprocessedDataCount = repository.getUnprocessedModelDataCount(conn) processingTimeRemaining = int(math.ceil( unprocessedDataCount * _PROCESSING_TIME_PER_RECORD)) self.addStandardHeaders() return utils.jsonEncode({ "processing_time_remaining": processingTimeRemaining, })
def GET(self, autostackId, *args): # pylint: disable=C0103,W0613 """ Get Metrics associated with autostack :: GET /_autostacks/{autostackId}/metrics NOTE: args is ignored. Function signature for all method handlers must be compatible with the regexp pattern that matches. POST optionally takes a second argument, DELETE requires it. """ try: self.addStandardHeaders() engine = repository.engineFactory() metricRows = repository.getAutostackMetrics( engine, autostackId, getMetricDisplayFields(engine)) metricsList = [ convertMetricRowToMetricDict(metricRow) for metricRow in metricRows ] return utils.jsonEncode(metricsList) except ObjectNotFoundError: raise web.notfound("Autostack not found: Autostack ID: %s" % autostackId) except web.HTTPError as ex: if bool(re.match(r"([45][0-9][0-9])\s?", web.ctx.status)): # Log 400-599 status codes as errors, ignoring 200-399 log.error(str(ex) or repr(ex)) raise except Exception as ex: raise web.internalerror(str(ex) or repr(ex))
def testMetricDataTimeStampQueryParams(uid): ''' This test makes MetricDataHandler GET calls with from and to params : _models/<uid>/data?from=<>&to=<> ''' with repository.engineFactory().connect() as conn: firstMetricData = conn.execute( sql.select([schema.metric_data]) .where(schema.metric_data.c.uid == uid) .order_by(sql.expression.asc(schema.metric_data.c.timestamp)) .limit(1)).fetchall() lastMetricData = conn.execute( sql.select([schema.metric_data]) .where(schema.metric_data.c.uid == uid) .order_by(sql.expression.desc(schema.metric_data.c.timestamp)) .limit(1)).fetchall() firstTimeStamp = firstMetricData[0].timestamp lastTimeStamp = lastMetricData[0].timestamp response = self.app.get("/%s/data?from=%s&to=%s" % (uid, firstTimeStamp, lastTimeStamp), headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) for metricData in getAllModelsResult['data']: self.assertGreaterEqual(datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), firstTimeStamp) self.assertLessEqual(datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), lastTimeStamp)
def testMetricDataTimeStampQueryParams(uid): ''' This test makes MetricDataHandler GET calls with from and to params : _models/<uid>/data?from=<>&to=<> ''' with repository.engineFactory().connect() as conn: firstMetricData = conn.execute( sql.select([schema.metric_data ]).where(schema.metric_data.c.uid == uid). order_by(sql.expression.asc( schema.metric_data.c.timestamp)).limit(1)).fetchall() lastMetricData = conn.execute( sql.select([ schema.metric_data ]).where(schema.metric_data.c.uid == uid).order_by( sql.expression.desc( schema.metric_data.c.timestamp)).limit( 1)).fetchall() firstTimeStamp = firstMetricData[0].timestamp lastTimeStamp = lastMetricData[0].timestamp response = self.app.get("/%s/data?from=%s&to=%s" % (uid, firstTimeStamp, lastTimeStamp), headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) for metricData in getAllModelsResult['data']: self.assertGreaterEqual( datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), firstTimeStamp) self.assertLessEqual( datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), lastTimeStamp)
def testMetricDataQueryParams(uid): ''' This test makes MetricDataHandler GET calls with various params : _models/<uid>/data?from=<>&to=<>&anomaly=<> ''' with repository.engineFactory().connect() as conn: firstMetricData = conn.execute( "SELECT * FROM `metric_data` WHERE `uid`='%s' " "and abs(`anomaly_score` - 0) > 1e-5 " "ORDER BY `timestamp` ASC LIMIT 1" % uid).fetchall() lastMetricData = conn.execute( "SELECT * FROM `metric_data` WHERE `uid`='%s' " "and abs(`anomaly_score` - 0) > 1e-5 " "ORDER BY `timestamp` DESC LIMIT 1" % uid).fetchall() firstTimeStamp = firstMetricData[0].timestamp lastTimeStamp = lastMetricData[0].timestamp anomalyScore = firstMetricData[0].anomaly_score response = self.app.get("/%s/data?from=%s&to=%s&anomaly=%s" % (uid, firstTimeStamp, lastTimeStamp, anomalyScore), headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) for metricData in getAllModelsResult['data']: self.assertGreaterEqual(metricData[2], anomalyScore) self.assertGreaterEqual(datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), firstTimeStamp) self.assertLessEqual(datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), lastTimeStamp)
def GET(self, autostackId, *args): # pylint: disable=C0103,W0613 """ Get Metrics associated with autostack :: GET /_autostacks/{autostackId}/metrics NOTE: args is ignored. Function signature for all method handlers must be compatible with the regexp pattern that matches. POST optionally takes a second argument, DELETE requires it. """ try: self.addStandardHeaders() engine = repository.engineFactory() metricRows = repository.getAutostackMetrics(engine, autostackId, getMetricDisplayFields(engine)) metricsList = [convertMetricRowToMetricDict(metricRow) for metricRow in metricRows] return utils.jsonEncode(metricsList) except ObjectNotFoundError: raise web.notfound("Autostack not found: Autostack ID: %s" % autostackId) except web.HTTPError as ex: if bool(re.match(r"([45][0-9][0-9])\s?", web.ctx.status)): # Log 400-599 status codes as errors, ignoring 200-399 log.error(str(ex) or repr(ex)) raise except Exception as ex: raise web.internalerror(str(ex) or repr(ex))
def GET(self): """ Get model data stats :: GET /_models/data/stats Returns: :: { "processing_time_remaining": 37 } """ with repository.engineFactory().connect() as conn: unprocessedDataCount = repository.getUnprocessedModelDataCount( conn) processingTimeRemaining = int( math.ceil(unprocessedDataCount * _PROCESSING_TIME_PER_RECORD)) self.addStandardHeaders() return utils.jsonEncode({ "processing_time_remaining": processingTimeRemaining, })
def testMetricDataQueryParams(uid): ''' This test makes MetricDataHandler GET calls with various params : _models/<uid>/data?from=<>&to=<>&anomaly=<> ''' with repository.engineFactory().connect() as conn: firstMetricData = conn.execute( "SELECT * FROM `metric_data` WHERE `uid`='%s' " "and abs(`anomaly_score` - 0) > 1e-5 " "ORDER BY `timestamp` ASC LIMIT 1" % uid).fetchall() lastMetricData = conn.execute( "SELECT * FROM `metric_data` WHERE `uid`='%s' " "and abs(`anomaly_score` - 0) > 1e-5 " "ORDER BY `timestamp` DESC LIMIT 1" % uid).fetchall() firstTimeStamp = firstMetricData[0].timestamp lastTimeStamp = lastMetricData[0].timestamp anomalyScore = firstMetricData[0].anomaly_score response = self.app.get( "/%s/data?from=%s&to=%s&anomaly=%s" % (uid, firstTimeStamp, lastTimeStamp, anomalyScore), headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) for metricData in getAllModelsResult['data']: self.assertGreaterEqual(metricData[2], anomalyScore) self.assertGreaterEqual( datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), firstTimeStamp) self.assertLessEqual( datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), lastTimeStamp)
def run(self): with ModelSwapperInterface() as modelSwapper: engine = repository.engineFactory() while True: with engine.connect() as conn: pendingStacks = repository.retryOnTransientErrors( repository.getAutostackMetricsPendingDataCollection)( conn) if not pendingStacks: time.sleep(self._NOTHING_READY_SLEEP_TIME_SEC) continue # Build a sequence of autostack metric requests requests = [] for autostack, metrics in pendingStacks: refBase = len(requests) requests.extend( AutostackMetricRequest(refID=refBase + i, autostack=autostack, metric=metric) for i, metric in enumerate(metrics)) # Collect, aggregate, and stream metric data self._processAutostackMetricRequests(engine, requests, modelSwapper)
def checkModelIsActive(self, uid): engine = repository.engineFactory() with engine.begin() as conn: metricObj = repository.getMetric(conn, uid, fields=[schema.metric.c.status]) self.assertEqual(metricObj.status, MetricStatus.ACTIVE)
def tearDownClass(cls): try: engine = repository.engineFactory() with engine.connect() as conn: repository.deleteMetric(conn, cls.uid) with engine.connect() as conn: _ = repository.getMetric(conn, cls.uid) except ObjectNotFoundError: g_logger.info("Successful clean-up") else: g_logger.error("Test failed to delete metric=%s", cls.uid)
def checkMetricUnmonitoredById(self, uid): engine = repository.engineFactory() with engine.begin() as conn: metricObj = repository.getMetric(conn, uid, fields=[schema.metric.c.status, schema.metric.c.parameters]) self.assertEqual(metricObj.status, MetricStatus.UNMONITORED) self.assertIsNone(metricObj.parameters) with self.assertRaises(model_checkpoint_mgr.ModelNotFound): model_checkpoint_mgr.ModelCheckpointMgr().loadModelDefinition(uid)
def checkStats(self, metricName, mn, mx): """Check that stats are computed correctly from the database""" engine = repository.engineFactory() with engine.begin() as conn: metricObj = (repository.getCustomMetricByName( conn, metricName, fields=[schema.metric.c.uid, schema.metric.c.parameters])) stats = repository.getMetricStats(conn, metricObj.uid) self.assertSetEqual(set(stats.keys()), set(("min", "max"))) self.assertAlmostEqual(stats["min"], mn) self.assertAlmostEqual(stats["max"], mx)
def checkMetricUnmonitoredById(self, uid): engine = repository.engineFactory() with engine.begin() as conn: metricObj = repository.getMetric( conn, uid, fields=[schema.metric.c.status, schema.metric.c.parameters]) self.assertEqual(metricObj.status, MetricStatus.UNMONITORED) self.assertIsNone(metricObj.parameters) with self.assertRaises(model_checkpoint_mgr.ModelNotFound): model_checkpoint_mgr.ModelCheckpointMgr().loadModelDefinition(uid)
def checkStats(self, metricName, mn, mx): """Check that stats are computed correctly from the database""" engine = repository.engineFactory() with engine.begin() as conn: metricObj = ( repository.getCustomMetricByName(conn, metricName, fields=[schema.metric.c.uid, schema.metric.c.parameters])) stats = repository.getMetricStats(conn, metricObj.uid) self.assertSetEqual(set(stats.keys()), set(("min", "max"))) self.assertAlmostEqual(stats["min"], mn) self.assertAlmostEqual(stats["max"], mx)
def testMetricDataAnomalyAsQueryParams(uid): ''' This test makes MetricDataHandler GET calls with anomaly param : _models/<uid>/data?anomaly=<> ''' queryString = ("SELECT * FROM metric_data WHERE uid='%s' " " and abs(anomaly_score - 0) > 1e-5 LIMIT 1") % uid with repository.engineFactory().connect() as conn: sampleMetricData = conn.execute(queryString).first() anomalyScore = sampleMetricData.anomaly_score response = self.app.get("/%s/data?anomaly=%s" % (uid, anomalyScore), headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) for metricData in getAllModelsResult['data']: self.assertGreaterEqual(metricData[2], anomalyScore)
def testTablesCreatedWithInnoDBEngine(self): """ Tests to make sure that all of the tables in the grok table_schema were created using the InnoDB engine to preserve referential integrity. At this time, it is checking all tables in the DB; in the future, if we do not require referential integrity, we can explicitly whitelist specific tables to allow those to use `MyISAM` or another engine. """ engine = repository.engineFactory() result = engine.execute("SELECT table_name, engine " "FROM information_schema.tables " "WHERE table_schema = 'grok'") for row in result: self.assertEqual(row.engine, "InnoDB", ("Table %s was created with the wrong engine type" % row["table_name"]))
def checkEncoderResolution(self, uid, minVal, maxVal): """Check that encoder resolution is computed correctly.""" engine = repository.engineFactory() with engine.begin() as conn: metricObj = repository.getMetric(conn, uid, fields=[schema.metric.c.name, schema.metric.c.model_params]) modelParams = json.loads(metricObj.model_params) self.assertNotEqual(modelParams, None, "No model exists for metric %s" % metricObj.name) sensorParams = modelParams["modelConfig"]["modelParams"]["sensorParams"] encoderParams = sensorParams["encoders"]["c1"] # Estimate and check the bounds for the resolution based on min and max lower = (maxVal - minVal) / 300.0 upper = (maxVal - minVal) / 80.0 self.assertGreater(encoderParams["resolution"], lower) self.assertLess(encoderParams["resolution"], upper)
def testTablesCreatedWithInnoDBEngine(self): """ Tests to make sure that all of the tables in the grok table_schema were created using the InnoDB engine to preserve referential integrity. At this time, it is checking all tables in the DB; in the future, if we do not require referential integrity, we can explicitly whitelist specific tables to allow those to use `MyISAM` or another engine. """ engine = repository.engineFactory() result = engine.execute("SELECT table_name, engine " "FROM information_schema.tables " "WHERE table_schema = 'grok'") for row in result: self.assertEqual( row.engine, "InnoDB", ("Table %s was created with the wrong engine type" % row["table_name"]))
def _runBasicChecksOnModel(self, modelId, _adapter, modelSpec): with repository.engineFactory().connect() as conn: metricObj = repository.getMetric(conn, modelId) _LOG.info("Making sure metric is CREATE_PENDING or ACTIVE or PENDING_DATA") self.assertIn( metricObj.status, [MetricStatus.CREATE_PENDING, MetricStatus.ACTIVE, MetricStatus.PENDING_DATA]) _LOG.info("Checking modelSpec") self.assertEqual(jsonDecode(metricObj.parameters), modelSpec) _LOG.info("Waiting for model to become active") self.checkModelIsActive(modelId) _LOG.info("Waiting at least one model result") self.checkModelResultsSize(modelId, 1, atLeast=True)
def checkEncoderResolution(self, uid, minVal, maxVal): """Check that encoder resolution is computed correctly.""" engine = repository.engineFactory() with engine.begin() as conn: metricObj = repository.getMetric( conn, uid, fields=[schema.metric.c.name, schema.metric.c.model_params]) modelParams = json.loads(metricObj.model_params) self.assertNotEqual(modelParams, None, "No model exists for metric %s" % metricObj.name) sensorParams = modelParams["modelConfig"]["modelParams"][ "sensorParams"] encoderParams = sensorParams["encoders"]["c1"] # Estimate and check the bounds for the resolution based on min and max lower = (maxVal - minVal) / 300.0 upper = (maxVal - minVal) / 80.0 self.assertGreater(encoderParams["resolution"], lower) self.assertLess(encoderParams["resolution"], upper)
def setUpClass(cls): with open(os.path.join( grok.app.GROK_HOME, "tests/py/integration/app/test_resources.yaml")) as fin: resources = yaml.load(fin) testCase = resources[aws_base.ResourceTypeNames.EC2_INSTANCE][0] cls._testRegion = testCase["region"] cls._testId = testCase["dimensions"]["InstanceId"] # Load grok API Key as required by TestCaseBase cls.apiKey = grok.app.config.get("security", "apikey") cls._modelSpecNoMinMax = {"datasource":testCase["datasource"], "metricSpec":{ "region":testCase["region"], "namespace":testCase["namespace"], "metric":testCase["metric"], "dimensions":testCase["dimensions"]}} cls.engine = repository.engineFactory()
def formatMetricRowProxy(metricObj): if metricObj.tag_name is not None and len(metricObj.tag_name) > 0: displayName = "%s (%s)" % (metricObj.tag_name, metricObj.server) else: displayName = metricObj.server if hasattr(metricObj, "parameters") and isinstance(metricObj.parameters, basestring): parameters = json.loads(metricObj.parameters) else: parameters = metricObj.parameters engine = repository.engineFactory() allowedKeys = set([col.name for col in getMetricDisplayFields(engine)]) metricDict = dict((col, getattr(metricObj, col)) for col in metricObj.keys() if col in allowedKeys) metricDict["display_name"] = displayName metricDict["parameters"] = parameters return metricDict
def fixUpGrokDB(): g_log.info("******* UPDATING GROKDB *******") # Perform manual db migration to switch to sqlalchemy engine = repository.engineFactory() with engine.connect() as connection: connection.execute("DROP TABLE IF EXISTS DATABASECHANGELOG") connection.execute("DROP TABLE IF EXISTS DATABASECHANGELOGLOCK") connection.execute( "CREATE TABLE `alembic_version` (`version_num` varchar(32) NOT NULL) " "ENGINE=InnoDB DEFAULT CHARSET=utf8;") # This fools Alembic into thinking the first migration, which goes from an # empty database to the 1.6 setup, has already been completed so it doesn't # attempt to perform it. connection.execute( "INSERT INTO alembic_version (version_num) VALUES('3a7e06671df4');") # Now we can run the migration script to upgrade from 1.6 to 1.7. repository.migrate("2f1ee984f978") g_log.info("******* GROKDB UPDATED *******")
def formatMetricRowProxy(metricObj): if metricObj.tag_name is not None and len(metricObj.tag_name) > 0: displayName = "%s (%s)" % (metricObj.tag_name, metricObj.server) else: displayName = metricObj.server if (hasattr(metricObj, "parameters") and isinstance(metricObj.parameters, basestring)): parameters = json.loads(metricObj.parameters) else: parameters = metricObj.parameters engine = repository.engineFactory() allowedKeys = set([col.name for col in getMetricDisplayFields(engine)]) metricDict = dict((col, getattr(metricObj, col)) for col in metricObj.keys() if col in allowedKeys) metricDict["display_name"] = displayName metricDict["parameters"] = parameters return metricDict
def setUpClass(cls): with open( os.path.join( grok.app.GROK_HOME, "tests/py/integration/app/test_resources.yaml")) as fin: resources = yaml.load(fin) testCase = resources[aws_base.ResourceTypeNames.EC2_INSTANCE][0] cls._testRegion = testCase["region"] cls._testId = testCase["dimensions"]["InstanceId"] # Load grok API Key as required by TestCaseBase cls.apiKey = grok.app.config.get("security", "apikey") cls._modelSpecNoMinMax = { "datasource": testCase["datasource"], "metricSpec": { "region": testCase["region"], "namespace": testCase["namespace"], "metric": testCase["metric"], "dimensions": testCase["dimensions"] } } cls.engine = repository.engineFactory()
def run(self): with ModelSwapperInterface() as modelSwapper: engine = repository.engineFactory() while True: with engine.connect() as conn: pendingStacks = repository.retryOnTransientErrors( repository.getAutostackMetricsPendingDataCollection)(conn) if not pendingStacks: time.sleep(self._NOTHING_READY_SLEEP_TIME_SEC) continue # Build a sequence of autostack metric requests requests = [] for autostack, metrics in pendingStacks: refBase = len(requests) requests.extend( AutostackMetricRequest(refID=refBase + i, autostack=autostack, metric=metric) for i, metric in enumerate(metrics)) # Collect, aggregate, and stream metric data self._processAutostackMetricRequests(engine, requests, modelSwapper)
def setUpClass(cls): # Load grok API Key as required by TestCaseBase cls.apiKey = grok.app.config.get("security", "apikey") cls.engine = repository.engineFactory()
def POST(cls): """Upload the metric info and metric data as a compressed tarfile to S3. The request must include the uid of the metric and may include other JSON keys as well. For instance, it is likely that a request from the mobile application will include information about the current view and data being displayed when the feedback request is sent. Any fields in addition to uid will be stored with the feedback archive file that is uploaded to S3. """ inputData = json.loads(web.data()) # Get the metric uid uid = inputData["uid"] del inputData["uid"] inputData["server_id"] = _MACHINE_ID # Data is written to a temporary directory before uploading path = tempfile.mkdtemp() try: # Retrieve the metric table record and add it to the other input # parameters metricFields = [ schema.metric.c.uid, schema.metric.c.datasource, schema.metric.c.name, schema.metric.c.description, schema.metric.c.server, schema.metric.c.location, schema.metric.c.parameters, schema.metric.c.status, schema.metric.c.message, schema.metric.c.last_timestamp, schema.metric.c.poll_interval, schema.metric.c.tag_name, schema.metric.c.last_rowid ] with repository.engineFactory().connect() as conn: metricRow = repository.getMetric(conn, uid, metricFields) metric = dict([ (col.name, utils.jsonDecode(getattr(metricRow, col.name)) if col.name == "parameters" else getattr(metricRow, col.name)) for col in metricFields ]) if metric["tag_name"]: metric["display_name"] = "%s (%s)" % (metric["tag_name"], metric["server"]) else: metric["display_name"] = metric["server"] inputData["metric"] = utils.jsonEncode(metric) metricPath = os.path.join(path, "metric.json") with open(metricPath, "w") as f: json.dump(inputData, f) # Retrieve the metric data with repository.engineFactory().connect() as conn: metricDataRows = repository.getMetricData(conn, uid) metricData = [ dict([(col.name, getattr(metricData, col.name)) for col in schema.metric_data.columns]) for metricData in metricDataRows ] metricDataPath = os.path.join(path, "metric_data.csv") with open(metricDataPath, "w") as f: writer = csv.writer(f) if len(metricData) > 0: header = metricData[0].keys() # Write the field names first writer.writerow(header) # Then write out the data for each row for dataDict in metricData: row = [dataDict[h] for h in header] writer.writerow(row) # Create a tarfile to upload ts = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") filename = "metric_dump_%s_%s.tar.gz" % (uid, ts) tfPath = os.path.join(path, filename) with tarfile.open(tfPath, "w:gz") as tf: tf.add(metricPath, arcname=os.path.basename(metricPath)) tf.add(metricDataPath, arcname=os.path.basename(metricDataPath)) # Upload the tarfile return cls._uploadTarfile(filename, tfPath) finally: shutil.rmtree(path)
def POST(cls): """Upload the metric info and metric data as a compressed tarfile to S3. The request must include the uid of the metric and may include other JSON keys as well. For instance, it is likely that a request from the mobile application will include information about the current view and data being displayed when the feedback request is sent. Any fields in addition to uid will be stored with the feedback archive file that is uploaded to S3. """ inputData = json.loads(web.data()) # Get the metric uid uid = inputData["uid"] del inputData["uid"] inputData["server_id"] = _MACHINE_ID # Data is written to a temporary directory before uploading path = tempfile.mkdtemp() try: # Retrieve the metric table record and add it to the other input # parameters metricFields = [schema.metric.c.uid, schema.metric.c.datasource, schema.metric.c.name, schema.metric.c.description, schema.metric.c.server, schema.metric.c.location, schema.metric.c.parameters, schema.metric.c.status, schema.metric.c.message, schema.metric.c.last_timestamp, schema.metric.c.poll_interval, schema.metric.c.tag_name, schema.metric.c.last_rowid] with repository.engineFactory().connect() as conn: metricRow = repository.getMetric(conn, uid, metricFields) metric = dict([(col.name, utils.jsonDecode(getattr(metricRow, col.name)) if col.name == "parameters" else getattr(metricRow, col.name)) for col in metricFields]) if metric["tag_name"]: metric["display_name"] = "%s (%s)" % (metric["tag_name"], metric["server"]) else: metric["display_name"] = metric["server"] inputData["metric"] = utils.jsonEncode(metric) metricPath = os.path.join(path, "metric.json") with open(metricPath, "w") as f: json.dump(inputData, f) # Retrieve the metric data with repository.engineFactory().connect() as conn: metricDataRows = repository.getMetricData(conn, uid) metricData = [dict([(col.name, getattr(metricData, col.name)) for col in schema.metric_data.columns]) for metricData in metricDataRows] metricDataPath = os.path.join(path, "metric_data.csv") with open(metricDataPath, "w") as f: writer = csv.writer(f) if len(metricData) > 0: header = metricData[0].keys() # Write the field names first writer.writerow(header) # Then write out the data for each row for dataDict in metricData: row = [dataDict[h] for h in header] writer.writerow(row) # Create a tarfile to upload ts = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") filename = "metric_dump_%s_%s.tar.gz" % (uid, ts) tfPath = os.path.join(path, filename) with tarfile.open(tfPath, "w:gz") as tf: tf.add(metricPath, arcname=os.path.basename(metricPath)) tf.add(metricDataPath, arcname=os.path.basename(metricDataPath)) # Upload the tarfile return cls._uploadTarfile(filename, tfPath) finally: shutil.rmtree(path)
def testCollectMetricStatistics(self): expectedStatisticNames = ["min", "max"] def validateStats(stats): self.assertIsInstance(stats, (list, tuple)) timestamps = [] for instanceMetrics in stats: self.assertEqual(len(instanceMetrics.records), 1) record = instanceMetrics.records[0] self.assertIsInstance(record.value, dict) self.assertGreater(len(record.value), 0) self.assertTrue( set(record.value.iterkeys()).issubset(expectedStatisticNames), msg=record.value) for metricValue in record.value.itervalues(): self.assertIsInstance(metricValue, float, msg=instanceMetrics) self.assertFalse(math.isnan(metricValue)) timestamps.append(record.timestamp) # Verify that all the stats timestamps are the same if timestamps: self.assertSequenceEqual(timestamps, [timestamps[0]] * len(timestamps)) # Collection data for both autostack/metric combinations collector = EC2InstanceMetricGetter() self.addCleanup(collector.close) def _createAutostackMetric(conn, name, region, filters): autostackDict = repository.addAutostack(conn, name=name, region=region, filters=json.dumps(filters)) modelSpec = {"modelParams": {}, "datasource": "autostack", "metricSpec": {"slaveDatasource": "cloudwatch", "slaveMetric": {"metric": "CPUUtilization", "namespace": "AWS/EC2"}, "autostackId": autostackDict["uid"]}} metricDict = repository.addMetric( conn, datasource="autostack", name="CPUUtilization", description=("CPUUtilization on Grok Autostack {0} in us-west-2 " "region").format(name), server="Autostacks/{0}".format(autostackDict["uid"]), location=region, tag_name=name, parameters=htmengine.utils.jsonEncode(modelSpec), poll_interval=300, status=MetricStatus.UNMONITORED) repository.addMetricToAutostack(conn, autostackDict["uid"], metricDict["uid"]) autostackObj = type("MutableAutostack", (object,), autostackDict)() autostackObj.filters = json.loads(autostackObj.filters) metricObj = type("MutableMetric", (object,), metricDict)() return autostackObj, metricObj # All instances in us-east-1 engine = repository.engineFactory() with engine.begin() as conn: autostack1, m1 = ( _createAutostackMetric(conn, name="testCollectMetricStats1", region="us-east-1", filters={"tag:Name": ["*"]})) stats1 = collector.collectMetricStatistics( autostack=autostack1, metric=m1) print "STATS1:", stats1 validateStats(stats1) self.assertGreaterEqual(len(stats1), 1) # All instances in us-west-2 autostack2, m2 = _createAutostackMetric(conn, name="testCollectMetricStats2", region="us-west-2", filters={"tag:Name": ["*"]}) stats2 = collector.collectMetricStatistics( autostack=autostack2, metric=m2) print "STATS2:", stats2 validateStats(stats2) self.assertGreater(len(stats2), 1) # No matching instances in us-west-2 autostack3, m3 = ( _createAutostackMetric( conn, name="testCollectMetricStatistics3", region="us-west-2", filters={"tag:Name": ["NothingShouldMatchThis"]})) stats3 = collector.collectMetricStatistics( autostack=autostack3, metric=m3) print "STATS3:", stats3 validateStats(stats3) self.assertEqual(len(stats3), 0)
def testModelInferencesLifeCycle(self): startTime = time() for model in sorted(self.data): #create a model; post is forwarded to put print "Creating metric for %s : " % model response = self.app.put("/", json.dumps(model), headers=self.headers) assertions.assertSuccess(self, response, code=201) response = self.app.get("/", headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) totalMetricCount = len(getAllModelsResult) self.assertEqual(totalMetricCount, len(self.data)) #Get the uids of all the metrics created. uids = [metric['uid'] for metric in getAllModelsResult] while True: with repository.engineFactory().connect() as conn: initialModelCount = conn.execute( sql.select([sql.func.count()], from_obj=schema.metric_data).where( schema.metric_data.c.rowid == 1)).scalar() if initialModelCount == totalMetricCount: print "Done creating all the initial models." break # Exit the test with some non-zero status if the test has run for more # than 20 minutes to just create the initial models. # Should not take more than that. currentElapsedTime = (time() - startTime) / 60 print "Current elapsed time %s" % currentElapsedTime if currentElapsedTime > 20: print "More than 20 minutes has elapsed. Timing out." sys.exit(42) print "%s initial models created." % initialModelCount print "Creating initial models for rest of the %s metrics" \ "..." % (totalMetricCount - initialModelCount) sleep(60) #Sleep for a long time. minutes = 15 print "Sleeping for %s minutes to let things settled down." % minutes while minutes > 0: print "Resume in %s minutes." % minutes minutes -= 1 sleep(60) modelCreationDuration = (time() - startTime) / 60 with repository.engineFactory().connect() as conn: lastRowIds = { uid: repository.getMetric(conn, uid).last_rowid for uid in uids } modelInferenceWithNonNullAnomalyScore = [] modelIds = lastRowIds.keys() while True: print set(modelInferenceWithNonNullAnomalyScore) if len(modelIds) == len( set(modelInferenceWithNonNullAnomalyScore)): print "Model inferences created for last_rowids for all the models." break for uid in modelIds: with repository.engineFactory().connect() as conn: anomalyNullCount = conn.execute( sql.select( [sql.func.count()], from_obj=schema.metric_data).where( schema.metric_data.c.rowid == lastRowIds[uid]). where(schema.metric_data.c.uid == uid).where( schema.metric_data.c.anomaly_score == None)).scalar() print "Model (%s) - Last Row ID (%s) : %s" \ % (uid, lastRowIds[uid], anomalyNullCount) if anomalyNullCount == 0: modelInferenceWithNonNullAnomalyScore.append(uid) # Exit the test with some non-zero status if the test has run for more # than 2 hours currentElapsedTime = (time() - startTime) / 60 print "Current elapsed time %s" % currentElapsedTime if currentElapsedTime > 120: print "More than 2 hours has elapsed. Timing out." sys.exit(42) print "Going back to sleep for 60s..." sleep(60) self.assertEqual(anomalyNullCount, 0) timeToCalculateAllInferences = time() def getMetricDataWithRowID(metricDataList, rowid): ''' Helper method to get the metric data of the nth row for a certain uid ''' for metricData in metricDataList: if metricData[3] == rowid: return metricData def testMetricDataForRandomRowID(uid): ''' This tests if the metric data returned by the GET call : _models/<uid>/data has anomaly_score consistent with what is there in the actual database by asserting it against a dao.MetricData.get() call It repeats the process for 5 random sample rows for each uid in the database. Algorithm : - Query the MetricDataHandler GET call for a certain uid - Check if response is OK - Find the last row id for the uid - Select a random row between 1 and last row id - Find the anomaly score for that row id - Assert on the anomaly score ''' response = self.app.get("/%s/data" % uid, headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) with repository.engineFactory().connect() as conn: lastRowID = repository.getMetric(conn, uid).last_rowid for _ in range(5): randomRowID = randrange(1, lastRowID) with repository.engineFactory().connect() as conn: singleMetricData = repository.getMetricData( conn, uid, rowid=randomRowID).first() metricData = getMetricDataWithRowID(getAllModelsResult['data'], randomRowID) self.assertEqual(metricData[2], singleMetricData.anomaly_score) self.assertEqual( datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), singleMetricData.timestamp) map(testMetricDataForRandomRowID, uids) def testMetricDataAnomalyAsQueryParams(uid): ''' This test makes MetricDataHandler GET calls with anomaly param : _models/<uid>/data?anomaly=<> ''' queryString = ( "SELECT * FROM metric_data WHERE uid='%s' " " and abs(anomaly_score - 0) > 1e-5 LIMIT 1") % uid with repository.engineFactory().connect() as conn: sampleMetricData = conn.execute(queryString).first() anomalyScore = sampleMetricData.anomaly_score response = self.app.get("/%s/data?anomaly=%s" % (uid, anomalyScore), headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) for metricData in getAllModelsResult['data']: self.assertGreaterEqual(metricData[2], anomalyScore) map(testMetricDataAnomalyAsQueryParams, uids) def testMetricDataTimeStampQueryParams(uid): ''' This test makes MetricDataHandler GET calls with from and to params : _models/<uid>/data?from=<>&to=<> ''' with repository.engineFactory().connect() as conn: firstMetricData = conn.execute( sql.select([schema.metric_data ]).where(schema.metric_data.c.uid == uid). order_by(sql.expression.asc( schema.metric_data.c.timestamp)).limit(1)).fetchall() lastMetricData = conn.execute( sql.select([ schema.metric_data ]).where(schema.metric_data.c.uid == uid).order_by( sql.expression.desc( schema.metric_data.c.timestamp)).limit( 1)).fetchall() firstTimeStamp = firstMetricData[0].timestamp lastTimeStamp = lastMetricData[0].timestamp response = self.app.get("/%s/data?from=%s&to=%s" % (uid, firstTimeStamp, lastTimeStamp), headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) for metricData in getAllModelsResult['data']: self.assertGreaterEqual( datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), firstTimeStamp) self.assertLessEqual( datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), lastTimeStamp) map(testMetricDataTimeStampQueryParams, uids) def testMetricDataQueryParams(uid): ''' This test makes MetricDataHandler GET calls with various params : _models/<uid>/data?from=<>&to=<>&anomaly=<> ''' with repository.engineFactory().connect() as conn: firstMetricData = conn.execute( "SELECT * FROM `metric_data` WHERE `uid`='%s' " "and abs(`anomaly_score` - 0) > 1e-5 " "ORDER BY `timestamp` ASC LIMIT 1" % uid).fetchall() lastMetricData = conn.execute( "SELECT * FROM `metric_data` WHERE `uid`='%s' " "and abs(`anomaly_score` - 0) > 1e-5 " "ORDER BY `timestamp` DESC LIMIT 1" % uid).fetchall() firstTimeStamp = firstMetricData[0].timestamp lastTimeStamp = lastMetricData[0].timestamp anomalyScore = firstMetricData[0].anomaly_score response = self.app.get( "/%s/data?from=%s&to=%s&anomaly=%s" % (uid, firstTimeStamp, lastTimeStamp, anomalyScore), headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) for metricData in getAllModelsResult['data']: self.assertGreaterEqual(metricData[2], anomalyScore) self.assertGreaterEqual( datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), firstTimeStamp) self.assertLessEqual( datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), lastTimeStamp) map(testMetricDataQueryParams, uids) endTime = (time() - startTime) / 60 print "Test started at : %s" % \ strftime('%Y-%m-%d %H:%M:%S', localtime(startTime)) print "Test finished at : %s" % \ strftime('%Y-%m-%d %H:%M:%S', localtime(endTime)) print "Total metric count : %s" % totalMetricCount print "Initial models created : %s" % initialModelCount print "Approximate time taken to create inital models : %s minutes" \ % modelCreationDuration print "Approximate time taken to calculate all inferences : %s minutes" \ % ((timeToCalculateAllInferences - startTime) / 60) print "Approximate time taken for all the tests to finish : %s minutes" \ % ((time() - startTime) / 60)
def testCollectAndPublishMetrics(self): # Start Metric Collector, create a set of Metrics, wait for it to collect # some metrics and to publish them to the metric_exchange, then validate # attributes of the published metrics. # # TODO Add more metric types # TODO Deeper validation of the published metrics # Start our own instance of metric collector and wait for data points with self._startModelSchedulerSubprocess() as modelSchedulerSubprocess, \ self._startMetricCollectorSubprocess() as metricCollectorSubprocess: # Create some models for metric collector to harvest region = "us-west-2" namespace = "AWS/EC2" resourceType = ResourceTypeNames.EC2_INSTANCE engine = repository.engineFactory() adapter = createCloudwatchDatasourceAdapter() ec2Instances = adapter.describeResources(region=region, resourceType=resourceType) self.assertGreater(len(ec2Instances), 0) maxModels = 10 ec2Instances = ec2Instances[:min(maxModels, Quota.getInstanceQuota( ))] metricInstances = [] _LOGGER.info("Starting %d models", len(ec2Instances)) self.assertGreater(len(ec2Instances), 0) for ec2Instance in ec2Instances: metricSpec = { "region": region, "namespace": namespace, "metric": "CPUUtilization", "dimensions": { "InstanceId": ec2Instance["resID"] } } modelSpec = { "datasource": "cloudwatch", "metricSpec": metricSpec } metricId = adapter.monitorMetric(modelSpec) with engine.connect() as conn: repository.setMetricStatus(conn, metricId, MetricStatus.ACTIVE) metricInstances.append(metricId) _LOGGER.info("Waiting for results from models...") seenMetricIDs = set() allMetricIDs = set(metricInstances) # Register a timeout so we won't deadlock the test def onTimeout(resultsQueueName): _LOGGER.error( "Timed out waiting to get results from models; numResults=%d; " "expected=%d", len(seenMetricIDs), len(allMetricIDs)) # HACK delete model swapper results queue to abort the consumer try: with MessageBusConnector() as bus: bus.deleteMessageQueue(resultsQueueName) except Exception: _LOGGER.exception("Failed to delete results mq=%s", resultsQueueName) raise with ModelSwapperInterface() as modelSwapper: with modelSwapper.consumeResults() as consumer: timer = threading.Timer( 120, onTimeout, args=[modelSwapper._resultsQueueName]) timer.start() try: for batch in consumer: seenMetricIDs.add(batch.modelID) batch.ack() if seenMetricIDs == allMetricIDs: break else: self.fail( "Expected %d results, but got only %d: %s" % ( len(allMetricIDs), len(seenMetricIDs), seenMetricIDs, )) _LOGGER.info("Got %d results from models", len(seenMetricIDs)) finally: timer.cancel() # Terminate metric_collector subprocess gracefully to avoid too much # error logging junk on the terminal metricCollectorSubprocess.send_signal(signal.SIGINT) # Terminate metric_collector subprocess gracefully to avoid too much # error logging junk on the terminal modelSchedulerSubprocess.send_signal(signal.SIGINT)
def GET(self, period): """ Get metrics, sorted by anomalies over specified period (hours) :param period: Period (hours) over which to consider anomalies for sort order :type period: int :returns: List of metrics :rtype: list Example request:: GET /_anomalies/period/{period} Example response:: [ { "status": 1, "last_rowid": 4033, "display_name": "jenkins-master (us-west-2/AWS/EC2/i-12345678)", "description": "NetworkIn on EC2 instance i-12345678 in us-west-2", "name": "AWS/EC2/NetworkIn", "last_timestamp": "2014-04-14 20:29:00", "poll_interval": 300, "server": "us-west-2/AWS/EC2/i-12345678", "tag_name": "jenkins-master", "datasource": "cloudwatch", "location": "us-west-2", "message": null, "parameters": { "InstanceId": "i-12345678", "region": "us-west-2" }, "uid": "0b6b97022fdb4134936aae92aa67393b" }, ... ] """ try: self.addStandardHeaders() engine = repository.engineFactory() with engine.connect() as conn: modelIterator = repository.getAllMetrics(conn, fields=getMetricDisplayFields(conn)) displayValuesMap = repository.getMetricIdsSortedByDisplayValue(conn, period) # Keep track of the largest model display value for each server serverValues = defaultdict(float) modelsList = [] for model in modelIterator: val = displayValuesMap.get(model.uid) if val is not None: serverValues[model.server] = max(float(val), serverValues[model.server]) modelsList.append(convertMetricRowToMetricDict(model)) # Sort by the primary key. The order within each server is preserved # from previous sort. def getModelRankByServer(model): return (-serverValues[model["server"]], model["server"], model["name"]) modelsList = sorted(modelsList, key=getModelRankByServer) return utils.jsonEncode(modelsList) except (web.HTTPError) as ex: log.info(str(ex) or repr(ex)) raise ex except Exception as ex: log.exception("GET Failed") raise web.internalerror(str(ex) or repr(ex))
def messageHandler(self, message): """ Inspect all inbound model results in a batch for anomaly thresholds and trigger notifications where applicable. :param amqp.messages.ConsumerMessage message: ``message.body`` is a serialized batch of model inference results generated in ``AnomalyService`` and must be deserialized using ``AnomalyService.deserializeModelResult()``. The message conforms to htmengine/runtime/json_schema/model_inference_results_msg_schema.json """ if message.properties.headers and "dataType" in message.properties.headers: # Not a model inference result return grok.app.config.loadConfig() # reload config on every batch engine = repository.engineFactory() # Cache minimum threshold to trigger any notification to avoid permuting # settings x metricDataRows try: try: batch = AnomalyService.deserializeModelResult(message.body) except Exception: self._log.exception("Error deserializing model result") raise # Load all settings for all users (once per incoming batch) with engine.connect() as conn: settings = repository.retryOnTransientErrors( repository.getAllNotificationSettings)(conn) self._log.debug("settings: %r" % settings) if settings: minThreshold = min(setting.sensitivity for setting in settings) else: minThreshold = 0.99999 metricInfo = batch["metric"] metricId = metricInfo["uid"] resource = metricInfo["resource"] for row in batch["results"]: if row["anomaly"] >= minThreshold: for settingObj in settings: if row["rowid"] <= 1000: continue # Not enough data rowDatetime = datetime.utcfromtimestamp(row["ts"]) if rowDatetime < datetime.utcnow() - timedelta(seconds=3600): continue # Skip old if row["anomaly"] >= settingObj.sensitivity: # First let's clear any old users out of the database. with engine.connect() as conn: repository.retryOnTransientErrors( repository.deleteStaleNotificationDevices)( conn, _NOTIFICATION_DEVICE_STALE_DAYS) # If anomaly_score meets or exceeds any of the device # notification sensitivity settings, trigger notification. # repository.addNotification() will handle throttling. notificationId = str(uuid.uuid4()) with engine.connect() as conn: result = repository.retryOnTransientErrors( repository.addNotification)(conn, uid=notificationId, server=resource, metric=metricId, rowid=row["rowid"], device=settingObj.uid, windowsize=( settingObj.windowsize), timestamp=rowDatetime, acknowledged=0, seen=0) self._log.info("NOTIFICATION=%s SERVER=%s METRICID=%s DEVICE=%s " "Notification generated. " % (notificationId, resource, metricId, settingObj.uid)) if (result is not None and result.rowcount > 0 and settingObj.email_addr): # Notification was generated. Attempt to send email with engine.connect() as conn: notificationObj = repository.getNotification(conn, notificationId) self.sendNotificationEmail(engine, settingObj, notificationObj) if not settings: # There are no device notification settings stored on this server, # no notifications will be generated. However, log that a # an anomaly was detected and notification would be sent if there # were any configured devices self._log.info("<%r>" % (metricInfo) + ( "{TAG:APP.NOTIFICATION} Anomaly " "detected at %s, but no devices are " "configured.") % rowDatetime) finally: message.ack() # Do cleanup with engine.connect() as conn: repository.clearOldNotifications(conn) # Delete all notifications outside
def testModelInferencesLifeCycle(self): startTime = time() for model in sorted(self.data): #create a model; post is forwarded to put print "Creating metric for %s : " % model response = self.app.put("/", json.dumps(model), headers=self.headers) assertions.assertSuccess(self, response, code=201) response = self.app.get("/", headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) totalMetricCount = len(getAllModelsResult) self.assertEqual(totalMetricCount, len(self.data)) #Get the uids of all the metrics created. uids = [metric['uid'] for metric in getAllModelsResult] while True: with repository.engineFactory().connect() as conn: initialModelCount = conn.execute( sql.select([sql.func.count()], from_obj=schema.metric_data) .where(schema.metric_data.c.rowid == 1)).scalar() if initialModelCount == totalMetricCount: print "Done creating all the initial models." break # Exit the test with some non-zero status if the test has run for more # than 20 minutes to just create the initial models. # Should not take more than that. currentElapsedTime = (time() - startTime) / 60 print "Current elapsed time %s" % currentElapsedTime if currentElapsedTime > 20: print "More than 20 minutes has elapsed. Timing out." sys.exit(42) print "%s initial models created." % initialModelCount print "Creating initial models for rest of the %s metrics" \ "..." % (totalMetricCount - initialModelCount) sleep(60) #Sleep for a long time. minutes = 15 print "Sleeping for %s minutes to let things settled down." % minutes while minutes > 0: print "Resume in %s minutes." % minutes minutes -= 1 sleep(60) modelCreationDuration = (time() - startTime) / 60 with repository.engineFactory().connect() as conn: lastRowIds = {uid: repository.getMetric(conn, uid).last_rowid for uid in uids} modelInferenceWithNonNullAnomalyScore = [] modelIds = lastRowIds.keys() while True: print set(modelInferenceWithNonNullAnomalyScore) if len(modelIds) == len(set(modelInferenceWithNonNullAnomalyScore)): print "Model inferences created for last_rowids for all the models." break for uid in modelIds: with repository.engineFactory().connect() as conn: anomalyNullCount = conn.execute( sql.select([sql.func.count()], from_obj=schema.metric_data) .where(schema.metric_data.c.rowid == lastRowIds[uid]) .where(schema.metric_data.c.uid == uid) .where(schema.metric_data.c.anomaly_score == None)).scalar() print "Model (%s) - Last Row ID (%s) : %s" \ % (uid, lastRowIds[uid], anomalyNullCount) if anomalyNullCount == 0: modelInferenceWithNonNullAnomalyScore.append(uid) # Exit the test with some non-zero status if the test has run for more # than 2 hours currentElapsedTime = (time() - startTime) / 60 print "Current elapsed time %s" % currentElapsedTime if currentElapsedTime > 120: print "More than 2 hours has elapsed. Timing out." sys.exit(42) print "Going back to sleep for 60s..." sleep(60) self.assertEqual(anomalyNullCount, 0) timeToCalculateAllInferences = time() def getMetricDataWithRowID(metricDataList, rowid): ''' Helper method to get the metric data of the nth row for a certain uid ''' for metricData in metricDataList: if metricData[3] == rowid: return metricData def testMetricDataForRandomRowID(uid): ''' This tests if the metric data returned by the GET call : _models/<uid>/data has anomaly_score consistent with what is there in the actual database by asserting it against a dao.MetricData.get() call It repeats the process for 5 random sample rows for each uid in the database. Algorithm : - Query the MetricDataHandler GET call for a certain uid - Check if response is OK - Find the last row id for the uid - Select a random row between 1 and last row id - Find the anomaly score for that row id - Assert on the anomaly score ''' response = self.app.get("/%s/data" %uid, headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) with repository.engineFactory().connect() as conn: lastRowID = repository.getMetric(conn, uid).last_rowid for _ in range(5): randomRowID = randrange(1, lastRowID) with repository.engineFactory().connect() as conn: singleMetricData = repository.getMetricData( conn, uid, rowid=randomRowID).first() metricData = getMetricDataWithRowID(getAllModelsResult['data'], randomRowID) self.assertEqual(metricData[2], singleMetricData.anomaly_score) self.assertEqual(datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), singleMetricData.timestamp) map(testMetricDataForRandomRowID, uids) def testMetricDataAnomalyAsQueryParams(uid): ''' This test makes MetricDataHandler GET calls with anomaly param : _models/<uid>/data?anomaly=<> ''' queryString = ("SELECT * FROM metric_data WHERE uid='%s' " " and abs(anomaly_score - 0) > 1e-5 LIMIT 1") % uid with repository.engineFactory().connect() as conn: sampleMetricData = conn.execute(queryString).first() anomalyScore = sampleMetricData.anomaly_score response = self.app.get("/%s/data?anomaly=%s" % (uid, anomalyScore), headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) for metricData in getAllModelsResult['data']: self.assertGreaterEqual(metricData[2], anomalyScore) map(testMetricDataAnomalyAsQueryParams, uids) def testMetricDataTimeStampQueryParams(uid): ''' This test makes MetricDataHandler GET calls with from and to params : _models/<uid>/data?from=<>&to=<> ''' with repository.engineFactory().connect() as conn: firstMetricData = conn.execute( sql.select([schema.metric_data]) .where(schema.metric_data.c.uid == uid) .order_by(sql.expression.asc(schema.metric_data.c.timestamp)) .limit(1)).fetchall() lastMetricData = conn.execute( sql.select([schema.metric_data]) .where(schema.metric_data.c.uid == uid) .order_by(sql.expression.desc(schema.metric_data.c.timestamp)) .limit(1)).fetchall() firstTimeStamp = firstMetricData[0].timestamp lastTimeStamp = lastMetricData[0].timestamp response = self.app.get("/%s/data?from=%s&to=%s" % (uid, firstTimeStamp, lastTimeStamp), headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) for metricData in getAllModelsResult['data']: self.assertGreaterEqual(datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), firstTimeStamp) self.assertLessEqual(datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), lastTimeStamp) map(testMetricDataTimeStampQueryParams, uids) def testMetricDataQueryParams(uid): ''' This test makes MetricDataHandler GET calls with various params : _models/<uid>/data?from=<>&to=<>&anomaly=<> ''' with repository.engineFactory().connect() as conn: firstMetricData = conn.execute( "SELECT * FROM `metric_data` WHERE `uid`='%s' " "and abs(`anomaly_score` - 0) > 1e-5 " "ORDER BY `timestamp` ASC LIMIT 1" % uid).fetchall() lastMetricData = conn.execute( "SELECT * FROM `metric_data` WHERE `uid`='%s' " "and abs(`anomaly_score` - 0) > 1e-5 " "ORDER BY `timestamp` DESC LIMIT 1" % uid).fetchall() firstTimeStamp = firstMetricData[0].timestamp lastTimeStamp = lastMetricData[0].timestamp anomalyScore = firstMetricData[0].anomaly_score response = self.app.get("/%s/data?from=%s&to=%s&anomaly=%s" % (uid, firstTimeStamp, lastTimeStamp, anomalyScore), headers=self.headers) assertions.assertSuccess(self, response) getAllModelsResult = utils.jsonDecode(response.body) for metricData in getAllModelsResult['data']: self.assertGreaterEqual(metricData[2], anomalyScore) self.assertGreaterEqual(datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), firstTimeStamp) self.assertLessEqual(datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'), lastTimeStamp) map(testMetricDataQueryParams, uids) endTime = (time() - startTime) / 60 print "Test started at : %s" % \ strftime('%Y-%m-%d %H:%M:%S', localtime(startTime)) print "Test finished at : %s" % \ strftime('%Y-%m-%d %H:%M:%S', localtime(endTime)) print "Total metric count : %s" % totalMetricCount print "Initial models created : %s" % initialModelCount print "Approximate time taken to create inital models : %s minutes" \ % modelCreationDuration print "Approximate time taken to calculate all inferences : %s minutes" \ % ((timeToCalculateAllInferences - startTime) / 60) print "Approximate time taken for all the tests to finish : %s minutes" \ % ((time() - startTime) / 60)
def getStatistics(metric): """Get aggregate statistics for an Autostack metric. The metric must belong to an Autostack or a ValueError will be raised. If AWS returns no stats and there is no data in the database then an ObjectNotFoundError will be raised. :param metric: the Autostack metric to get statistics for :type metric: TODO :returns: metric statistics :rtype: dict {"min": minVal, "max": maxVal} :raises: ValueError if the metric doesn't not belong to an Autostack :raises: grok.app.exceptions.ObjectNotFoundError if the metric or the corresponding autostack doesn't exist; this may happen if it got deleted by another process in the meantime. :raises: grok.app.exceptions.MetricStatisticsNotReadyError if there are no or insufficent samples at this time; this may also happen if the metric and its data were deleted by another process in the meantime """ engine = repository.engineFactory() if metric.datasource != "autostack": raise ValueError( "Metric must belong to an Autostack but has datasource=%r" % metric.datasource) metricGetter = EC2InstanceMetricGetter() try: with engine.connect() as conn: autostack = repository.getAutostackFromMetric(conn, metric.uid) instanceMetricList = metricGetter.collectMetricStatistics(autostack, metric) finally: metricGetter.close() n = 0 mins = 0.0 maxs = 0.0 for instanceMetric in instanceMetricList: assert len(instanceMetric.records) == 1 metricRecord = instanceMetric.records[0] stats = metricRecord.value if (not isinstance(stats["min"], numbers.Number) or math.isnan(stats["min"]) or not isinstance(stats["max"], numbers.Number) or math.isnan(stats["max"])): # Cloudwatch gave us bogus data for this metric so we will exclude it continue mins += stats["min"] maxs += stats["max"] n += 1 if n == 0: # Fall back to metric_data when we don't get anything from AWS. This may # raise an MetricStatisticsNotReadyError if there is no or not enough data. with engine.connect() as conn: dbStats = repository.getMetricStats(conn, metric.uid) minVal = dbStats["min"] maxVal = dbStats["max"] else: minVal = mins / n maxVal = maxs / n # Now add the 20% buffer on the range buff = (maxVal - minVal) * 0.2 minVal -= buff maxVal += buff return {"min": minVal, "max": maxVal}
def run(self): """ Collect metric data and status for active metrics """ # NOTE: the process pool must be created BEFORE this main (parent) process # creates any global or class-level shared resources (e.g., boto # connection) that would have undersirable consequences when # replicated into and used by forked child processes (e.g., the same MySQL # connection socket file descriptor used by multiple processes). And we # can't take advantage of the process Pool's maxtasksperchild feature # either (for the same reason) self._log.info("Starting grok Metric Collector") resultsQueue = multiprocessing.Manager().JoinableQueue() recvPipe, sendPipe = multiprocessing.Pipe(False) processPool = multiprocessing.Pool( processes=self._WORKER_PROCESS_POOL_SIZE, maxtasksperchild=None) try: with ModelSwapperInterface() as modelSwapper: engine = repository.engineFactory() while True: startTime = time.time() if startTime > self._nextCacheGarbageCollectionTime: # TODO: unit-test self._garbageCollectInfoCache() # Determine which metrics are due for an update metricsToUpdate = self._getCandidateMetrics(engine) filterDuration = time.time() - startTime if not metricsToUpdate: time.sleep(self._NO_PENDING_METRICS_SLEEP_SEC) continue # Collect metric data collectionStartTime = time.time() poolResults = self._collectDataForMetrics(metricsToUpdate, processPool, resultsQueue) # Process/dispatch results in parallel in another thread as results # become available in resultsQueue dispatchThread = ( threading.Thread(target=self._processAndDispatchThreadTarget, args=(engine, metricsToUpdate, resultsQueue, modelSwapper, sendPipe))) dispatchStartTime = time.time() dispatchThread.start() # Syncronize with processPool poolResults.wait() # Wait for collection tasks to complete metricPollDuration = time.time() - collectionStartTime resultsQueue.join() # Block until all tasks completed... # Syncronize with dispatchThread resultsQueue.put(self._SENTINEL) # Signal to dispatchThread that # there are no more results to # process. resultsQueue.join() numEmpty, numErrors = recvPipe.recv() # Get dispatchThread stats dispatchDuration = time.time() - dispatchStartTime self._log.info( "Processed numMetrics=%d; numEmpty=%d; numErrors=%d; " "duration=%.4fs (filter=%.4fs; query=%.4fs; dispatch=%.4fs)", len(metricsToUpdate), numEmpty, numErrors, time.time() - startTime, filterDuration, metricPollDuration, dispatchDuration) finally: self._log.info("Exiting Metric Collector run-loop") processPool.terminate() processPool.join()
def run(self): """ Collect metric data and status for active metrics """ # NOTE: the process pool must be created BEFORE this main (parent) process # creates any global or class-level shared resources (e.g., boto # connection) that would have undersirable consequences when # replicated into and used by forked child processes (e.g., the same MySQL # connection socket file descriptor used by multiple processes). And we # can't take advantage of the process Pool's maxtasksperchild feature # either (for the same reason) self._log.info("Starting grok Metric Collector") resultsQueue = multiprocessing.Manager().JoinableQueue() recvPipe, sendPipe = multiprocessing.Pipe(False) processPool = multiprocessing.Pool(processes=self._WORKER_PROCESS_POOL_SIZE, maxtasksperchild=None) try: with ModelSwapperInterface() as modelSwapper: engine = repository.engineFactory() while True: startTime = time.time() if startTime > self._nextCacheGarbageCollectionTime: # TODO: unit-test self._garbageCollectInfoCache() # Determine which metrics are due for an update metricsToUpdate = self._getCandidateMetrics(engine) filterDuration = time.time() - startTime if not metricsToUpdate: time.sleep(self._NO_PENDING_METRICS_SLEEP_SEC) continue # Collect metric data collectionStartTime = time.time() poolResults = self._collectDataForMetrics(metricsToUpdate, processPool, resultsQueue) # Process/dispatch results in parallel in another thread as results # become available in resultsQueue dispatchThread = threading.Thread( target=self._processAndDispatchThreadTarget, args=(engine, metricsToUpdate, resultsQueue, modelSwapper, sendPipe), ) dispatchStartTime = time.time() dispatchThread.start() # Syncronize with processPool poolResults.wait() # Wait for collection tasks to complete metricPollDuration = time.time() - collectionStartTime resultsQueue.join() # Block until all tasks completed... # Syncronize with dispatchThread resultsQueue.put(self._SENTINEL) # Signal to dispatchThread that # there are no more results to # process. resultsQueue.join() numEmpty, numErrors = recvPipe.recv() # Get dispatchThread stats dispatchDuration = time.time() - dispatchStartTime self._log.info( "Processed numMetrics=%d; numEmpty=%d; numErrors=%d; " "duration=%.4fs (filter=%.4fs; query=%.4fs; dispatch=%.4fs)", len(metricsToUpdate), numEmpty, numErrors, time.time() - startTime, filterDuration, metricPollDuration, dispatchDuration, ) finally: self._log.info("Exiting Metric Collector run-loop") processPool.terminate() processPool.join()
def _connect(): """ Explicitly checks out a connection from the sqlalchemy engine for use inside web handler via web.ctx """ web.ctx.connFactory = repository.engineFactory().connect
def getStatistics(metric): """Get aggregate statistics for an Autostack metric. The metric must belong to an Autostack or a ValueError will be raised. If AWS returns no stats and there is no data in the database then an ObjectNotFoundError will be raised. :param metric: the Autostack metric to get statistics for :type metric: TODO :returns: metric statistics :rtype: dict {"min": minVal, "max": maxVal} :raises: ValueError if the metric doesn't not belong to an Autostack :raises: grok.app.exceptions.ObjectNotFoundError if the metric or the corresponding autostack doesn't exist; this may happen if it got deleted by another process in the meantime. :raises: grok.app.exceptions.MetricStatisticsNotReadyError if there are no or insufficent samples at this time; this may also happen if the metric and its data were deleted by another process in the meantime """ engine = repository.engineFactory() if metric.datasource != "autostack": raise ValueError( "Metric must belong to an Autostack but has datasource=%r" % metric.datasource) metricGetter = EC2InstanceMetricGetter() try: with engine.connect() as conn: autostack = repository.getAutostackFromMetric(conn, metric.uid) instanceMetricList = metricGetter.collectMetricStatistics( autostack, metric) finally: metricGetter.close() n = 0 mins = 0.0 maxs = 0.0 for instanceMetric in instanceMetricList: assert len(instanceMetric.records) == 1 metricRecord = instanceMetric.records[0] stats = metricRecord.value if (not isinstance(stats["min"], numbers.Number) or math.isnan(stats["min"]) or not isinstance(stats["max"], numbers.Number) or math.isnan(stats["max"])): # Cloudwatch gave us bogus data for this metric so we will exclude it continue mins += stats["min"] maxs += stats["max"] n += 1 if n == 0: # Fall back to metric_data when we don't get anything from AWS. This may # raise an MetricStatisticsNotReadyError if there is no or not enough data. with engine.connect() as conn: dbStats = repository.getMetricStats(conn, metric.uid) minVal = dbStats["min"] maxVal = dbStats["max"] else: minVal = mins / n maxVal = maxs / n # Now add the 20% buffer on the range buff = (maxVal - minVal) * 0.2 minVal -= buff maxVal += buff return {"min": minVal, "max": maxVal}
def GET(self, period): """ Get metrics, sorted by anomalies over specified period (hours) :param period: Period (hours) over which to consider anomalies for sort order :type period: int :returns: List of metrics :rtype: list Example request:: GET /_anomalies/period/{period} Example response:: [ { "status": 1, "last_rowid": 4033, "display_name": "jenkins-master (us-west-2/AWS/EC2/i-12345678)", "description": "NetworkIn on EC2 instance i-12345678 in us-west-2", "name": "AWS/EC2/NetworkIn", "last_timestamp": "2014-04-14 20:29:00", "poll_interval": 300, "server": "us-west-2/AWS/EC2/i-12345678", "tag_name": "jenkins-master", "datasource": "cloudwatch", "location": "us-west-2", "message": null, "parameters": { "InstanceId": "i-12345678", "region": "us-west-2" }, "uid": "0b6b97022fdb4134936aae92aa67393b" }, ... ] """ try: self.addStandardHeaders() engine = repository.engineFactory() with engine.connect() as conn: modelIterator = repository.getAllMetrics( conn, fields=getMetricDisplayFields(conn)) displayValuesMap = repository.getMetricIdsSortedByDisplayValue( conn, period) # Keep track of the largest model display value for each server serverValues = defaultdict(float) modelsList = [] for model in modelIterator: val = displayValuesMap.get(model.uid) if val is not None: serverValues[model.server] = max( float(val), serverValues[model.server]) modelsList.append(convertMetricRowToMetricDict(model)) # Sort by the primary key. The order within each server is preserved # from previous sort. def getModelRankByServer(model): return (-serverValues[model["server"]], model["server"], model["name"]) modelsList = sorted(modelsList, key=getModelRankByServer) return utils.jsonEncode(modelsList) except (web.HTTPError) as ex: log.info(str(ex) or repr(ex)) raise ex except Exception as ex: log.exception("GET Failed") raise web.internalerror(str(ex) or repr(ex))
def setUpClass(cls): """ Setup steps for all test cases. Focus for these is to cover all API checks for ModelDataHandler. Hence, this does all setup creating metric, waiting for metricData across all testcases, all API call for querying metricData will be against single metric created in setup Setup Process 1) Update conf with aws credentials, ManagedTempRepository will not work in this test 2) Select test instance such that its running from longer time, We are using instance older than 15 days 3) Create Metric, wait for min metricData rows to become available Set to 100, configurable 4) Pick testRowId, set it lower value this will make sure to have Non NULL value for anomaly_score field for given row while invoking GET with consitions, set to 5 5) Decide queryParams for anomalyScore, to and from timestamp """ cls.headers = getDefaultHTTPHeaders(grok.app.config) # All other sevices needs AWS credentials to work # Set AWS credentials grok.app.config.loadConfig() # Select test instance such that its running from longer time g_logger.info("Getting long-running EC2 Instances") instances = aws_utils.getLongRunningEC2Instances("us-west-2", grok.app.config.get("aws", "aws_access_key_id"), grok.app.config.get("aws", "aws_secret_access_key"), 15) testInstance = instances[randrange(1, len(instances))] createModelData = { "region": "us-west-2", "namespace": "AWS/EC2", "datasource": "cloudwatch", "metric": "CPUUtilization", "dimensions": { "InstanceId": testInstance.id } } # Number of minimum rows cls.minDataRows = 100 cls.app = TestApp(models_api.app.wsgifunc()) # create test metric g_logger.info("Creating test metric; modelSpec=%s", createModelData) response = cls.app.put("/", utils.jsonEncode(createModelData), headers=cls.headers) postResult = utils.jsonDecode(response.body) maxWaitTime = 600 waitTimeMetricData = 0 waitAnomalyScore = 0 # Wait for enough metric data to be available cls.uid = postResult[0]["uid"] engine = repository.engineFactory() with engine.connect() as conn: cls.metricData = [row for row in repository.getMetricData(conn, cls.uid)] with engine.connect() as conn: cls.testMetric = repository.getMetric(conn, cls.uid) # Confirm that we have enough metricData g_logger.info("Waiting for metric data") while (len(cls.metricData) < cls.minDataRows and waitTimeMetricData < maxWaitTime): g_logger.info("not ready, waiting for metric data: got %d of %d ...", len(cls.metricData), cls.minDataRows) time.sleep(5) waitTimeMetricData += 5 with engine.connect() as conn: cls.metricData = [row for row in repository.getMetricData(conn, cls.uid)] # taking lower value for testRowId, this will make sure to have # Non NULL value for anomaly_score field for given row cls.testRowId = 5 with engine.connect() as conn: cls.testMetricRow = (repository.getMetricData(conn, cls.uid, rowid=cls.testRowId) .fetchone()) # Make sure we did not receive None etc for anomaly score g_logger.info("cls.testMetricRow.anomaly_score=%r", cls.testMetricRow.anomaly_score) g_logger.info("waitAnomalyScore=%r", waitAnomalyScore) while (cls.testMetricRow.anomaly_score is None and waitAnomalyScore < maxWaitTime): g_logger.info("anomaly_score not ready, sleeping...") time.sleep(5) waitAnomalyScore += 5 with engine.connect() as conn: cls.testMetricRow = (repository.getMetricData(conn, cls.uid, rowid=cls.testRowId) .fetchone()) # Decide queryParams for anomalyScore, to and from timestamp cls.testAnomalyScore = cls.testMetricRow.anomaly_score cls.testTimeStamp = cls.testMetricRow.timestamp
def testCollectMetricData(self): self.engine = repository.engineFactory(reset=True) with self.engine.connect() as conn: autostack1 = self._addAutostack(name="testCollectMetricData1", region="us-east-1", filters='{"tag:Name": ["*"]}') m1a = self._addAutostackMetric(conn, autostack1) m1b = self._addAutostackMetric(conn, autostack1, name="Autostacks/InstanceCount") autostack2 = self._addAutostack(name="testCollectMetricData2", region="us-west-2", filters='{"tag:Name": ["*?*"]}') m2 = self._addAutostackMetric(conn, autostack2) autostack3 = self._addAutostack( name="testCollectMetricData3", region="us-west-2", filters='{"tag:Name": ["NothingShouldMatchThis"]}') m3 = self._addAutostackMetric(conn, autostack3) # Collection data for both autostack/metric combinations collector = EC2InstanceMetricGetter() self.addCleanup(collector.close) requests = [ AutostackMetricRequest(refID=1, autostack=autostack1, metric=m1a), AutostackMetricRequest(refID=2, autostack=autostack1, metric=m1b), AutostackMetricRequest(refID=3, autostack=autostack2, metric=m2), AutostackMetricRequest(refID=4, autostack=autostack3, metric=m3) ] metricCollections = dict( (collection.refID, collection) for collection in collector.collectMetricData(requests=requests)) self.assertEqual(len(metricCollections), len(requests)) def checkSliceSorted(records): sortedRecords = sorted(records, key=lambda record: record.timestamp) self.assertSequenceEqual(records, sortedRecords) def checkSliceUniqueTimestamps(records): timestamps = tuple(record.timestamp for record in records) for timestamp in timestamps: self.assertIsInstance(timestamp, datetime) self.assertItemsEqual(set(timestamps), timestamps) collection1 = metricCollections[1] collection2 = metricCollections[2] collection3 = metricCollections[3] collection4 = metricCollections[4] # COLLECTION-1: self.assertEqual(collection1.nextMetricTime, collection1.timeRange.end) metricGroups = defaultdict(list) for metricSlice in collection1.slices: checkSliceSorted(metricSlice.records) checkSliceUniqueTimestamps(metricSlice.records) for record in metricSlice.records: metricGroups[record.timestamp].append( (metricSlice.instanceID, record.value)) foundValues = False for _timestamp, values in metricGroups.iteritems(): if len(values) >= 0: #print timestamp, values[:5] foundValues = True break self.assertTrue(foundValues) # COLLECTION-2: self.assertEqual(collection2.nextMetricTime, collection2.timeRange.end) metricGroups = defaultdict(list) for metricSlice in collection2.slices: checkSliceSorted(metricSlice.records) checkSliceUniqueTimestamps(metricSlice.records) for record in metricSlice.records: metricGroups[record.timestamp].append( (metricSlice.instanceID, record.value)) foundValues = False for _timestamp, values in metricGroups.iteritems(): if len(values) >= 0: #print timestamp, values[:5] foundValues = True break self.assertTrue(foundValues) # COLLECTION-3: self.assertEqual(collection3.nextMetricTime, collection3.timeRange.end) metricGroups = defaultdict(list) metricTimestampInstanceHits = defaultdict(list) for metricSlice in collection3.slices: checkSliceSorted(metricSlice.records) checkSliceUniqueTimestamps(metricSlice.records) for record in metricSlice.records: metricGroups[record.timestamp].append((metricSlice.instanceID, record.value)) metricTimestampInstanceHits[record.timestamp].append( metricSlice.instanceID) foundAlignedItems = False for _timestamp, values in metricGroups.iteritems(): if len(values) > 1: #print timestamp, values[:5] foundAlignedItems = True break self.assertTrue(foundAlignedItems) # Make sure there were no duplicate timestamps in any one slice for _timestamp, instances in metricTimestampInstanceHits.iteritems(): self.assertItemsEqual(instances, set(instances)) # COLLECTION-4 (there should be no matching instances for it): self.assertEqual(len(collection4.slices), 0) self.assertEqual(collection4.nextMetricTime, collection4.timeRange.end)
def messageHandler(self, message): """ Inspect all inbound model results in a batch for anomaly thresholds and trigger notifications where applicable. :param amqp.messages.ConsumerMessage message: ``message.body`` is a serialized batch of model inference results generated in ``AnomalyService`` and must be deserialized using ``AnomalyService.deserializeModelResult()``. The message conforms to htmengine/runtime/json_schema/model_inference_results_msg_schema.json """ if message.properties.headers and "dataType" in message.properties.headers: # Not a model inference result message.ack() return grok.app.config.loadConfig() # reload config on every batch engine = repository.engineFactory() # Cache minimum threshold to trigger any notification to avoid permuting # settings x metricDataRows try: try: batch = AnomalyService.deserializeModelResult(message.body) except Exception: self._log.exception("Error deserializing model result") raise # Load all settings for all users (once per incoming batch) with engine.connect() as conn: settings = repository.retryOnTransientErrors( repository.getAllNotificationSettings)(conn) self._log.debug("settings: %r" % settings) if settings: minThreshold = min(setting.sensitivity for setting in settings) else: minThreshold = 0.99999 metricInfo = batch["metric"] metricId = metricInfo["uid"] resource = metricInfo["resource"] for row in batch["results"]: if row["anomaly"] >= minThreshold: rowDatetime = datetime.utcfromtimestamp(row["ts"]) if not settings: # There are no device notification settings stored on this server, # no notifications will be generated. However, log that a # an anomaly was detected and notification would be sent if there # were any configured devices self._log.info("<%r>" % (metricInfo) + ("{TAG:APP.NOTIFICATION} Anomaly " "detected at %s, but no devices are " "configured.") % rowDatetime) continue for settingObj in settings: if row["rowid"] <= 1000: continue # Not enough data if rowDatetime < datetime.utcnow() - timedelta( seconds=3600): continue # Skip old if row["anomaly"] >= settingObj.sensitivity: # First let's clear any old users out of the database. with engine.connect() as conn: repository.retryOnTransientErrors( repository.deleteStaleNotificationDevices)( conn, _NOTIFICATION_DEVICE_STALE_DAYS) # If anomaly_score meets or exceeds any of the device # notification sensitivity settings, trigger notification. # repository.addNotification() will handle throttling. notificationId = str(uuid.uuid4()) with engine.connect() as conn: result = repository.retryOnTransientErrors( repository.addNotification)( conn, uid=notificationId, server=resource, metric=metricId, rowid=row["rowid"], device=settingObj.uid, windowsize=(settingObj.windowsize), timestamp=rowDatetime, acknowledged=0, seen=0) self._log.info( "NOTIFICATION=%s SERVER=%s METRICID=%s DEVICE=%s " "Notification generated. " % (notificationId, resource, metricId, settingObj.uid)) if (result is not None and result.rowcount > 0 and settingObj.email_addr): # Notification was generated. Attempt to send email with engine.connect() as conn: notificationObj = repository.getNotification( conn, notificationId) self.sendNotificationEmail( engine, settingObj, notificationObj) finally: message.ack() # Do cleanup with engine.connect() as conn: repository.clearOldNotifications( conn) # Delete all notifications outside
def testCollectAndPublishMetrics(self): # Start Metric Collector, create a set of Metrics, wait for it to collect # some metrics and to publish them to the metric_exchange, then validate # attributes of the published metrics. # # TODO Add more metric types # TODO Deeper validation of the published metrics # Start our own instance of metric collector and wait for data points with self._startModelSchedulerSubprocess() as modelSchedulerSubprocess, \ self._startMetricCollectorSubprocess() as metricCollectorSubprocess: # Create some models for metric collector to harvest region = "us-west-2" namespace = "AWS/EC2" resourceType = ResourceTypeNames.EC2_INSTANCE engine = repository.engineFactory() adapter = createCloudwatchDatasourceAdapter() ec2Instances = adapter.describeResources(region=region, resourceType=resourceType) self.assertGreater(len(ec2Instances), 0) maxModels = 10 ec2Instances = ec2Instances[:min(maxModels, Quota.getInstanceQuota())] metricInstances = [] _LOGGER.info("Starting %d models", len(ec2Instances)) self.assertGreater(len(ec2Instances), 0) for ec2Instance in ec2Instances: metricSpec = {"region": region, "namespace": namespace, "metric": "CPUUtilization", "dimensions": {"InstanceId": ec2Instance["resID"]}} modelSpec = {"datasource": "cloudwatch", "metricSpec": metricSpec} metricId = adapter.monitorMetric(modelSpec) with engine.connect() as conn: repository.setMetricStatus(conn, metricId, MetricStatus.ACTIVE) metricInstances.append(metricId) _LOGGER.info("Waiting for results from models...") seenMetricIDs = set() allMetricIDs = set(metricInstances) # Register a timeout so we won't deadlock the test def onTimeout(resultsQueueName): _LOGGER.error( "Timed out waiting to get results from models; numResults=%d; " "expected=%d", len(seenMetricIDs), len(allMetricIDs)) # HACK delete model swapper results queue to abort the consumer try: with MessageBusConnector() as bus: bus.deleteMessageQueue(resultsQueueName) except Exception: _LOGGER.exception("Failed to delete results mq=%s", resultsQueueName) raise with ModelSwapperInterface() as modelSwapper: with modelSwapper.consumeResults() as consumer: timer = threading.Timer(120, onTimeout, args=[modelSwapper._resultsQueueName]) timer.start() try: for batch in consumer: seenMetricIDs.add(batch.modelID) batch.ack() if seenMetricIDs == allMetricIDs: break else: self.fail( "Expected %d results, but got only %d: %s" % (len(allMetricIDs), len(seenMetricIDs), seenMetricIDs,)) _LOGGER.info("Got %d results from models", len(seenMetricIDs)) finally: timer.cancel() # Terminate metric_collector subprocess gracefully to avoid too much # error logging junk on the terminal metricCollectorSubprocess.send_signal(signal.SIGINT) # Terminate metric_collector subprocess gracefully to avoid too much # error logging junk on the terminal modelSchedulerSubprocess.send_signal(signal.SIGINT)
def GET(self): """ Get metrics, sorted by AWS name tag / instance ID :returns: List of metrics :rtype: list Example request:: GET /_anomalies/name Example response:: [ { "status": 1, "last_rowid": 4033, "display_name": "jenkins-master (us-west-2/AWS/EC2/i-12345678)", "description": "NetworkIn on EC2 instance i-12345678 in us-west-2", "name": "AWS/EC2/NetworkIn", "last_timestamp": "2014-04-14 20:29:00", "poll_interval": 300, "server": "us-west-2/AWS/EC2/i-12345678", "tag_name": "jenkins-master", "datasource": "cloudwatch", "location": "us-west-2", "message": null, "parameters": { "InstanceId": "i-12345678", "region": "us-west-2" }, "uid": "0b6b97022fdb4134936aae92aa67393b" }, ... ] """ try: self.addStandardHeaders() engine = repository.engineFactory() with engine.connect() as conn: modelIterator = repository.getAllMetrics( conn, fields=getMetricDisplayFields(conn)) modelsList = [ convertMetricRowToMetricDict(model) for model in modelIterator ] # Sort by tag_name, and then parameters=>InstanceID def cmpFn(model1, model2): name1 = model1["tag_name"] name2 = model2["tag_name"] id1 = model1["parameters"].get("InstanceID") id2 = model2["parameters"].get("InstanceID") if name1 and not name2: return -1 elif name2 and not name1: return 1 elif name1 != name2: return cmp(name1, name2) elif id1 and not id2: return -1 elif id2 and not id1: return 1 elif id1 != id2: return cmp(id1, id2) return 0 modelsList.sort(cmpFn) return utils.jsonEncode(modelsList) except (web.HTTPError) as ex: log.info(str(ex) or repr(ex)) raise ex except Exception as ex: log.exception("GET Failed") raise web.internalerror(str(ex) or repr(ex))