def start(self):
    # Removes possible left over cached engine
    # (needed if non-patched engine is run prior)
    repository.engineFactory(reset=True)

    # Override the Repository database name
    try:
      self._configPatch.start()
      self._configPatchApplied = True

      # Verity that the database doesn't exist yet
      assert self.tempDatabaseName not in getAllDatabaseNames(), (
        "Temp repo db=%s already existed" % (self.tempDatabaseName,))

      # Now create the temporary repository database
      self._attemptedToCreateRepository = True
      repository.reset()

      # Verify that the temporary repository database got created
      assert self.tempDatabaseName in getAllDatabaseNames(), (
        "Temp repo db=%s not found" % (self.tempDatabaseName,))
    except:
      # Attempt to clean up
      self.stop()

      raise
    def testMetricDataForRandomRowID(uid):
      '''
        This tests if the metric data returned by the GET call :
          _models/<uid>/data
        has anomaly_score consistent with what is there in the actual
        database by asserting it against a dao.MetricData.get() call
        It repeats the process for 5 random sample rows for each uid
        in the database.

        Algorithm :
        - Query the MetricDataHandler GET call for a certain uid
        - Check if response is OK
        - Find the last row id for the uid
        - Select a random row between 1 and last row id
        - Find the anomaly score for that row id
        - Assert on the anomaly score
      '''
      response = self.app.get("/%s/data" %uid, headers=self.headers)
      assertions.assertSuccess(self, response)
      getAllModelsResult = utils.jsonDecode(response.body)
      with repository.engineFactory().connect() as conn:
        lastRowID = repository.getMetric(conn, uid).last_rowid
      for _ in range(5):
        randomRowID = randrange(1, lastRowID)
        with repository.engineFactory().connect() as conn:
          singleMetricData = repository.getMetricData(
            conn,
            uid,
            rowid=randomRowID).first()
        metricData = getMetricDataWithRowID(getAllModelsResult['data'],
          randomRowID)
        self.assertEqual(metricData[2], singleMetricData.anomaly_score)
        self.assertEqual(datetime.strptime(metricData[0],
          '%Y-%m-%d %H:%M:%S'), singleMetricData.timestamp)
        def testMetricDataForRandomRowID(uid):
            '''
        This tests if the metric data returned by the GET call :
          _models/<uid>/data
        has anomaly_score consistent with what is there in the actual
        database by asserting it against a dao.MetricData.get() call
        It repeats the process for 5 random sample rows for each uid
        in the database.

        Algorithm :
        - Query the MetricDataHandler GET call for a certain uid
        - Check if response is OK
        - Find the last row id for the uid
        - Select a random row between 1 and last row id
        - Find the anomaly score for that row id
        - Assert on the anomaly score
      '''
            response = self.app.get("/%s/data" % uid, headers=self.headers)
            assertions.assertSuccess(self, response)
            getAllModelsResult = utils.jsonDecode(response.body)
            with repository.engineFactory().connect() as conn:
                lastRowID = repository.getMetric(conn, uid).last_rowid
            for _ in range(5):
                randomRowID = randrange(1, lastRowID)
                with repository.engineFactory().connect() as conn:
                    singleMetricData = repository.getMetricData(
                        conn, uid, rowid=randomRowID).first()
                metricData = getMetricDataWithRowID(getAllModelsResult['data'],
                                                    randomRowID)
                self.assertEqual(metricData[2], singleMetricData.anomaly_score)
                self.assertEqual(
                    datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'),
                    singleMetricData.timestamp)
    def start(self):
        # Removes possible left over cached engine
        # (needed if non-patched engine is run prior)
        repository.engineFactory(reset=True)

        # Override the Repository database name
        try:
            self._configPatch.start()
            self._configPatchApplied = True

            # Verity that the database doesn't exist yet
            assert self.tempDatabaseName not in getAllDatabaseNames(), (
                "Temp repo db=%s already existed" % (self.tempDatabaseName, ))

            # Now create the temporary repository database
            self._attemptedToCreateRepository = True
            repository.reset()

            # Verify that the temporary repository database got created
            assert self.tempDatabaseName in getAllDatabaseNames(), (
                "Temp repo db=%s not found" % (self.tempDatabaseName, ))
        except:
            # Attempt to clean up
            self.stop()

            raise
예제 #5
0
  def GET(self):
    """
    Get model data stats

    ::

        GET /_models/data/stats

    Returns:

    ::

        {
            "processing_time_remaining": 37
        }
    """
    with repository.engineFactory().connect() as conn:
      unprocessedDataCount = repository.getUnprocessedModelDataCount(conn)
    processingTimeRemaining = int(math.ceil(
        unprocessedDataCount * _PROCESSING_TIME_PER_RECORD))

    self.addStandardHeaders()
    return utils.jsonEncode({
        "processing_time_remaining": processingTimeRemaining,
    })
예제 #6
0
    def GET(self, autostackId, *args):  # pylint: disable=C0103,W0613
        """
      Get Metrics associated with autostack

      ::

          GET /_autostacks/{autostackId}/metrics

      NOTE: args is ignored.  Function signature for all method handlers must
      be compatible with the regexp pattern that matches.  POST optionally
      takes a second argument, DELETE requires it.
    """
        try:
            self.addStandardHeaders()
            engine = repository.engineFactory()
            metricRows = repository.getAutostackMetrics(
                engine, autostackId, getMetricDisplayFields(engine))
            metricsList = [
                convertMetricRowToMetricDict(metricRow)
                for metricRow in metricRows
            ]

            return utils.jsonEncode(metricsList)

        except ObjectNotFoundError:
            raise web.notfound("Autostack not found: Autostack ID: %s" %
                               autostackId)
        except web.HTTPError as ex:
            if bool(re.match(r"([45][0-9][0-9])\s?", web.ctx.status)):
                # Log 400-599 status codes as errors, ignoring 200-399
                log.error(str(ex) or repr(ex))
            raise
        except Exception as ex:
            raise web.internalerror(str(ex) or repr(ex))
    def testMetricDataTimeStampQueryParams(uid):
      '''
        This test makes MetricDataHandler GET calls with from and to params :
          _models/<uid>/data?from=<>&to=<>
      '''
      with repository.engineFactory().connect() as conn:
        firstMetricData = conn.execute(
          sql.select([schema.metric_data])
          .where(schema.metric_data.c.uid == uid)
          .order_by(sql.expression.asc(schema.metric_data.c.timestamp))
          .limit(1)).fetchall()

        lastMetricData = conn.execute(
          sql.select([schema.metric_data])
          .where(schema.metric_data.c.uid == uid)
          .order_by(sql.expression.desc(schema.metric_data.c.timestamp))
          .limit(1)).fetchall()
      firstTimeStamp = firstMetricData[0].timestamp
      lastTimeStamp = lastMetricData[0].timestamp
      response = self.app.get("/%s/data?from=%s&to=%s"
        % (uid, firstTimeStamp, lastTimeStamp), headers=self.headers)
      assertions.assertSuccess(self, response)
      getAllModelsResult = utils.jsonDecode(response.body)
      for metricData in getAllModelsResult['data']:
        self.assertGreaterEqual(datetime.strptime(metricData[0],
          '%Y-%m-%d %H:%M:%S'), firstTimeStamp)
        self.assertLessEqual(datetime.strptime(metricData[0],
          '%Y-%m-%d %H:%M:%S'), lastTimeStamp)
        def testMetricDataTimeStampQueryParams(uid):
            '''
        This test makes MetricDataHandler GET calls with from and to params :
          _models/<uid>/data?from=<>&to=<>
      '''
            with repository.engineFactory().connect() as conn:
                firstMetricData = conn.execute(
                    sql.select([schema.metric_data
                                ]).where(schema.metric_data.c.uid == uid).
                    order_by(sql.expression.asc(
                        schema.metric_data.c.timestamp)).limit(1)).fetchall()

                lastMetricData = conn.execute(
                    sql.select([
                        schema.metric_data
                    ]).where(schema.metric_data.c.uid == uid).order_by(
                        sql.expression.desc(
                            schema.metric_data.c.timestamp)).limit(
                                1)).fetchall()
            firstTimeStamp = firstMetricData[0].timestamp
            lastTimeStamp = lastMetricData[0].timestamp
            response = self.app.get("/%s/data?from=%s&to=%s" %
                                    (uid, firstTimeStamp, lastTimeStamp),
                                    headers=self.headers)
            assertions.assertSuccess(self, response)
            getAllModelsResult = utils.jsonDecode(response.body)
            for metricData in getAllModelsResult['data']:
                self.assertGreaterEqual(
                    datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'),
                    firstTimeStamp)
                self.assertLessEqual(
                    datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'),
                    lastTimeStamp)
 def testMetricDataQueryParams(uid):
   '''
     This test makes MetricDataHandler GET calls with various params :
       _models/<uid>/data?from=<>&to=<>&anomaly=<>
   '''
   with repository.engineFactory().connect() as conn:
     firstMetricData = conn.execute(
       "SELECT * FROM `metric_data` WHERE `uid`='%s' "
       "and abs(`anomaly_score` - 0) > 1e-5 "
       "ORDER BY `timestamp` ASC LIMIT 1" % uid).fetchall()
     lastMetricData = conn.execute(
       "SELECT * FROM `metric_data` WHERE `uid`='%s' "
       "and abs(`anomaly_score` - 0) > 1e-5 "
       "ORDER BY `timestamp` DESC LIMIT 1" % uid).fetchall()
   firstTimeStamp = firstMetricData[0].timestamp
   lastTimeStamp = lastMetricData[0].timestamp
   anomalyScore = firstMetricData[0].anomaly_score
   response = self.app.get("/%s/data?from=%s&to=%s&anomaly=%s"
     % (uid, firstTimeStamp, lastTimeStamp, anomalyScore),
     headers=self.headers)
   assertions.assertSuccess(self, response)
   getAllModelsResult = utils.jsonDecode(response.body)
   for metricData in getAllModelsResult['data']:
     self.assertGreaterEqual(metricData[2], anomalyScore)
     self.assertGreaterEqual(datetime.strptime(metricData[0],
       '%Y-%m-%d %H:%M:%S'), firstTimeStamp)
     self.assertLessEqual(datetime.strptime(metricData[0],
       '%Y-%m-%d %H:%M:%S'), lastTimeStamp)
예제 #10
0
  def GET(self, autostackId, *args): # pylint: disable=C0103,W0613
    """
      Get Metrics associated with autostack

      ::

          GET /_autostacks/{autostackId}/metrics

      NOTE: args is ignored.  Function signature for all method handlers must
      be compatible with the regexp pattern that matches.  POST optionally
      takes a second argument, DELETE requires it.
    """
    try:
      self.addStandardHeaders()
      engine = repository.engineFactory()
      metricRows = repository.getAutostackMetrics(engine,
                                                  autostackId,
                                                  getMetricDisplayFields(engine))
      metricsList = [convertMetricRowToMetricDict(metricRow)
                     for metricRow in metricRows]

      return utils.jsonEncode(metricsList)

    except ObjectNotFoundError:
      raise web.notfound("Autostack not found: Autostack ID: %s" % autostackId)
    except web.HTTPError as ex:
      if bool(re.match(r"([45][0-9][0-9])\s?", web.ctx.status)):
        # Log 400-599 status codes as errors, ignoring 200-399
        log.error(str(ex) or repr(ex))
      raise
    except Exception as ex:
      raise web.internalerror(str(ex) or repr(ex))
예제 #11
0
    def GET(self):
        """
    Get model data stats

    ::

        GET /_models/data/stats

    Returns:

    ::

        {
            "processing_time_remaining": 37
        }
    """
        with repository.engineFactory().connect() as conn:
            unprocessedDataCount = repository.getUnprocessedModelDataCount(
                conn)
        processingTimeRemaining = int(
            math.ceil(unprocessedDataCount * _PROCESSING_TIME_PER_RECORD))

        self.addStandardHeaders()
        return utils.jsonEncode({
            "processing_time_remaining":
            processingTimeRemaining,
        })
   def testMetricDataQueryParams(uid):
       '''
   This test makes MetricDataHandler GET calls with various params :
     _models/<uid>/data?from=<>&to=<>&anomaly=<>
 '''
       with repository.engineFactory().connect() as conn:
           firstMetricData = conn.execute(
               "SELECT * FROM `metric_data` WHERE `uid`='%s' "
               "and abs(`anomaly_score` - 0) > 1e-5 "
               "ORDER BY `timestamp` ASC LIMIT 1" % uid).fetchall()
           lastMetricData = conn.execute(
               "SELECT * FROM `metric_data` WHERE `uid`='%s' "
               "and abs(`anomaly_score` - 0) > 1e-5 "
               "ORDER BY `timestamp` DESC LIMIT 1" % uid).fetchall()
       firstTimeStamp = firstMetricData[0].timestamp
       lastTimeStamp = lastMetricData[0].timestamp
       anomalyScore = firstMetricData[0].anomaly_score
       response = self.app.get(
           "/%s/data?from=%s&to=%s&anomaly=%s" %
           (uid, firstTimeStamp, lastTimeStamp, anomalyScore),
           headers=self.headers)
       assertions.assertSuccess(self, response)
       getAllModelsResult = utils.jsonDecode(response.body)
       for metricData in getAllModelsResult['data']:
           self.assertGreaterEqual(metricData[2], anomalyScore)
           self.assertGreaterEqual(
               datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'),
               firstTimeStamp)
           self.assertLessEqual(
               datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'),
               lastTimeStamp)
예제 #13
0
    def run(self):
        with ModelSwapperInterface() as modelSwapper:
            engine = repository.engineFactory()
            while True:
                with engine.connect() as conn:
                    pendingStacks = repository.retryOnTransientErrors(
                        repository.getAutostackMetricsPendingDataCollection)(
                            conn)

                if not pendingStacks:
                    time.sleep(self._NOTHING_READY_SLEEP_TIME_SEC)
                    continue

                # Build a sequence of autostack metric requests
                requests = []
                for autostack, metrics in pendingStacks:
                    refBase = len(requests)
                    requests.extend(
                        AutostackMetricRequest(refID=refBase + i,
                                               autostack=autostack,
                                               metric=metric)
                        for i, metric in enumerate(metrics))

                # Collect, aggregate, and stream metric data
                self._processAutostackMetricRequests(engine, requests,
                                                     modelSwapper)
예제 #14
0
  def checkModelIsActive(self, uid):
    engine = repository.engineFactory()
    with engine.begin() as conn:
      metricObj = repository.getMetric(conn,
                                       uid,
                                       fields=[schema.metric.c.status])

    self.assertEqual(metricObj.status, MetricStatus.ACTIVE)
예제 #15
0
    def checkModelIsActive(self, uid):
        engine = repository.engineFactory()
        with engine.begin() as conn:
            metricObj = repository.getMetric(conn,
                                             uid,
                                             fields=[schema.metric.c.status])

        self.assertEqual(metricObj.status, MetricStatus.ACTIVE)
예제 #16
0
    def tearDownClass(cls):
        try:
            engine = repository.engineFactory()
            with engine.connect() as conn:
                repository.deleteMetric(conn, cls.uid)

            with engine.connect() as conn:
                _ = repository.getMetric(conn, cls.uid)
        except ObjectNotFoundError:
            g_logger.info("Successful clean-up")
        else:
            g_logger.error("Test failed to delete metric=%s", cls.uid)
예제 #17
0
  def tearDownClass(cls):
    try:
      engine = repository.engineFactory()
      with engine.connect() as conn:
        repository.deleteMetric(conn, cls.uid)

      with engine.connect() as conn:
        _ = repository.getMetric(conn, cls.uid)
    except ObjectNotFoundError:
      g_logger.info("Successful clean-up")
    else:
      g_logger.error("Test failed to delete metric=%s", cls.uid)
예제 #18
0
  def checkMetricUnmonitoredById(self, uid):
    engine = repository.engineFactory()
    with engine.begin() as conn:
      metricObj = repository.getMetric(conn,
                                       uid,
                                       fields=[schema.metric.c.status,
                                               schema.metric.c.parameters])

    self.assertEqual(metricObj.status, MetricStatus.UNMONITORED)
    self.assertIsNone(metricObj.parameters)

    with self.assertRaises(model_checkpoint_mgr.ModelNotFound):
      model_checkpoint_mgr.ModelCheckpointMgr().loadModelDefinition(uid)
예제 #19
0
    def checkStats(self, metricName, mn, mx):
        """Check that stats are computed correctly from the database"""
        engine = repository.engineFactory()
        with engine.begin() as conn:
            metricObj = (repository.getCustomMetricByName(
                conn,
                metricName,
                fields=[schema.metric.c.uid, schema.metric.c.parameters]))
            stats = repository.getMetricStats(conn, metricObj.uid)

        self.assertSetEqual(set(stats.keys()), set(("min", "max")))
        self.assertAlmostEqual(stats["min"], mn)
        self.assertAlmostEqual(stats["max"], mx)
예제 #20
0
    def checkMetricUnmonitoredById(self, uid):
        engine = repository.engineFactory()
        with engine.begin() as conn:
            metricObj = repository.getMetric(
                conn,
                uid,
                fields=[schema.metric.c.status, schema.metric.c.parameters])

        self.assertEqual(metricObj.status, MetricStatus.UNMONITORED)
        self.assertIsNone(metricObj.parameters)

        with self.assertRaises(model_checkpoint_mgr.ModelNotFound):
            model_checkpoint_mgr.ModelCheckpointMgr().loadModelDefinition(uid)
예제 #21
0
  def checkStats(self, metricName, mn, mx):
    """Check that stats are computed correctly from the database"""
    engine = repository.engineFactory()
    with engine.begin() as conn:
      metricObj = (
        repository.getCustomMetricByName(conn,
                                         metricName,
                                         fields=[schema.metric.c.uid,
                                                 schema.metric.c.parameters]))
      stats = repository.getMetricStats(conn, metricObj.uid)

    self.assertSetEqual(set(stats.keys()), set(("min", "max")))
    self.assertAlmostEqual(stats["min"], mn)
    self.assertAlmostEqual(stats["max"], mx)
예제 #22
0
 def testMetricDataAnomalyAsQueryParams(uid):
   '''
     This test makes MetricDataHandler GET calls with anomaly param :
       _models/<uid>/data?anomaly=<>
   '''
   queryString = ("SELECT * FROM metric_data WHERE uid='%s' "
                  "   and abs(anomaly_score - 0) > 1e-5 LIMIT 1") % uid
   with repository.engineFactory().connect() as conn:
     sampleMetricData = conn.execute(queryString).first()
   anomalyScore = sampleMetricData.anomaly_score
   response = self.app.get("/%s/data?anomaly=%s"
     % (uid, anomalyScore), headers=self.headers)
   assertions.assertSuccess(self, response)
   getAllModelsResult = utils.jsonDecode(response.body)
   for metricData in getAllModelsResult['data']:
     self.assertGreaterEqual(metricData[2], anomalyScore)
예제 #23
0
  def testTablesCreatedWithInnoDBEngine(self):
    """
    Tests to make sure that all of the tables in the grok table_schema were
    created using the InnoDB engine to preserve referential integrity.

    At this time, it is checking all tables in the DB; in the future, if we do
    not require referential integrity, we can explicitly whitelist specific
    tables to allow those to use `MyISAM` or another engine.
    """
    engine = repository.engineFactory()
    result = engine.execute("SELECT table_name, engine "
                            "FROM information_schema.tables "
                            "WHERE table_schema = 'grok'")

    for row in result:
      self.assertEqual(row.engine, "InnoDB",
        ("Table %s was created with the wrong engine type" % row["table_name"]))
예제 #24
0
  def checkEncoderResolution(self, uid, minVal, maxVal):
    """Check that encoder resolution is computed correctly."""
    engine = repository.engineFactory()
    with engine.begin() as conn:
      metricObj = repository.getMetric(conn,
                                       uid,
                                       fields=[schema.metric.c.name,
                                               schema.metric.c.model_params])

    modelParams = json.loads(metricObj.model_params)
    self.assertNotEqual(modelParams, None,
                        "No model exists for metric %s" % metricObj.name)
    sensorParams = modelParams["modelConfig"]["modelParams"]["sensorParams"]
    encoderParams = sensorParams["encoders"]["c1"]
    # Estimate and check the bounds for the resolution based on min and max
    lower = (maxVal - minVal) / 300.0
    upper = (maxVal - minVal) / 80.0
    self.assertGreater(encoderParams["resolution"], lower)
    self.assertLess(encoderParams["resolution"], upper)
예제 #25
0
    def testTablesCreatedWithInnoDBEngine(self):
        """
    Tests to make sure that all of the tables in the grok table_schema were
    created using the InnoDB engine to preserve referential integrity.

    At this time, it is checking all tables in the DB; in the future, if we do
    not require referential integrity, we can explicitly whitelist specific
    tables to allow those to use `MyISAM` or another engine.
    """
        engine = repository.engineFactory()
        result = engine.execute("SELECT table_name, engine "
                                "FROM information_schema.tables "
                                "WHERE table_schema = 'grok'")

        for row in result:
            self.assertEqual(
                row.engine, "InnoDB",
                ("Table %s was created with the wrong engine type" %
                 row["table_name"]))
  def _runBasicChecksOnModel(self, modelId, _adapter, modelSpec):
    with repository.engineFactory().connect() as conn:
      metricObj = repository.getMetric(conn, modelId)
    _LOG.info("Making sure metric is CREATE_PENDING or ACTIVE or PENDING_DATA")

    self.assertIn(
      metricObj.status,
      [MetricStatus.CREATE_PENDING,
       MetricStatus.ACTIVE,
       MetricStatus.PENDING_DATA])

    _LOG.info("Checking modelSpec")
    self.assertEqual(jsonDecode(metricObj.parameters), modelSpec)

    _LOG.info("Waiting for model to become active")
    self.checkModelIsActive(modelId)

    _LOG.info("Waiting at least one model result")
    self.checkModelResultsSize(modelId, 1, atLeast=True)
예제 #27
0
    def checkEncoderResolution(self, uid, minVal, maxVal):
        """Check that encoder resolution is computed correctly."""
        engine = repository.engineFactory()
        with engine.begin() as conn:
            metricObj = repository.getMetric(
                conn,
                uid,
                fields=[schema.metric.c.name, schema.metric.c.model_params])

        modelParams = json.loads(metricObj.model_params)
        self.assertNotEqual(modelParams, None,
                            "No model exists for metric %s" % metricObj.name)
        sensorParams = modelParams["modelConfig"]["modelParams"][
            "sensorParams"]
        encoderParams = sensorParams["encoders"]["c1"]
        # Estimate and check the bounds for the resolution based on min and max
        lower = (maxVal - minVal) / 300.0
        upper = (maxVal - minVal) / 80.0
        self.assertGreater(encoderParams["resolution"], lower)
        self.assertLess(encoderParams["resolution"], upper)
  def setUpClass(cls):
    with open(os.path.join(
        grok.app.GROK_HOME,
        "tests/py/integration/app/test_resources.yaml")) as fin:
      resources = yaml.load(fin)
    testCase = resources[aws_base.ResourceTypeNames.EC2_INSTANCE][0]

    cls._testRegion = testCase["region"]
    cls._testId = testCase["dimensions"]["InstanceId"]
    # Load grok API Key as required by TestCaseBase
    cls.apiKey = grok.app.config.get("security", "apikey")

    cls._modelSpecNoMinMax = {"datasource":testCase["datasource"],
                              "metricSpec":{
                                "region":testCase["region"],
                                "namespace":testCase["namespace"],
                                "metric":testCase["metric"],
                                "dimensions":testCase["dimensions"]}}

    cls.engine = repository.engineFactory()
예제 #29
0
def formatMetricRowProxy(metricObj):
    if metricObj.tag_name is not None and len(metricObj.tag_name) > 0:
        displayName = "%s (%s)" % (metricObj.tag_name, metricObj.server)
    else:
        displayName = metricObj.server

    if hasattr(metricObj, "parameters") and isinstance(metricObj.parameters, basestring):
        parameters = json.loads(metricObj.parameters)
    else:
        parameters = metricObj.parameters

    engine = repository.engineFactory()

    allowedKeys = set([col.name for col in getMetricDisplayFields(engine)])

    metricDict = dict((col, getattr(metricObj, col)) for col in metricObj.keys() if col in allowedKeys)

    metricDict["display_name"] = displayName
    metricDict["parameters"] = parameters

    return metricDict
예제 #30
0
def fixUpGrokDB():
  g_log.info("******* UPDATING GROKDB *******")

  # Perform manual db migration to switch to sqlalchemy
  engine = repository.engineFactory()
  with engine.connect() as connection:
    connection.execute("DROP TABLE IF EXISTS DATABASECHANGELOG")
    connection.execute("DROP TABLE IF EXISTS DATABASECHANGELOGLOCK")
    connection.execute(
        "CREATE TABLE `alembic_version` (`version_num` varchar(32) NOT NULL) "
        "ENGINE=InnoDB DEFAULT CHARSET=utf8;")
    # This fools Alembic into thinking the first migration, which goes from an
    # empty database to the 1.6 setup, has already been completed so it doesn't
    # attempt to perform it.
    connection.execute(
        "INSERT INTO alembic_version (version_num) VALUES('3a7e06671df4');")

  # Now we can run the migration script to upgrade from 1.6 to 1.7.
  repository.migrate("2f1ee984f978")

  g_log.info("******* GROKDB UPDATED *******")
예제 #31
0
def formatMetricRowProxy(metricObj):
    if metricObj.tag_name is not None and len(metricObj.tag_name) > 0:
        displayName = "%s (%s)" % (metricObj.tag_name, metricObj.server)
    else:
        displayName = metricObj.server

    if (hasattr(metricObj, "parameters")
            and isinstance(metricObj.parameters, basestring)):
        parameters = json.loads(metricObj.parameters)
    else:
        parameters = metricObj.parameters

    engine = repository.engineFactory()

    allowedKeys = set([col.name for col in getMetricDisplayFields(engine)])

    metricDict = dict((col, getattr(metricObj, col))
                      for col in metricObj.keys() if col in allowedKeys)

    metricDict["display_name"] = displayName
    metricDict["parameters"] = parameters

    return metricDict
    def setUpClass(cls):
        with open(
                os.path.join(
                    grok.app.GROK_HOME,
                    "tests/py/integration/app/test_resources.yaml")) as fin:
            resources = yaml.load(fin)
        testCase = resources[aws_base.ResourceTypeNames.EC2_INSTANCE][0]

        cls._testRegion = testCase["region"]
        cls._testId = testCase["dimensions"]["InstanceId"]
        # Load grok API Key as required by TestCaseBase
        cls.apiKey = grok.app.config.get("security", "apikey")

        cls._modelSpecNoMinMax = {
            "datasource": testCase["datasource"],
            "metricSpec": {
                "region": testCase["region"],
                "namespace": testCase["namespace"],
                "metric": testCase["metric"],
                "dimensions": testCase["dimensions"]
            }
        }

        cls.engine = repository.engineFactory()
예제 #33
0
  def run(self):
    with ModelSwapperInterface() as modelSwapper:
      engine = repository.engineFactory()
      while True:
        with engine.connect() as conn:
          pendingStacks = repository.retryOnTransientErrors(
            repository.getAutostackMetricsPendingDataCollection)(conn)

        if not pendingStacks:
          time.sleep(self._NOTHING_READY_SLEEP_TIME_SEC)
          continue

        # Build a sequence of autostack metric requests
        requests = []
        for autostack, metrics in pendingStacks:
          refBase = len(requests)
          requests.extend(
            AutostackMetricRequest(refID=refBase + i,
                                   autostack=autostack,
                                   metric=metric)
            for i, metric in enumerate(metrics))

        # Collect, aggregate, and stream metric data
        self._processAutostackMetricRequests(engine, requests, modelSwapper)
 def setUpClass(cls):
     # Load grok API Key as required by TestCaseBase
     cls.apiKey = grok.app.config.get("security", "apikey")
     cls.engine = repository.engineFactory()
예제 #35
0
    def POST(cls):
        """Upload the metric info and metric data as a compressed tarfile to S3.

    The request must include the uid of the metric and may include other JSON
    keys as well. For instance, it is likely that a request from the mobile
    application will include information about the current view and data
    being displayed when the feedback request is sent. Any fields in addition
    to uid will be stored with the feedback archive file that is uploaded to
    S3.
    """
        inputData = json.loads(web.data())
        # Get the metric uid
        uid = inputData["uid"]
        del inputData["uid"]

        inputData["server_id"] = _MACHINE_ID

        # Data is written to a temporary directory before uploading
        path = tempfile.mkdtemp()

        try:
            # Retrieve the metric table record and add it to the other input
            # parameters
            metricFields = [
                schema.metric.c.uid, schema.metric.c.datasource,
                schema.metric.c.name, schema.metric.c.description,
                schema.metric.c.server, schema.metric.c.location,
                schema.metric.c.parameters, schema.metric.c.status,
                schema.metric.c.message, schema.metric.c.last_timestamp,
                schema.metric.c.poll_interval, schema.metric.c.tag_name,
                schema.metric.c.last_rowid
            ]

            with repository.engineFactory().connect() as conn:
                metricRow = repository.getMetric(conn, uid, metricFields)
            metric = dict([
                (col.name, utils.jsonDecode(getattr(metricRow, col.name))
                 if col.name == "parameters" else getattr(metricRow, col.name))
                for col in metricFields
            ])
            if metric["tag_name"]:
                metric["display_name"] = "%s (%s)" % (metric["tag_name"],
                                                      metric["server"])
            else:
                metric["display_name"] = metric["server"]

            inputData["metric"] = utils.jsonEncode(metric)

            metricPath = os.path.join(path, "metric.json")
            with open(metricPath, "w") as f:
                json.dump(inputData, f)

            # Retrieve the metric data
            with repository.engineFactory().connect() as conn:
                metricDataRows = repository.getMetricData(conn, uid)
            metricData = [
                dict([(col.name, getattr(metricData, col.name))
                      for col in schema.metric_data.columns])
                for metricData in metricDataRows
            ]

            metricDataPath = os.path.join(path, "metric_data.csv")
            with open(metricDataPath, "w") as f:
                writer = csv.writer(f)
                if len(metricData) > 0:
                    header = metricData[0].keys()
                    # Write the field names first
                    writer.writerow(header)
                    # Then write out the data for each row
                    for dataDict in metricData:
                        row = [dataDict[h] for h in header]
                        writer.writerow(row)

            # Create a tarfile to upload
            ts = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S")
            filename = "metric_dump_%s_%s.tar.gz" % (uid, ts)
            tfPath = os.path.join(path, filename)
            with tarfile.open(tfPath, "w:gz") as tf:
                tf.add(metricPath, arcname=os.path.basename(metricPath))
                tf.add(metricDataPath,
                       arcname=os.path.basename(metricDataPath))

            # Upload the tarfile
            return cls._uploadTarfile(filename, tfPath)

        finally:
            shutil.rmtree(path)
예제 #36
0
  def POST(cls):
    """Upload the metric info and metric data as a compressed tarfile to S3.

    The request must include the uid of the metric and may include other JSON
    keys as well. For instance, it is likely that a request from the mobile
    application will include information about the current view and data
    being displayed when the feedback request is sent. Any fields in addition
    to uid will be stored with the feedback archive file that is uploaded to
    S3.
    """
    inputData = json.loads(web.data())
    # Get the metric uid
    uid = inputData["uid"]
    del inputData["uid"]

    inputData["server_id"] = _MACHINE_ID

    # Data is written to a temporary directory before uploading
    path = tempfile.mkdtemp()

    try:
      # Retrieve the metric table record and add it to the other input
      # parameters
      metricFields = [schema.metric.c.uid,
                      schema.metric.c.datasource,
                      schema.metric.c.name,
                      schema.metric.c.description,
                      schema.metric.c.server,
                      schema.metric.c.location,
                      schema.metric.c.parameters,
                      schema.metric.c.status,
                      schema.metric.c.message,
                      schema.metric.c.last_timestamp,
                      schema.metric.c.poll_interval,
                      schema.metric.c.tag_name,
                      schema.metric.c.last_rowid]

      with repository.engineFactory().connect() as conn:
        metricRow = repository.getMetric(conn,
                                         uid,
                                         metricFields)
      metric = dict([(col.name, utils.jsonDecode(getattr(metricRow, col.name))
                      if col.name == "parameters"
                      else getattr(metricRow, col.name))
                      for col in metricFields])
      if metric["tag_name"]:
        metric["display_name"] = "%s (%s)" % (metric["tag_name"],
                                               metric["server"])
      else:
        metric["display_name"] = metric["server"]

      inputData["metric"] = utils.jsonEncode(metric)

      metricPath = os.path.join(path, "metric.json")
      with open(metricPath, "w") as f:
        json.dump(inputData, f)

      # Retrieve the metric data
      with repository.engineFactory().connect() as conn:
        metricDataRows = repository.getMetricData(conn, uid)
      metricData = [dict([(col.name, getattr(metricData, col.name))
                          for col in schema.metric_data.columns])
                    for metricData in metricDataRows]

      metricDataPath = os.path.join(path, "metric_data.csv")
      with open(metricDataPath, "w") as f:
        writer = csv.writer(f)
        if len(metricData) > 0:
          header = metricData[0].keys()
          # Write the field names first
          writer.writerow(header)
          # Then write out the data for each row
          for dataDict in metricData:
            row = [dataDict[h] for h in header]
            writer.writerow(row)

      # Create a tarfile to upload
      ts = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S")
      filename = "metric_dump_%s_%s.tar.gz" % (uid, ts)
      tfPath = os.path.join(path, filename)
      with tarfile.open(tfPath, "w:gz") as tf:
        tf.add(metricPath, arcname=os.path.basename(metricPath))
        tf.add(metricDataPath, arcname=os.path.basename(metricDataPath))

      # Upload the tarfile
      return cls._uploadTarfile(filename, tfPath)

    finally:
      shutil.rmtree(path)
예제 #37
0
  def testCollectMetricStatistics(self):

    expectedStatisticNames = ["min", "max"]

    def validateStats(stats):
      self.assertIsInstance(stats, (list, tuple))

      timestamps = []
      for instanceMetrics in stats:
        self.assertEqual(len(instanceMetrics.records), 1)
        record = instanceMetrics.records[0]
        self.assertIsInstance(record.value, dict)
        self.assertGreater(len(record.value), 0)
        self.assertTrue(
          set(record.value.iterkeys()).issubset(expectedStatisticNames),
          msg=record.value)

        for metricValue in record.value.itervalues():
          self.assertIsInstance(metricValue, float, msg=instanceMetrics)
          self.assertFalse(math.isnan(metricValue))

        timestamps.append(record.timestamp)


      # Verify that all the stats timestamps are the same
      if timestamps:
        self.assertSequenceEqual(timestamps, [timestamps[0]] * len(timestamps))



    # Collection data for both autostack/metric combinations
    collector = EC2InstanceMetricGetter()
    self.addCleanup(collector.close)

    def _createAutostackMetric(conn, name, region, filters):
      autostackDict = repository.addAutostack(conn,
                                              name=name,
                                              region=region,
                                              filters=json.dumps(filters))

      modelSpec = {"modelParams": {},
                   "datasource": "autostack",
                   "metricSpec": {"slaveDatasource": "cloudwatch",
                                  "slaveMetric": {"metric": "CPUUtilization",
                                                  "namespace": "AWS/EC2"},
                                  "autostackId": autostackDict["uid"]}}

      metricDict = repository.addMetric(
          conn,
          datasource="autostack",
          name="CPUUtilization",
          description=("CPUUtilization on Grok Autostack {0} in us-west-2 "
                       "region").format(name),
          server="Autostacks/{0}".format(autostackDict["uid"]),
          location=region,
          tag_name=name,
          parameters=htmengine.utils.jsonEncode(modelSpec),
          poll_interval=300,
          status=MetricStatus.UNMONITORED)

      repository.addMetricToAutostack(conn,
                                      autostackDict["uid"],
                                      metricDict["uid"])

      autostackObj = type("MutableAutostack", (object,), autostackDict)()
      autostackObj.filters = json.loads(autostackObj.filters)

      metricObj = type("MutableMetric", (object,), metricDict)()

      return autostackObj, metricObj

    # All instances in us-east-1
    engine = repository.engineFactory()
    with engine.begin() as conn:
      autostack1, m1 = (
        _createAutostackMetric(conn,
                               name="testCollectMetricStats1",
                               region="us-east-1",
                               filters={"tag:Name": ["*"]}))

      stats1 = collector.collectMetricStatistics(
        autostack=autostack1,
        metric=m1)
      print "STATS1:", stats1

      validateStats(stats1)
      self.assertGreaterEqual(len(stats1), 1)


      # All instances in us-west-2
      autostack2, m2 = _createAutostackMetric(conn,
                                              name="testCollectMetricStats2",
                                              region="us-west-2",
                                              filters={"tag:Name": ["*"]})

      stats2 = collector.collectMetricStatistics(
        autostack=autostack2,
        metric=m2)
      print "STATS2:", stats2
      validateStats(stats2)
      self.assertGreater(len(stats2), 1)


      # No matching instances in us-west-2
      autostack3, m3 = (
        _createAutostackMetric(
          conn,
          name="testCollectMetricStatistics3",
          region="us-west-2",
          filters={"tag:Name": ["NothingShouldMatchThis"]}))

      stats3 = collector.collectMetricStatistics(
        autostack=autostack3,
        metric=m3)
      print "STATS3:", stats3
      validateStats(stats3)
      self.assertEqual(len(stats3), 0)
    def testModelInferencesLifeCycle(self):
        startTime = time()
        for model in sorted(self.data):
            #create a model; post is forwarded to put
            print "Creating metric for %s : " % model
            response = self.app.put("/",
                                    json.dumps(model),
                                    headers=self.headers)
            assertions.assertSuccess(self, response, code=201)

        response = self.app.get("/", headers=self.headers)
        assertions.assertSuccess(self, response)
        getAllModelsResult = utils.jsonDecode(response.body)
        totalMetricCount = len(getAllModelsResult)
        self.assertEqual(totalMetricCount, len(self.data))

        #Get the uids of all the metrics created.
        uids = [metric['uid'] for metric in getAllModelsResult]

        while True:
            with repository.engineFactory().connect() as conn:
                initialModelCount = conn.execute(
                    sql.select([sql.func.count()],
                               from_obj=schema.metric_data).where(
                                   schema.metric_data.c.rowid == 1)).scalar()
            if initialModelCount == totalMetricCount:
                print "Done creating all the initial models."
                break

            # Exit the test with some non-zero status if the test has run for more
            # than 20 minutes to just create the initial models.
            # Should not take more than that.

            currentElapsedTime = (time() - startTime) / 60
            print "Current elapsed time %s" % currentElapsedTime
            if currentElapsedTime > 20:
                print "More than 20 minutes has elapsed. Timing out."
                sys.exit(42)
            print "%s initial models created." % initialModelCount
            print "Creating initial models for rest of the %s metrics" \
              "..." % (totalMetricCount - initialModelCount)
            sleep(60)

        #Sleep for a long time.
        minutes = 15
        print "Sleeping for %s minutes to let things settled down." % minutes
        while minutes > 0:
            print "Resume in %s minutes." % minutes
            minutes -= 1
            sleep(60)

        modelCreationDuration = (time() - startTime) / 60

        with repository.engineFactory().connect() as conn:
            lastRowIds = {
                uid: repository.getMetric(conn, uid).last_rowid
                for uid in uids
            }
        modelInferenceWithNonNullAnomalyScore = []
        modelIds = lastRowIds.keys()
        while True:
            print set(modelInferenceWithNonNullAnomalyScore)
            if len(modelIds) == len(
                    set(modelInferenceWithNonNullAnomalyScore)):
                print "Model inferences created for last_rowids for all the models."
                break
            for uid in modelIds:
                with repository.engineFactory().connect() as conn:
                    anomalyNullCount = conn.execute(
                        sql.select(
                            [sql.func.count()],
                            from_obj=schema.metric_data).where(
                                schema.metric_data.c.rowid == lastRowIds[uid]).
                        where(schema.metric_data.c.uid == uid).where(
                            schema.metric_data.c.anomaly_score ==
                            None)).scalar()
                print "Model (%s) - Last Row ID (%s) : %s" \
                  % (uid, lastRowIds[uid], anomalyNullCount)
                if anomalyNullCount == 0:
                    modelInferenceWithNonNullAnomalyScore.append(uid)

            # Exit the test with some non-zero status if the test has run for more
            # than 2 hours

            currentElapsedTime = (time() - startTime) / 60
            print "Current elapsed time %s" % currentElapsedTime
            if currentElapsedTime > 120:
                print "More than 2 hours has elapsed. Timing out."
                sys.exit(42)
            print "Going back to sleep for 60s..."
            sleep(60)

        self.assertEqual(anomalyNullCount, 0)
        timeToCalculateAllInferences = time()

        def getMetricDataWithRowID(metricDataList, rowid):
            '''
        Helper method to get the metric data of the nth row for a certain uid
      '''
            for metricData in metricDataList:
                if metricData[3] == rowid:
                    return metricData

        def testMetricDataForRandomRowID(uid):
            '''
        This tests if the metric data returned by the GET call :
          _models/<uid>/data
        has anomaly_score consistent with what is there in the actual
        database by asserting it against a dao.MetricData.get() call
        It repeats the process for 5 random sample rows for each uid
        in the database.

        Algorithm :
        - Query the MetricDataHandler GET call for a certain uid
        - Check if response is OK
        - Find the last row id for the uid
        - Select a random row between 1 and last row id
        - Find the anomaly score for that row id
        - Assert on the anomaly score
      '''
            response = self.app.get("/%s/data" % uid, headers=self.headers)
            assertions.assertSuccess(self, response)
            getAllModelsResult = utils.jsonDecode(response.body)
            with repository.engineFactory().connect() as conn:
                lastRowID = repository.getMetric(conn, uid).last_rowid
            for _ in range(5):
                randomRowID = randrange(1, lastRowID)
                with repository.engineFactory().connect() as conn:
                    singleMetricData = repository.getMetricData(
                        conn, uid, rowid=randomRowID).first()
                metricData = getMetricDataWithRowID(getAllModelsResult['data'],
                                                    randomRowID)
                self.assertEqual(metricData[2], singleMetricData.anomaly_score)
                self.assertEqual(
                    datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'),
                    singleMetricData.timestamp)

        map(testMetricDataForRandomRowID, uids)

        def testMetricDataAnomalyAsQueryParams(uid):
            '''
        This test makes MetricDataHandler GET calls with anomaly param :
          _models/<uid>/data?anomaly=<>
      '''
            queryString = (
                "SELECT * FROM metric_data WHERE uid='%s' "
                "   and abs(anomaly_score - 0) > 1e-5 LIMIT 1") % uid
            with repository.engineFactory().connect() as conn:
                sampleMetricData = conn.execute(queryString).first()
            anomalyScore = sampleMetricData.anomaly_score
            response = self.app.get("/%s/data?anomaly=%s" %
                                    (uid, anomalyScore),
                                    headers=self.headers)
            assertions.assertSuccess(self, response)
            getAllModelsResult = utils.jsonDecode(response.body)
            for metricData in getAllModelsResult['data']:
                self.assertGreaterEqual(metricData[2], anomalyScore)

        map(testMetricDataAnomalyAsQueryParams, uids)

        def testMetricDataTimeStampQueryParams(uid):
            '''
        This test makes MetricDataHandler GET calls with from and to params :
          _models/<uid>/data?from=<>&to=<>
      '''
            with repository.engineFactory().connect() as conn:
                firstMetricData = conn.execute(
                    sql.select([schema.metric_data
                                ]).where(schema.metric_data.c.uid == uid).
                    order_by(sql.expression.asc(
                        schema.metric_data.c.timestamp)).limit(1)).fetchall()

                lastMetricData = conn.execute(
                    sql.select([
                        schema.metric_data
                    ]).where(schema.metric_data.c.uid == uid).order_by(
                        sql.expression.desc(
                            schema.metric_data.c.timestamp)).limit(
                                1)).fetchall()
            firstTimeStamp = firstMetricData[0].timestamp
            lastTimeStamp = lastMetricData[0].timestamp
            response = self.app.get("/%s/data?from=%s&to=%s" %
                                    (uid, firstTimeStamp, lastTimeStamp),
                                    headers=self.headers)
            assertions.assertSuccess(self, response)
            getAllModelsResult = utils.jsonDecode(response.body)
            for metricData in getAllModelsResult['data']:
                self.assertGreaterEqual(
                    datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'),
                    firstTimeStamp)
                self.assertLessEqual(
                    datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'),
                    lastTimeStamp)

        map(testMetricDataTimeStampQueryParams, uids)

        def testMetricDataQueryParams(uid):
            '''
        This test makes MetricDataHandler GET calls with various params :
          _models/<uid>/data?from=<>&to=<>&anomaly=<>
      '''
            with repository.engineFactory().connect() as conn:
                firstMetricData = conn.execute(
                    "SELECT * FROM `metric_data` WHERE `uid`='%s' "
                    "and abs(`anomaly_score` - 0) > 1e-5 "
                    "ORDER BY `timestamp` ASC LIMIT 1" % uid).fetchall()
                lastMetricData = conn.execute(
                    "SELECT * FROM `metric_data` WHERE `uid`='%s' "
                    "and abs(`anomaly_score` - 0) > 1e-5 "
                    "ORDER BY `timestamp` DESC LIMIT 1" % uid).fetchall()
            firstTimeStamp = firstMetricData[0].timestamp
            lastTimeStamp = lastMetricData[0].timestamp
            anomalyScore = firstMetricData[0].anomaly_score
            response = self.app.get(
                "/%s/data?from=%s&to=%s&anomaly=%s" %
                (uid, firstTimeStamp, lastTimeStamp, anomalyScore),
                headers=self.headers)
            assertions.assertSuccess(self, response)
            getAllModelsResult = utils.jsonDecode(response.body)
            for metricData in getAllModelsResult['data']:
                self.assertGreaterEqual(metricData[2], anomalyScore)
                self.assertGreaterEqual(
                    datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'),
                    firstTimeStamp)
                self.assertLessEqual(
                    datetime.strptime(metricData[0], '%Y-%m-%d %H:%M:%S'),
                    lastTimeStamp)

        map(testMetricDataQueryParams, uids)

        endTime = (time() - startTime) / 60

        print "Test started at        : %s" % \
              strftime('%Y-%m-%d %H:%M:%S', localtime(startTime))
        print "Test finished at       : %s" % \
              strftime('%Y-%m-%d %H:%M:%S', localtime(endTime))
        print "Total metric count     : %s" % totalMetricCount
        print "Initial models created : %s" % initialModelCount
        print "Approximate time taken to create inital models : %s minutes" \
          % modelCreationDuration
        print "Approximate time taken to calculate all inferences : %s minutes" \
          % ((timeToCalculateAllInferences - startTime) / 60)
        print "Approximate time taken for all the tests to finish : %s minutes" \
          % ((time() - startTime) / 60)
예제 #39
0
    def testCollectAndPublishMetrics(self):
        # Start Metric Collector, create a set of Metrics, wait for it to collect
        # some metrics and to publish them to the metric_exchange, then validate
        # attributes of the published metrics.
        #
        # TODO Add more metric types
        # TODO Deeper validation of the published metrics

        # Start our own instance of metric collector and wait for data points
        with self._startModelSchedulerSubprocess() as modelSchedulerSubprocess, \
            self._startMetricCollectorSubprocess() as metricCollectorSubprocess:
            # Create some models for metric collector to harvest
            region = "us-west-2"
            namespace = "AWS/EC2"
            resourceType = ResourceTypeNames.EC2_INSTANCE

            engine = repository.engineFactory()
            adapter = createCloudwatchDatasourceAdapter()

            ec2Instances = adapter.describeResources(region=region,
                                                     resourceType=resourceType)

            self.assertGreater(len(ec2Instances), 0)

            maxModels = 10

            ec2Instances = ec2Instances[:min(maxModels, Quota.getInstanceQuota(
            ))]

            metricInstances = []

            _LOGGER.info("Starting %d models", len(ec2Instances))
            self.assertGreater(len(ec2Instances), 0)
            for ec2Instance in ec2Instances:

                metricSpec = {
                    "region": region,
                    "namespace": namespace,
                    "metric": "CPUUtilization",
                    "dimensions": {
                        "InstanceId": ec2Instance["resID"]
                    }
                }

                modelSpec = {
                    "datasource": "cloudwatch",
                    "metricSpec": metricSpec
                }

                metricId = adapter.monitorMetric(modelSpec)

                with engine.connect() as conn:
                    repository.setMetricStatus(conn, metricId,
                                               MetricStatus.ACTIVE)

                metricInstances.append(metricId)

            _LOGGER.info("Waiting for results from models...")

            seenMetricIDs = set()
            allMetricIDs = set(metricInstances)

            # Register a timeout so we won't deadlock the test
            def onTimeout(resultsQueueName):
                _LOGGER.error(
                    "Timed out waiting to get results from models; numResults=%d; "
                    "expected=%d", len(seenMetricIDs), len(allMetricIDs))

                # HACK delete model swapper results queue to abort the consumer
                try:
                    with MessageBusConnector() as bus:
                        bus.deleteMessageQueue(resultsQueueName)
                except Exception:
                    _LOGGER.exception("Failed to delete results mq=%s",
                                      resultsQueueName)
                    raise

            with ModelSwapperInterface() as modelSwapper:
                with modelSwapper.consumeResults() as consumer:
                    timer = threading.Timer(
                        120, onTimeout, args=[modelSwapper._resultsQueueName])
                    timer.start()
                    try:
                        for batch in consumer:
                            seenMetricIDs.add(batch.modelID)
                            batch.ack()
                            if seenMetricIDs == allMetricIDs:
                                break
                        else:
                            self.fail(
                                "Expected %d results, but got only %d: %s" % (
                                    len(allMetricIDs),
                                    len(seenMetricIDs),
                                    seenMetricIDs,
                                ))
                        _LOGGER.info("Got %d results from models",
                                     len(seenMetricIDs))
                    finally:
                        timer.cancel()

            # Terminate metric_collector subprocess gracefully to avoid too much
            # error logging junk on the terminal
            metricCollectorSubprocess.send_signal(signal.SIGINT)

            # Terminate metric_collector subprocess gracefully to avoid too much
            # error logging junk on the terminal
            modelSchedulerSubprocess.send_signal(signal.SIGINT)
예제 #40
0
  def GET(self, period):
    """
    Get metrics, sorted by anomalies over specified period (hours)

    :param period: Period (hours) over which to consider anomalies for sort
      order
    :type period: int
    :returns: List of metrics
    :rtype: list

    Example request::

      GET /_anomalies/period/{period}

    Example response::

      [
        {
          "status": 1,
          "last_rowid": 4033,
          "display_name": "jenkins-master (us-west-2/AWS/EC2/i-12345678)",
          "description": "NetworkIn on EC2 instance i-12345678 in us-west-2",
          "name": "AWS/EC2/NetworkIn",
          "last_timestamp": "2014-04-14 20:29:00",
          "poll_interval": 300,
          "server": "us-west-2/AWS/EC2/i-12345678",
          "tag_name": "jenkins-master",
          "datasource": "cloudwatch",
          "location": "us-west-2",
          "message": null,
          "parameters": {
            "InstanceId": "i-12345678",
            "region": "us-west-2"
          },
          "uid": "0b6b97022fdb4134936aae92aa67393b"
        },
        ...
      ]

    """

    try:
      self.addStandardHeaders()

      engine = repository.engineFactory()

      with engine.connect() as conn:
        modelIterator = repository.getAllMetrics(conn, fields=getMetricDisplayFields(conn))
        displayValuesMap = repository.getMetricIdsSortedByDisplayValue(conn, period)

      # Keep track of the largest model display value for each server
      serverValues = defaultdict(float)

      modelsList = []

      for model in modelIterator:
        val = displayValuesMap.get(model.uid)
        if val is not None:
          serverValues[model.server] = max(float(val),
                                           serverValues[model.server])
        modelsList.append(convertMetricRowToMetricDict(model))

      # Sort by the primary key. The order within each server is preserved
      # from previous sort.
      def getModelRankByServer(model):
        return (-serverValues[model["server"]], model["server"], model["name"])
      modelsList = sorted(modelsList, key=getModelRankByServer)

      return utils.jsonEncode(modelsList)

    except (web.HTTPError) as ex:
      log.info(str(ex) or repr(ex))
      raise ex

    except Exception as ex:
      log.exception("GET Failed")
      raise web.internalerror(str(ex) or repr(ex))
예제 #41
0
  def messageHandler(self, message):
    """ Inspect all inbound model results in a batch for anomaly thresholds and
        trigger notifications where applicable.

        :param amqp.messages.ConsumerMessage message: ``message.body`` is a
          serialized batch of model inference results generated in
          ``AnomalyService`` and must be deserialized using
          ``AnomalyService.deserializeModelResult()``. The message conforms to
          htmengine/runtime/json_schema/model_inference_results_msg_schema.json
    """
    if message.properties.headers and "dataType" in message.properties.headers:
      # Not a model inference result
      return

    grok.app.config.loadConfig() # reload config on every batch
    engine = repository.engineFactory()
    # Cache minimum threshold to trigger any notification to avoid permuting
    # settings x metricDataRows
    try:
      try:
        batch = AnomalyService.deserializeModelResult(message.body)
      except Exception:
        self._log.exception("Error deserializing model result")
        raise

      # Load all settings for all users (once per incoming batch)
      with engine.connect() as conn:
        settings = repository.retryOnTransientErrors(
            repository.getAllNotificationSettings)(conn)

      self._log.debug("settings: %r" % settings)

      if settings:
        minThreshold = min(setting.sensitivity for setting in settings)
      else:
        minThreshold = 0.99999

      metricInfo = batch["metric"]
      metricId = metricInfo["uid"]
      resource = metricInfo["resource"]


      for row in batch["results"]:

        if row["anomaly"] >= minThreshold:
          for settingObj in settings:
            if row["rowid"] <= 1000:
              continue # Not enough data

            rowDatetime = datetime.utcfromtimestamp(row["ts"])

            if rowDatetime < datetime.utcnow() - timedelta(seconds=3600):
              continue # Skip old

            if row["anomaly"] >= settingObj.sensitivity:
              # First let's clear any old users out of the database.
              with engine.connect() as conn:
                repository.retryOnTransientErrors(
                    repository.deleteStaleNotificationDevices)(
                        conn, _NOTIFICATION_DEVICE_STALE_DAYS)

              # If anomaly_score meets or exceeds any of the device
              # notification sensitivity settings, trigger notification.
              # repository.addNotification() will handle throttling.
              notificationId = str(uuid.uuid4())

              with engine.connect() as conn:
                result = repository.retryOnTransientErrors(
                    repository.addNotification)(conn,
                                                uid=notificationId,
                                                server=resource,
                                                metric=metricId,
                                                rowid=row["rowid"],
                                                device=settingObj.uid,
                                                windowsize=(
                                                  settingObj.windowsize),
                                                timestamp=rowDatetime,
                                                acknowledged=0,
                                                seen=0)

              self._log.info("NOTIFICATION=%s SERVER=%s METRICID=%s DEVICE=%s "
                             "Notification generated. " % (notificationId,
                             resource, metricId,
                             settingObj.uid))

              if (result is not None and
                  result.rowcount > 0 and
                  settingObj.email_addr):
                # Notification was generated.  Attempt to send email
                with engine.connect() as conn:
                  notificationObj = repository.getNotification(conn,
                                                               notificationId)

                self.sendNotificationEmail(engine,
                                           settingObj,
                                           notificationObj)

          if not settings:
            # There are no device notification settings stored on this server,
            # no notifications will be generated.  However, log that a
            # an anomaly was detected and notification would be sent if there
            # were any configured devices
            self._log.info("<%r>" % (metricInfo) + (
                                          "{TAG:APP.NOTIFICATION} Anomaly "
                                          "detected at %s, but no devices are "
                                          "configured.") % rowDatetime)

    finally:
      message.ack()

    # Do cleanup
    with engine.connect() as conn:
      repository.clearOldNotifications(conn) # Delete all notifications outside
  def testModelInferencesLifeCycle(self):
    startTime = time()
    for model in sorted(self.data):
      #create a model; post is forwarded to put
      print "Creating metric for %s : " % model
      response = self.app.put("/", json.dumps(model),
          headers=self.headers)
      assertions.assertSuccess(self, response, code=201)

    response = self.app.get("/", headers=self.headers)
    assertions.assertSuccess(self, response)
    getAllModelsResult = utils.jsonDecode(response.body)
    totalMetricCount = len(getAllModelsResult)
    self.assertEqual(totalMetricCount, len(self.data))

    #Get the uids of all the metrics created.
    uids = [metric['uid'] for metric in getAllModelsResult]

    while True:
      with repository.engineFactory().connect() as conn:
        initialModelCount = conn.execute(
          sql.select([sql.func.count()], from_obj=schema.metric_data)
          .where(schema.metric_data.c.rowid == 1)).scalar()
      if initialModelCount == totalMetricCount:
        print "Done creating all the initial models."
        break

      # Exit the test with some non-zero status if the test has run for more
      # than 20 minutes to just create the initial models.
      # Should not take more than that.

      currentElapsedTime = (time() - startTime) / 60
      print "Current elapsed time %s" % currentElapsedTime
      if currentElapsedTime > 20:
        print "More than 20 minutes has elapsed. Timing out."
        sys.exit(42)
      print "%s initial models created." % initialModelCount
      print "Creating initial models for rest of the %s metrics" \
        "..." % (totalMetricCount - initialModelCount)
      sleep(60)


    #Sleep for a long time.
    minutes = 15
    print "Sleeping for %s minutes to let things settled down." % minutes
    while minutes > 0:
      print "Resume in %s minutes." % minutes
      minutes -= 1
      sleep(60)

    modelCreationDuration = (time() - startTime) / 60

    with repository.engineFactory().connect() as conn:
      lastRowIds = {uid: repository.getMetric(conn, uid).last_rowid
                    for uid in uids}
    modelInferenceWithNonNullAnomalyScore = []
    modelIds = lastRowIds.keys()
    while True:
      print set(modelInferenceWithNonNullAnomalyScore)
      if len(modelIds) == len(set(modelInferenceWithNonNullAnomalyScore)):
        print "Model inferences created for last_rowids for all the models."
        break
      for uid in modelIds:
        with repository.engineFactory().connect() as conn:
          anomalyNullCount = conn.execute(
            sql.select([sql.func.count()], from_obj=schema.metric_data)
            .where(schema.metric_data.c.rowid == lastRowIds[uid])
            .where(schema.metric_data.c.uid == uid)
            .where(schema.metric_data.c.anomaly_score == None)).scalar()
        print "Model (%s) - Last Row ID (%s) : %s" \
          % (uid, lastRowIds[uid], anomalyNullCount)
        if anomalyNullCount == 0:
          modelInferenceWithNonNullAnomalyScore.append(uid)

      # Exit the test with some non-zero status if the test has run for more
      # than 2 hours

      currentElapsedTime = (time() - startTime) / 60
      print "Current elapsed time %s" % currentElapsedTime
      if currentElapsedTime > 120:
        print "More than 2 hours has elapsed. Timing out."
        sys.exit(42)
      print "Going back to sleep for 60s..."
      sleep(60)

    self.assertEqual(anomalyNullCount, 0)
    timeToCalculateAllInferences = time()


    def getMetricDataWithRowID(metricDataList, rowid):
      '''
        Helper method to get the metric data of the nth row for a certain uid
      '''
      for metricData in metricDataList:
        if metricData[3] == rowid:
          return metricData


    def testMetricDataForRandomRowID(uid):
      '''
        This tests if the metric data returned by the GET call :
          _models/<uid>/data
        has anomaly_score consistent with what is there in the actual
        database by asserting it against a dao.MetricData.get() call
        It repeats the process for 5 random sample rows for each uid
        in the database.

        Algorithm :
        - Query the MetricDataHandler GET call for a certain uid
        - Check if response is OK
        - Find the last row id for the uid
        - Select a random row between 1 and last row id
        - Find the anomaly score for that row id
        - Assert on the anomaly score
      '''
      response = self.app.get("/%s/data" %uid, headers=self.headers)
      assertions.assertSuccess(self, response)
      getAllModelsResult = utils.jsonDecode(response.body)
      with repository.engineFactory().connect() as conn:
        lastRowID = repository.getMetric(conn, uid).last_rowid
      for _ in range(5):
        randomRowID = randrange(1, lastRowID)
        with repository.engineFactory().connect() as conn:
          singleMetricData = repository.getMetricData(
            conn,
            uid,
            rowid=randomRowID).first()
        metricData = getMetricDataWithRowID(getAllModelsResult['data'],
          randomRowID)
        self.assertEqual(metricData[2], singleMetricData.anomaly_score)
        self.assertEqual(datetime.strptime(metricData[0],
          '%Y-%m-%d %H:%M:%S'), singleMetricData.timestamp)

    map(testMetricDataForRandomRowID, uids)


    def testMetricDataAnomalyAsQueryParams(uid):
      '''
        This test makes MetricDataHandler GET calls with anomaly param :
          _models/<uid>/data?anomaly=<>
      '''
      queryString = ("SELECT * FROM metric_data WHERE uid='%s' "
                     "   and abs(anomaly_score - 0) > 1e-5 LIMIT 1") % uid
      with repository.engineFactory().connect() as conn:
        sampleMetricData = conn.execute(queryString).first()
      anomalyScore = sampleMetricData.anomaly_score
      response = self.app.get("/%s/data?anomaly=%s"
        % (uid, anomalyScore), headers=self.headers)
      assertions.assertSuccess(self, response)
      getAllModelsResult = utils.jsonDecode(response.body)
      for metricData in getAllModelsResult['data']:
        self.assertGreaterEqual(metricData[2], anomalyScore)

    map(testMetricDataAnomalyAsQueryParams, uids)


    def testMetricDataTimeStampQueryParams(uid):
      '''
        This test makes MetricDataHandler GET calls with from and to params :
          _models/<uid>/data?from=<>&to=<>
      '''
      with repository.engineFactory().connect() as conn:
        firstMetricData = conn.execute(
          sql.select([schema.metric_data])
          .where(schema.metric_data.c.uid == uid)
          .order_by(sql.expression.asc(schema.metric_data.c.timestamp))
          .limit(1)).fetchall()

        lastMetricData = conn.execute(
          sql.select([schema.metric_data])
          .where(schema.metric_data.c.uid == uid)
          .order_by(sql.expression.desc(schema.metric_data.c.timestamp))
          .limit(1)).fetchall()
      firstTimeStamp = firstMetricData[0].timestamp
      lastTimeStamp = lastMetricData[0].timestamp
      response = self.app.get("/%s/data?from=%s&to=%s"
        % (uid, firstTimeStamp, lastTimeStamp), headers=self.headers)
      assertions.assertSuccess(self, response)
      getAllModelsResult = utils.jsonDecode(response.body)
      for metricData in getAllModelsResult['data']:
        self.assertGreaterEqual(datetime.strptime(metricData[0],
          '%Y-%m-%d %H:%M:%S'), firstTimeStamp)
        self.assertLessEqual(datetime.strptime(metricData[0],
          '%Y-%m-%d %H:%M:%S'), lastTimeStamp)

    map(testMetricDataTimeStampQueryParams, uids)


    def testMetricDataQueryParams(uid):
      '''
        This test makes MetricDataHandler GET calls with various params :
          _models/<uid>/data?from=<>&to=<>&anomaly=<>
      '''
      with repository.engineFactory().connect() as conn:
        firstMetricData = conn.execute(
          "SELECT * FROM `metric_data` WHERE `uid`='%s' "
          "and abs(`anomaly_score` - 0) > 1e-5 "
          "ORDER BY `timestamp` ASC LIMIT 1" % uid).fetchall()
        lastMetricData = conn.execute(
          "SELECT * FROM `metric_data` WHERE `uid`='%s' "
          "and abs(`anomaly_score` - 0) > 1e-5 "
          "ORDER BY `timestamp` DESC LIMIT 1" % uid).fetchall()
      firstTimeStamp = firstMetricData[0].timestamp
      lastTimeStamp = lastMetricData[0].timestamp
      anomalyScore = firstMetricData[0].anomaly_score
      response = self.app.get("/%s/data?from=%s&to=%s&anomaly=%s"
        % (uid, firstTimeStamp, lastTimeStamp, anomalyScore),
        headers=self.headers)
      assertions.assertSuccess(self, response)
      getAllModelsResult = utils.jsonDecode(response.body)
      for metricData in getAllModelsResult['data']:
        self.assertGreaterEqual(metricData[2], anomalyScore)
        self.assertGreaterEqual(datetime.strptime(metricData[0],
          '%Y-%m-%d %H:%M:%S'), firstTimeStamp)
        self.assertLessEqual(datetime.strptime(metricData[0],
          '%Y-%m-%d %H:%M:%S'), lastTimeStamp)

    map(testMetricDataQueryParams, uids)


    endTime = (time() - startTime) / 60

    print "Test started at        : %s" % \
          strftime('%Y-%m-%d %H:%M:%S', localtime(startTime))
    print "Test finished at       : %s" % \
          strftime('%Y-%m-%d %H:%M:%S', localtime(endTime))
    print "Total metric count     : %s" % totalMetricCount
    print "Initial models created : %s" % initialModelCount
    print "Approximate time taken to create inital models : %s minutes" \
      % modelCreationDuration
    print "Approximate time taken to calculate all inferences : %s minutes" \
      % ((timeToCalculateAllInferences - startTime) / 60)
    print "Approximate time taken for all the tests to finish : %s minutes" \
      % ((time() - startTime) / 60)
예제 #43
0
def getStatistics(metric):
  """Get aggregate statistics for an Autostack metric.

  The metric must belong to an Autostack or a ValueError will be raised. If AWS
  returns no stats and there is no data in the database then an
  ObjectNotFoundError will be raised.

  :param metric: the Autostack metric to get statistics for
  :type metric: TODO

  :returns: metric statistics
  :rtype: dict {"min": minVal, "max": maxVal}

  :raises: ValueError if the metric doesn't not belong to an Autostack

  :raises: grok.app.exceptions.ObjectNotFoundError if the metric or the
      corresponding autostack doesn't exist; this may happen if it got deleted
      by another process in the meantime.

  :raises: grok.app.exceptions.MetricStatisticsNotReadyError if there are no or
      insufficent samples at this time; this may also happen if the metric and
      its data were deleted by another process in the meantime
  """
  engine = repository.engineFactory()

  if metric.datasource != "autostack":
    raise ValueError(
      "Metric must belong to an Autostack but has datasource=%r"
      % metric.datasource)
  metricGetter = EC2InstanceMetricGetter()
  try:
    with engine.connect() as conn:
      autostack = repository.getAutostackFromMetric(conn, metric.uid)
    instanceMetricList = metricGetter.collectMetricStatistics(autostack, metric)
  finally:
    metricGetter.close()

  n = 0
  mins = 0.0
  maxs = 0.0
  for instanceMetric in instanceMetricList:
    assert len(instanceMetric.records) == 1
    metricRecord = instanceMetric.records[0]
    stats = metricRecord.value

    if (not isinstance(stats["min"], numbers.Number) or
        math.isnan(stats["min"]) or
        not isinstance(stats["max"], numbers.Number) or
        math.isnan(stats["max"])):
      # Cloudwatch gave us bogus data for this metric so we will exclude it
      continue

    mins += stats["min"]
    maxs += stats["max"]
    n += 1

  if n == 0:
    # Fall back to metric_data when we don't get anything from AWS. This may
    # raise an MetricStatisticsNotReadyError if there is no or not enough data.
    with engine.connect() as conn:
      dbStats = repository.getMetricStats(conn, metric.uid)
    minVal = dbStats["min"]
    maxVal = dbStats["max"]
  else:
    minVal = mins / n
    maxVal = maxs / n

  # Now add the 20% buffer on the range
  buff = (maxVal - minVal) * 0.2
  minVal -= buff
  maxVal += buff

  return {"min": minVal,
          "max": maxVal}
 def setUpClass(cls):
     # Load grok API Key as required by TestCaseBase
     cls.apiKey = grok.app.config.get("security", "apikey")
     cls.engine = repository.engineFactory()
예제 #45
0
  def run(self):
    """ Collect metric data and status for active metrics
    """
    # NOTE: the process pool must be created BEFORE this main (parent) process
    # creates any global or class-level shared resources (e.g., boto
    # connection) that would have undersirable consequences when
    # replicated into and used by forked child processes (e.g., the same MySQL
    # connection socket file descriptor used by multiple processes). And we
    # can't take advantage of the process Pool's maxtasksperchild feature
    # either (for the same reason)
    self._log.info("Starting grok Metric Collector")
    resultsQueue = multiprocessing.Manager().JoinableQueue()

    recvPipe, sendPipe = multiprocessing.Pipe(False)

    processPool = multiprocessing.Pool(
      processes=self._WORKER_PROCESS_POOL_SIZE,
      maxtasksperchild=None)

    try:
      with ModelSwapperInterface() as modelSwapper:
        engine = repository.engineFactory()
        while True:
          startTime = time.time()

          if startTime > self._nextCacheGarbageCollectionTime:
            # TODO: unit-test
            self._garbageCollectInfoCache()

          # Determine which metrics are due for an update
          metricsToUpdate = self._getCandidateMetrics(engine)

          filterDuration = time.time() - startTime

          if not metricsToUpdate:
            time.sleep(self._NO_PENDING_METRICS_SLEEP_SEC)
            continue

          # Collect metric data
          collectionStartTime = time.time()

          poolResults = self._collectDataForMetrics(metricsToUpdate,
                                                    processPool,
                                                    resultsQueue)

          # Process/dispatch results in parallel in another thread as results
          # become available in resultsQueue
          dispatchThread = (
            threading.Thread(target=self._processAndDispatchThreadTarget,
                             args=(engine,
                                   metricsToUpdate,
                                   resultsQueue,
                                   modelSwapper,
                                   sendPipe)))
          dispatchStartTime = time.time()
          dispatchThread.start()

          # Syncronize with processPool
          poolResults.wait() # Wait for collection tasks to complete

          metricPollDuration = time.time() - collectionStartTime

          resultsQueue.join() # Block until all tasks completed...

          # Syncronize with dispatchThread
          resultsQueue.put(self._SENTINEL) # Signal to dispatchThread that
                                           # there are no more results to
                                           # process.
          resultsQueue.join()
          numEmpty, numErrors = recvPipe.recv() # Get dispatchThread stats

          dispatchDuration = time.time() - dispatchStartTime

          self._log.info(
            "Processed numMetrics=%d; numEmpty=%d; numErrors=%d; "
            "duration=%.4fs (filter=%.4fs; query=%.4fs; dispatch=%.4fs)",
            len(metricsToUpdate), numEmpty, numErrors,
            time.time() - startTime, filterDuration,
            metricPollDuration, dispatchDuration)
    finally:
      self._log.info("Exiting Metric Collector run-loop")
      processPool.terminate()
      processPool.join()
예제 #46
0
    def run(self):
        """ Collect metric data and status for active metrics
    """
        # NOTE: the process pool must be created BEFORE this main (parent) process
        # creates any global or class-level shared resources (e.g., boto
        # connection) that would have undersirable consequences when
        # replicated into and used by forked child processes (e.g., the same MySQL
        # connection socket file descriptor used by multiple processes). And we
        # can't take advantage of the process Pool's maxtasksperchild feature
        # either (for the same reason)
        self._log.info("Starting grok Metric Collector")
        resultsQueue = multiprocessing.Manager().JoinableQueue()

        recvPipe, sendPipe = multiprocessing.Pipe(False)

        processPool = multiprocessing.Pool(processes=self._WORKER_PROCESS_POOL_SIZE, maxtasksperchild=None)

        try:
            with ModelSwapperInterface() as modelSwapper:
                engine = repository.engineFactory()
                while True:
                    startTime = time.time()

                    if startTime > self._nextCacheGarbageCollectionTime:
                        # TODO: unit-test
                        self._garbageCollectInfoCache()

                    # Determine which metrics are due for an update
                    metricsToUpdate = self._getCandidateMetrics(engine)

                    filterDuration = time.time() - startTime

                    if not metricsToUpdate:
                        time.sleep(self._NO_PENDING_METRICS_SLEEP_SEC)
                        continue

                    # Collect metric data
                    collectionStartTime = time.time()

                    poolResults = self._collectDataForMetrics(metricsToUpdate, processPool, resultsQueue)

                    # Process/dispatch results in parallel in another thread as results
                    # become available in resultsQueue
                    dispatchThread = threading.Thread(
                        target=self._processAndDispatchThreadTarget,
                        args=(engine, metricsToUpdate, resultsQueue, modelSwapper, sendPipe),
                    )
                    dispatchStartTime = time.time()
                    dispatchThread.start()

                    # Syncronize with processPool
                    poolResults.wait()  # Wait for collection tasks to complete

                    metricPollDuration = time.time() - collectionStartTime

                    resultsQueue.join()  # Block until all tasks completed...

                    # Syncronize with dispatchThread
                    resultsQueue.put(self._SENTINEL)  # Signal to dispatchThread that
                    # there are no more results to
                    # process.
                    resultsQueue.join()
                    numEmpty, numErrors = recvPipe.recv()  # Get dispatchThread stats

                    dispatchDuration = time.time() - dispatchStartTime

                    self._log.info(
                        "Processed numMetrics=%d; numEmpty=%d; numErrors=%d; "
                        "duration=%.4fs (filter=%.4fs; query=%.4fs; dispatch=%.4fs)",
                        len(metricsToUpdate),
                        numEmpty,
                        numErrors,
                        time.time() - startTime,
                        filterDuration,
                        metricPollDuration,
                        dispatchDuration,
                    )
        finally:
            self._log.info("Exiting Metric Collector run-loop")
            processPool.terminate()
            processPool.join()
예제 #47
0
 def _connect():
   """ Explicitly checks out a connection from the sqlalchemy engine for use
   inside web handler via web.ctx
   """
   web.ctx.connFactory = repository.engineFactory().connect
예제 #48
0
def getStatistics(metric):
    """Get aggregate statistics for an Autostack metric.

  The metric must belong to an Autostack or a ValueError will be raised. If AWS
  returns no stats and there is no data in the database then an
  ObjectNotFoundError will be raised.

  :param metric: the Autostack metric to get statistics for
  :type metric: TODO

  :returns: metric statistics
  :rtype: dict {"min": minVal, "max": maxVal}

  :raises: ValueError if the metric doesn't not belong to an Autostack

  :raises: grok.app.exceptions.ObjectNotFoundError if the metric or the
      corresponding autostack doesn't exist; this may happen if it got deleted
      by another process in the meantime.

  :raises: grok.app.exceptions.MetricStatisticsNotReadyError if there are no or
      insufficent samples at this time; this may also happen if the metric and
      its data were deleted by another process in the meantime
  """
    engine = repository.engineFactory()

    if metric.datasource != "autostack":
        raise ValueError(
            "Metric must belong to an Autostack but has datasource=%r" %
            metric.datasource)
    metricGetter = EC2InstanceMetricGetter()
    try:
        with engine.connect() as conn:
            autostack = repository.getAutostackFromMetric(conn, metric.uid)
        instanceMetricList = metricGetter.collectMetricStatistics(
            autostack, metric)
    finally:
        metricGetter.close()

    n = 0
    mins = 0.0
    maxs = 0.0
    for instanceMetric in instanceMetricList:
        assert len(instanceMetric.records) == 1
        metricRecord = instanceMetric.records[0]
        stats = metricRecord.value

        if (not isinstance(stats["min"], numbers.Number)
                or math.isnan(stats["min"])
                or not isinstance(stats["max"], numbers.Number)
                or math.isnan(stats["max"])):
            # Cloudwatch gave us bogus data for this metric so we will exclude it
            continue

        mins += stats["min"]
        maxs += stats["max"]
        n += 1

    if n == 0:
        # Fall back to metric_data when we don't get anything from AWS. This may
        # raise an MetricStatisticsNotReadyError if there is no or not enough data.
        with engine.connect() as conn:
            dbStats = repository.getMetricStats(conn, metric.uid)
        minVal = dbStats["min"]
        maxVal = dbStats["max"]
    else:
        minVal = mins / n
        maxVal = maxs / n

    # Now add the 20% buffer on the range
    buff = (maxVal - minVal) * 0.2
    minVal -= buff
    maxVal += buff

    return {"min": minVal, "max": maxVal}
예제 #49
0
 def _connect():
     """ Explicitly checks out a connection from the sqlalchemy engine for use
 inside web handler via web.ctx
 """
     web.ctx.connFactory = repository.engineFactory().connect
예제 #50
0
    def GET(self, period):
        """
    Get metrics, sorted by anomalies over specified period (hours)

    :param period: Period (hours) over which to consider anomalies for sort
      order
    :type period: int
    :returns: List of metrics
    :rtype: list

    Example request::

      GET /_anomalies/period/{period}

    Example response::

      [
        {
          "status": 1,
          "last_rowid": 4033,
          "display_name": "jenkins-master (us-west-2/AWS/EC2/i-12345678)",
          "description": "NetworkIn on EC2 instance i-12345678 in us-west-2",
          "name": "AWS/EC2/NetworkIn",
          "last_timestamp": "2014-04-14 20:29:00",
          "poll_interval": 300,
          "server": "us-west-2/AWS/EC2/i-12345678",
          "tag_name": "jenkins-master",
          "datasource": "cloudwatch",
          "location": "us-west-2",
          "message": null,
          "parameters": {
            "InstanceId": "i-12345678",
            "region": "us-west-2"
          },
          "uid": "0b6b97022fdb4134936aae92aa67393b"
        },
        ...
      ]

    """

        try:
            self.addStandardHeaders()

            engine = repository.engineFactory()

            with engine.connect() as conn:
                modelIterator = repository.getAllMetrics(
                    conn, fields=getMetricDisplayFields(conn))
                displayValuesMap = repository.getMetricIdsSortedByDisplayValue(
                    conn, period)

            # Keep track of the largest model display value for each server
            serverValues = defaultdict(float)

            modelsList = []

            for model in modelIterator:
                val = displayValuesMap.get(model.uid)
                if val is not None:
                    serverValues[model.server] = max(
                        float(val), serverValues[model.server])
                modelsList.append(convertMetricRowToMetricDict(model))

            # Sort by the primary key. The order within each server is preserved
            # from previous sort.
            def getModelRankByServer(model):
                return (-serverValues[model["server"]], model["server"],
                        model["name"])

            modelsList = sorted(modelsList, key=getModelRankByServer)

            return utils.jsonEncode(modelsList)

        except (web.HTTPError) as ex:
            log.info(str(ex) or repr(ex))
            raise ex

        except Exception as ex:
            log.exception("GET Failed")
            raise web.internalerror(str(ex) or repr(ex))
예제 #51
0
  def setUpClass(cls):
    """
    Setup steps for all test cases.
    Focus for these is to cover all API checks for ModelDataHandler.
    Hence, this does all setup creating metric, waiting for
    metricData across all testcases, all API call for querying metricData
    will be against single metric created in setup
    Setup Process
    1) Update conf with aws credentials, ManagedTempRepository will not
       work in this test
    2) Select test instance such that its running from longer time,
       We are using instance older than 15 days
    3) Create Metric, wait for min metricData rows to become available
       Set to 100, configurable
    4) Pick testRowId, set it lower value this will make sure to have
       Non NULL value for anomaly_score field for given row while invoking
       GET with consitions, set to 5
    5) Decide queryParams for anomalyScore, to and from timestamp
    """
    cls.headers = getDefaultHTTPHeaders(grok.app.config)

    # All other sevices needs AWS credentials to work
    # Set AWS credentials
    grok.app.config.loadConfig()

    # Select test instance such that its running from longer time
    g_logger.info("Getting long-running EC2 Instances")
    instances = aws_utils.getLongRunningEC2Instances("us-west-2",
      grok.app.config.get("aws", "aws_access_key_id"),
      grok.app.config.get("aws", "aws_secret_access_key"), 15)
    testInstance = instances[randrange(1, len(instances))]

    createModelData = {
      "region": "us-west-2",
      "namespace": "AWS/EC2",
      "datasource": "cloudwatch",
      "metric": "CPUUtilization",
      "dimensions": {
        "InstanceId": testInstance.id
      }
    }

    # Number of minimum rows
    cls.minDataRows = 100

    cls.app = TestApp(models_api.app.wsgifunc())

    # create test metric
    g_logger.info("Creating test metric; modelSpec=%s", createModelData)
    response = cls.app.put("/", utils.jsonEncode(createModelData),
     headers=cls.headers)
    postResult = utils.jsonDecode(response.body)
    maxWaitTime = 600
    waitTimeMetricData = 0
    waitAnomalyScore = 0


    # Wait for enough metric data to be available
    cls.uid = postResult[0]["uid"]
    engine = repository.engineFactory()
    with engine.connect() as conn:
      cls.metricData = [row for row
                         in repository.getMetricData(conn, cls.uid)]
    with engine.connect() as conn:
      cls.testMetric = repository.getMetric(conn, cls.uid)

    # Confirm that we have enough metricData
    g_logger.info("Waiting for metric data")
    while (len(cls.metricData) < cls.minDataRows and
           waitTimeMetricData < maxWaitTime):
      g_logger.info("not ready, waiting for metric data: got %d of %d ...",
                    len(cls.metricData), cls.minDataRows)
      time.sleep(5)
      waitTimeMetricData += 5
      with engine.connect() as conn:
        cls.metricData = [row for row
                           in repository.getMetricData(conn, cls.uid)]

    # taking lower value for testRowId, this will make sure to have
    # Non NULL value for anomaly_score field for given row
    cls.testRowId = 5

    with engine.connect() as conn:
      cls.testMetricRow = (repository.getMetricData(conn,
                                                     cls.uid,
                                                     rowid=cls.testRowId)
                          .fetchone())

    # Make sure we did not receive None etc for anomaly score
    g_logger.info("cls.testMetricRow.anomaly_score=%r",
                  cls.testMetricRow.anomaly_score)
    g_logger.info("waitAnomalyScore=%r", waitAnomalyScore)
    while (cls.testMetricRow.anomaly_score is None and
           waitAnomalyScore < maxWaitTime):
      g_logger.info("anomaly_score not ready, sleeping...")
      time.sleep(5)
      waitAnomalyScore += 5
      with engine.connect() as conn:
        cls.testMetricRow = (repository.getMetricData(conn,
                                                      cls.uid,
                                                      rowid=cls.testRowId)
                            .fetchone())

    # Decide queryParams for anomalyScore, to and from timestamp
    cls.testAnomalyScore = cls.testMetricRow.anomaly_score
    cls.testTimeStamp = cls.testMetricRow.timestamp
예제 #52
0
  def testCollectMetricData(self):
    self.engine = repository.engineFactory(reset=True)

    with self.engine.connect() as conn:
      autostack1 = self._addAutostack(name="testCollectMetricData1",
                                      region="us-east-1",
                                      filters='{"tag:Name": ["*"]}')

      m1a = self._addAutostackMetric(conn, autostack1)
      m1b = self._addAutostackMetric(conn,
                                     autostack1,
                                     name="Autostacks/InstanceCount")

      autostack2 = self._addAutostack(name="testCollectMetricData2",
                                      region="us-west-2",
                                      filters='{"tag:Name": ["*?*"]}')

      m2 = self._addAutostackMetric(conn, autostack2)

      autostack3 = self._addAutostack(
        name="testCollectMetricData3",
        region="us-west-2",
        filters='{"tag:Name": ["NothingShouldMatchThis"]}')

      m3 = self._addAutostackMetric(conn, autostack3)

    # Collection data for both autostack/metric combinations
    collector = EC2InstanceMetricGetter()
    self.addCleanup(collector.close)

    requests = [
      AutostackMetricRequest(refID=1, autostack=autostack1, metric=m1a),
      AutostackMetricRequest(refID=2, autostack=autostack1, metric=m1b),
      AutostackMetricRequest(refID=3, autostack=autostack2, metric=m2),
      AutostackMetricRequest(refID=4, autostack=autostack3, metric=m3)
    ]

    metricCollections = dict(
      (collection.refID, collection)
      for collection in collector.collectMetricData(requests=requests))

    self.assertEqual(len(metricCollections), len(requests))


    def checkSliceSorted(records):
      sortedRecords = sorted(records, key=lambda record: record.timestamp)
      self.assertSequenceEqual(records, sortedRecords)

    def checkSliceUniqueTimestamps(records):
      timestamps = tuple(record.timestamp for record in records)
      for timestamp in timestamps:
        self.assertIsInstance(timestamp, datetime)
      self.assertItemsEqual(set(timestamps), timestamps)


    collection1 = metricCollections[1]
    collection2 = metricCollections[2]
    collection3 = metricCollections[3]
    collection4 = metricCollections[4]


    # COLLECTION-1:
    self.assertEqual(collection1.nextMetricTime, collection1.timeRange.end)
    metricGroups = defaultdict(list)
    for metricSlice in collection1.slices:
      checkSliceSorted(metricSlice.records)
      checkSliceUniqueTimestamps(metricSlice.records)
      for record in metricSlice.records:
        metricGroups[record.timestamp].append(
          (metricSlice.instanceID, record.value))

    foundValues = False
    for _timestamp, values in metricGroups.iteritems():
      if len(values) >= 0:
        #print timestamp, values[:5]
        foundValues = True
        break

    self.assertTrue(foundValues)


    # COLLECTION-2:
    self.assertEqual(collection2.nextMetricTime, collection2.timeRange.end)
    metricGroups = defaultdict(list)
    for metricSlice in collection2.slices:
      checkSliceSorted(metricSlice.records)
      checkSliceUniqueTimestamps(metricSlice.records)
      for record in metricSlice.records:
        metricGroups[record.timestamp].append(
          (metricSlice.instanceID, record.value))

    foundValues = False
    for _timestamp, values in metricGroups.iteritems():
      if len(values) >= 0:
        #print timestamp, values[:5]
        foundValues = True
        break

    self.assertTrue(foundValues)


    # COLLECTION-3:
    self.assertEqual(collection3.nextMetricTime, collection3.timeRange.end)
    metricGroups = defaultdict(list)
    metricTimestampInstanceHits = defaultdict(list)
    for metricSlice in collection3.slices:
      checkSliceSorted(metricSlice.records)
      checkSliceUniqueTimestamps(metricSlice.records)
      for record in metricSlice.records:
        metricGroups[record.timestamp].append((metricSlice.instanceID,
                                               record.value))
        metricTimestampInstanceHits[record.timestamp].append(
          metricSlice.instanceID)

    foundAlignedItems = False
    for _timestamp, values in metricGroups.iteritems():
      if len(values) > 1:
        #print timestamp, values[:5]
        foundAlignedItems = True
        break

    self.assertTrue(foundAlignedItems)

    # Make sure there were no duplicate timestamps in any one slice
    for _timestamp, instances in metricTimestampInstanceHits.iteritems():
      self.assertItemsEqual(instances, set(instances))


    # COLLECTION-4 (there should be no matching instances for it):
    self.assertEqual(len(collection4.slices), 0)
    self.assertEqual(collection4.nextMetricTime, collection4.timeRange.end)
예제 #53
0
    def messageHandler(self, message):
        """ Inspect all inbound model results in a batch for anomaly thresholds and
        trigger notifications where applicable.

        :param amqp.messages.ConsumerMessage message: ``message.body`` is a
          serialized batch of model inference results generated in
          ``AnomalyService`` and must be deserialized using
          ``AnomalyService.deserializeModelResult()``. The message conforms to
          htmengine/runtime/json_schema/model_inference_results_msg_schema.json
    """
        if message.properties.headers and "dataType" in message.properties.headers:
            # Not a model inference result
            message.ack()
            return

        grok.app.config.loadConfig()  # reload config on every batch
        engine = repository.engineFactory()
        # Cache minimum threshold to trigger any notification to avoid permuting
        # settings x metricDataRows
        try:
            try:
                batch = AnomalyService.deserializeModelResult(message.body)
            except Exception:
                self._log.exception("Error deserializing model result")
                raise

            # Load all settings for all users (once per incoming batch)
            with engine.connect() as conn:
                settings = repository.retryOnTransientErrors(
                    repository.getAllNotificationSettings)(conn)

            self._log.debug("settings: %r" % settings)

            if settings:
                minThreshold = min(setting.sensitivity for setting in settings)
            else:
                minThreshold = 0.99999

            metricInfo = batch["metric"]
            metricId = metricInfo["uid"]
            resource = metricInfo["resource"]

            for row in batch["results"]:

                if row["anomaly"] >= minThreshold:
                    rowDatetime = datetime.utcfromtimestamp(row["ts"])

                    if not settings:
                        # There are no device notification settings stored on this server,
                        # no notifications will be generated.  However, log that a
                        # an anomaly was detected and notification would be sent if there
                        # were any configured devices
                        self._log.info("<%r>" % (metricInfo) +
                                       ("{TAG:APP.NOTIFICATION} Anomaly "
                                        "detected at %s, but no devices are "
                                        "configured.") % rowDatetime)
                        continue

                    for settingObj in settings:
                        if row["rowid"] <= 1000:
                            continue  # Not enough data

                        if rowDatetime < datetime.utcnow() - timedelta(
                                seconds=3600):
                            continue  # Skip old

                        if row["anomaly"] >= settingObj.sensitivity:
                            # First let's clear any old users out of the database.
                            with engine.connect() as conn:
                                repository.retryOnTransientErrors(
                                    repository.deleteStaleNotificationDevices)(
                                        conn, _NOTIFICATION_DEVICE_STALE_DAYS)

                            # If anomaly_score meets or exceeds any of the device
                            # notification sensitivity settings, trigger notification.
                            # repository.addNotification() will handle throttling.
                            notificationId = str(uuid.uuid4())

                            with engine.connect() as conn:
                                result = repository.retryOnTransientErrors(
                                    repository.addNotification)(
                                        conn,
                                        uid=notificationId,
                                        server=resource,
                                        metric=metricId,
                                        rowid=row["rowid"],
                                        device=settingObj.uid,
                                        windowsize=(settingObj.windowsize),
                                        timestamp=rowDatetime,
                                        acknowledged=0,
                                        seen=0)

                            self._log.info(
                                "NOTIFICATION=%s SERVER=%s METRICID=%s DEVICE=%s "
                                "Notification generated. " %
                                (notificationId, resource, metricId,
                                 settingObj.uid))

                            if (result is not None and result.rowcount > 0
                                    and settingObj.email_addr):
                                # Notification was generated.  Attempt to send email
                                with engine.connect() as conn:
                                    notificationObj = repository.getNotification(
                                        conn, notificationId)

                                self.sendNotificationEmail(
                                    engine, settingObj, notificationObj)
        finally:
            message.ack()

        # Do cleanup
        with engine.connect() as conn:
            repository.clearOldNotifications(
                conn)  # Delete all notifications outside
  def testCollectAndPublishMetrics(self):
    # Start Metric Collector, create a set of Metrics, wait for it to collect
    # some metrics and to publish them to the metric_exchange, then validate
    # attributes of the published metrics.
    #
    # TODO Add more metric types
    # TODO Deeper validation of the published metrics

    # Start our own instance of metric collector and wait for data points
    with self._startModelSchedulerSubprocess() as modelSchedulerSubprocess, \
        self._startMetricCollectorSubprocess() as metricCollectorSubprocess:
      # Create some models for metric collector to harvest
      region = "us-west-2"
      namespace = "AWS/EC2"
      resourceType = ResourceTypeNames.EC2_INSTANCE

      engine = repository.engineFactory()
      adapter = createCloudwatchDatasourceAdapter()


      ec2Instances = adapter.describeResources(region=region,
                                               resourceType=resourceType)

      self.assertGreater(len(ec2Instances), 0)

      maxModels = 10

      ec2Instances = ec2Instances[:min(maxModels, Quota.getInstanceQuota())]

      metricInstances = []

      _LOGGER.info("Starting %d models", len(ec2Instances))
      self.assertGreater(len(ec2Instances), 0)
      for ec2Instance in ec2Instances:

        metricSpec = {"region": region,
                      "namespace": namespace,
                      "metric": "CPUUtilization",
                      "dimensions": {"InstanceId": ec2Instance["resID"]}}

        modelSpec = {"datasource": "cloudwatch",
                     "metricSpec": metricSpec}

        metricId = adapter.monitorMetric(modelSpec)

        with engine.connect() as conn:
          repository.setMetricStatus(conn, metricId, MetricStatus.ACTIVE)

        metricInstances.append(metricId)

      _LOGGER.info("Waiting for results from models...")

      seenMetricIDs = set()
      allMetricIDs = set(metricInstances)

      # Register a timeout so we won't deadlock the test
      def onTimeout(resultsQueueName):
        _LOGGER.error(
          "Timed out waiting to get results from models; numResults=%d; "
          "expected=%d", len(seenMetricIDs), len(allMetricIDs))

        # HACK delete model swapper results queue to abort the consumer
        try:
          with MessageBusConnector() as bus:
            bus.deleteMessageQueue(resultsQueueName)
        except Exception:
          _LOGGER.exception("Failed to delete results mq=%s", resultsQueueName)
          raise

      with ModelSwapperInterface() as modelSwapper:
        with modelSwapper.consumeResults() as consumer:
          timer = threading.Timer(120, onTimeout,
                                  args=[modelSwapper._resultsQueueName])
          timer.start()
          try:
            for batch in consumer:
              seenMetricIDs.add(batch.modelID)
              batch.ack()
              if seenMetricIDs == allMetricIDs:
                break
            else:
              self.fail(
                "Expected %d results, but got only %d: %s"
                % (len(allMetricIDs), len(seenMetricIDs), seenMetricIDs,))
            _LOGGER.info("Got %d results from models", len(seenMetricIDs))
          finally:
            timer.cancel()

      # Terminate metric_collector subprocess gracefully to avoid too much
      # error logging junk on the terminal
      metricCollectorSubprocess.send_signal(signal.SIGINT)

      # Terminate metric_collector subprocess gracefully to avoid too much
      # error logging junk on the terminal
      modelSchedulerSubprocess.send_signal(signal.SIGINT)
예제 #55
0
    def GET(self):
        """
    Get metrics, sorted by AWS name tag / instance ID

    :returns: List of metrics
    :rtype: list

    Example request::

      GET /_anomalies/name

    Example response::

      [
        {
          "status": 1,
          "last_rowid": 4033,
          "display_name": "jenkins-master (us-west-2/AWS/EC2/i-12345678)",
          "description": "NetworkIn on EC2 instance i-12345678 in us-west-2",
          "name": "AWS/EC2/NetworkIn",
          "last_timestamp": "2014-04-14 20:29:00",
          "poll_interval": 300,
          "server": "us-west-2/AWS/EC2/i-12345678",
          "tag_name": "jenkins-master",
          "datasource": "cloudwatch",
          "location": "us-west-2",
          "message": null,
          "parameters": {
            "InstanceId": "i-12345678",
            "region": "us-west-2"
          },
          "uid": "0b6b97022fdb4134936aae92aa67393b"
        },
        ...
      ]

    """

        try:
            self.addStandardHeaders()

            engine = repository.engineFactory()

            with engine.connect() as conn:
                modelIterator = repository.getAllMetrics(
                    conn, fields=getMetricDisplayFields(conn))
                modelsList = [
                    convertMetricRowToMetricDict(model)
                    for model in modelIterator
                ]

            # Sort by tag_name, and then parameters=>InstanceID
            def cmpFn(model1, model2):
                name1 = model1["tag_name"]
                name2 = model2["tag_name"]
                id1 = model1["parameters"].get("InstanceID")
                id2 = model2["parameters"].get("InstanceID")

                if name1 and not name2:
                    return -1
                elif name2 and not name1:
                    return 1
                elif name1 != name2:
                    return cmp(name1, name2)
                elif id1 and not id2:
                    return -1
                elif id2 and not id1:
                    return 1
                elif id1 != id2:
                    return cmp(id1, id2)
                return 0

            modelsList.sort(cmpFn)

            return utils.jsonEncode(modelsList)

        except (web.HTTPError) as ex:
            log.info(str(ex) or repr(ex))
            raise ex

        except Exception as ex:
            log.exception("GET Failed")
            raise web.internalerror(str(ex) or repr(ex))