def createDynamoDBSchema(self): """ Apply full dynamodb table schema definitions. """ self._metric = self._gracefulCreateTable(MetricDynamoDBDefinition()) self._metric_data = (self._gracefulCreateTable( MetricDataDynamoDBDefinition())) self._metric_tweets = (self._gracefulCreateTable( MetricTweetsDynamoDBDefinition())) self._instance_data_hourly = self._gracefulCreateTable( InstanceDataHourlyDynamoDBDefinition())
def _publishInstanceDataHourly(self, instanceName, metricType, rows): """ Specific handler for instance data rows. Publishes to the `taurus.data.instance_data_hourly` dynamodb table. :param instanceName: name of the instance :type instanceName: str :param metricType: the metric type identifier :type metricType: str :param rows: model inference result rows per "results" property of htmengine/runtime/json_schema/model_inference_results_msg_schema.json :type rows: Sequence of dicts """ hourToMaxScore = {} for row in rows: # row.timestamp is a datetime instance ts = datetime.utcfromtimestamp(row["ts"]).replace(minute=0, second=0, microsecond=0) # Store the max anomaly likelihood for the period hourToMaxScore[ts] = max(hourToMaxScore.get(ts, 0.0), row["anomaly"]) for ts, score in sorted(hourToMaxScore.iteritems()): score = FIXED_DYNAMODB_CONTEXT.create_decimal_from_float(score) dateHour = ts.strftime("%Y-%m-%dT%H") data = { "instance_id": { "S": instanceName }, "date_hour": { "S": dateHour }, "date": { "S": ts.strftime("%Y-%m-%d") }, "hour": { "S": ts.strftime("%H") }, "anomaly_score": { "M": { metricType: { "N": str(score) } } }, } # Validate the data fields against the schema InstanceDataHourlyDynamoDBDefinition().Item(**data) # First try a conditional update for the anomaly score for this metric updateKey = { "instance_id": data["instance_id"], "date_hour": data["date_hour"] } anomalyScoreMetric = "anomaly_score.%s" % metricType updateCondition = ("attribute_not_exists(%(asm)s) or " "%(asm)s < :value" % { "asm": anomalyScoreMetric }) updateValues = {":value": {"N": str(score)}} updateExpression = "SET %s = :value" % anomalyScoreMetric @retryOnTransientDynamoDBError(g_log) def updateItemWithRetries(): self.dynamodb.update_item( self._instance_data_hourly.table_name, key=updateKey, update_expression=updateExpression, condition_expression=updateCondition, expression_attribute_values=updateValues) try: updateItemWithRetries() except ResourceNotFoundException: # There is no row yet, so continue on to PutItem pass except ValidationException: # It's OK, let's continue and try the PutItem pass except ConditionalCheckFailedException: # The existing value is larger so we are done continue except Exception: g_log.exception( "update_item failed: table=%s; updateKey=%s; " "update=%s; condition=%s; values=%s", self._instance_data_hourly.table_name, updateKey, updateExpression, updateCondition, updateValues) raise else: # There was no exception, the update succeeded, we are done continue # If the UpdateItem failed with ResourceNotFoundException, put the row # and continue to next iteration of the loop putCondition = "attribute_not_exists(instance_id)" @retryOnTransientDynamoDBError(g_log) def putItemWithRetries(item, condition): self.dynamodb.put_item(self._instance_data_hourly.table_name, item=item, condition_expression=condition) try: putItemWithRetries(data, putCondition) except ConditionalCheckFailedException: # No problem, row already exists! pass except Exception: g_log.exception( "put_item failed: table=%s; condition=%s; item=%s", self._instance_data_hourly.table_name, putCondition, data) raise else: # There was no exception, the put succeeded, we are done continue # In the case that a parallel process beat us to it try: updateItemWithRetries() except ConditionalCheckFailedException: # The existing value is larger so we are done continue except Exception: g_log.exception( "update_item failed: table=%s; updateKey=%s; " "update=%s; condition=%s; values=%s", self._instance_data_hourly.table_name, updateKey, updateExpression, updateCondition, updateValues) raise
def testPathwayToDynamoDB(self): """ Test metric data pathway to dynamodb """ metricName = "TEST." + "".join(random.sample(string.ascii_letters, 16)) nativeMetric = { "modelParams": { "minResolution": 0.2, "min": 0.0, "max": 10000.0, }, "datasource": "custom", "metricSpec": { "metric": metricName, "resource": "Test", "userInfo": { "symbol": "TEST", "metricType": "TwitterVolume", "metricTypeName": "Twitter Volume", } } } metricName = nativeMetric["metricSpec"]["metric"] instanceName = nativeMetric["metricSpec"]["resource"] userInfo = nativeMetric["metricSpec"]["userInfo"] now = datetime.datetime.utcnow().replace(minute=0, second=0, microsecond=0) data = [ (5000.0, now - datetime.timedelta(minutes=10)), (6000.0, now - datetime.timedelta(minutes=5)), (7000.0, now), ] # We'll be explicitly deleting the metric below, but we need to add a # cleanup step that runs in case there is some other failure that prevents # that part of the test from being reached. def gracefulDelete(): try: self._deleteMetric(metricName) except ObjectNotFoundError: pass self.addCleanup(gracefulDelete) # Add custom metric data sock = socket.socket() sock.connect(("localhost", self.plaintextPort)) for metricValue, ts in data: sock.sendall( "%s %r %s\n" % (metricName, metricValue, epochFromNaiveUTCDatetime(ts))) self.gracefullyCloseSocket(sock) uid = self.checkMetricCreated(metricName) # Save the uid for later LOGGER.info("Metric %s has uid: %s", metricName, uid) # Send model creation request model = self._createModel(nativeMetric) parameters = json.loads(model.parameters) self.assertEqual(parameters["metricSpec"]["userInfo"], userInfo) for _ in xrange(60): with self.engine.begin() as conn: metric = repository.getMetric(conn, uid) if metric.status == MetricStatus.ACTIVE: break LOGGER.info("Model=%s not ready. Sleeping 1 second...", uid) time.sleep(1) else: self.fail("Model results not available within 5 minutes") # Check that the data all got processed self.checkModelResultsSize(uid, 3) # Now check that the data was published to dynamodb... dynamodb = DynamoDBService.connectDynamoDB() metricTable = Table(MetricDynamoDBDefinition().tableName, connection=dynamodb) metricItem = metricTable.lookup(uid) self.assertEqual(metricItem["uid"], uid) self.assertEqual(metricItem["name"], metricName) self.assertEqual(metricItem["metricType"], "TwitterVolume") self.assertEqual(metricItem["metricTypeName"], "Twitter Volume") self.assertEqual(metricItem["symbol"], "TEST") metricDataTable = Table(MetricDataDynamoDBDefinition().tableName, connection=dynamodb) instanceDataAnomalyScores = {} for metricValue, ts in data: metricDataItem = _RETRY_ON_ITEM_NOT_FOUND_DYNAMODB_ERROR( metricDataTable.lookup)(uid, ts.isoformat()) # There is no server-side cleanup for metric data, so remove it here for # now to avoid accumulating test data self.addCleanup(metricDataItem.delete) self.assertEqual(metricValue, metricDataItem["metric_value"]) dt = datetime.datetime.strptime(metricDataItem["timestamp"], "%Y-%m-%dT%H:%M:%S") self.assertEqual(ts, dt) ts = ts.replace(minute=0, second=0, microsecond=0) date = ts.strftime("%Y-%m-%d") hour = ts.strftime("%H") key = (date, hour) maxVal = instanceDataAnomalyScores.get(key, 0.0) instanceDataAnomalyScores[key] = max( maxVal, metricDataItem["anomaly_score"]) # And check that the aggregated instance data is updated instanceDataHourlyTable = Table( InstanceDataHourlyDynamoDBDefinition().tableName, connection=dynamodb) for key, anomalyScore in instanceDataAnomalyScores.iteritems(): date, hour = key instanceDataHourlyItem = _RETRY_ON_ITEM_NOT_FOUND_DYNAMODB_ERROR( instanceDataHourlyTable.lookup)(instanceName, "%sT%s" % (date, hour)) self.addCleanup(instanceDataHourlyItem.delete) self.assertAlmostEqual( anomalyScore, float( instanceDataHourlyItem["anomaly_score"]["TwitterVolume"])) self.assertEqual(date, instanceDataHourlyItem["date"]) self.assertEqual(hour, instanceDataHourlyItem["hour"]) # Now send some twitter data and validate that it made it to dynamodb twitterData = [{ "metric_name": metricName, "tweet_uid": uid, "created_at": "2015-02-19T19:43:24.870109", "agg_ts": "2015-02-19T19:43:24.870118", "text": "Tweet text", "userid": "10", "username": "******", "retweet_count": "0" }] with MessageBusConnector() as messageBus: messageBus.publishExg( exchange=self.config.get("non_metric_data", "exchange_name"), routingKey=( self.config.get("non_metric_data", "exchange_name") + ".twitter"), body=json.dumps(twitterData)) metricTweetsTable = Table(MetricTweetsDynamoDBDefinition().tableName, connection=dynamodb) metricTweetItem = metricTweetsTable.lookup( "-".join((metricName, uid)), "2015-02-19T19:43:24.870118") # There is no server-side cleanup for tweet data, so remove it here for # now to avoid accumulating test data self.addCleanup(metricTweetItem.delete) self.assertEqual(metricTweetItem["username"], twitterData[0]["username"]) self.assertEqual(metricTweetItem["tweet_uid"], twitterData[0]["tweet_uid"]) self.assertEqual(metricTweetItem["created_at"], twitterData[0]["created_at"]) self.assertEqual(metricTweetItem["agg_ts"], twitterData[0]["agg_ts"]) self.assertEqual(metricTweetItem["text"], twitterData[0]["text"]) self.assertEqual(metricTweetItem["userid"], twitterData[0]["userid"]) self.assertEqual(metricTweetItem["username"], twitterData[0]["username"]) self.assertEqual(metricTweetItem["retweet_count"], twitterData[0]["retweet_count"]) queryResult = metricTweetsTable.query_2( metric_name__eq=metricName, agg_ts__eq=twitterData[0]["agg_ts"], index="taurus.metric_data-metric_name_index") queriedMetricTweetItem = next(queryResult) self.assertEqual(queriedMetricTweetItem["username"], twitterData[0]["username"]) self.assertEqual(queriedMetricTweetItem["tweet_uid"], twitterData[0]["tweet_uid"]) self.assertEqual(queriedMetricTweetItem["created_at"], twitterData[0]["created_at"]) self.assertEqual(queriedMetricTweetItem["agg_ts"], twitterData[0]["agg_ts"]) self.assertEqual(queriedMetricTweetItem["text"], twitterData[0]["text"]) self.assertEqual(queriedMetricTweetItem["userid"], twitterData[0]["userid"]) self.assertEqual(queriedMetricTweetItem["username"], twitterData[0]["username"]) self.assertEqual(queriedMetricTweetItem["retweet_count"], twitterData[0]["retweet_count"]) # Delete metric and ensure metric is deleted from dynamodb, too self._deleteMetric(metricName) for _ in xrange(60): time.sleep(1) try: metricItem = metricTable.lookup(uid) except ItemNotFound as err: break else: self.fail("Metric not deleted from dynamodb")