Пример #1
0
class Aggregator(object):
    """
        Aggregate events of multiple different SQS messages into S3 key lists.
    """
    def __init__(self, context, partition_set):
        self.__aggregation_sets = partition_set
        self.__aggregation_sets[c.KEY_TABLES] = {}
        self.__partitioner = Partitioner(context[c.KEY_PARTITIONS],
                                         context[c.KEY_SEPERATOR_PARTITION])
        self.__context = context
        self.__info = {}
        self.__info[c.INFO_TOTAL_BYTES] = 0
        self.__info[c.INFO_TOTAL_ROWS] = 0
        self.__info[c.INFO_TOTAL_MESSAGES] = 0
        self.__info[c.INFO_EVENTS] = {}
        self.__logger = logging.getLogger()
        self.__logger.setLevel(logging.ERROR)

    @property
    def bytes_uncompressed(self):
        return self.__info[c.INFO_TOTAL_BYTES]

    @property
    def rows(self):
        return self.__info[c.INFO_TOTAL_ROWS]

    @property
    def messages(self):
        return self.__info[c.INFO_TOTAL_MESSAGES]

    @property
    def events(self):
        return self.__info[c.INFO_EVENTS]

    @property
    def info(self):
        return self.__info

    def append_default_metrics_and_partition(self, messages):
        length = len(messages)
        util.debug_print(("Processing {} messages.").format(length))
        self.increment(self.__info, c.INFO_TOTAL_MESSAGES, length)
        for x in range(0, length):
            message = messages[x]
            self.process_message(message)

    def process_message(self, message):
        compression_mode = CompressionClassFactory.instance(
            message[c.SQS_PARAM_MESSAGE_ATTRIBUTES][
                c.SQS_PARAM_COMPRESSION_TYPE]['StringValue'])
        body = compression_mode.extract_message_body(message)
        attempts = int(message['Attributes']['ApproximateReceiveCount'])
        sensitivity_type = SensitivityClassFactory.instance(
            message[c.SQS_PARAM_MESSAGE_ATTRIBUTES][
                c.SQS_PARAM_SENSITIVITY_TYPE]['StringValue'])
        payload_type = PayloadClassFactory.instance(
            self.__context, message[c.SQS_PARAM_MESSAGE_ATTRIBUTES][
                c.SQS_PARAM_PAYLOAD_TYPE]['StringValue'], compression_mode,
            sensitivity_type)

        msg_token = "{}{}{}".format(message['MessageId'],
                                    self.__context[c.KEY_SEPERATOR_CSV],
                                    message['ReceiptHandle'])
        if attempts > self.__context[c.KEY_MAX_MESSAGE_RETRY]:
            self.__logger.error(
                "The message with message Id {} has been processed {} times.".
                format(msg_token, attempts))
        self.increment(self.__info, c.INFO_TOTAL_BYTES, len(body))

        payload_type.to_partitions(msg_token, body, self.partition,
                                   sensitivity_type,
                                   self.__partitioner.partitions)

    def partition(self, token, row, sensitivity_type):
        #schema hash
        columns = row.keys()
        columns = [i if isinstance(i, string_types) else i for i in columns]
        columns.sort()
        rows_as_string = str(columns)
        schema_hash = hash(rows_as_string)
        event_name = row[metric_schema.EVENT.id]
        uuid_key = "{}{}{}".format(row[metric_schema.UUID.id],
                                   row[metric_schema.EVENT.id],
                                   row[metric_schema.SERVER_TIMESTAMP.id])
        #create the key here as the partition my remove attributes if the attribute is created as a partition
        tablename, partition = self.__partitioner.extract(
            schema_hash, row, sensitivity_type)
        columns, row = self.order_and_map_to_long_name(row)

        self.increment_detailed_cloudwatch_event_information(event_name)

        if partition is None:
            self.__logger.error("Dropping metric\n{}".format(row))
            return

        if partition not in self.__aggregation_sets:
            #need to use a immutable object as required by fastparquet for hashing
            self.__aggregation_sets[partition] = dict({})

        if tablename not in self.__aggregation_sets[c.KEY_TABLES]:
            self.__aggregation_sets[c.KEY_TABLES][tablename] = tablename

        partition_dict = self.__aggregation_sets[partition]
        if schema_hash not in partition_dict:
            partition_dict[schema_hash] = {}
            partition_dict[schema_hash][c.KEY_SET] = {}
            partition_dict[schema_hash][c.KEY_SET_COLUMNS] = columns
        partition_dict[schema_hash][c.KEY_SET][uuid_key] = row

        self.register_processed_message(partition_dict[schema_hash], token)

    def increment_detailed_cloudwatch_event_information(self, event_name):
        if self.__context.get(c.KEY_WRITE_DETAILED_CLOUDWATCH_EVENTS, False):
            self.increment(self.events, event_name, 1)

    def register_processed_message(self, schema_dict, msg_token):
        #track which messages have been processed
        if c.KEY_MSG_IDS not in schema_dict:
            schema_dict[c.KEY_MSG_IDS], schema_dict[
                c.KEY_APPENDER] = self.get_new_list_append_handler()

        if msg_token not in schema_dict[c.KEY_MSG_IDS]:
            schema_dict[c.KEY_APPENDER](msg_token)

    def get_new_list_append_handler(self):
        list = []
        append = list.append
        return list, append

    def increment(self, dict, key, value):
        if key not in dict:
            dict[key] = value
        dict[key] += value

    def order_and_map_to_long_name(self, row):
        orderer = Order()
        ordered_columns = orderer.order_columns(row)
        ordered_dict = OrderedDict()
        ordered_columns_long_name = []
        for field in ordered_columns:
            if field not in row:
                continue
            value = row[field]
            if field in metric_schema.DICTIONARY and field in row:
                name = metric_schema.DICTIONARY[field].long_name
                ordered_dict[name] = value
                ordered_columns_long_name.append(name)
            else:
                ordered_dict[field] = value
                ordered_columns_long_name.append(field)

        return ordered_columns_long_name, ordered_dict