Пример #1
0
 def lambda_handler(*args, **kwargs):
     event = args[0]
     context = args[1]
     aws_lambda_logging.setup(level=os.environ.get('LOGLEVEL', 'INFO'),
                              aws_request_id=context.aws_request_id,
                              boto_level='CRITICAL')
     received_raw_kinesis_records = event['Records']
     for raw_kinesis_records in chunks(
             iter_deaggregate_records(received_raw_kinesis_records),
             batch_size):
         kinesis_records: list = []
         for raw_kinesis_record in raw_kinesis_records:
             kinesis_record = KinesisRecord(raw_kinesis_record)
             if kinesis_record.is_any_of(event_types):
                 kinesis_records.append(kinesis_record)
         if kinesis_records:
             log.info({
                 "Action":
                 "Processing",
                 "Events": [
                     kinesis_record.get_type()
                     for kinesis_record in kinesis_records
                 ]
             })
             results = func(kinesis_records, context)
             if results:
                 log.info({"Results": results})
Пример #2
0
def handler(event, context):
    """
    Logs every record in the stream
    """
    aws_lambda_logging.setup(level=os.environ.get('LOGLEVEL', 'INFO'),
                             aws_request_id=context.aws_request_id,
                             boto_level='CRITICAL')
    raw_kinesis_records = event['Records']
    for kinesis_record in iter_deaggregate_records(raw_kinesis_records):
        try:
            kinesis_record['kinesis']['data'] = json.loads(
                base64.b64decode(kinesis_record['kinesis']['data']))
        except json.JSONDecodeError:
            pass
        log.info(kinesis_record)
def lambda_generator_handler(event, context):
    """A Python AWS Lambda function to process Kinesis aggregated
    records in a generator-based fashion."""

    raw_kinesis_records = event['Records']
    record_count = 0

    # Deaggregate all records using a generator function
    for record in iter_deaggregate_records(raw_kinesis_records):

        # Kinesis data in Python Lambdas is base64 encoded
        payload = base64.b64decode(record['kinesis']['data'])
        print('%s' % payload)
        record_count += 1

    return 'Successfully processed {} records.'.format(record_count)
def lambda_generator_handler(event, context):
    '''A Python AWS Lambda function to process Kinesis aggregated
    records in a generator-based fashion.'''
    
    raw_kinesis_records = event['Records']
    record_count = 0
    
    #Deaggregate all records using a generator function
    for record in iter_deaggregate_records(raw_kinesis_records):   
             
        # Kinesis data in Python Lambdas is base64 encoded
        payload = base64.b64decode(record['kinesis']['data'])
        print('%s' % (payload))
        record_count += 1
        
    return 'Successfully processed {} records.'.format(record_count)
def handler(event, context):
    """
    Invokes AWS queue depending on comapany's name.
    """
    raw_kinesis_records = event['Records']

    payload = None
    for record in iter_deaggregate_records(raw_kinesis_records):
        # Kinesis data in Python Lambdas is base64 encoded
        b_rec = base64.b64decode(record['kinesis']['data'])

        payload = loads(b_rec, single_value=True)['payload']

    try:
        company = payload['revision']['data']['company']
        queue_map[company]()
        print('Message sent!')
    except KeyError:
        print('Such company does not exist!')
Пример #6
0
def parse_records(raw_records: list) -> Generator[str, None, None]:
    """
    Generator that de-aggregates, decodes, gzip decompresses Kinesis Records

    :param raw_records: Raw Kinesis records (usually event['Records'] in Lambda handler function)
    :return:
    """
    for record in iter_deaggregate_records(raw_records):
        logger.debug(f"Raw Kinesis record: {record}")

        # Kinesis data is base64 encoded
        raw_data = base64.b64decode(record["kinesis"]["data"])

        # decompress data if raw data is gzip (log data from CloudWatch Logs subscription filters comes gzipped)
        # gzip magic number: 0x1f 0x8b
        if raw_data[0] == 0x1F and raw_data[1] == 0x8B:
            raw_data = gzip.decompress(raw_data)

        data = raw_data.decode()
        payloads = normalize_cloudwatch_messages(data)
        logger.debug(f"Normalized payloads: {payloads}")

        for payload in payloads:
            yield payload
def decode_validate(raw_records: list):
    xray_recorder.begin_subsegment('decode and validate')

    log_dict = dict()

    processed_records = 0

    for record in iter_deaggregate_records(raw_records):
        logger.debug(f"raw Kinesis record: {record}")
        # Kinesis data is base64 encoded
        decoded_data = base64.b64decode(record['kinesis']['data'])

        # check if base64 contents is gzip
        # gzip magic number 0x1f 0x8b
        if decoded_data[0] == 0x1f and decoded_data[1] == 0x8b:
            decoded_data = gzip.decompress(decoded_data)

        decoded_data = decoded_data.decode()
        normalized_payloads = normalize_kinesis_payload(decoded_data)
        logger.debug(f"Normalized payloads: {normalized_payloads}")

        for normalized_payload in normalized_payloads:
            logger.debug(f"Parsing normalized payload: {normalized_payload}")

            processed_records += 1

            # check if log type field is available
            try:
                log_type = normalized_payload[LOG_TYPE_FIELD]

            except KeyError:
                logger.error(
                    f"Cannot retrieve necessary field \"{LOG_TYPE_FIELD}\" "
                    f"from payload: {normalized_payload}")
                log_type = f"{LOG_TYPE_UNKNOWN_PREFIX}/unknown_type"
                logger.error(f"Marking as {log_type}")

            # check if timestamp is present
            try:
                timestamp = normalized_payload[LOG_TIMESTAMP_FIELD]

            except KeyError:
                logger.error(
                    f"Cannot retrieve recommended field \"{LOG_TIMESTAMP_FIELD}\" "
                    f"from payload: {normalized_payload}")
                timestamp = None

            try:
                log_id = normalized_payload[LOG_ID_FIELD]
            except KeyError:
                logger.error(
                    f"Cannot retrieve recommended field \"{LOG_ID_FIELD}\" "
                    f"from payload: {normalized_payload}")
                log_id = None

            # valid data
            append_to_dict(log_dict,
                           log_type,
                           normalized_payload,
                           log_timestamp=timestamp,
                           log_id=log_id)

    logger.info(f"Processed {processed_records} records from Kinesis")
    xray_recorder.end_subsegment()
    return log_dict
Пример #8
0
def decode_validate(raw_records: list):
    xray_recorder.begin_subsegment('decode and validate')

    log_dict = dict()

    processed_records = 0

    for record in iter_deaggregate_records(raw_records):
        logger.debug(f"raw Kinesis record: {record}")
        # Kinesis data is base64 encoded
        decoded_data = base64.b64decode(record['kinesis']['data'])

        # check if base64 contents is gzip
        # gzip magic number 0x1f 0x8b
        if decoded_data[0] == 0x1f and decoded_data[1] == 0x8b:
            decoded_data = gzip.decompress(decoded_data)

        decoded_data = decoded_data.decode()
        normalized_payloads = normalize_kinesis_payload(decoded_data)
        logger.debug(f"Normalized payloads: {normalized_payloads}")

        for normalized_payload in normalized_payloads:
            logger.debug(f"Parsing normalized payload: {normalized_payload}")

            processed_records += 1

            # get log id when available
            # log_id = normalized_payload.setdefault(LOG_ID_FIELD, None)
            # Notice: log_id is not used in this Lambda

            # check if log type field is available
            try:
                log_type = normalized_payload[LOG_TYPE_FIELD]
            except KeyError:
                logger.error(
                    f"Cannot retrieve necessary field \"{LOG_TYPE_FIELD}\" from payload, "
                    f"skipping: {normalized_payload}")
                continue

            # check if timestamp is present
            try:
                timestamp = normalized_payload[LOG_TIMESTAMP_FIELD]

            except KeyError:
                logger.error(
                    f"Cannot retrieve necessary field \"{LOG_TIMESTAMP_FIELD}\" from payload, "
                    f"skipping: {normalized_payload}")
                log_type += "_no_timestamp"
                logger.error(f"Re-marking as {log_type} and giving up")
                append_to_dict(log_dict, log_type, normalized_payload)
                continue

            # valid data
            append_to_dict(log_dict,
                           log_type,
                           normalized_payload,
                           log_timestamp=timestamp)

    logger.info(f"Processed {processed_records} records from Kinesis")
    xray_recorder.end_subsegment()
    return log_dict
def handler(event, context):
    raw_records = event['Records']
    actions = []
    es = Elasticsearch(hosts=[{
        'host': ELASTICSEARCH_HOST,
        'port': 443
    }],
                       http_auth=aws_auth,
                       use_ssl=True,
                       verify_certs=True,
                       connection_class=RequestsHttpConnection,
                       timeout=ES_TIMEOUT,
                       ca_certs=certifi.where())

    logger.info(f"Connected to Elasticsearch at https://{ELASTICSEARCH_HOST}")

    failed_data_type = []
    failed_data_timestamp = []
    failed_data_es = []

    processed_records = 0

    subsegment = xray_recorder.begin_subsegment('parse_records')
    for record in iter_deaggregate_records(raw_records):
        logger.debug(f"raw Kinesis record: {record}")
        # Kinesis data is base64 encoded
        decoded_data = base64.b64decode(record['kinesis']['data'])

        # check if base64 contents is gzip
        # gzip magic number 0x1f 0x8b
        if decoded_data[0] == 0x1f and decoded_data[1] == 0x8b:
            decoded_data = gzip.decompress(decoded_data)

        decoded_data = decoded_data.decode()
        normalized_payloads = normalize_kinesis_payload(decoded_data)
        logger.debug(f"Normalized payloads: {normalized_payloads}")

        for normalized_payload in normalized_payloads:
            logger.debug(f"Parsing normalized payload: {normalized_payload}")

            processed_records += 1

            # check if log type field is available
            try:
                log_type = normalized_payload[LOG_TYPE_FIELD]

            except KeyError:
                logger.error(
                    f"Cannot retrieve necessary field \"{LOG_TYPE_FIELD}\" "
                    f"from payload: {normalized_payload}")
                logger.error(f"Will save failed record to S3")
                failed_data_type.append(normalized_payload)
                continue

            # apply whitelist in-place

            if len(LOG_TYPE_FIELD_WHITELIST
                   ) != 0 and log_type not in LOG_TYPE_FIELD_WHITELIST:
                logger.debug(f"Skipping ignored log_type: {log_type}")
                continue

            # check if timestamp is present
            try:
                timestamp = normalized_payload[LOG_TIMESTAMP_FIELD]
                timestamp = dateutil.parser.parse(timestamp)

            except KeyError:
                logger.error(
                    f"Cannot retrieve necessary field \"{LOG_TIMESTAMP_FIELD}\" "
                    f"from payload: {normalized_payload}")
                logger.error(f"Will save failed record to S3")
                failed_data_timestamp.append(normalized_payload)
                continue

            # valid data
            date = datetime.datetime.strftime(timestamp, "%Y%m%d")
            index = f"{INDEX_NAME_PREFIX}-{log_type}-{date}"

            if len(LOG_TYPE_FIELD_WHITELIST
                   ) != 0 and log_type not in LOG_TYPE_FIELD_WHITELIST:
                logger.info(
                    f"Log type {log_type} not in whitelist {LOG_TYPE_FIELD_WHITELIST}"
                )
                continue

            actions.append({
                "_index": index,
                "_type": "_doc",
                "_source": normalized_payload
            })

    logger.info(f"Processed {processed_records} records from Kinesis")
    subsegment.put_annotation("processed_records", processed_records)
    xray_recorder.end_subsegment()

    subsegment = xray_recorder.begin_subsegment('Elasticsearch push')
    subsegment.put_annotation("total_actions", len(actions))
    # good logs save
    if len(actions) > 0:
        logger.info(
            f"Pushing {len(actions)} actions generated from Kinesis records to Elasticsearch Bulk API"
        )

        for i in range(0, len(actions), BULK_CHUNK_SIZE):
            chunk_subsegment = xray_recorder.begin_subsegment(
                'Elasticsearch push chunk')
            actions_chunk = actions[i:i + BULK_CHUNK_SIZE]

            chunk_subsegment.put_annotation("chunk_number",
                                            int(i / BULK_CHUNK_SIZE + 1))
            chunk_subsegment.put_annotation("chunk_size", len(actions_chunk))
            logger.info(
                f"Sending chunk no. {int(i / BULK_CHUNK_SIZE + 1)} of {len(actions_chunk)} actions"
            )

            try:
                # make sure there will be only one internal chunk/batch
                helpers.bulk(es, actions_chunk, chunk_size=len(actions_chunk))

            except BulkIndexError as e:
                logger.info(
                    f"Got {len(e.errors)} failed actions from Elasticsearch Bulk API"
                )
                failed_data_es += e.errors

            xray_recorder.end_subsegment()

    else:
        logger.info("Nothing to flush")
    xray_recorder.end_subsegment()

    bad_data_save(failed_data_type,
                  reason=f"missing log type field {LOG_TYPE_FIELD}")
    bad_data_save(failed_data_timestamp,
                  reason=f"missing timestamp in field {LOG_TIMESTAMP_FIELD}")
    bad_data_save(failed_data_es, reason="rejected by Elasticsearch")

    logger.info(f"Finished")