def lambda_handler(*args, **kwargs): event = args[0] context = args[1] aws_lambda_logging.setup(level=os.environ.get('LOGLEVEL', 'INFO'), aws_request_id=context.aws_request_id, boto_level='CRITICAL') received_raw_kinesis_records = event['Records'] for raw_kinesis_records in chunks( iter_deaggregate_records(received_raw_kinesis_records), batch_size): kinesis_records: list = [] for raw_kinesis_record in raw_kinesis_records: kinesis_record = KinesisRecord(raw_kinesis_record) if kinesis_record.is_any_of(event_types): kinesis_records.append(kinesis_record) if kinesis_records: log.info({ "Action": "Processing", "Events": [ kinesis_record.get_type() for kinesis_record in kinesis_records ] }) results = func(kinesis_records, context) if results: log.info({"Results": results})
def handler(event, context): """ Logs every record in the stream """ aws_lambda_logging.setup(level=os.environ.get('LOGLEVEL', 'INFO'), aws_request_id=context.aws_request_id, boto_level='CRITICAL') raw_kinesis_records = event['Records'] for kinesis_record in iter_deaggregate_records(raw_kinesis_records): try: kinesis_record['kinesis']['data'] = json.loads( base64.b64decode(kinesis_record['kinesis']['data'])) except json.JSONDecodeError: pass log.info(kinesis_record)
def lambda_generator_handler(event, context): """A Python AWS Lambda function to process Kinesis aggregated records in a generator-based fashion.""" raw_kinesis_records = event['Records'] record_count = 0 # Deaggregate all records using a generator function for record in iter_deaggregate_records(raw_kinesis_records): # Kinesis data in Python Lambdas is base64 encoded payload = base64.b64decode(record['kinesis']['data']) print('%s' % payload) record_count += 1 return 'Successfully processed {} records.'.format(record_count)
def lambda_generator_handler(event, context): '''A Python AWS Lambda function to process Kinesis aggregated records in a generator-based fashion.''' raw_kinesis_records = event['Records'] record_count = 0 #Deaggregate all records using a generator function for record in iter_deaggregate_records(raw_kinesis_records): # Kinesis data in Python Lambdas is base64 encoded payload = base64.b64decode(record['kinesis']['data']) print('%s' % (payload)) record_count += 1 return 'Successfully processed {} records.'.format(record_count)
def handler(event, context): """ Invokes AWS queue depending on comapany's name. """ raw_kinesis_records = event['Records'] payload = None for record in iter_deaggregate_records(raw_kinesis_records): # Kinesis data in Python Lambdas is base64 encoded b_rec = base64.b64decode(record['kinesis']['data']) payload = loads(b_rec, single_value=True)['payload'] try: company = payload['revision']['data']['company'] queue_map[company]() print('Message sent!') except KeyError: print('Such company does not exist!')
def parse_records(raw_records: list) -> Generator[str, None, None]: """ Generator that de-aggregates, decodes, gzip decompresses Kinesis Records :param raw_records: Raw Kinesis records (usually event['Records'] in Lambda handler function) :return: """ for record in iter_deaggregate_records(raw_records): logger.debug(f"Raw Kinesis record: {record}") # Kinesis data is base64 encoded raw_data = base64.b64decode(record["kinesis"]["data"]) # decompress data if raw data is gzip (log data from CloudWatch Logs subscription filters comes gzipped) # gzip magic number: 0x1f 0x8b if raw_data[0] == 0x1F and raw_data[1] == 0x8B: raw_data = gzip.decompress(raw_data) data = raw_data.decode() payloads = normalize_cloudwatch_messages(data) logger.debug(f"Normalized payloads: {payloads}") for payload in payloads: yield payload
def decode_validate(raw_records: list): xray_recorder.begin_subsegment('decode and validate') log_dict = dict() processed_records = 0 for record in iter_deaggregate_records(raw_records): logger.debug(f"raw Kinesis record: {record}") # Kinesis data is base64 encoded decoded_data = base64.b64decode(record['kinesis']['data']) # check if base64 contents is gzip # gzip magic number 0x1f 0x8b if decoded_data[0] == 0x1f and decoded_data[1] == 0x8b: decoded_data = gzip.decompress(decoded_data) decoded_data = decoded_data.decode() normalized_payloads = normalize_kinesis_payload(decoded_data) logger.debug(f"Normalized payloads: {normalized_payloads}") for normalized_payload in normalized_payloads: logger.debug(f"Parsing normalized payload: {normalized_payload}") processed_records += 1 # check if log type field is available try: log_type = normalized_payload[LOG_TYPE_FIELD] except KeyError: logger.error( f"Cannot retrieve necessary field \"{LOG_TYPE_FIELD}\" " f"from payload: {normalized_payload}") log_type = f"{LOG_TYPE_UNKNOWN_PREFIX}/unknown_type" logger.error(f"Marking as {log_type}") # check if timestamp is present try: timestamp = normalized_payload[LOG_TIMESTAMP_FIELD] except KeyError: logger.error( f"Cannot retrieve recommended field \"{LOG_TIMESTAMP_FIELD}\" " f"from payload: {normalized_payload}") timestamp = None try: log_id = normalized_payload[LOG_ID_FIELD] except KeyError: logger.error( f"Cannot retrieve recommended field \"{LOG_ID_FIELD}\" " f"from payload: {normalized_payload}") log_id = None # valid data append_to_dict(log_dict, log_type, normalized_payload, log_timestamp=timestamp, log_id=log_id) logger.info(f"Processed {processed_records} records from Kinesis") xray_recorder.end_subsegment() return log_dict
def decode_validate(raw_records: list): xray_recorder.begin_subsegment('decode and validate') log_dict = dict() processed_records = 0 for record in iter_deaggregate_records(raw_records): logger.debug(f"raw Kinesis record: {record}") # Kinesis data is base64 encoded decoded_data = base64.b64decode(record['kinesis']['data']) # check if base64 contents is gzip # gzip magic number 0x1f 0x8b if decoded_data[0] == 0x1f and decoded_data[1] == 0x8b: decoded_data = gzip.decompress(decoded_data) decoded_data = decoded_data.decode() normalized_payloads = normalize_kinesis_payload(decoded_data) logger.debug(f"Normalized payloads: {normalized_payloads}") for normalized_payload in normalized_payloads: logger.debug(f"Parsing normalized payload: {normalized_payload}") processed_records += 1 # get log id when available # log_id = normalized_payload.setdefault(LOG_ID_FIELD, None) # Notice: log_id is not used in this Lambda # check if log type field is available try: log_type = normalized_payload[LOG_TYPE_FIELD] except KeyError: logger.error( f"Cannot retrieve necessary field \"{LOG_TYPE_FIELD}\" from payload, " f"skipping: {normalized_payload}") continue # check if timestamp is present try: timestamp = normalized_payload[LOG_TIMESTAMP_FIELD] except KeyError: logger.error( f"Cannot retrieve necessary field \"{LOG_TIMESTAMP_FIELD}\" from payload, " f"skipping: {normalized_payload}") log_type += "_no_timestamp" logger.error(f"Re-marking as {log_type} and giving up") append_to_dict(log_dict, log_type, normalized_payload) continue # valid data append_to_dict(log_dict, log_type, normalized_payload, log_timestamp=timestamp) logger.info(f"Processed {processed_records} records from Kinesis") xray_recorder.end_subsegment() return log_dict
def handler(event, context): raw_records = event['Records'] actions = [] es = Elasticsearch(hosts=[{ 'host': ELASTICSEARCH_HOST, 'port': 443 }], http_auth=aws_auth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection, timeout=ES_TIMEOUT, ca_certs=certifi.where()) logger.info(f"Connected to Elasticsearch at https://{ELASTICSEARCH_HOST}") failed_data_type = [] failed_data_timestamp = [] failed_data_es = [] processed_records = 0 subsegment = xray_recorder.begin_subsegment('parse_records') for record in iter_deaggregate_records(raw_records): logger.debug(f"raw Kinesis record: {record}") # Kinesis data is base64 encoded decoded_data = base64.b64decode(record['kinesis']['data']) # check if base64 contents is gzip # gzip magic number 0x1f 0x8b if decoded_data[0] == 0x1f and decoded_data[1] == 0x8b: decoded_data = gzip.decompress(decoded_data) decoded_data = decoded_data.decode() normalized_payloads = normalize_kinesis_payload(decoded_data) logger.debug(f"Normalized payloads: {normalized_payloads}") for normalized_payload in normalized_payloads: logger.debug(f"Parsing normalized payload: {normalized_payload}") processed_records += 1 # check if log type field is available try: log_type = normalized_payload[LOG_TYPE_FIELD] except KeyError: logger.error( f"Cannot retrieve necessary field \"{LOG_TYPE_FIELD}\" " f"from payload: {normalized_payload}") logger.error(f"Will save failed record to S3") failed_data_type.append(normalized_payload) continue # apply whitelist in-place if len(LOG_TYPE_FIELD_WHITELIST ) != 0 and log_type not in LOG_TYPE_FIELD_WHITELIST: logger.debug(f"Skipping ignored log_type: {log_type}") continue # check if timestamp is present try: timestamp = normalized_payload[LOG_TIMESTAMP_FIELD] timestamp = dateutil.parser.parse(timestamp) except KeyError: logger.error( f"Cannot retrieve necessary field \"{LOG_TIMESTAMP_FIELD}\" " f"from payload: {normalized_payload}") logger.error(f"Will save failed record to S3") failed_data_timestamp.append(normalized_payload) continue # valid data date = datetime.datetime.strftime(timestamp, "%Y%m%d") index = f"{INDEX_NAME_PREFIX}-{log_type}-{date}" if len(LOG_TYPE_FIELD_WHITELIST ) != 0 and log_type not in LOG_TYPE_FIELD_WHITELIST: logger.info( f"Log type {log_type} not in whitelist {LOG_TYPE_FIELD_WHITELIST}" ) continue actions.append({ "_index": index, "_type": "_doc", "_source": normalized_payload }) logger.info(f"Processed {processed_records} records from Kinesis") subsegment.put_annotation("processed_records", processed_records) xray_recorder.end_subsegment() subsegment = xray_recorder.begin_subsegment('Elasticsearch push') subsegment.put_annotation("total_actions", len(actions)) # good logs save if len(actions) > 0: logger.info( f"Pushing {len(actions)} actions generated from Kinesis records to Elasticsearch Bulk API" ) for i in range(0, len(actions), BULK_CHUNK_SIZE): chunk_subsegment = xray_recorder.begin_subsegment( 'Elasticsearch push chunk') actions_chunk = actions[i:i + BULK_CHUNK_SIZE] chunk_subsegment.put_annotation("chunk_number", int(i / BULK_CHUNK_SIZE + 1)) chunk_subsegment.put_annotation("chunk_size", len(actions_chunk)) logger.info( f"Sending chunk no. {int(i / BULK_CHUNK_SIZE + 1)} of {len(actions_chunk)} actions" ) try: # make sure there will be only one internal chunk/batch helpers.bulk(es, actions_chunk, chunk_size=len(actions_chunk)) except BulkIndexError as e: logger.info( f"Got {len(e.errors)} failed actions from Elasticsearch Bulk API" ) failed_data_es += e.errors xray_recorder.end_subsegment() else: logger.info("Nothing to flush") xray_recorder.end_subsegment() bad_data_save(failed_data_type, reason=f"missing log type field {LOG_TYPE_FIELD}") bad_data_save(failed_data_timestamp, reason=f"missing timestamp in field {LOG_TIMESTAMP_FIELD}") bad_data_save(failed_data_es, reason="rejected by Elasticsearch") logger.info(f"Finished")