def __init__(self, context, enable_alert_processor=True): """ Args: context: An AWS context object which provides metadata on the currently executing lambda function. enable_alert_processor (bool): If the user wants to send the alerts using their own methods, 'enable_alert_processor' can be set to False to suppress sending with the StreamAlert alert processor. """ # Load the config. Validation occurs during load, which will # raise exceptions on any ConfigErrors config = load_config() # Load the environment from the context arn self.env = load_env(context) # Instantiate the sink here to handle sending the triggered alerts to the # alert processor self.sinker = StreamSink(self.env) # Instantiate a classifier that is used for this run self.classifier = StreamClassifier(config=config) self.enable_alert_processor = enable_alert_processor self._failed_record_count = 0 self._alerts = []
def __init__(self, context): """Initializer Args: context (dict): An AWS context object which provides metadata on the currently executing lambda function. """ # Load the config. Validation occurs during load, which will # raise exceptions on any ConfigError StreamAlert.config = StreamAlert.config or config.load_config(validate=True) # Load the environment from the context arn self.env = config.parse_lambda_arn(context.invoked_function_arn) # Instantiate the send_alerts here to handle sending the triggered alerts to the # alert processor self.alert_forwarder = AlertForwarder() # Instantiate a classifier that is used for this run self.classifier = StreamClassifier(config=self.config) self._failed_record_count = 0 self._processed_record_count = 0 self._processed_size = 0 self._alerts = [] rule_import_paths = [item for location in {'rule_locations', 'matcher_locations'} for item in self.config['global']['general'][location]] # Create an instance of the RulesEngine class that gets cached in the # StreamAlert class as an instance property self._rules_engine = RulesEngine(self.config, *rule_import_paths) # Firehose client attribute self._firehose_client = None
def run(self, event, context): """StreamAlert Lambda function handler. Loads the configuration for the StreamAlert function which contains: available data sources, log formats, parser modes, and sinks. Classifies logs sent into the stream into a parsed type. Matches records against rules. Args: event: An AWS event mapped to a specific source/entity (kinesis stream or an s3 bucket event) containing data emitted to the stream. context: An AWS context object which provides metadata on the currently executing lambda function. Returns: None """ logger.debug('Number of Records: %d', len(event.get('Records', []))) config = load_config() env = load_env(context) for record in event.get('Records', []): payload = StreamPayload(raw_record=record) classifier = StreamClassifier(config=config) classifier.map_source(payload) # If the kinesis stream or s3 bucket is not in our config, # go onto the next record if not payload.valid_source: continue if payload.service == 's3': self.s3_process(payload, classifier) elif payload.service == 'kinesis': self.kinesis_process(payload, classifier) else: logger.info('Unsupported service: %s', payload.service) # returns the list of generated alerts if self.return_alerts: return self.alerts # send alerts to SNS self.send_alerts(env, payload)
def test_map_source_2(self): """Payload Source Mapping 2""" data_encoded = base64.b64encode('test_map_source_data_2') payload = self.payload_generator(kinesis_stream='test_stream_2', kinesis_data=data_encoded) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) test_stream_2_logs = { 'test_log_type_json_2', 'test_log_type_json_nested_osquery', 'test_log_type_syslog' } metadata = classifier.log_metadata(payload) # service, entity, metadata test assert_equal(payload.service, 'kinesis') assert_equal(payload.entity, 'test_stream_2') assert_equal(set(metadata.keys()), test_stream_2_logs)
def run(self, event): """StreamAlert Lambda function handler. Loads the configuration for the StreamAlert function which contains: available data sources, log formats, parser modes, and sinks. Classifies logs sent into the stream into a parsed type. Matches records against rules. Args: event: An AWS event mapped to a specific source/entity (kinesis stream or an s3 bucket event) containing data emitted to the stream. Returns: None """ LOGGER.debug('Number of Records: %d', len(event.get('Records', []))) config = load_config() for record in event.get('Records', []): payload = StreamPayload(raw_record=record) classifier = StreamClassifier(config=config) # If the kinesis stream, s3 bucket, or sns topic is not in our config, # go onto the next record if not classifier.map_source(payload): continue if payload.service == 's3': self._s3_process(payload, classifier) elif payload.service == 'kinesis': self._kinesis_process(payload, classifier) elif payload.service == 'sns': self._sns_process(payload, classifier) else: LOGGER.info('Unsupported service: %s', payload.service) LOGGER.debug('%s alerts triggered', len(self.alerts)) LOGGER.debug('\n%s\n', json.dumps(self.alerts, indent=4)) if self.return_alerts: return self.alerts
def test_rule(self, rule_name, test_record, formatted_record): """Feed formatted records into StreamAlert and check for alerts Args: rule_name [str]: The rule name being tested test_record [dict]: A single record to test formatted_record [dict]: A dictionary that includes the 'data' from the test record, formatted into a structure that is resemblant of how an incoming record from a service would format it. See test/integration/templates for example of how each service formats records. Returns: [list] alerts that hit for this rule [integer] count of expected alerts for this rule [bool] boolean where False indicates errors occurred during processing """ event = {'Records': [formatted_record]} expected_alert_count = test_record.get('trigger_count') if not expected_alert_count: expected_alert_count = 1 if test_record['trigger'] else 0 # Run the rule processor. Passing mocked context object with fake # values and False for suppressing sending of alerts processor = StreamAlert(self.context, False) all_records_matched_schema = processor.run(event) if not all_records_matched_schema: payload = StreamPayload(raw_record=formatted_record) classifier = StreamClassifier(config=load_config()) classifier.map_source(payload) logs = classifier._log_metadata() self.analyze_record_delta(logs, rule_name, test_record) alerts = processor.get_alerts() # we only want alerts for the specific rule being tested alerts = [alert for alert in alerts if alert['rule_name'] == rule_name] return alerts, expected_alert_count, all_records_matched_schema
def test_map_source_1(self): """Payload Source Mapping 1""" data_encoded = base64.b64encode('test_map_source data') payload = self.payload_generator(kinesis_stream='test_kinesis_stream', kinesis_data=data_encoded) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) test_kinesis_stream_logs = { 'test_log_type_json', 'test_log_type_json_2', 'test_log_type_json_nested', 'test_log_type_json_nested_with_data', 'test_log_type_csv', 'test_log_type_csv_nested', 'test_log_type_kv_auditd' } metadata = classifier._log_metadata() # service, entity, metadata test assert_equal(payload.service, 'kinesis') assert_equal(payload.entity, 'test_kinesis_stream') assert_equal(set(metadata.keys()), test_kinesis_stream_logs)
def test_classify_record_kinesis_json(self): """Payload Classify JSON - boolean, float, integer types""" kinesis_data = json.dumps({ 'key4': 'true', 'key5': '10.001', 'key6': '10', 'key7': False }) payload = self.payload_generator(kinesis_stream='test_kinesis_stream', kinesis_data=kinesis_data) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) # pre parse and classify data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) # valid record test assert_equal(payload.valid, True) assert_equal(type(payload.records[0]), dict) # log type test assert_equal(payload.log_source, 'test_log_type_json_2') # payload type test assert_equal(payload.type, 'json') assert_not_equal(payload.type, 'csv') # record type test assert_equal(payload.records[0]['key4'], True) assert_equal(payload.records[0]['key5'], 10.001) assert_equal(payload.records[0]['key6'], 10) assert_equal(payload.records[0]['key7'], False)
def test_classify_record_kinesis_nested_json_missing_subkey_fields(self): """Payload Classify Nested JSON Missing Subkeys""" kinesis_data = json.dumps({ 'name': 'testquery', 'hostIdentifier': 'host1.test.prod', 'calendarTime': 'Jan 01 2017', 'unixTime': '12321412321', 'columns': { 'key1': 'test', 'key2': 'one' }, 'action': 'added', 'decorations': { 'role': 'web-server', 'env': 'production', # 'cluster': 'eu-east', 'number': '100' } }) payload = self.payload_generator(kinesis_stream='test_stream_2', kinesis_data=kinesis_data) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) # invalid record test assert_equal(payload.valid, False) assert_equal(payload.records, None)
def __init__(self, context, enable_alert_processor=True): """Initializer Args: context (dict): An AWS context object which provides metadata on the currently executing lambda function. enable_alert_processor (bool): If the user wants to send the alerts using their own methods, 'enable_alert_processor' can be set to False to suppress sending with the StreamAlert alert processor. """ # Load the config. Validation occurs during load, which will # raise exceptions on any ConfigErrors StreamAlert.config = StreamAlert.config or load_config() # Load the environment from the context arn self.env = load_env(context) # Instantiate the sink here to handle sending the triggered alerts to the # alert processor self.sinker = StreamSink(self.env) # Instantiate a classifier that is used for this run self.classifier = StreamClassifier(config=self.config) self.enable_alert_processor = enable_alert_processor self._failed_record_count = 0 self._processed_size = 0 self._alerts = [] # Create a dictionary to hold parsed payloads by log type. # Firehose needs this information to send to its corresponding # delivery stream. self.categorized_payloads = defaultdict(list) # Firehose client initialization self.firehose_client = None # create an instance of the StreamRules class that gets cached in the # StreamAlert class as an instance property self._rule_engine = StreamRules(self.config)
def test_classify_record_kinesis_csv(self): """Payload Classify CSV""" csv_data = 'jan102017,0100,host1,thisis some data with keyword1 in it' payload = self.payload_generator(kinesis_stream='test_kinesis_stream', kinesis_data=csv_data) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) # valid record test assert_equal(payload.valid, True) assert_equal(type(payload.records[0]), dict) # record value tests assert_equal(payload.records[0]['message'], 'thisis some data with keyword1 in it') assert_equal(payload.records[0]['host'], 'host1') # type test assert_equal(payload.type, 'csv') assert_not_equal(payload.type, 'json') # log source test assert_equal(payload.log_source, 'test_log_type_csv')
def test_classify_record_kinesis_csv_nested(self): """Payload Classify Nested CSV""" csv_nested_data = ( '"Jan 10 2017","1485635414","host1.prod.test","Corp",' '"chef,web-server,1,10,success"') payload = self.payload_generator(kinesis_stream='test_kinesis_stream', kinesis_data=csv_nested_data) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) # valid record test assert_equal(payload.valid, True) assert_equal(type(payload.records[0]), dict) # record value tests assert_equal(payload.records[0]['date'], 'Jan 10 2017') assert_equal(payload.records[0]['host'], 'host1.prod.test') assert_equal(payload.records[0]['time'], 1485635414) assert_equal(payload.records[0]['message']['role'], 'web-server') assert_equal(payload.records[0]['message']['cluster_size'], 10) # type test assert_equal(payload.type, 'csv') assert_not_equal(payload.type, 'json') # log source test assert_equal(payload.log_source, 'test_log_type_csv_nested')
def test_classify_record_kinesis_json(self): """Payload Classify JSON""" kinesis_data = json.dumps({ 'key1': 'sample data!!!!', 'key2': 'more sample data', 'key3': '1' }) payload = self.payload_generator(kinesis_stream='test_kinesis_stream', kinesis_data=kinesis_data) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) # pre parse and classify data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) # valid record test assert_equal(payload.valid, True) assert_equal(type(payload.records[0]), dict) # log type test assert_equal(payload.log_source, 'test_log_type_json') # payload type test assert_equal(payload.type, 'json') assert_not_equal(payload.type, 'csv') # record type test assert_equal(type(payload.records[0]['key1']), str) assert_equal(type(payload.records[0]['key2']), str) assert_equal(type(payload.records[0]['key3']), int)
def test_multiple_schema_matching(self): """Test Matching Multiple Schemas with Log Patterns""" kinesis_data = json.dumps({ 'name': 'file added test', 'identifier': 'host4.this.test', 'time': 'Jan 01 2017', 'type': 'lol_file_added_event_test', 'message': 'bad_001.txt was added' }) # Make sure support for multiple schema matching is ON sa_classifier.SUPPORT_MULTIPLE_SCHEMA_MATCHING = True payload = self.payload_generator(kinesis_stream='test_stream_2', kinesis_data=kinesis_data) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) data = self.pre_parse_kinesis(payload) valid_parses = classifier._process_log_schemas(payload, data) assert_equal(len(valid_parses), 2) assert_equal(valid_parses[0].log_name, 'test_multiple_schemas:01') assert_equal(valid_parses[1].log_name, 'test_multiple_schemas:02') valid_parse = classifier._check_valid_parse(valid_parses) assert_equal(valid_parse.log_name, 'test_multiple_schemas:01')
def load_and_classify_payload(config, service, entity, raw_record): """Return a loaded and classified payload.""" # prepare the payloads payload = load_stream_payload(service, entity, raw_record) payload = list(payload.pre_parse())[0] classifier = StreamClassifier(config=config) classifier.load_sources(service, entity) classifier.classify_record(payload) return payload
def test_classify_record_kinesis_json_optional(self): """Payload Classify JSON - optional fields""" kinesis_data = json.dumps({ 'key1': [{ 'test': 1, 'test2': 2 }, { 'test3': 3, 'test4': 4 }], 'key2': 'more sample data', 'key3': '1', 'key10': { 'test-field': 1, 'test-field2': 2 } }) payload = self.payload_generator(kinesis_stream='test_kinesis_stream', kinesis_data=kinesis_data) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) # pre parse and classify data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) # valid record test assert_equal(payload.valid, True) assert_equal(type(payload.records[0]), dict) # log type test assert_equal(payload.log_source, 'test_log_type_json') # payload type test assert_equal(payload.type, 'json') assert_not_equal(payload.type, 'csv') # record value tests assert_equal(len(payload.records[0]['key1']), 2) assert_equal(payload.records[0]['key3'], 1) assert_equal(payload.records[0]['key1'][1]['test4'], 4) # optional field tests assert_equal(payload.records[0]['key11'], 0.0) assert_equal(payload.records[0]['key9'], False) assert_equal(len(payload.records[0]['key10']), 2) # record type tests assert_equal(type(payload.records[0]['key1']), list) assert_equal(type(payload.records[0]['key2']), str) assert_equal(type(payload.records[0]['key3']), int)
def test_classify_record_kinesis_nested_json_osquery(self): """Payload Classify JSON osquery""" kinesis_data = json.dumps({ 'name': 'testquery', 'hostIdentifier': 'host1.test.prod', 'calendarTime': 'Jan 01 2017', 'unixTime': '1485556524', 'columns': { 'key1': 'test', 'key2': 'one' }, 'action': 'added', 'decorations': { 'role': 'web-server', 'env': 'production', 'cluster': 'eu-east', 'number': '100' } }) payload = self.payload_generator(kinesis_stream='test_stream_2', kinesis_data=kinesis_data) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) # valid record test assert_equal(payload.valid, True) assert_equal(type(payload.records[0]), dict) # log type test assert_equal(payload.log_source, 'test_log_type_json_nested_osquery') # payload type test assert_equal(payload.type, 'json') assert_not_equal(payload.type, 'csv') # record type test assert_equal(type(payload.records[0]['hostIdentifier']), str) assert_equal(type(payload.records[0]['unixTime']), int) assert_equal(type(payload.records[0]['columns']), dict) assert_equal(type(payload.records[0]['decorations']), dict) # record value test assert_equal(payload.records[0]['unixTime'], 1485556524) assert_equal(payload.records[0]['columns']['key1'], 'test') assert_equal(payload.records[0]['decorations']['cluster'], 'eu-east') assert_equal(payload.records[0]['decorations']['number'], 100) assert_equal(payload.records[0]['log_type'], '')
def test_classify_record_syslog(self): """Payload Classify Syslog""" test_data_1 = ('Jan 26 19:35:33 vagrant-ubuntu-trusty-64 ' 'sudo: pam_unix(sudo:session): ' 'session opened for user root by (uid=0)') test_data_2 = ( "Jan 26 12:28:06 macbook004154test authd[122]: " "Succeeded authorizing right 'com.apple.trust-settings.admin' " "by client '/usr/sbin/ocspd' [11835] for authorization created by" " '/usr/bin/security' [21322] (3,0)") fixtures = {'test_1': test_data_1, 'test_2': test_data_2} for name, syslog_message in fixtures.iteritems(): payload = self.payload_generator(kinesis_stream='test_stream_2', kinesis_data=syslog_message) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) # valid record test assert_equal(payload.valid, True) assert_equal(type(payload.records[0]), dict) # type test assert_equal(payload.type, 'syslog') assert_not_equal(payload.type, 'csv') assert_not_equal(payload.type, 'json') assert_not_equal(payload.type, 'kv') # record value tests if name == 'test_1': assert_equal(payload.records[0]['host'], 'vagrant-ubuntu-trusty-64') assert_equal(payload.records[0]['application'], 'sudo') assert_equal( payload.records[0]['message'], 'pam_unix(sudo:session):' ' session opened for user' ' root by (uid=0)') elif name == 'test_2': assert_equal(payload.records[0]['host'], 'macbook004154test') assert_equal(payload.records[0]['application'], 'authd')
def test_classify_record_kinesis_nested_json_with_data(self): """Payload Classify Nested JSON Generic""" kinesis_data = json.dumps({ 'date': 'Jan 01 2017', 'unixtime': '1485556524', 'host': 'host1', 'application': 'myapp', 'environment': 'development', 'data': { 'category': 'test', 'type': '1', 'source': 'dev-app-1' } }) payload = self.payload_generator(kinesis_stream='test_kinesis_stream', kinesis_data=kinesis_data) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) # valid record test assert_equal(payload.valid, True) assert_equal(type(payload.records[0]), dict) # log type test assert_equal(payload.log_source, 'test_log_type_json_nested_with_data') # payload type test assert_equal(payload.type, 'json') assert_not_equal(payload.type, 'csv') # record type test assert_equal(type(payload.records[0]['date']), str) assert_equal(type(payload.records[0]['unixtime']), int) assert_equal(type(payload.records[0]['data']), dict) assert_equal(type(payload.records[0]['data']['type']), int) assert_equal(type(payload.records[0]['data']['category']), str) # record value test assert_equal(payload.records[0]['date'], 'Jan 01 2017') assert_equal(payload.records[0]['data']['source'], 'dev-app-1')
def make_kinesis_payload(self, kinesis_stream, kinesis_data): """Helper for creating the kinesis payload""" raw_record = { 'eventSource': 'aws:kinesis', 'eventSourceARN': 'arn:aws:kinesis:us-east-1:123456789012:stream/{}'.format( kinesis_stream), 'kinesis': { 'data': base64.b64encode(kinesis_data) } } payload = StreamPayload(raw_record=raw_record) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) if payload.valid: return payload
def test_classify_record_kinesis_kv(self): """Payload Classify KV""" auditd_test_data = ( 'type=SYSCALL msg=audit(1364481363.243:24287): ' 'arch=c000003e syscall=2 success=no exit=-13 a0=7fffd19c5592 a1=0 ' 'a2=7fffd19c4b50 a3=a items=1 ppid=2686 pid=3538 auid=500 uid=500 ' 'gid=500 euid=500 suid=500 fsuid=500 egid=500 sgid=500 fsgid=500 tty=pts0 ' 'ses=1 comm="cat" exe="/bin/cat" ' 'subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 ' 'key="sshd_config" type=CWD msg=audit(1364481363.243:24287): ' 'cwd="/home/shadowman" type=PATH ' 'msg=audit(1364481363.243:24287): item=0 name="/etc/ssh/sshd_config" ' 'inode=409248 dev=fd:00 mode=0100600 ouid=0 ogid=0 ' 'rdev=00:00 obj=system_u:object_r:etc_t:s0') payload = self.payload_generator(kinesis_stream='test_kinesis_stream', kinesis_data=auditd_test_data) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) # valid record test assert_equal(payload.valid, True) assert_equal(type(payload.records[0]), dict) # record value tests assert_equal(payload.records[0]['type'], 'SYSCALL') assert_equal(payload.records[0]['suid'], 500) assert_equal(payload.records[0]['pid'], 3538) assert_equal(payload.records[0]['type_3'], 'PATH') # type test assert_equal(payload.type, 'kv') assert_not_equal(payload.type, 'csv') assert_not_equal(payload.type, 'json')
def test_classify_record_kinesis_nested_json(self): """Payload Classify Nested JSON""" kinesis_data = json.dumps({ 'date': 'Jan 01 2017', 'unixtime': '1485556524', 'host': 'my-host-name', 'data': { 'key1': 'test', 'key2': 'one' } }) payload = self.payload_generator(kinesis_stream='test_kinesis_stream', kinesis_data=kinesis_data) classifier = StreamClassifier(config=self.config) classifier.map_source(payload) data = self.pre_parse_kinesis(payload) classifier.classify_record(payload, data) # valid record test assert_equal(payload.valid, True) assert_equal(type(payload.records[0]), dict) # log type test assert_equal(payload.log_source, 'test_log_type_json_nested') # payload type test assert_equal(payload.type, 'json') assert_not_equal(payload.type, 'csv') # record type test assert_equal(type(payload.records[0]['date']), str) assert_equal(type(payload.records[0]['unixtime']), int) assert_equal(type(payload.records[0]['data']), dict) # record value test assert_equal(payload.records[0]['date'], 'Jan 01 2017') assert_equal(payload.records[0]['data']['key1'], 'test')
class StreamAlert(object): """Wrapper class for handling all StreamAlert classificaiton and processing""" def __init__(self, context, enable_alert_processor=True): """ Args: context: An AWS context object which provides metadata on the currently executing lambda function. enable_alert_processor (bool): If the user wants to send the alerts using their own methods, 'enable_alert_processor' can be set to False to suppress sending with the StreamAlert alert processor. """ # Load the config. Validation occurs during load, which will # raise exceptions on any ConfigErrors config = load_config() # Load the environment from the context arn self.env = load_env(context) # Instantiate the sink here to handle sending the triggered alerts to the # alert processor self.sinker = StreamSink(self.env) # Instantiate a classifier that is used for this run self.classifier = StreamClassifier(config=config) self.enable_alert_processor = enable_alert_processor self._failed_record_count = 0 self._alerts = [] def run(self, event): """StreamAlert Lambda function handler. Loads the configuration for the StreamAlert function which contains: available data sources, log formats, parser modes, and sinks. Classifies logs sent into the stream into a parsed type. Matches records against rules. Args: event: An AWS event mapped to a specific source/entity (kinesis stream or an s3 bucket event) containing data emitted to the stream. Returns: bool: True if all logs being parsed match a schema """ records = event.get('Records', []) LOGGER.debug('Number of Records: %d', len(records)) if not records: return False MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS, len(records)) for raw_record in records: # Get the service and entity from the payload. If the service/entity # is not in our config, log and error and go onto the next record service, entity = self.classifier.extract_service_and_entity( raw_record) if not service: LOGGER.error( 'No valid service found in payload\'s raw record. Skipping ' 'record: %s', raw_record) continue if not entity: LOGGER.error( 'Unable to extract entity from payload\'s raw record for service %s. ' 'Skipping record: %s', service, raw_record) continue # Cache the log sources for this service and entity on the classifier if not self.classifier.load_sources(service, entity): continue # Create the StreamPayload to use for encapsulating parsed info payload = load_stream_payload(service, entity, raw_record) if not payload: continue self._process_alerts(payload) LOGGER.debug('Invalid record count: %d', self._failed_record_count) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_PARSES, self._failed_record_count) LOGGER.debug('%s alerts triggered', len(self._alerts)) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS, len(self._alerts)) # Check if debugging logging is on before json dumping alerts since # this can be time consuming if there are a lot of alerts if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG): LOGGER.debug('Alerts:\n%s', json.dumps(self._alerts, indent=2)) return self._failed_record_count == 0 def get_alerts(self): """Public method to return alerts from class. Useful for testing. Returns: list: list of alerts as dictionaries """ return self._alerts def _process_alerts(self, payload): """Process records for alerts and send them to the correct places Args: payload (StreamPayload): StreamAlert payload object being processed """ for record in payload.pre_parse(): self.classifier.classify_record(record) if not record.valid: if self.env['lambda_alias'] != 'development': LOGGER.error( 'Record does not match any defined schemas: %s\n%s', record, record.pre_parsed_record) self._failed_record_count += 1 continue LOGGER.debug( 'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>', record.valid, record.log_source, record.entity) record_alerts = StreamRules.process(record) LOGGER.debug( 'Processed %d valid record(s) that resulted in %d alert(s).', len(payload.records), len(record_alerts)) if not record_alerts: continue # Extend the list of alerts with any new ones so they can be returned self._alerts.extend(record_alerts) if self.enable_alert_processor: self.sinker.sink(record_alerts)
class StreamAlert(object): """Wrapper class for handling StreamAlert classificaiton and processing""" __config = {} def __init__(self, context, enable_alert_processor=True): """Initializer Args: context (dict): An AWS context object which provides metadata on the currently executing lambda function. enable_alert_processor (bool): If the user wants to send the alerts using their own methods, 'enable_alert_processor' can be set to False to suppress sending with the StreamAlert alert processor. """ # Load the config. Validation occurs during load, which will # raise exceptions on any ConfigErrors StreamAlert.__config = StreamAlert.__config or load_config() # Load the environment from the context arn self.env = load_env(context) # Instantiate the sink here to handle sending the triggered alerts to the # alert processor self.sinker = StreamSink(self.env) # Instantiate a classifier that is used for this run self.classifier = StreamClassifier(config=self.__config) self.enable_alert_processor = enable_alert_processor self._failed_record_count = 0 self._processed_size = 0 self._alerts = [] # Create a dictionary to hold parsed payloads by log type. # Firehose needs this information to send to its corresponding # delivery stream. self.categorized_payloads = defaultdict(list) # Firehose client initialization self.firehose_client = None def run(self, event): """StreamAlert Lambda function handler. Loads the configuration for the StreamAlert function which contains available data sources, log schemas, normalized types, and outputs. Classifies logs sent into a parsed type. Matches records against rules. Args: event (dict): An AWS event mapped to a specific source/entity containing data read by Lambda. Returns: bool: True if all logs being parsed match a schema """ records = event.get('Records', []) LOGGER.debug('Number of Records: %d', len(records)) if not records: return False MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS, len(records)) firehose_config = self.__config['global'].get( 'infrastructure', {}).get('firehose', {}) if firehose_config.get('enabled'): self.firehose_client = boto3.client('firehose', region_name=self.env['lambda_region']) for raw_record in records: # Get the service and entity from the payload. If the service/entity # is not in our config, log and error and go onto the next record service, entity = self.classifier.extract_service_and_entity(raw_record) if not service: LOGGER.error('No valid service found in payload\'s raw record. Skipping ' 'record: %s', raw_record) continue if not entity: LOGGER.error( 'Unable to extract entity from payload\'s raw record for service %s. ' 'Skipping record: %s', service, raw_record) continue # Cache the log sources for this service and entity on the classifier if not self.classifier.load_sources(service, entity): continue # Create the StreamPayload to use for encapsulating parsed info payload = load_stream_payload(service, entity, raw_record) if not payload: continue self._process_alerts(payload) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_PROCESSED_SIZE, self._processed_size) LOGGER.debug('Invalid record count: %d', self._failed_record_count) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_PARSES, self._failed_record_count) LOGGER.debug('%s alerts triggered', len(self._alerts)) MetricLogger.log_metric( FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS, len( self._alerts)) # Check if debugging logging is on before json dumping alerts since # this can be time consuming if there are a lot of alerts if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG): LOGGER.debug('Alerts:\n%s', json.dumps(self._alerts, indent=2)) if self.firehose_client: self._send_to_firehose() return self._failed_record_count == 0 def get_alerts(self): """Public method to return alerts from class. Useful for testing. Returns: list: list of alerts as dictionaries """ return self._alerts def _send_to_firehose(self): """Send all classified records to a respective Firehose Delivery Stream""" def _chunk(record_list, chunk_size): """Helper function to chunk payloads""" for item in range(0, len(record_list), chunk_size): yield record_list[item:item + chunk_size] def _check_record_batch(batch): """Helper function to verify record size""" for index, record in enumerate(batch): if len(str(record)) > MAX_RECORD_SIZE: # Show the first 1k bytes in order to not overload # CloudWatch logs LOGGER.error('The following record is too large' 'be sent to Firehose: %s', str(record)[:1000]) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_FAILED_RECORDS, 1) batch.pop(index) delivery_stream_name_pattern = 'streamalert_data_{}' # Iterate through each payload type for log_type, records in self.categorized_payloads.items(): # This same method is used when naming the Delivery Streams formatted_log_type = log_type.replace(':', '_') for record_batch in _chunk(records, MAX_BATCH_SIZE): stream_name = delivery_stream_name_pattern.format(formatted_log_type) _check_record_batch(record_batch) resp = self.firehose_client.put_record_batch( DeliveryStreamName=stream_name, # The newline at the end is required by Firehose, # otherwise all records will be on a single line and # unsearchable in Athena. Records=[{'Data': json.dumps(record, separators=(",", ":")) + '\n'} for record in record_batch]) # Error handle if failures occured # TODO(jack) implement backoff here once the rule processor is split if resp.get('FailedPutCount') > 0: failed_records = [failed for failed in resp['RequestResponses'] if failed.get('ErrorCode')] MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_FAILED_RECORDS, resp['FailedPutCount']) # Only print the first 100 failed records LOGGER.error('The following records failed to Put to the' 'Delivery stream %s: %s', stream_name, json.dumps(failed_records[:100], indent=2)) else: MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_RECORDS_SENT, len(record_batch)) LOGGER.info('Successfully sent %d messages to Firehose:%s', len(record_batch), stream_name) def _process_alerts(self, payload): """Process records for alerts and send them to the correct places Args: payload (StreamPayload): StreamAlert payload object being processed """ for record in payload.pre_parse(): # Increment the processed size using the length of this record self._processed_size += len(record.pre_parsed_record) self.classifier.classify_record(record) if not record.valid: if self.env['lambda_alias'] != 'development': LOGGER.error('Record does not match any defined schemas: %s\n%s', record, record.pre_parsed_record) self._failed_record_count += 1 continue LOGGER.debug( 'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>', record.valid, record.log_source, record.entity) record_alerts = StreamRules.process(record) LOGGER.debug('Processed %d valid record(s) that resulted in %d alert(s).', len(payload.records), len(record_alerts)) # Add all parsed records to the categorized payload dict # only if Firehose is enabled if self.firehose_client: self.categorized_payloads[payload.log_source].extend(payload.records) if not record_alerts: continue # Extend the list of alerts with any new ones so they can be returned self._alerts.extend(record_alerts) if self.enable_alert_processor: self.sinker.sink(record_alerts)
class StreamAlert(object): """Wrapper class for handling StreamAlert classification and processing""" config = {} def __init__(self, context): """Initializer Args: context (dict): An AWS context object which provides metadata on the currently executing lambda function. """ # Load the config. Validation occurs during load, which will # raise exceptions on any ConfigErrors StreamAlert.config = StreamAlert.config or load_config() # Load the environment from the context arn self.env = load_env(context) # Instantiate the send_alerts here to handle sending the triggered alerts to the # alert processor self.alert_forwarder = AlertForwarder() # Instantiate a classifier that is used for this run self.classifier = StreamClassifier(config=self.config) self._failed_record_count = 0 self._processed_record_count = 0 self._processed_size = 0 self._alerts = [] rule_import_paths = [ item for location in {'rule_locations', 'matcher_locations'} for item in self.config['global']['general'][location] ] # Create an instance of the StreamRules class that gets cached in the # StreamAlert class as an instance property self._rules_engine = RulesEngine(self.config, *rule_import_paths) # Firehose client attribute self._firehose_client = None def run(self, event): """StreamAlert Lambda function handler. Loads the configuration for the StreamAlert function which contains available data sources, log schemas, normalized types, and outputs. Classifies logs sent into a parsed type. Matches records against rules. Args: event (dict): An AWS event mapped to a specific source/entity containing data read by Lambda. Returns: bool: True if all logs being parsed match a schema """ records = event.get('Records', []) LOGGER.debug('Number of incoming records: %d', len(records)) if not records: return False firehose_config = self.config['global'].get('infrastructure', {}).get('firehose', {}) if firehose_config.get('enabled'): self._firehose_client = StreamAlertFirehose( self.env['lambda_region'], firehose_config, self.config['logs']) payload_with_normalized_records = [] for raw_record in records: # Get the service and entity from the payload. If the service/entity # is not in our config, log and error and go onto the next record service, entity = self.classifier.extract_service_and_entity( raw_record) if not service: LOGGER.error( 'No valid service found in payload\'s raw record. Skipping ' 'record: %s', raw_record) continue if not entity: LOGGER.error( 'Unable to extract entity from payload\'s raw record for service %s. ' 'Skipping record: %s', service, raw_record) continue # Cache the log sources for this service and entity on the classifier if not self.classifier.load_sources(service, entity): continue # Create the StreamPayload to use for encapsulating parsed info payload = load_stream_payload(service, entity, raw_record) if not payload: continue payload_with_normalized_records.extend( self._process_alerts(payload)) # Log normalized records metric MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.NORMALIZED_RECORDS, len(payload_with_normalized_records)) # Apply Threat Intel to normalized records in the end of Rule Processor invocation record_alerts = self._rules_engine.threat_intel_match( payload_with_normalized_records) self._alerts.extend(record_alerts) if record_alerts: self.alert_forwarder.send_alerts(record_alerts) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS, self._processed_record_count) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_PROCESSED_SIZE, self._processed_size) LOGGER.debug('Invalid record count: %d', self._failed_record_count) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_PARSES, self._failed_record_count) LOGGER.debug('%s alerts triggered', len(self._alerts)) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS, len(self._alerts)) # Check if debugging logging is on before json dumping alerts since # this can be time consuming if there are a lot of alerts if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG): LOGGER.debug( 'Alerts:\n%s', json.dumps([alert.output_dict() for alert in self._alerts], indent=2, sort_keys=True)) if self._firehose_client: self._firehose_client.send() # Only log rule info here if this is not running tests # During testing, this gets logged at the end and printing here could be confusing # since stress testing calls this method multiple times if self.env['lambda_alias'] != 'development': stats.print_rule_stats(True) return self._failed_record_count == 0 @property def alerts(self): """Returns list of Alert instances (useful for testing).""" return self._alerts def _process_alerts(self, payload): """Run the record through the rules, saving any alerts and forwarding them to Dynamo. Args: payload (StreamPayload): StreamAlert payload object being processed """ payload_with_normalized_records = [] for record in payload.pre_parse(): # Increment the processed size using the length of this record self._processed_size += len(record.pre_parsed_record) self.classifier.classify_record(record) if not record.valid: if self.env['lambda_alias'] != 'development': LOGGER.error( 'Record does not match any defined schemas: %s\n%s', record, record.pre_parsed_record) self._failed_record_count += 1 continue # Increment the total processed records to get an accurate assessment of throughput self._processed_record_count += len(record.records) LOGGER.debug( 'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>', record.valid, record.log_source, record.entity) record_alerts, normalized_records = self._rules_engine.run(record) payload_with_normalized_records.extend(normalized_records) LOGGER.debug( 'Processed %d valid record(s) that resulted in %d alert(s).', len(payload.records), len(record_alerts)) # Add all parsed records to the categorized payload dict only if Firehose is enabled if self._firehose_client: # Only send payloads with enabled log sources if self._firehose_client.enabled_log_source( payload.log_source): self._firehose_client.categorized_payloads[ payload.log_source].extend(payload.records) if not record_alerts: continue # Extend the list of alerts with any new ones so they can be returned self._alerts.extend(record_alerts) self.alert_forwarder.send_alerts(record_alerts) return payload_with_normalized_records
class StreamAlert(object): """Wrapper class for handling StreamAlert classificaiton and processing""" config = {} # Used to detect special characters in payload keys. # This is necessary for sanitization of data prior to searching in Athena. special_char_regex = re.compile(r'\W') special_char_sub = '_' def __init__(self, context, enable_alert_processor=True): """Initializer Args: context (dict): An AWS context object which provides metadata on the currently executing lambda function. enable_alert_processor (bool): If the user wants to send the alerts using their own methods, 'enable_alert_processor' can be set to False to suppress sending with the StreamAlert alert processor. """ # Load the config. Validation occurs during load, which will # raise exceptions on any ConfigErrors StreamAlert.config = StreamAlert.config or load_config() # Load the environment from the context arn self.env = load_env(context) # Instantiate the sink here to handle sending the triggered alerts to the # alert processor self.sinker = StreamSink(self.env) # Instantiate a classifier that is used for this run self.classifier = StreamClassifier(config=self.config) self.enable_alert_processor = enable_alert_processor self._failed_record_count = 0 self._processed_size = 0 self._alerts = [] # Create a dictionary to hold parsed payloads by log type. # Firehose needs this information to send to its corresponding # delivery stream. self.categorized_payloads = defaultdict(list) # Firehose client initialization self.firehose_client = None StreamThreatIntel.load_intelligence(self.config) def run(self, event): """StreamAlert Lambda function handler. Loads the configuration for the StreamAlert function which contains available data sources, log schemas, normalized types, and outputs. Classifies logs sent into a parsed type. Matches records against rules. Args: event (dict): An AWS event mapped to a specific source/entity containing data read by Lambda. Returns: bool: True if all logs being parsed match a schema """ records = event.get('Records', []) LOGGER.debug('Number of Records: %d', len(records)) if not records: return False MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS, len(records)) firehose_config = self.config['global'].get( 'infrastructure', {}).get('firehose', {}) if firehose_config.get('enabled'): self.firehose_client = boto3.client('firehose', region_name=self.env['lambda_region']) for raw_record in records: # Get the service and entity from the payload. If the service/entity # is not in our config, log and error and go onto the next record service, entity = self.classifier.extract_service_and_entity(raw_record) if not service: LOGGER.error('No valid service found in payload\'s raw record. Skipping ' 'record: %s', raw_record) continue if not entity: LOGGER.error( 'Unable to extract entity from payload\'s raw record for service %s. ' 'Skipping record: %s', service, raw_record) continue # Cache the log sources for this service and entity on the classifier if not self.classifier.load_sources(service, entity): continue # Create the StreamPayload to use for encapsulating parsed info payload = load_stream_payload(service, entity, raw_record) if not payload: continue self._process_alerts(payload) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_PROCESSED_SIZE, self._processed_size) LOGGER.debug('Invalid record count: %d', self._failed_record_count) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_PARSES, self._failed_record_count) LOGGER.debug('%s alerts triggered', len(self._alerts)) MetricLogger.log_metric( FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS, len( self._alerts)) # Check if debugging logging is on before json dumping alerts since # this can be time consuming if there are a lot of alerts if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG): LOGGER.debug('Alerts:\n%s', json.dumps(self._alerts, indent=2)) if self.firehose_client: self._send_to_firehose() return self._failed_record_count == 0 def get_alerts(self): """Public method to return alerts from class. Useful for testing. Returns: list: list of alerts as dictionaries """ return self._alerts @staticmethod def _segment_records_by_count(record_list, max_count): """Segment records by length Args: record_list (list): The original records list to be segmented max_count (int): The max amount of records to yield per group """ for index in range(0, len(record_list), max_count): yield record_list[index:index + max_count] def _segment_records_by_size(self, record_batch): """Segment record groups by size Args: record_batch (list): The original record batch to measure and segment Returns: generator: Used to iterate on each newly segmented group """ split_factor = 1 len_batch = len(record_batch) # Sample the first batch of records to determine the split factor. # Generally, it's very rare for a group of records to have # drastically different sizes in a single Lambda invocation. while len(json.dumps(record_batch[:len_batch / split_factor], separators=(",", ":"))) > MAX_BATCH_SIZE: split_factor += 1 return self._segment_records_by_count(record_batch, len_batch / split_factor) @staticmethod def _limit_record_size(batch): """Limit the record size to be sent to Firehose Args: batch (list): Record batch to iterate on """ for index, record in enumerate(batch): if len(json.dumps(record, separators=(",", ":"))) > MAX_RECORD_SIZE: # Show the first 1k bytes in order to not overload # CloudWatch logs LOGGER.error('The following record is too large' 'be sent to Firehose: %s', str(record)[:1000]) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_FAILED_RECORDS, 1) batch.pop(index) @classmethod def sanitize_keys(cls, record): """Remove special characters from parsed record keys This is required when searching in Athena. Keys can only have a period or underscore Args: record (dict): Original parsed record Returns: dict: A sanitized record """ new_record = {} for key, value in record.iteritems(): sanitized_key = re.sub(cls.special_char_regex, cls.special_char_sub, key) # Handle nested objects if isinstance(value, dict): new_record[sanitized_key] = cls.sanitize_keys(record[key]) else: new_record[sanitized_key] = record[key] return new_record def _firehose_request_helper(self, stream_name, record_batch): """Send record batches to Firehose Args: stream_name (str): The name of the Delivery Stream to send to record_batch (list): The records to send """ record_batch_size = len(record_batch) resp = {} try: LOGGER.debug('Sending %d records to Firehose:%s', record_batch_size, stream_name) resp = self.firehose_client.put_record_batch( DeliveryStreamName=stream_name, # The newline at the end is required by Firehose, # otherwise all records will be on a single line and # unsearchable in Athena. Records=[{'Data': json.dumps(self.sanitize_keys(record), separators=(",", ":")) + '\n'} for record in record_batch]) except ClientError as firehose_err: LOGGER.error(firehose_err) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_FAILED_RECORDS, record_batch_size) return # Error handle if failures occured in PutRecordBatch # TODO(jack) implement backoff here for additional message reliability if resp.get('FailedPutCount') > 0: failed_records = [failed for failed in resp['RequestResponses'] if failed.get('ErrorCode')] MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_FAILED_RECORDS, resp['FailedPutCount']) # Only print the first 100 failed records to Cloudwatch logs LOGGER.error('The following records failed to Put to the' 'Delivery stream %s: %s', stream_name, json.dumps(failed_records[:100], indent=2)) else: MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_RECORDS_SENT, record_batch_size) LOGGER.info('Successfully sent %d messages to Firehose:%s', record_batch_size, stream_name) def _send_to_firehose(self): """Send all classified records to a respective Firehose Delivery Stream""" delivery_stream_name_pattern = 'streamalert_data_{}' # Iterate through each payload type for log_type, records in self.categorized_payloads.items(): # This same method is used when naming the Delivery Streams formatted_log_type = log_type.replace(':', '_') for record_batch in self._segment_records_by_count(records, MAX_BATCH_COUNT): stream_name = delivery_stream_name_pattern.format(formatted_log_type) self._limit_record_size(record_batch) for sized_batch in self._segment_records_by_size(record_batch): self._firehose_request_helper(stream_name, sized_batch) def _process_alerts(self, payload): """Process records for alerts and send them to the correct places Args: payload (StreamPayload): StreamAlert payload object being processed """ for record in payload.pre_parse(): # Increment the processed size using the length of this record self._processed_size += len(record.pre_parsed_record) self.classifier.classify_record(record) if not record.valid: if self.env['lambda_alias'] != 'development': LOGGER.error('Record does not match any defined schemas: %s\n%s', record, record.pre_parsed_record) self._failed_record_count += 1 continue LOGGER.debug( 'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>', record.valid, record.log_source, record.entity) record_alerts = StreamRules.process(record) LOGGER.debug('Processed %d valid record(s) that resulted in %d alert(s).', len(payload.records), len(record_alerts)) # Add all parsed records to the categorized payload dict # only if Firehose is enabled if self.firehose_client: # Only send payloads with enabled types if payload.log_source.split(':')[0] not in self.config['global'] \ ['infrastructure'].get('firehose', {}).get('disabled_logs', []): self.categorized_payloads[payload.log_source].extend(payload.records) if not record_alerts: continue # Extend the list of alerts with any new ones so they can be returned self._alerts.extend(record_alerts) if self.enable_alert_processor: self.sinker.sink(record_alerts)