def handler(event, context): """Main Lambda handler function""" try: StreamAlert(context).run(event) except Exception: LOGGER.error('Invocation event: %s', json.dumps(event)) raise
def _load_rule_table(cls, config): """Load and return a RuleTable class for communicating with the DynamoDB rule table Args: config (dict): Loaded configuration from 'conf/' directory Returns: rule_table.RuleTable: Loaded frontend for DynamoDB rules table """ # Ensure the rules table is enabled rt_config = config['global']['infrastructure']['rules_table'] if not rt_config.get('enabled', False): return now = datetime.utcnow() refresh_delta = timedelta( minutes=rt_config.get('cache_refresh_minutes', 10)) # The rule table will need 'refreshed' if the refresh interval has been surpassed needs_refresh = cls._RULE_TABLE_LAST_REFRESH + refresh_delta < now if not needs_refresh: LOGGER.debug( 'Rule table does not need refreshed (last refresh time: %s; ' 'current time: %s)', cls._RULE_TABLE_LAST_REFRESH, now) return LOGGER.info( 'Refreshing rule table (last refresh time: %s; current time: %s)', cls._RULE_TABLE_LAST_REFRESH, now) table_name = '{}_streamalert_rules'.format( config['global']['account']['prefix']) cls._RULE_TABLE = RuleTable(table_name) cls._RULE_TABLE_LAST_REFRESH = now
def _add_optional_keys(self, json_records, schema, optional_keys): """Add optional keys to a parsed JSON record. Args: json_records (list): JSONPath extracted JSON records schema (dict): The log type schema optional_keys (dict): The optional keys in the schema """ if not optional_keys: return for key_name in optional_keys: # Instead of doing a schema.update() here with a default value type, # we should enforce having any optional keys declared within the schema # and log an error if that is not the case if key_name not in schema: LOGGER.error( 'Optional top level key \'%s\' ' 'not found in declared log schema', key_name) continue # If the optional key isn't in our parsed json payload for record in json_records: if key_name not in record: # Set default value record[key_name] = self.default_optional_values( schema[key_name])
def process_subkeys(cls, record, payload_type, rule): """Check payload record contains all subkeys needed for rules Because each log is processed by every rule for a given log type, it's possible that a rule references a subkey that doesn't exist in that specific log. This method verifies that the declared subkeys in a rule are contained in the JSON payload prior to rule processing. Args: record: Payload record to process payload_type (str): type of the record rule: Rule attributes Returns: bool: result of subkey check. """ if not rule.req_subkeys or payload_type != 'json': return True for key, nested_keys in rule.req_subkeys.iteritems(): # This is an extra layer of protection when # verifying a subkey exists in a record with a null value. # In the case of CloudTrail, a top level key has been # observed as either a map with subkeys, or null. if not record.get(key): LOGGER.debug( 'The required subkey %s is not found when trying to process %s: \n%s', key, rule.rule_name, json.dumps(record, indent=2)) return False if not all(x in record[key] for x in nested_keys): return False return True
def load_sources(self, service, entity): """Load the sources for this payload. Args: service (str): Source service entity (str): Entity within the service Returns: bool: True if the entity's log sources loaded properly """ # Clear the list from any previous runs del self._entity_log_sources[:] # Get all logs for the configured service/entity (s3, kinesis, or sns) service_entities = self._config['sources'].get(service) if not service_entities: LOGGER.error('Service [%s] not declared in sources configuration', service) return False config_entity = service_entities.get(entity) if not config_entity: LOGGER.error( 'Entity [%s] not declared in sources configuration for service [%s]', entity, service) return False # Get a copy of the logs list by slicing here, not a pointer to the list reference self._entity_log_sources = config_entity['logs'][:] return bool(self._entity_log_sources)
def process(cls, input_payload): """Process rules on a record. Gather a list of rules based on the record's datasource type. For each rule, evaluate the record through all listed matchers and the rule itself to determine if a match occurs. Returns: list: alerts An alert is represented as a dictionary with the following keys: rule_name: the name of the triggered rule payload: the StreamPayload object outputs: list of outputs to send to """ alerts = [] payload = copy(input_payload) rules = [ rule_attrs for rule_attrs in cls.__rules.values() if payload.log_source in rule_attrs.logs ] if not rules: LOGGER.debug('No rules to process for %s', payload) return alerts for record in payload.records: for rule in rules: # subkey check has_sub_keys = cls.process_subkeys(record, payload.type, rule) if not has_sub_keys: continue # matcher check matcher_result = cls.match_event(record, rule) if not matcher_result: continue # rule analysis rule_result = cls.process_rule(record, rule) if rule_result: LOGGER.info( 'Rule [%s] triggered an alert on log type [%s] from entity \'%s\' ' 'in service \'%s\'', rule.rule_name, payload.log_source, payload.entity, payload.service()) alert = { 'record': record, 'rule_name': rule.rule_name, 'rule_description': rule.rule_function.__doc__ or DEFAULT_RULE_DESCRIPTION, 'log_source': str(payload.log_source), 'log_type': payload.type, 'outputs': rule.outputs, 'source_service': payload.service(), 'source_entity': payload.entity } alerts.append(alert) return alerts
def decorator(rule): """Rule decorator logic.""" rule_name = rule.__name__ logs = opts.get('logs') outputs = opts.get('outputs') matchers = opts.get('matchers') datatypes = opts.get('datatypes') req_subkeys = opts.get('req_subkeys') if not (logs or datatypes): LOGGER.error( 'Invalid rule [%s] - rule must have either \'logs\' or \'' 'datatypes\' declared', rule_name) return if not outputs: LOGGER.error( 'Invalid rule [%s] - rule must have \'outputs\' declared', rule_name) return if rule_name in cls.__rules: raise ValueError('rule [{}] already defined'.format(rule_name)) cls.__rules[rule_name] = RuleAttributes(rule_name, rule, matchers, datatypes, logs, outputs, req_subkeys) return rule
def _read_downloaded_s3_object(s3_object): """Read lines from a downloaded file from S3 Supports reading both gzipped files and plaintext files. Args: s3_object (str): A full path to the downloaded file. Yields: (str) Lines from the downloaded s3 object. """ _, extension = os.path.splitext(s3_object) if extension == '.gz': with gzip.open(s3_object, 'r') as s3_file: for num, line in enumerate(s3_file, start=1): yield num, line.rstrip() else: with open(s3_object, 'r') as s3_file: for num, line in enumerate(s3_file, start=1): yield num, line.rstrip() # AWS Lambda apparently does not reallocate disk space when files are # removed using os.remove(), so we must truncate them before removal with open(s3_object, 'w'): pass os.remove(s3_object) if not os.path.exists(s3_object): LOGGER.debug('Removed temp S3 file: %s', s3_object) else: LOGGER.error('Failed to remove temp S3 file: %s', s3_object)
def parse(self, schema, data): """Parse a string into a list of JSON payloads. Args: schema (dict): Parsing schema. data (str|dict): Data to be parsed. Returns: list: A list of dictionaries representing parsed records OR False if the data is not JSON or the data does not follow the schema. """ if isinstance(data, (unicode, str)): try: loaded_data = json.loads(data) except ValueError as err: LOGGER.debug('JSON parse failed: %s', str(err)) LOGGER.debug('JSON parse could not load data: %s', str(data)) return False else: json_records = self._parse_records(schema, loaded_data) else: json_records = self._parse_records(schema, data) if not json_records: return False self._add_optional_keys(json_records, schema, self.options.get('optional_top_level_keys')) # Make sure all keys match the schema, including nests maps if not self._key_check(schema, json_records): return False return json_records
def _load_enabled_log_sources(self, firehose_config, log_sources): """Load and expand all declared and enabled Firehose log sources Args: firehose_config (dict): Loaded Firehose config from global.json log_sources (dict): Loaded logs.json file Returns: set: Disabled logs """ enabled_logs = set() for enabled_log in firehose_config.get('enabled_logs', []): enabled_log_parts = enabled_log.split(':') # Expand to all subtypes if len(enabled_log_parts) == 1: expanded_logs = [self.firehose_log_name(log_name) for log_name in log_sources if log_name.split(':')[0] == enabled_log_parts[0]] # If the list comprehension is Falsey, it means no matching logs # were found while doing the expansion. if not expanded_logs: LOGGER.error('Enabled Firehose log %s not declared in logs.json', enabled_log) enabled_logs.update(expanded_logs) elif len(enabled_log_parts) == 2: if enabled_log not in log_sources: LOGGER.error('Enabled Firehose log %s not declared in logs.json', enabled_log) enabled_logs.add(self.firehose_log_name('_'.join(enabled_log_parts))) return enabled_logs
def _get_object(self): """Given an S3 record, download and parse the data. Returns: str: Path to the downloaded s3 object. """ # Use the urllib unquote method to decode any url encoded characters # (ie - %26 --> &) from the bucket and key names unquoted = lambda (data): unquote(data).decode('utf-8') region = self.raw_record['awsRegion'] bucket = unquoted(self.raw_record['s3']['bucket']['name']) key = unquoted(self.raw_record['s3']['object']['key']) self.s3_object_size = int(self.raw_record['s3']['object']['size']) LOGGER.debug( 'Pre-parsing record from S3. Bucket: %s, Key: %s, Size: %d', bucket, key, self.s3_object_size) try: return self._download_object(region, bucket, key) except IOError: LOGGER.exception( '[S3Payload] The following error occurred while downloading') return
def _validate_type_mapping(mapping_str): """Static method to extract normalized type and IOC type from qualified str Args: mapping_str (str): A qualified string has pattern 'normalized_type:ioc_type' Returns: A tuple(bool, str, str) bool: First return indicate if the string a qualifited string contains both normalized CEF type and IOC type. str: Second return is normalized type. str: Last return is IOC type. """ normalized_type = None ioc_type = None splitted_str = mapping_str.split(':') if len(splitted_str) == 1: normalized_type = splitted_str[0] elif len(splitted_str) == 2: normalized_type = splitted_str[0] ioc_type = splitted_str[1].split('_')[-1] else: LOGGER.info('Key %s in conf/types.json is incorrect', mapping_str) return False, None, None if normalized_type and ioc_type: return True, normalized_type, ioc_type return False, normalized_type, None
def rule_analysis(record, rule, payload, alerts): """Class method to analyze rule against a record Args: record (dict): A parsed log with data. rule: Rule attributes. payload: The StreamPayload object. alerts (list): A list of alerts which will be sent to alert processor. Returns: (dict): A list of alerts. """ rule_result = StreamRules.process_rule(record, rule) if rule_result: if StreamRules.check_alerts_duplication(record, rule, alerts): return LOGGER.info( 'Rule [%s] triggered an alert on log type [%s] from entity \'%s\' ' 'in service \'%s\'', rule.rule_name, payload.log_source, payload.entity, payload.service()) alert = { 'record': record, 'rule_name': rule.rule_name, 'rule_description': rule.rule_function.__doc__ or DEFAULT_RULE_DESCRIPTION, 'log_source': str(payload.log_source), 'log_type': payload.type, 'outputs': rule.outputs, 'source_service': payload.service(), 'source_entity': payload.entity, 'context': rule.context } alerts.append(alert)
def test_pre_parse_s3_debug(s3_mock, log_mock, _): """S3Payload - Pre Parse, Debug On""" # Cache the logger level log_level = LOGGER.getEffectiveLevel() # Increase the logger level to debug LOGGER.setLevel(logging.DEBUG) records = ['_first_line_test_' * 10, '_second_line_test_' * 10] s3_mock.side_effect = [((100, records[0]), (200, records[1]))] raw_record = make_s3_raw_record('unit_bucket_name', 'unit_key_name') s3_payload = load_stream_payload('s3', 'unit_key_name', raw_record) S3Payload.s3_object_size = 350 _ = [_ for _ in s3_payload.pre_parse()] calls = [ call( 'Processed %s S3 records out of an approximate total of %s ' '(average record size: %s bytes, total size: %s bytes)', 100, 350, 1, 350), call( 'Processed %s S3 records out of an approximate total of %s ' '(average record size: %s bytes, total size: %s bytes)', 200, 350, 1, 350) ] log_mock.assert_has_calls(calls) # Reset the logger level and stop the patchers LOGGER.setLevel(log_level)
def _extract_json_path(self, json_payload): """Extract records from the original json payload using a provided JSON path Args: json_payload (dict): The parsed json data Returns: list: A list of JSON records extracted via JSON path or regex """ records = [] json_path_expression = self.options.get('json_path') if not json_path_expression: return records # Handle jsonpath extraction of records LOGGER.debug('Parsing records with JSONPath') records_jsonpath = jsonpath_rw.parse(json_path_expression) # If the csv parser is extracting csv from json, the payload is likely # a string and needs to be loaded to a dict if not isinstance(json_payload, dict): json_payload = json.loads(json_payload) matches = records_jsonpath.find(json_payload) if not matches: return False return [match.value for match in matches]
def match_event(cls, record, rule): """Evaluate matchers on a record. Given a list of matchers, evaluate a record through each to find a match. If any matcher is evaluated as false, the loop breaks and no further matchers are evaluated. Otherwise, returns True. Args: record: Record to be matched rule: Rule containing the list of matchers Returns: bool: result of matcher processing """ # matchers are optional for rules if not rule.matchers: return True for matcher in rule.matchers: matcher_function = cls.__matchers.get(matcher) if matcher_function: try: matcher_result = matcher_function(record) except Exception as err: # pylint: disable=broad-except matcher_result = False LOGGER.error('%s: %s', matcher_function.__name__, err.message) if not matcher_result: return False else: LOGGER.error('The matcher [%s] does not exist!', matcher) return True
def _parse_records(self, schema, json_payload): """Identify and extract nested payloads from parsed JSON records. Nested payloads can be detected with log_patterns (`records` should be a JSONpath selector that yields the desired nested records). If desired, fields present on the root record can be merged into child events using the `envelope_keys` option. Args: json_payload (dict): The parsed json data Returns: list: A list of parsed JSON records """ # Check options and return the payload if there is nothing special to do if not self.options: return [json_payload] envelope_schema = self.options.get('envelope_keys') optional_envelope_keys = self.options.get('optional_envelope_keys') # If the schema has a defined envelope schema, with optional keys in # the envelope. This occurs in some cases when using json_regex_key. if envelope_schema and optional_envelope_keys: missing_keys_schema = {} for key in optional_envelope_keys: if key not in json_payload: missing_keys_schema[key] = envelope_schema[key] if missing_keys_schema: self._add_optional_keys([json_payload], envelope_schema, missing_keys_schema) # If the envelope schema is defined and all envelope keys are required # to be present in the record. elif envelope_schema and not all(x in json_payload for x in envelope_schema): return [json_payload] envelope = {} if envelope_schema: LOGGER.debug('Parsing envelope keys') schema.update({ENVELOPE_KEY: envelope_schema}) envelope_keys = envelope_schema.keys() envelope_jsonpath = jsonpath_rw.parse("$." + ",".join(envelope_keys)) envelope_matches = [ match.value for match in envelope_jsonpath.find(json_payload) ] envelope = dict(zip(envelope_keys, envelope_matches)) json_records = self._extract_records(json_payload, envelope) if json_records is False: return False # If the final parsed record is singular if not json_records: json_records.append(json_payload) return json_records
def firehose_request_wrapper(data): """Firehose request wrapper to use with backoff""" LOGGER.info('[Firehose] Sending %d records to %s', record_batch_size, stream_name) return self._firehose_client.put_record_batch( DeliveryStreamName=stream_name, Records=data)
def _send_to_dynamo(self, alerts): """Write alerts in batches to Dynamo.""" # The batch_writer() automatically handles buffering, batching, and retrying failed items with self.table.batch_writer() as batch: for alert in alerts: batch.put_item(Item=self.dynamo_record(alert)) LOGGER.info('Successfully sent %d alert(s) to dynamo:%s', len(alerts), self.table.table_name)
def _shred_temp_directory(): """Delete all objects in the container's temp directory""" LOGGER.debug('Shredding temp directory') for root, dirs, files in os.walk(tempfile.gettempdir(), topdown=False): for name in files: subprocess.check_call([ #nosec 'shred', '--force', '--iterations=1', '--remove', os.path.join(root, name)]) for name in dirs: os.rmdir(os.path.join(root, name)) #nosec
def _check_record_batch(batch): """Helper function to verify record size""" for index, record in enumerate(batch): if len(str(record)) > MAX_RECORD_SIZE: # Show the first 1k bytes in order to not overload # CloudWatch logs LOGGER.error('The following record is too large' 'be sent to Firehose: %s', str(record)[:1000]) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_FAILED_RECORDS, 1) batch.pop(index)
def read_compressed_files(cls, intel_dir, delimiter=','): """Read intelligence into memory Read all intelligence from csv.gz files located in threat_intel directory into a dictionary. CSV filename should follow the convention <ioc_type_as_basename>.csv.gz. The basename (without extension) of csv file will be the key in return dictionary. Returns: (dict): Threat intelligence in the following format: { "domain": { "evil1.com": ["apt_domain", "source1 reported evil1.com"], "evil2.com": ["c2_domain", "source2 reported evil2.com"] }, "ip": { "1.1.1.2": ["scan_ip", "source reported ip1"], "2.2.2.2": ["scan_ip", "source reported ip2"] }, "url": { "www.hacker.com/evil_page": ["mal_url", "source_foo"] }, "md5": { "0123456789abcdef0123456789abcdef": ["mal_md5", "source_bar"] } } None: if the intelligence directory does not exist """ if not os.path.exists(intel_dir): return gz_files = [ os.path.join(intel_dir, gz_file) for gz_file in os.listdir(intel_dir) if gz_file.endswith('.gz') ] for gz_file in gz_files: with gzip.open(gz_file, 'r') as ioc_file: csv_reader = csv.reader(ioc_file, delimiter=delimiter) ioc_type = os.path.basename(gz_file).split('.')[0] if ioc_type not in cls.__intelligence: cls.__intelligence[ioc_type] = dict() for row in csv_reader: if len(row) < 2: LOGGER.debug( 'Warning, each row in CSV file should ' 'contain at least two fields. Bad row [%s]', row) continue cls.__intelligence[ioc_type][row[0]] = row[1:] return cls.__intelligence
def _process_ioc(self, ioc_collections): """Check if any info is malicious by querying DynamoDB IOC table Args: ioc_collections (list): A list of StreamIoc instances. """ LOGGER.debug('[Threat Inel] Rule Processor queries %d IOCs', len(ioc_collections)) # Segment data before calling DynamoDB table with batch_get_item. for subset in self._segment(ioc_collections): query_values = [] for ioc in subset: if ioc.value not in query_values: query_values.append(ioc.value) query_result = [] query_error_msg = 'An error occurred while quering dynamodb table. Error is: %s' try: result, unprocesed_keys = self._query(query_values) query_result.extend(result) except ClientError as err: LOGGER.error(query_error_msg, err.response) return except ParamValidationError as err: LOGGER.error(query_error_msg, err) return # If there are unprocessed keys, we will re-query once with unprocessed # keys only if unprocesed_keys: deserializer = self._deserialize( unprocesed_keys[self._table]['Keys']) query_values = [elem[PRIMARY_KEY] for elem in deserializer] query_error_msg = 'An error occurred while processing unprocesed_keys. Error is: %s' try: result, _ = self._query(query_values) query_result.extend(result) except ClientError as err: LOGGER.error(query_error_msg, err.response) return except ParamValidationError as err: LOGGER.error(query_error_msg, err) return for value in ioc_collections: for ioc in query_result: if value.value == ioc[PRIMARY_KEY]: value.sub_type = ioc[SUB_TYPE_KEY] value.is_ioc = True continue
def firehose_request_wrapper(): """Firehose request wrapper to use with backoff""" LOGGER.info('[Firehose] Sending %d records to %s', record_batch_size, stream_name) return self._firehose_client.put_record_batch( DeliveryStreamName=stream_name, # The newline at the end is required by Firehose, # otherwise all records will be on a single line and # unsearchable in Athena. Records=[{'Data': json.dumps(self.sanitize_keys(record), separators=(",", ":")) + '\n'} for record in record_batch])
def send_alerts(self, alerts): """Send alerts to the Alert Processor and to the alerts Dynamo table. Args: alerts (list): A list of dictionaries representing json alerts. """ try: self._send_to_dynamo(alerts) except ClientError: # The batch_writer() automatically retries transient errors - any raised ClientError # is likely unrecoverable. Log an exception and metric LOGGER.exception('Error saving alerts to Dynamo') MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_DYNAMO_WRITES, 1)
def _process_log_schemas(self, payload): """Get any log schemas that matched this log format Args: payload: A StreamAlert payload object Returns: list: Contains any schemas that matched this log format Each list entry contains the namedtuple of 'SchemaMatch' with values of log_name, root_schema, parser, and parsed_data """ schema_match = namedtuple( 'SchemaMatch', 'log_name, root_schema, parser, parsed_data') schema_matches = [] log_info = self.get_log_info_for_source() # Loop over all logs declared in logs.json for log_name, attributes in log_info.iteritems(): # Get the parser type to use for this log parser_name = payload.type or attributes['parser'] schema = attributes['schema'] options = attributes.get('configuration', {}) # Setup the parser class parser_class = get_parser(parser_name) parser = parser_class(options) # Get a list of parsed records LOGGER.debug('Trying schema: %s', log_name) parsed_data = parser.parse(schema, payload.pre_parsed_record) if not parsed_data: continue LOGGER.debug('Parsed %d records with schema %s', len(parsed_data), log_name) if SUPPORT_MULTIPLE_SCHEMA_MATCHING: schema_matches.append( schema_match(log_name, schema, parser, parsed_data)) continue log_patterns = parser.options.get('log_patterns') if all( parser.matched_log_pattern(rec, log_patterns) for rec in parsed_data): return [schema_match(log_name, schema, parser, parsed_data)] return schema_matches
def enabled_log_source(cls, log_source_name): """Check that the incoming record is an enabled log source for Firehose Args: log_source_name (str): The log source of the record Returns: bool: Whether or not the log source is enabled to send to Firehose """ if not cls._ENABLED_LOGS: LOGGER.error('Enabled logs not loaded') return False return cls.firehose_log_name(log_source_name) in cls._ENABLED_LOGS
def pre_parse(self): """Pre-parsing method for SNS records. Extracts the SNS payload from the record itself and sets it as the `pre_parsed_record` property. Yields: This object with the pre_parsed_record now set """ LOGGER.debug( 'Pre-parsing record from SNS. MessageId: %s, EventSubscriptionArn: %s', self.raw_record['Sns']['MessageId'], self.raw_record['EventSubscriptionArn']) self.pre_parsed_record = self.raw_record['Sns']['Message'] yield self
def _limit_record_size(cls, batch): """Limits the batch size sent to Firehose by popping large records Args: batch (list): Record batch to iterate on """ for index, record in enumerate(batch): if len(json.dumps(record, separators=(",", ":"))) > cls.MAX_RECORD_SIZE: # Show the first 1k bytes in order to not overload CloudWatch logs LOGGER.error('The following record is too large' 'be sent to Firehose: %s', str(record)[:1000]) MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FIREHOSE_FAILED_RECORDS, 1) batch.pop(index)
def firehose_request_wrapper(data): """Firehose request wrapper to use with backoff""" # Use the current length of data here so we can track failed records that are retried LOGGER.info('[Firehose] Sending %d records to %s', len(data), stream_name) response = self._client.put_record_batch(DeliveryStreamName=stream_name, Records=data) # Log this as an error for now so it can be picked up in logs if response['FailedPutCount'] > 0: LOGGER.error('Received non-zero FailedPutCount: %d', response['FailedPutCount']) # Strip out the successful records so only the failed ones are retried. This happens # to the list of dictionary objects, so the called function sees the updated list self._strip_successful_records(data, response) return response