Пример #1
0
    def _read_downloaded_s3_object(s3_object):
        """Read lines from a downloaded file from S3

        Supports reading both gzipped files and plaintext files.

        Args:
            s3_object (str): A full path to the downloaded file.

        Yields:
            (str) Lines from the downloaded s3 object.
        """
        _, extension = os.path.splitext(s3_object)

        if extension == '.gz':
            with gzip.open(s3_object, 'r') as s3_file:
                for num, line in enumerate(s3_file, start=1):
                    yield num, line.rstrip()
        else:
            with open(s3_object, 'r') as s3_file:
                for num, line in enumerate(s3_file, start=1):
                    yield num, line.rstrip()

        # AWS Lambda apparently does not reallocate disk space when files are
        # removed using os.remove(), so we must truncate them before removal
        with open(s3_object, 'w'):
            pass

        os.remove(s3_object)
        if not os.path.exists(s3_object):
            LOGGER.debug('Removed temp S3 file: %s', s3_object)
        else:
            LOGGER.error('Failed to remove temp S3 file: %s', s3_object)
Пример #2
0
    def load_sources(self, service, entity):
        """Load the sources for this payload.

        Args:
            service (str): Source service
            entity (str): Entity within the service

        Returns:
            bool: True if the entity's log sources loaded properly
        """
        # Clear the list from any previous runs
        del self._entity_log_sources[:]

        # Get all logs for the configured service/entity (s3, kinesis, or sns)
        service_entities = self._config['sources'].get(service)
        if not service_entities:
            LOGGER.error('Service [%s] not declared in sources configuration',
                         service)
            return False

        config_entity = service_entities.get(entity)
        if not config_entity:
            LOGGER.error(
                'Entity [%s] not declared in sources configuration for service [%s]',
                entity, service)
            return False

        # Get a copy of the logs list by slicing here, not a pointer to the list reference
        self._entity_log_sources = config_entity['logs'][:]

        return bool(self._entity_log_sources)
Пример #3
0
    def _add_optional_keys(self, json_records, schema, optional_keys):
        """Add optional keys to a parsed JSON record.

        Args:
            json_records (list): JSONPath extracted JSON records
            schema (dict): The log type schema
            optional_keys (dict): The optional keys in the schema
        """
        if not optional_keys:
            return

        for key_name in optional_keys:
            # Instead of doing a schema.update() here with a default value type,
            # we should enforce having any optional keys declared within the schema
            # and log an error if that is not the case
            if key_name not in schema:
                LOGGER.error(
                    'Optional top level key \'%s\' '
                    'not found in declared log schema', key_name)
                continue
            # If the optional key isn't in our parsed json payload
            for record in json_records:
                if key_name not in record:
                    # Set default value
                    record[key_name] = self.default_optional_values(
                        schema[key_name])
Пример #4
0
    def match_event(cls, record, rule):
        """Evaluate matchers on a record.

        Given a list of matchers, evaluate a record through each
        to find a match.  If any matcher is evaluated as false,
        the loop breaks and no further matchers are evaluated.
        Otherwise, returns True.

        Args:
            record: Record to be matched
            rule: Rule containing the list of matchers

        Returns:
            bool: result of matcher processing
        """
        # matchers are optional for rules
        if not rule.matchers:
            return True

        for matcher in rule.matchers:
            matcher_function = cls.__matchers.get(matcher)
            if matcher_function:
                try:
                    matcher_result = matcher_function(record)
                except Exception as err:  # pylint: disable=broad-except
                    matcher_result = False
                    LOGGER.error('%s: %s', matcher_function.__name__,
                                 err.message)
                if not matcher_result:
                    return False
            else:
                LOGGER.error('The matcher [%s] does not exist!', matcher)

        return True
Пример #5
0
        def decorator(rule):
            """Rule decorator logic."""
            rule_name = rule.__name__
            logs = opts.get('logs')
            outputs = opts.get('outputs')
            matchers = opts.get('matchers')
            datatypes = opts.get('datatypes')
            req_subkeys = opts.get('req_subkeys')

            if not (logs or datatypes):
                LOGGER.error(
                    'Invalid rule [%s] - rule must have either \'logs\' or \''
                    'datatypes\' declared', rule_name)
                return

            if not outputs:
                LOGGER.error(
                    'Invalid rule [%s] - rule must have \'outputs\' declared',
                    rule_name)
                return

            if rule_name in cls.__rules:
                raise ValueError('rule [{}] already defined'.format(rule_name))
            cls.__rules[rule_name] = RuleAttributes(rule_name, rule, matchers,
                                                    datatypes, logs, outputs,
                                                    req_subkeys)
            return rule
Пример #6
0
    def _load_enabled_log_sources(self, firehose_config, log_sources):
        """Load and expand all declared and enabled Firehose log sources

        Args:
            firehose_config (dict): Loaded Firehose config from global.json
            log_sources (dict): Loaded logs.json file

        Returns:
            set: Disabled logs
        """
        enabled_logs = set()
        for enabled_log in firehose_config.get('enabled_logs', []):
            enabled_log_parts = enabled_log.split(':')

            # Expand to all subtypes
            if len(enabled_log_parts) == 1:
                expanded_logs = [self.firehose_log_name(log_name) for log_name
                                 in log_sources
                                 if log_name.split(':')[0] == enabled_log_parts[0]]
                # If the list comprehension is Falsey, it means no matching logs
                # were found while doing the expansion.
                if not expanded_logs:
                    LOGGER.error('Enabled Firehose log %s not declared in logs.json', enabled_log)

                enabled_logs.update(expanded_logs)

            elif len(enabled_log_parts) == 2:
                if enabled_log not in log_sources:
                    LOGGER.error('Enabled Firehose log %s not declared in logs.json', enabled_log)

                enabled_logs.add(self.firehose_log_name('_'.join(enabled_log_parts)))

        return enabled_logs
Пример #7
0
    def _process_alerts(self, payload):
        """Process records for alerts and send them to the correct places

        Args:
            payload (StreamPayload): StreamAlert payload object being processed
        """
        for record in payload.pre_parse():
            self.classifier.classify_record(record)
            if not record.valid:
                if self.env['lambda_alias'] != 'development':
                    LOGGER.error(
                        'Record does not match any defined schemas: %s\n%s',
                        record, record.pre_parsed_record)

                self._failed_record_count += 1
                continue

            LOGGER.debug(
                'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>',
                record.valid, record.log_source, record.entity)

            record_alerts = StreamRules.process(record)

            LOGGER.debug(
                'Processed %d valid record(s) that resulted in %d alert(s).',
                len(payload.records), len(record_alerts))

            if not record_alerts:
                continue

            # Extend the list of alerts with any new ones so they can be returned
            self._alerts.extend(record_alerts)

            if self.enable_alert_processor:
                self.sinker.sink(record_alerts)
Пример #8
0
def handler(event, context):
    """Main Lambda handler function"""
    try:
        StreamAlert(context).run(event)
    except Exception:
        LOGGER.error('Invocation event: %s', json.dumps(event))
        raise
Пример #9
0
    def _parse(self, payload):
        """Parse a record into a declared type.

        Args:
            payload: A StreamAlert payload object

        Sets:
            payload.log_source: The detected log name from the data_sources config.
            payload.type: The record's type.
            payload.records: The parsed records as a list.

        Returns:
            bool: the success of the parse.
        """
        schema_matches = self._process_log_schemas(payload)

        if not schema_matches:
            return False

        if LOGGER_DEBUG_ENABLED:
            LOGGER.debug(
                'Schema Matched Records:\n%s',
                json.dumps([
                    schema_match.parsed_data for schema_match in schema_matches
                ],
                           indent=2))

        schema_match = self._check_schema_match(schema_matches)

        if LOGGER_DEBUG_ENABLED:
            LOGGER.debug('Log name: %s', schema_match.log_name)
            LOGGER.debug('Parsed data:\n%s',
                         json.dumps(schema_match.parsed_data, indent=2))

        for parsed_data_value in schema_match.parsed_data:
            # Convert data types per the schema
            # Use the root schema for the parser due to updates caused by
            # configuration settings such as envelope_keys and optional_keys
            try:
                if not self._convert_type(parsed_data_value,
                                          schema_match.root_schema):
                    return False
            except KeyError:
                LOGGER.error('The payload is mis-classified. Payload [%s]',
                             parsed_data_value)
                return False

        normalized_types = StreamThreatIntel.normalized_type_mapping()

        payload.log_source = schema_match.log_name
        payload.type = schema_match.parser.type()
        payload.records = schema_match.parsed_data
        payload.normalized_types = normalized_types.get(
            payload.log_source.split(':')[0])

        return True
Пример #10
0
    def sink(self, alerts):
        """Sink triggered alerts from the StreamRules engine.

        Args:
            alerts (list): a list of dictionaries representating json alerts

        Sends a message to the alert processor with the following JSON format:
            {
                "record": record,
                "metadata": {
                    "rule_name": rule.rule_name,
                    "rule_description": rule.rule_function.__doc__,
                    "log": str(payload.log_source),
                    "outputs": rule.outputs,
                    "type": payload.type,
                    "source": {
                        "service": payload.service,
                        "entity": payload.entity
                    }
                }
            }
        """
        for alert in alerts:
            try:
                data = json.dumps(alert, default=lambda o: o.__dict__)
            except AttributeError as err:
                LOGGER.error(
                    'An error occurred while dumping alert to JSON: %s '
                    'Alert: %s', err.message, alert)
                continue

            try:
                response = self.client_lambda.invoke(
                    FunctionName=self.function,
                    InvocationType='Event',
                    Payload=data,
                    Qualifier='production')

            except ClientError as err:
                LOGGER.exception(
                    'An error occurred while sending alert to '
                    '\'%s:production\'. Error is: %s. Alert: %s',
                    self.function, err.response, data)
                continue

            if response['ResponseMetadata']['HTTPStatusCode'] != 202:
                LOGGER.error('Failed to send alert to \'%s\': %s',
                             self.function, data)
                continue

            if self.env['lambda_alias'] != 'development':
                LOGGER.info(
                    'Sent alert to \'%s\' with Lambda request ID \'%s\'',
                    self.function, response['ResponseMetadata']['RequestId'])
Пример #11
0
    def _process_alerts(self, payload):
        """Process records for alerts and send them to the correct places

        Args:
            payload (StreamPayload): StreamAlert payload object being processed
        """
        payload_with_normalized_records = []
        for record in payload.pre_parse():
            # Increment the processed size using the length of this record
            self._processed_size += len(record.pre_parsed_record)
            self.classifier.classify_record(record)
            if not record.valid:
                if self.env['lambda_alias'] != 'development':
                    LOGGER.error(
                        'Record does not match any defined schemas: %s\n%s',
                        record, record.pre_parsed_record)

                self._failed_record_count += 1
                continue

            # Increment the total processed records to get an accurate assessment of throughput
            self._processed_record_count += len(record.records)

            LOGGER.debug(
                'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>',
                record.valid, record.log_source, record.entity)

            record_alerts, normalized_records = self._rule_engine.process(
                record)

            payload_with_normalized_records.extend(normalized_records)

            LOGGER.debug(
                'Processed %d valid record(s) that resulted in %d alert(s).',
                len(payload.records), len(record_alerts))

            # Add all parsed records to the categorized payload dict only if Firehose is enabled
            if self._firehose_client:
                # Only send payloads with enabled log sources
                if self._firehose_client.enabled_log_source(
                        payload.log_source):
                    self._firehose_client.categorized_payloads[
                        payload.log_source].extend(payload.records)

            if not record_alerts:
                continue

            # Extend the list of alerts with any new ones so they can be returned
            self._alerts.extend(record_alerts)

            if self.enable_alert_processor:
                self.sinker.sink(record_alerts)

        return payload_with_normalized_records
Пример #12
0
 def _check_record_batch(batch):
     """Helper function to verify record size"""
     for index, record in enumerate(batch):
         if len(str(record)) > MAX_RECORD_SIZE:
             # Show the first 1k bytes in order to not overload
             # CloudWatch logs
             LOGGER.error('The following record is too large'
                          'be sent to Firehose: %s', str(record)[:1000])
             MetricLogger.log_metric(FUNCTION_NAME,
                                     MetricLogger.FIREHOSE_FAILED_RECORDS,
                                     1)
             batch.pop(index)
Пример #13
0
    def _firehose_request_helper(self, stream_name, record_batch):
        """Send record batches to Firehose

        Args:
            stream_name (str): The name of the Delivery Stream to send to
            record_batch (list): The records to send
        """
        record_batch_size = len(record_batch)
        resp = {}

        try:
            LOGGER.debug('Sending %d records to Firehose:%s',
                         record_batch_size,
                         stream_name)
            resp = self.firehose_client.put_record_batch(
                DeliveryStreamName=stream_name,
                # The newline at the end is required by Firehose,
                # otherwise all records will be on a single line and
                # unsearchable in Athena.
                Records=[{'Data': json.dumps(self.sanitize_keys(record),
                                             separators=(",", ":")) + '\n'}
                         for record
                         in record_batch])
        except ClientError as firehose_err:
            LOGGER.error(firehose_err)
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_FAILED_RECORDS,
                                    record_batch_size)
            return

        # Error handle if failures occured in PutRecordBatch
        # TODO(jack) implement backoff here for additional message reliability
        if resp.get('FailedPutCount') > 0:
            failed_records = [failed
                              for failed
                              in resp['RequestResponses']
                              if failed.get('ErrorCode')]
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_FAILED_RECORDS,
                                    resp['FailedPutCount'])
            # Only print the first 100 failed records to Cloudwatch logs
            LOGGER.error('The following records failed to Put to the'
                         'Delivery stream %s: %s',
                         stream_name,
                         json.dumps(failed_records[:100], indent=2))
        else:
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_RECORDS_SENT,
                                    record_batch_size)
            LOGGER.info('Successfully sent %d messages to Firehose:%s',
                        record_batch_size,
                        stream_name)
Пример #14
0
    def enabled_log_source(cls, log_source_name):
        """Check that the incoming record is an enabled log source for Firehose

        Args:
            log_source_name (str): The log source of the record

        Returns:
            bool: Whether or not the log source is enabled to send to Firehose
        """
        if not cls._ENABLED_LOGS:
            LOGGER.error('Enabled logs not loaded')
            return False

        return cls.firehose_log_name(log_source_name) in cls._ENABLED_LOGS
Пример #15
0
        def firehose_request_wrapper(data):
            """Firehose request wrapper to use with backoff"""
            # Use the current length of data here so we can track failed records that are retried
            LOGGER.info('[Firehose] Sending %d records to %s', len(data), stream_name)

            response = self._client.put_record_batch(DeliveryStreamName=stream_name, Records=data)

            # Log this as an error for now so it can be picked up in logs
            if response['FailedPutCount'] > 0:
                LOGGER.error('Received non-zero FailedPutCount: %d', response['FailedPutCount'])
                # Strip out the successful records so only the failed ones are retried. This happens
                # to the list of dictionary objects, so the called function sees the updated list
                self._strip_successful_records(data, response)

            return response
Пример #16
0
    def _limit_record_size(cls, batch):
        """Limits the batch size sent to Firehose by popping large records

        Args:
            batch (list): Record batch to iterate on
        """
        for index, record in enumerate(batch):
            if len(json.dumps(record, separators=(",", ":"))) > cls.MAX_RECORD_SIZE:
                # Show the first 1k bytes in order to not overload CloudWatch logs
                LOGGER.error('The following record is too large'
                             'be sent to Firehose: %s', str(record)[:1000])
                MetricLogger.log_metric(FUNCTION_NAME,
                                        MetricLogger.FIREHOSE_FAILED_RECORDS,
                                        1)
                batch.pop(index)
Пример #17
0
def load_stream_payload(service, entity, raw_record):
    """Returns the right StreamPayload subclass for this service

    Args:
        service (str): service name to load class for
        entity (str): entity for this service
        raw_record (str): record raw payload data
    """
    payload_map = {'s3': S3Payload,
                   'sns': SnsPayload,
                   'kinesis': KinesisPayload}

    if service not in payload_map:
        LOGGER.error('Service payload not supported: %s', service)
        return

    return payload_map[service](raw_record=raw_record, entity=entity)
Пример #18
0
    def _process_alerts(self, payload):
        """Process records for alerts and send them to the correct places

        Args:
            payload (StreamPayload): StreamAlert payload object being processed
        """
        for record in payload.pre_parse():
            # Increment the processed size using the length of this record
            self._processed_size += len(record.pre_parsed_record)
            self.classifier.classify_record(record)
            if not record.valid:
                if self.env['lambda_alias'] != 'development':
                    LOGGER.error('Record does not match any defined schemas: %s\n%s',
                                 record, record.pre_parsed_record)

                self._failed_record_count += 1
                continue

            LOGGER.debug(
                'Classified and Parsed Payload: <Valid: %s, Log Source: %s, Entity: %s>',
                record.valid,
                record.log_source,
                record.entity)

            record_alerts = StreamRules.process(record)

            LOGGER.debug('Processed %d valid record(s) that resulted in %d alert(s).',
                         len(payload.records),
                         len(record_alerts))

            # Add all parsed records to the categorized payload dict
            # only if Firehose is enabled
            if self.firehose_client:
                # Only send payloads with enabled types
                if payload.log_source.split(':')[0] not in self.config['global'] \
                    ['infrastructure'].get('firehose', {}).get('disabled_logs', []):
                    self.categorized_payloads[payload.log_source].extend(payload.records)

            if not record_alerts:
                continue

            # Extend the list of alerts with any new ones so they can be returned
            self._alerts.extend(record_alerts)

            if self.enable_alert_processor:
                self.sinker.sink(record_alerts)
Пример #19
0
    def load_enabled_log_sources(cls, firehose_config, log_sources, force_load=False):
        """Load and expand all declared and enabled Firehose log sources

        Args:
            firehose_config (dict): Loaded Firehose config from global.json
            log_sources (dict): Loaded logs.json file
            force_load (bool=False): Set to True if the log sources should be reloaded
                even if there is cached values

        Returns:
            dict: Enabled logs, key: sanitized table name, value: log type value
        """
        # Do not reload the logs if they are already cached
        if cls._ENABLED_LOGS and not force_load:
            return cls._ENABLED_LOGS

        # Nothing to load if no configs passed
        if not (firehose_config and log_sources):
            return cls._ENABLED_LOGS

        # Expand enabled logs into specific subtypes
        for enabled_log in firehose_config.get('enabled_logs', {}):
            enabled_log_parts = enabled_log.split(':')

            # Expand to all subtypes
            if len(enabled_log_parts) == 1:
                expanded_logs = {cls.firehose_log_name(log_name): enabled_log
                                 for log_name in log_sources
                                 if log_name.split(':')[0] == enabled_log_parts[0]}

                if not expanded_logs:
                    LOGGER.error('Enabled Firehose log %s not declared in logs.json', enabled_log)

                cls._ENABLED_LOGS.update(expanded_logs)

            elif len(enabled_log_parts) == 2:
                if enabled_log not in log_sources:
                    LOGGER.error('Enabled Firehose log %s not declared in logs.json', enabled_log)
                    continue

                cls._ENABLED_LOGS[cls.firehose_log_name('_'.join(enabled_log_parts))] = enabled_log

        return cls._ENABLED_LOGS
Пример #20
0
    def parse(self, schema, data):
        """Parse a key value string into a dictionary.

        Args:
            schema (dict): Parsing schema.
            data (str): Data to be parsed.

        Returns:
            list: A list of dictionaries representing parsed records OR
            False if the columns do not match.
        """
        # get the delimiter (character between key/value pairs) and the
        # separator (the character between keys and values)
        delimiter = self.options.get('delimiter', self.__default_delimiter)
        separator = self.options.get('separator', self.__default_separator)

        kv_payload = {}
        try:
            # remove any blank strings that may exist in our list
            fields = [field for field in data.split(delimiter) if field]
            # first check the field length matches our # of keys
            if len(fields) != len(schema):
                return False

            regex = re.compile('.+{}.+'.format(separator))
            for index, field in enumerate(fields):
                # verify our fields match the kv regex
                if regex.match(field):
                    key, value = field.split(separator)
                    # handle duplicate keys
                    if key in kv_payload:
                        # load key from our configuration
                        kv_payload[schema.keys()[index]] = value
                    else:
                        # load key from data
                        kv_payload[key] = value
                else:
                    LOGGER.error('key/value regex failure for %s', field)

            return [kv_payload]
        except UnicodeDecodeError:
            return False
    def _process_ioc(self, ioc_collections):
        """Check if any info is malicious by querying DynamoDB IOC table

        Args:
            ioc_collections (list): A list of StreamIoc instances.
        """
        LOGGER.debug('[Threat Inel] Rule Processor queries %d IOCs',
                     len(ioc_collections))
        # Segment data before calling DynamoDB table with batch_get_item.
        for subset in self._segment(ioc_collections):
            query_values = []
            for ioc in subset:
                if ioc.value not in query_values:
                    query_values.append(ioc.value)

            query_result = []

            query_error_msg = 'An error occurred while quering dynamodb table. Error is: %s'
            try:
                result, unprocesed_keys = self._query(query_values)
                query_result.extend(result)
            except ClientError as err:
                LOGGER.error(query_error_msg, err.response)
                return
            except ParamValidationError as err:
                LOGGER.error(query_error_msg, err)
                return

            # If there are unprocessed keys, we will re-query once with unprocessed
            # keys only
            if unprocesed_keys:
                deserializer = self._deserialize(
                    unprocesed_keys[self._table]['Keys'])
                query_values = [elem[PRIMARY_KEY] for elem in deserializer]
                query_error_msg = 'An error occurred while processing unprocesed_keys. Error is: %s'
                try:
                    result, _ = self._query(query_values)
                    query_result.extend(result)
                except ClientError as err:
                    LOGGER.error(query_error_msg, err.response)
                    return
                except ParamValidationError as err:
                    LOGGER.error(query_error_msg, err)
                    return

            for value in ioc_collections:
                for ioc in query_result:
                    if value.value == ioc[PRIMARY_KEY]:
                        value.sub_type = ioc[SUB_TYPE_KEY]
                        value.is_ioc = True
                        continue
Пример #22
0
    def _add_optional_keys(json_records, schema, optional_keys):
        """Add optional keys to a parsed JSON record.

        Args:
            json_records (list): JSONPath extracted JSON records
            schema (dict): The log type schema
            optional_keys (dict): The optional keys in the schema
        """
        if not optional_keys:
            return

        def _default_optional_values(key):
            """Return a default value for a given schema type"""
            if key == 'string':
                return str()
            elif key == 'integer':
                return int()
            elif key == 'float':
                return float()
            elif key == 'boolean':
                return bool()
            elif key == []:
                return list()
            elif key == OrderedDict():
                return dict()

        for key_name in optional_keys:
            # Instead of doing a schema.update() here with a default value type,
            # we should enforce having any optional keys declared within the schema
            # and log an error if that is not the case
            if key_name not in schema:
                LOGGER.error(
                    'Optional top level key \'%s\' '
                    'not found in declared log schema', key_name)
                continue
            # If the optional key isn't in our parsed json payload
            for record in json_records:
                if key_name not in record:
                    # Set default value
                    record[key_name] = _default_optional_values(
                        schema[key_name])
Пример #23
0
    def _check_schema_match(schema_matches):
        """Check to see if the log matches multiple schemas. If so, fall back
        on using log_patterns to look for the proper log. If no log_patterns
        exist, or they do not resolve the problem, fall back on using the
        first matched schema.

        Args:
            schema_matches (list): A list of tuples containing the info for schemas that have
                validly parsed this record. Each tuple is: (log_name, parser, parsed_data)

        Returns:
            tuple: The proper tuple to use for parsing from the list of tuples
        """
        # If there is only one parse or we do not have support for multiple schemas
        # enabled, then just return the first parse that was valid
        if len(schema_matches) == 1 or not SUPPORT_MULTIPLE_SCHEMA_MATCHING:
            return schema_matches[0]

        matches = []
        for i, schema_match in enumerate(schema_matches):
            log_patterns = schema_match.parser.options.get('log_patterns', {})
            LOGGER.debug('Log patterns: %s', log_patterns)
            if (all(
                    schema_match.parser.matched_log_pattern(
                        data, log_patterns)
                    for data in schema_match.parsed_data)):
                matches.append(schema_matches[i])
            else:
                if LOGGER_DEBUG_ENABLED:
                    LOGGER.debug(
                        'Log pattern matching failed for:\n%s',
                        json.dumps(schema_match.parsed_data, indent=2))

        if matches:
            if len(matches) > 1:
                LOGGER.error('Log patterns matched for multiple schemas: %s',
                             ', '.join(match.log_name for match in matches))
                LOGGER.error('Proceeding with schema for: %s',
                             matches[0].log_name)

            return matches[0]

        LOGGER.error('Log classification matched for multiple schemas: %s',
                     ', '.join(match.log_name for match in schema_matches))
        LOGGER.error('Proceeding with schema for: %s',
                     schema_matches[0].log_name)

        return schema_matches[0]
Пример #24
0
    def run(self, event):
        """StreamAlert Lambda function handler.

        Loads the configuration for the StreamAlert function which contains:
        available data sources, log formats, parser modes, and sinks.  Classifies
        logs sent into the stream into a parsed type.  Matches records against
        rules.

        Args:
            event: An AWS event mapped to a specific source/entity (kinesis stream or
                an s3 bucket event) containing data emitted to the stream.

        Returns:
            bool: True if all logs being parsed match a schema
        """
        records = event.get('Records', [])
        LOGGER.debug('Number of Records: %d', len(records))
        if not records:
            return False

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS,
                                len(records))

        for raw_record in records:
            # Get the service and entity from the payload. If the service/entity
            # is not in our config, log and error and go onto the next record
            service, entity = self.classifier.extract_service_and_entity(
                raw_record)
            if not service:
                LOGGER.error(
                    'No valid service found in payload\'s raw record. Skipping '
                    'record: %s', raw_record)
                continue

            if not entity:
                LOGGER.error(
                    'Unable to extract entity from payload\'s raw record for service %s. '
                    'Skipping record: %s', service, raw_record)
                continue

            # Cache the log sources for this service and entity on the classifier
            if not self.classifier.load_sources(service, entity):
                continue

            # Create the StreamPayload to use for encapsulating parsed info
            payload = load_stream_payload(service, entity, raw_record)
            if not payload:
                continue

            self._process_alerts(payload)

        LOGGER.debug('Invalid record count: %d', self._failed_record_count)

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_PARSES,
                                self._failed_record_count)

        LOGGER.debug('%s alerts triggered', len(self._alerts))

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS,
                                len(self._alerts))

        # Check if debugging logging is on before json dumping alerts since
        # this can be time consuming if there are a lot of alerts
        if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG):
            LOGGER.debug('Alerts:\n%s', json.dumps(self._alerts, indent=2))

        return self._failed_record_count == 0
Пример #25
0
    def _firehose_request_helper(self, stream_name, record_batch):
        """Send record batches to Firehose

        Args:
            stream_name (str): The name of the Delivery Stream to send to
            record_batch (list): The records to send
        """
        resp = {}
        record_batch_size = len(record_batch)
        exceptions_to_backoff = (ClientError, ConnectionError)

        @backoff.on_predicate(backoff.fibo,
                              lambda resp: resp['FailedPutCount'] > 0,
                              max_tries=self.MAX_BACKOFF_ATTEMPTS,
                              max_value=self.MAX_BACKOFF_FIBO_VALUE,
                              jitter=backoff.full_jitter,
                              on_backoff=backoff_handler,
                              on_success=success_handler,
                              on_giveup=giveup_handler)
        @backoff.on_exception(backoff.fibo,
                              exceptions_to_backoff,
                              max_tries=self.MAX_BACKOFF_ATTEMPTS,
                              jitter=backoff.full_jitter,
                              on_backoff=backoff_handler,
                              on_success=success_handler,
                              on_giveup=giveup_handler)
        def firehose_request_wrapper(data):
            """Firehose request wrapper to use with backoff"""
            LOGGER.info('[Firehose] Sending %d records to %s',
                        record_batch_size,
                        stream_name)
            return self._firehose_client.put_record_batch(
                DeliveryStreamName=stream_name,
                Records=data)

        # The newline at the end is required by Firehose,
        # otherwise all records will be on a single line and
        # unsearchable in Athena.
        records_data = [
            {'Data': json.dumps(self.sanitize_keys(record), separators=(",", ":")) + '\n'}
            for record in record_batch
        ]

        # The try/except here is to catch the raised error at the
        # end of the backoff.
        try:
            resp = firehose_request_wrapper(records_data)
        except exceptions_to_backoff as firehose_err:
            LOGGER.error(firehose_err)
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_FAILED_RECORDS,
                                    record_batch_size)
            return

        # Error handle if failures occurred in PutRecordBatch after
        # several backoff attempts
        if resp.get('FailedPutCount') > 0:
            failed_records = [failed
                              for failed
                              in resp['RequestResponses']
                              if failed.get('ErrorCode')]
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_FAILED_RECORDS,
                                    resp['FailedPutCount'])
            # Only print the first 100 failed records to Cloudwatch logs
            LOGGER.error('[Firehose] The following records failed to put to '
                         'the Delivery Stream %s: %s',
                         stream_name,
                         json.dumps(failed_records[:100], indent=2))
        else:
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_RECORDS_SENT,
                                    record_batch_size)
            LOGGER.info('[Firehose] Successfully sent %d messages to %s with RequestId [%s]',
                        record_batch_size,
                        stream_name,
                        resp.get('ResponseMetadata', {}).get('RequestId', ''))
Пример #26
0
    def _send_to_firehose(self):
        """Send all classified records to a respective Firehose Delivery Stream"""
        def _chunk(record_list, chunk_size):
            """Helper function to chunk payloads"""
            for item in range(0, len(record_list), chunk_size):
                yield record_list[item:item + chunk_size]

        def _check_record_batch(batch):
            """Helper function to verify record size"""
            for index, record in enumerate(batch):
                if len(str(record)) > MAX_RECORD_SIZE:
                    # Show the first 1k bytes in order to not overload
                    # CloudWatch logs
                    LOGGER.error('The following record is too large'
                                 'be sent to Firehose: %s', str(record)[:1000])
                    MetricLogger.log_metric(FUNCTION_NAME,
                                            MetricLogger.FIREHOSE_FAILED_RECORDS,
                                            1)
                    batch.pop(index)

        delivery_stream_name_pattern = 'streamalert_data_{}'

        # Iterate through each payload type
        for log_type, records in self.categorized_payloads.items():
            # This same method is used when naming the Delivery Streams
            formatted_log_type = log_type.replace(':', '_')

            for record_batch in _chunk(records, MAX_BATCH_SIZE):
                stream_name = delivery_stream_name_pattern.format(formatted_log_type)
                _check_record_batch(record_batch)

                resp = self.firehose_client.put_record_batch(
                    DeliveryStreamName=stream_name,
                    # The newline at the end is required by Firehose,
                    # otherwise all records will be on a single line and
                    # unsearchable in Athena.
                    Records=[{'Data': json.dumps(record, separators=(",", ":")) + '\n'}
                             for record
                             in record_batch])

                # Error handle if failures occured
                # TODO(jack) implement backoff here once the rule processor is split
                if resp.get('FailedPutCount') > 0:
                    failed_records = [failed
                                      for failed
                                      in resp['RequestResponses']
                                      if failed.get('ErrorCode')]
                    MetricLogger.log_metric(FUNCTION_NAME,
                                            MetricLogger.FIREHOSE_FAILED_RECORDS,
                                            resp['FailedPutCount'])
                    # Only print the first 100 failed records
                    LOGGER.error('The following records failed to Put to the'
                                 'Delivery stream %s: %s',
                                 stream_name,
                                 json.dumps(failed_records[:100], indent=2))
                else:
                    MetricLogger.log_metric(FUNCTION_NAME,
                                            MetricLogger.FIREHOSE_RECORDS_SENT,
                                            len(record_batch))
                    LOGGER.info('Successfully sent %d messages to Firehose:%s',
                                len(record_batch),
                                stream_name)
Пример #27
0
    def run(self, event):
        """StreamAlert Lambda function handler.

        Loads the configuration for the StreamAlert function which contains
        available data sources, log schemas, normalized types, and outputs.
        Classifies logs sent into a parsed type.
        Matches records against rules.

        Args:
            event (dict): An AWS event mapped to a specific source/entity
                containing data read by Lambda.

        Returns:
            bool: True if all logs being parsed match a schema
        """
        records = event.get('Records', [])
        LOGGER.debug('Number of Records: %d', len(records))
        if not records:
            return False

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS, len(records))

        firehose_config = self.config['global'].get(
            'infrastructure', {}).get('firehose', {})
        if firehose_config.get('enabled'):
            self.firehose_client = boto3.client('firehose',
                                                region_name=self.env['lambda_region'])

        for raw_record in records:
            # Get the service and entity from the payload. If the service/entity
            # is not in our config, log and error and go onto the next record
            service, entity = self.classifier.extract_service_and_entity(raw_record)
            if not service:
                LOGGER.error('No valid service found in payload\'s raw record. Skipping '
                             'record: %s', raw_record)
                continue

            if not entity:
                LOGGER.error(
                    'Unable to extract entity from payload\'s raw record for service %s. '
                    'Skipping record: %s', service, raw_record)
                continue

            # Cache the log sources for this service and entity on the classifier
            if not self.classifier.load_sources(service, entity):
                continue

            # Create the StreamPayload to use for encapsulating parsed info
            payload = load_stream_payload(service, entity, raw_record)
            if not payload:
                continue

            self._process_alerts(payload)

        MetricLogger.log_metric(FUNCTION_NAME,
                                MetricLogger.TOTAL_PROCESSED_SIZE,
                                self._processed_size)

        LOGGER.debug('Invalid record count: %d', self._failed_record_count)

        MetricLogger.log_metric(FUNCTION_NAME,
                                MetricLogger.FAILED_PARSES,
                                self._failed_record_count)

        LOGGER.debug('%s alerts triggered', len(self._alerts))

        MetricLogger.log_metric(
            FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS, len(
                self._alerts))

        # Check if debugging logging is on before json dumping alerts since
        # this can be time consuming if there are a lot of alerts
        if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG):
            LOGGER.debug('Alerts:\n%s', json.dumps(self._alerts, indent=2))

        if self.firehose_client:
            self._send_to_firehose()

        return self._failed_record_count == 0
Пример #28
0
    def run(self, event):
        """StreamAlert Lambda function handler.

        Loads the configuration for the StreamAlert function which contains
        available data sources, log schemas, normalized types, and outputs.
        Classifies logs sent into a parsed type.
        Matches records against rules.

        Args:
            event (dict): An AWS event mapped to a specific source/entity
                containing data read by Lambda.

        Returns:
            bool: True if all logs being parsed match a schema
        """
        records = event.get('Records', [])
        LOGGER.debug('Number of incoming records: %d', len(records))
        if not records:
            return False

        firehose_config = self.config['global'].get('infrastructure',
                                                    {}).get('firehose', {})
        if firehose_config.get('enabled'):
            self._firehose_client = StreamAlertFirehose(
                self.env['lambda_region'], firehose_config,
                self.config['logs'])

        payload_with_normalized_records = []
        for raw_record in records:
            # Get the service and entity from the payload. If the service/entity
            # is not in our config, log and error and go onto the next record
            service, entity = self.classifier.extract_service_and_entity(
                raw_record)
            if not service:
                LOGGER.error(
                    'No valid service found in payload\'s raw record. Skipping '
                    'record: %s', raw_record)
                continue

            if not entity:
                LOGGER.error(
                    'Unable to extract entity from payload\'s raw record for service %s. '
                    'Skipping record: %s', service, raw_record)
                continue

            # Cache the log sources for this service and entity on the classifier
            if not self.classifier.load_sources(service, entity):
                continue

            # Create the StreamPayload to use for encapsulating parsed info
            payload = load_stream_payload(service, entity, raw_record)
            if not payload:
                continue

            payload_with_normalized_records.extend(
                self._process_alerts(payload))

        # Log normalized records metric
        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.NORMALIZED_RECORDS,
                                len(payload_with_normalized_records))

        # Apply Threat Intel to normalized records in the end of Rule Processor invocation
        record_alerts = self._rules_engine.threat_intel_match(
            payload_with_normalized_records)
        self._alerts.extend(record_alerts)
        if record_alerts:
            self.alert_forwarder.send_alerts(record_alerts)

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TOTAL_RECORDS,
                                self._processed_record_count)

        MetricLogger.log_metric(FUNCTION_NAME,
                                MetricLogger.TOTAL_PROCESSED_SIZE,
                                self._processed_size)

        LOGGER.debug('Invalid record count: %d', self._failed_record_count)

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_PARSES,
                                self._failed_record_count)

        LOGGER.debug('%s alerts triggered', len(self._alerts))

        MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.TRIGGERED_ALERTS,
                                len(self._alerts))

        # Check if debugging logging is on before json dumping alerts since
        # this can be time consuming if there are a lot of alerts
        if self._alerts and LOGGER.isEnabledFor(LOG_LEVEL_DEBUG):
            LOGGER.debug(
                'Alerts:\n%s',
                json.dumps([alert.output_dict() for alert in self._alerts],
                           indent=2,
                           sort_keys=True))

        if self._firehose_client:
            self._firehose_client.send()

        # Only log rule info here if this is not running tests
        # During testing, this gets logged at the end and printing here could be confusing
        # since stress testing calls this method multiple times
        if self.env['lambda_alias'] != 'development':
            stats.print_rule_stats(True)

        return self._failed_record_count == 0
Пример #29
0
    def _firehose_request_helper(self, stream_name, record_batch):
        """Send record batches to Firehose

        Args:
            stream_name (str): The name of the Delivery Stream to send to
            record_batch (list): The records to send
        """
        exceptions_to_backoff = (ClientError, ConnectionError, Timeout)

        @backoff.on_predicate(backoff.fibo,
                              lambda resp: resp['FailedPutCount'] > 0,
                              max_tries=self.MAX_BACKOFF_ATTEMPTS,
                              max_value=self.MAX_BACKOFF_FIBO_VALUE,
                              jitter=backoff.full_jitter,
                              on_backoff=backoff_handler(debug_only=False),
                              on_success=success_handler(),
                              on_giveup=giveup_handler())
        @backoff.on_exception(backoff.fibo,
                              exceptions_to_backoff,
                              max_tries=self.MAX_BACKOFF_ATTEMPTS,
                              jitter=backoff.full_jitter,
                              on_backoff=backoff_handler(debug_only=False),
                              on_success=success_handler(),
                              on_giveup=giveup_handler())
        def firehose_request_wrapper(data):
            """Firehose request wrapper to use with backoff"""
            # Use the current length of data here so we can track failed records that are retried
            LOGGER.info('[Firehose] Sending %d records to %s', len(data), stream_name)

            response = self._client.put_record_batch(DeliveryStreamName=stream_name, Records=data)

            # Log this as an error for now so it can be picked up in logs
            if response['FailedPutCount'] > 0:
                LOGGER.error('Received non-zero FailedPutCount: %d', response['FailedPutCount'])
                # Strip out the successful records so only the failed ones are retried. This happens
                # to the list of dictionary objects, so the called function sees the updated list
                self._strip_successful_records(data, response)

            return response

        original_batch_size = len(record_batch)

        # The newline at the end is required by Firehose,
        # otherwise all records will be on a single line and
        # unsearchable in Athena.
        records_data = [
            {'Data': json.dumps(self.sanitize_keys(record), separators=(",", ":")) + '\n'}
            for record in record_batch
        ]

        # The try/except here is to catch the raised error at the end of the backoff
        try:
            resp = firehose_request_wrapper(records_data)
        except exceptions_to_backoff as firehose_err:
            LOGGER.error(firehose_err)
            # Use the current length of the records_data in case some records were
            # successful but others were not
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_FAILED_RECORDS,
                                    len(records_data))
            return

        # Error handle if failures occurred in PutRecordBatch after
        # several backoff attempts
        if resp.get('FailedPutCount') > 0:
            failed_records = [failed
                              for failed
                              in resp['RequestResponses']
                              if failed.get('ErrorCode')]
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_FAILED_RECORDS,
                                    resp['FailedPutCount'])
            # Only print the first 100 failed records to Cloudwatch logs
            LOGGER.error('[Firehose] The following records failed to put to '
                         'the Delivery Stream %s: %s',
                         stream_name,
                         json.dumps(failed_records[:100], indent=2))
        else:
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FIREHOSE_RECORDS_SENT,
                                    original_batch_size)
            LOGGER.info('[Firehose] Successfully sent %d messages to %s with RequestId [%s]',
                        original_batch_size,
                        stream_name,
                        resp.get('ResponseMetadata', {}).get('RequestId', ''))
Пример #30
0
    def _convert_type(cls, payload, schema):
        """Convert a parsed payload's values into their declared types.

        If the schema is incorrectly defined for a particular field,
        this function will return False which will make the payload
        invalid.

        Args:
            payload (dict): Parsed payload dict
            schema (dict): data schema for a specific log source

        Returns:
            dict: parsed dict payload with typed values
        """
        for key, value in schema.iteritems():
            key = str(key)
            # if the schema value is declared as string
            if value == 'string':
                try:
                    payload[key] = str(payload[key])
                except UnicodeEncodeError:
                    payload[key] = unicode(payload[key])

            # if the schema value is declared as integer
            elif value == 'integer':
                try:
                    payload[key] = int(payload[key])
                except ValueError:
                    LOGGER.error(
                        'Invalid schema. Value for key [%s] is not an int: %s',
                        key, payload[key])
                    return False

            elif value == 'float':
                try:
                    payload[key] = float(payload[key])
                except ValueError:
                    LOGGER.error(
                        'Invalid schema. Value for key [%s] is not a float: %s',
                        key, payload[key])
                    return False

            elif value == 'boolean':
                payload[key] = str(payload[key]).lower() == 'true'

            elif isinstance(value, dict):
                if not value:
                    continue  # allow empty maps (dict)

                # Skip the values for the 'streamalert:envelope_keys' key that we've
                # added during parsing if the do not conform to being a dict
                if key == 'streamalert:envelope_keys' and not isinstance(
                        payload[key], dict):
                    continue

                cls._convert_type(payload[key], schema[key])

            elif isinstance(value, list):
                pass

            else:
                LOGGER.error('Unsupported schema type: %s', value)

        return True