Пример #1
0
def handler(event, context):
    """Main Lambda handler function"""
    try:
        StreamAlert(context).run(event)
    except Exception:
        LOGGER.error('Invocation event: %s', json.dumps(event))
        raise
Пример #2
0
    def _load_rule_table(cls, config):
        """Load and return a RuleTable class for communicating with the DynamoDB rule table

        Args:
            config (dict): Loaded configuration from 'conf/' directory

        Returns:
            rule_table.RuleTable: Loaded frontend for DynamoDB rules table
        """
        # Ensure the rules table is enabled
        rt_config = config['global']['infrastructure']['rules_table']
        if not rt_config.get('enabled', False):
            return

        now = datetime.utcnow()
        refresh_delta = timedelta(
            minutes=rt_config.get('cache_refresh_minutes', 10))

        # The rule table will need 'refreshed' if the refresh interval has been surpassed
        needs_refresh = cls._RULE_TABLE_LAST_REFRESH + refresh_delta < now

        if not needs_refresh:
            LOGGER.debug(
                'Rule table does not need refreshed (last refresh time: %s; '
                'current time: %s)', cls._RULE_TABLE_LAST_REFRESH, now)
            return

        LOGGER.info(
            'Refreshing rule table (last refresh time: %s; current time: %s)',
            cls._RULE_TABLE_LAST_REFRESH, now)

        table_name = '{}_streamalert_rules'.format(
            config['global']['account']['prefix'])
        cls._RULE_TABLE = RuleTable(table_name)
        cls._RULE_TABLE_LAST_REFRESH = now
Пример #3
0
    def _add_optional_keys(self, json_records, schema, optional_keys):
        """Add optional keys to a parsed JSON record.

        Args:
            json_records (list): JSONPath extracted JSON records
            schema (dict): The log type schema
            optional_keys (dict): The optional keys in the schema
        """
        if not optional_keys:
            return

        for key_name in optional_keys:
            # Instead of doing a schema.update() here with a default value type,
            # we should enforce having any optional keys declared within the schema
            # and log an error if that is not the case
            if key_name not in schema:
                LOGGER.error(
                    'Optional top level key \'%s\' '
                    'not found in declared log schema', key_name)
                continue
            # If the optional key isn't in our parsed json payload
            for record in json_records:
                if key_name not in record:
                    # Set default value
                    record[key_name] = self.default_optional_values(
                        schema[key_name])
Пример #4
0
    def process_subkeys(cls, record, payload_type, rule):
        """Check payload record contains all subkeys needed for rules

        Because each log is processed by every rule for a given log type,
        it's possible that a rule references a subkey that doesn't exist in
        that specific log. This method verifies that the declared subkeys
        in a rule are contained in the JSON payload prior to rule processing.

        Args:
            record: Payload record to process
            payload_type (str): type of the record
            rule: Rule attributes

        Returns:
            bool: result of subkey check.
        """
        if not rule.req_subkeys or payload_type != 'json':
            return True

        for key, nested_keys in rule.req_subkeys.iteritems():
            # This is an extra layer of protection when
            # verifying a subkey exists in a record with a null value.
            # In the case of CloudTrail, a top level key has been
            # observed as either a map with subkeys, or null.
            if not record.get(key):
                LOGGER.debug(
                    'The required subkey %s is not found when trying to process %s: \n%s',
                    key, rule.rule_name, json.dumps(record, indent=2))
                return False
            if not all(x in record[key] for x in nested_keys):
                return False

        return True
Пример #5
0
    def load_sources(self, service, entity):
        """Load the sources for this payload.

        Args:
            service (str): Source service
            entity (str): Entity within the service

        Returns:
            bool: True if the entity's log sources loaded properly
        """
        # Clear the list from any previous runs
        del self._entity_log_sources[:]

        # Get all logs for the configured service/entity (s3, kinesis, or sns)
        service_entities = self._config['sources'].get(service)
        if not service_entities:
            LOGGER.error('Service [%s] not declared in sources configuration',
                         service)
            return False

        config_entity = service_entities.get(entity)
        if not config_entity:
            LOGGER.error(
                'Entity [%s] not declared in sources configuration for service [%s]',
                entity, service)
            return False

        # Get a copy of the logs list by slicing here, not a pointer to the list reference
        self._entity_log_sources = config_entity['logs'][:]

        return bool(self._entity_log_sources)
Пример #6
0
    def process(cls, input_payload):
        """Process rules on a record.

        Gather a list of rules based on the record's datasource type.
        For each rule, evaluate the record through all listed matchers
        and the rule itself to determine if a match occurs.

        Returns:
            list: alerts

            An alert is represented as a dictionary with the following keys:
                rule_name: the name of the triggered rule
                payload: the StreamPayload object
                outputs: list of outputs to send to
        """
        alerts = []
        payload = copy(input_payload)

        rules = [
            rule_attrs for rule_attrs in cls.__rules.values()
            if payload.log_source in rule_attrs.logs
        ]

        if not rules:
            LOGGER.debug('No rules to process for %s', payload)
            return alerts

        for record in payload.records:
            for rule in rules:
                # subkey check
                has_sub_keys = cls.process_subkeys(record, payload.type, rule)
                if not has_sub_keys:
                    continue

                # matcher check
                matcher_result = cls.match_event(record, rule)
                if not matcher_result:
                    continue

                # rule analysis
                rule_result = cls.process_rule(record, rule)
                if rule_result:
                    LOGGER.info(
                        'Rule [%s] triggered an alert on log type [%s] from entity \'%s\' '
                        'in service \'%s\'', rule.rule_name,
                        payload.log_source, payload.entity, payload.service())
                    alert = {
                        'record': record,
                        'rule_name': rule.rule_name,
                        'rule_description': rule.rule_function.__doc__
                        or DEFAULT_RULE_DESCRIPTION,
                        'log_source': str(payload.log_source),
                        'log_type': payload.type,
                        'outputs': rule.outputs,
                        'source_service': payload.service(),
                        'source_entity': payload.entity
                    }
                    alerts.append(alert)

        return alerts
Пример #7
0
        def decorator(rule):
            """Rule decorator logic."""
            rule_name = rule.__name__
            logs = opts.get('logs')
            outputs = opts.get('outputs')
            matchers = opts.get('matchers')
            datatypes = opts.get('datatypes')
            req_subkeys = opts.get('req_subkeys')

            if not (logs or datatypes):
                LOGGER.error(
                    'Invalid rule [%s] - rule must have either \'logs\' or \''
                    'datatypes\' declared', rule_name)
                return

            if not outputs:
                LOGGER.error(
                    'Invalid rule [%s] - rule must have \'outputs\' declared',
                    rule_name)
                return

            if rule_name in cls.__rules:
                raise ValueError('rule [{}] already defined'.format(rule_name))
            cls.__rules[rule_name] = RuleAttributes(rule_name, rule, matchers,
                                                    datatypes, logs, outputs,
                                                    req_subkeys)
            return rule
Пример #8
0
    def _read_downloaded_s3_object(s3_object):
        """Read lines from a downloaded file from S3

        Supports reading both gzipped files and plaintext files.

        Args:
            s3_object (str): A full path to the downloaded file.

        Yields:
            (str) Lines from the downloaded s3 object.
        """
        _, extension = os.path.splitext(s3_object)

        if extension == '.gz':
            with gzip.open(s3_object, 'r') as s3_file:
                for num, line in enumerate(s3_file, start=1):
                    yield num, line.rstrip()
        else:
            with open(s3_object, 'r') as s3_file:
                for num, line in enumerate(s3_file, start=1):
                    yield num, line.rstrip()

        # AWS Lambda apparently does not reallocate disk space when files are
        # removed using os.remove(), so we must truncate them before removal
        with open(s3_object, 'w'):
            pass

        os.remove(s3_object)
        if not os.path.exists(s3_object):
            LOGGER.debug('Removed temp S3 file: %s', s3_object)
        else:
            LOGGER.error('Failed to remove temp S3 file: %s', s3_object)
Пример #9
0
    def parse(self, schema, data):
        """Parse a string into a list of JSON payloads.

        Args:
            schema (dict): Parsing schema.
            data (str|dict): Data to be parsed.

        Returns:
            list: A list of dictionaries representing parsed records OR
            False if the data is not JSON or the data does not follow the schema.
        """
        if isinstance(data, (unicode, str)):
            try:
                loaded_data = json.loads(data)
            except ValueError as err:
                LOGGER.debug('JSON parse failed: %s', str(err))
                LOGGER.debug('JSON parse could not load data: %s', str(data))
                return False
            else:
                json_records = self._parse_records(schema, loaded_data)
        else:
            json_records = self._parse_records(schema, data)

        if not json_records:
            return False

        self._add_optional_keys(json_records, schema,
                                self.options.get('optional_top_level_keys'))
        # Make sure all keys match the schema, including nests maps
        if not self._key_check(schema, json_records):
            return False

        return json_records
Пример #10
0
    def _load_enabled_log_sources(self, firehose_config, log_sources):
        """Load and expand all declared and enabled Firehose log sources

        Args:
            firehose_config (dict): Loaded Firehose config from global.json
            log_sources (dict): Loaded logs.json file

        Returns:
            set: Disabled logs
        """
        enabled_logs = set()
        for enabled_log in firehose_config.get('enabled_logs', []):
            enabled_log_parts = enabled_log.split(':')

            # Expand to all subtypes
            if len(enabled_log_parts) == 1:
                expanded_logs = [self.firehose_log_name(log_name) for log_name
                                 in log_sources
                                 if log_name.split(':')[0] == enabled_log_parts[0]]
                # If the list comprehension is Falsey, it means no matching logs
                # were found while doing the expansion.
                if not expanded_logs:
                    LOGGER.error('Enabled Firehose log %s not declared in logs.json', enabled_log)

                enabled_logs.update(expanded_logs)

            elif len(enabled_log_parts) == 2:
                if enabled_log not in log_sources:
                    LOGGER.error('Enabled Firehose log %s not declared in logs.json', enabled_log)

                enabled_logs.add(self.firehose_log_name('_'.join(enabled_log_parts)))

        return enabled_logs
Пример #11
0
    def _get_object(self):
        """Given an S3 record, download and parse the data.

        Returns:
            str: Path to the downloaded s3 object.
        """
        # Use the urllib unquote method to decode any url encoded characters
        # (ie - %26 --> &) from the bucket and key names
        unquoted = lambda (data): unquote(data).decode('utf-8')
        region = self.raw_record['awsRegion']

        bucket = unquoted(self.raw_record['s3']['bucket']['name'])
        key = unquoted(self.raw_record['s3']['object']['key'])
        self.s3_object_size = int(self.raw_record['s3']['object']['size'])

        LOGGER.debug(
            'Pre-parsing record from S3. Bucket: %s, Key: %s, Size: %d',
            bucket, key, self.s3_object_size)

        try:
            return self._download_object(region, bucket, key)
        except IOError:
            LOGGER.exception(
                '[S3Payload] The following error occurred while downloading')
            return
    def _validate_type_mapping(mapping_str):
        """Static method to extract normalized type and IOC type from qualified str

        Args:
            mapping_str (str): A qualified string has pattern 'normalized_type:ioc_type'

        Returns:
            A tuple(bool, str, str)
            bool: First return indicate if the string a qualifited string contains
                both normalized CEF type and IOC type.
            str: Second return is normalized type.
            str: Last return is IOC type.
        """
        normalized_type = None
        ioc_type = None

        splitted_str = mapping_str.split(':')
        if len(splitted_str) == 1:
            normalized_type = splitted_str[0]
        elif len(splitted_str) == 2:
            normalized_type = splitted_str[0]
            ioc_type = splitted_str[1].split('_')[-1]
        else:
            LOGGER.info('Key %s in conf/types.json is incorrect', mapping_str)
            return False, None, None

        if normalized_type and ioc_type:
            return True, normalized_type, ioc_type

        return False, normalized_type, None
Пример #13
0
    def rule_analysis(record, rule, payload, alerts):
        """Class method to analyze rule against a record

        Args:
            record (dict): A parsed log with data.
            rule: Rule attributes.
            payload: The StreamPayload object.
            alerts (list): A list of alerts which will be sent to alert processor.

        Returns:
            (dict): A list of alerts.
        """
        rule_result = StreamRules.process_rule(record, rule)
        if rule_result:
            if StreamRules.check_alerts_duplication(record, rule, alerts):
                return

            LOGGER.info(
                'Rule [%s] triggered an alert on log type [%s] from entity \'%s\' '
                'in service \'%s\'', rule.rule_name, payload.log_source,
                payload.entity, payload.service())
            alert = {
                'record': record,
                'rule_name': rule.rule_name,
                'rule_description': rule.rule_function.__doc__
                or DEFAULT_RULE_DESCRIPTION,
                'log_source': str(payload.log_source),
                'log_type': payload.type,
                'outputs': rule.outputs,
                'source_service': payload.service(),
                'source_entity': payload.entity,
                'context': rule.context
            }

            alerts.append(alert)
Пример #14
0
def test_pre_parse_s3_debug(s3_mock, log_mock, _):
    """S3Payload - Pre Parse, Debug On"""
    # Cache the logger level
    log_level = LOGGER.getEffectiveLevel()

    # Increase the logger level to debug
    LOGGER.setLevel(logging.DEBUG)

    records = ['_first_line_test_' * 10, '_second_line_test_' * 10]

    s3_mock.side_effect = [((100, records[0]), (200, records[1]))]

    raw_record = make_s3_raw_record('unit_bucket_name', 'unit_key_name')
    s3_payload = load_stream_payload('s3', 'unit_key_name', raw_record)
    S3Payload.s3_object_size = 350

    _ = [_ for _ in s3_payload.pre_parse()]

    calls = [
        call(
            'Processed %s S3 records out of an approximate total of %s '
            '(average record size: %s bytes, total size: %s bytes)', 100, 350,
            1, 350),
        call(
            'Processed %s S3 records out of an approximate total of %s '
            '(average record size: %s bytes, total size: %s bytes)', 200, 350,
            1, 350)
    ]

    log_mock.assert_has_calls(calls)

    # Reset the logger level and stop the patchers
    LOGGER.setLevel(log_level)
Пример #15
0
    def _extract_json_path(self, json_payload):
        """Extract records from the original json payload using a provided JSON path

        Args:
            json_payload (dict): The parsed json data

        Returns:
            list: A list of JSON records extracted via JSON path or regex
        """
        records = []
        json_path_expression = self.options.get('json_path')
        if not json_path_expression:
            return records

        # Handle jsonpath extraction of records
        LOGGER.debug('Parsing records with JSONPath')
        records_jsonpath = jsonpath_rw.parse(json_path_expression)

        # If the csv parser is extracting csv from json, the payload is likely
        # a string and needs to be loaded to a dict
        if not isinstance(json_payload, dict):
            json_payload = json.loads(json_payload)

        matches = records_jsonpath.find(json_payload)
        if not matches:
            return False

        return [match.value for match in matches]
Пример #16
0
    def match_event(cls, record, rule):
        """Evaluate matchers on a record.

        Given a list of matchers, evaluate a record through each
        to find a match.  If any matcher is evaluated as false,
        the loop breaks and no further matchers are evaluated.
        Otherwise, returns True.

        Args:
            record: Record to be matched
            rule: Rule containing the list of matchers

        Returns:
            bool: result of matcher processing
        """
        # matchers are optional for rules
        if not rule.matchers:
            return True

        for matcher in rule.matchers:
            matcher_function = cls.__matchers.get(matcher)
            if matcher_function:
                try:
                    matcher_result = matcher_function(record)
                except Exception as err:  # pylint: disable=broad-except
                    matcher_result = False
                    LOGGER.error('%s: %s', matcher_function.__name__,
                                 err.message)
                if not matcher_result:
                    return False
            else:
                LOGGER.error('The matcher [%s] does not exist!', matcher)

        return True
Пример #17
0
    def _parse_records(self, schema, json_payload):
        """Identify and extract nested payloads from parsed JSON records.

        Nested payloads can be detected with log_patterns (`records` should be a
        JSONpath selector that yields the desired nested records). If desired,
        fields present on the root record can be merged into child events
        using the `envelope_keys` option.

        Args:
            json_payload (dict): The parsed json data

        Returns:
            list: A list of parsed JSON records
        """
        # Check options and return the payload if there is nothing special to do
        if not self.options:
            return [json_payload]

        envelope_schema = self.options.get('envelope_keys')
        optional_envelope_keys = self.options.get('optional_envelope_keys')

        # If the schema has a defined envelope schema, with optional keys in
        # the envelope.  This occurs in some cases when using json_regex_key.
        if envelope_schema and optional_envelope_keys:
            missing_keys_schema = {}
            for key in optional_envelope_keys:
                if key not in json_payload:
                    missing_keys_schema[key] = envelope_schema[key]
            if missing_keys_schema:
                self._add_optional_keys([json_payload], envelope_schema,
                                        missing_keys_schema)

        # If the envelope schema is defined and all envelope keys are required
        # to be present in the record.
        elif envelope_schema and not all(x in json_payload
                                         for x in envelope_schema):
            return [json_payload]

        envelope = {}
        if envelope_schema:
            LOGGER.debug('Parsing envelope keys')
            schema.update({ENVELOPE_KEY: envelope_schema})
            envelope_keys = envelope_schema.keys()
            envelope_jsonpath = jsonpath_rw.parse("$." +
                                                  ",".join(envelope_keys))
            envelope_matches = [
                match.value for match in envelope_jsonpath.find(json_payload)
            ]
            envelope = dict(zip(envelope_keys, envelope_matches))

        json_records = self._extract_records(json_payload, envelope)
        if json_records is False:
            return False

        # If the final parsed record is singular
        if not json_records:
            json_records.append(json_payload)

        return json_records
Пример #18
0
 def firehose_request_wrapper(data):
     """Firehose request wrapper to use with backoff"""
     LOGGER.info('[Firehose] Sending %d records to %s',
                 record_batch_size,
                 stream_name)
     return self._firehose_client.put_record_batch(
         DeliveryStreamName=stream_name,
         Records=data)
Пример #19
0
 def _send_to_dynamo(self, alerts):
     """Write alerts in batches to Dynamo."""
     # The batch_writer() automatically handles buffering, batching, and retrying failed items
     with self.table.batch_writer() as batch:
         for alert in alerts:
             batch.put_item(Item=self.dynamo_record(alert))
     LOGGER.info('Successfully sent %d alert(s) to dynamo:%s', len(alerts),
                 self.table.table_name)
Пример #20
0
    def _shred_temp_directory():
        """Delete all objects in the container's temp directory"""
        LOGGER.debug('Shredding temp directory')

        for root, dirs, files in os.walk(tempfile.gettempdir(), topdown=False):
            for name in files:
                subprocess.check_call([ #nosec
                    'shred', '--force', '--iterations=1',
                    '--remove', os.path.join(root, name)])
            for name in dirs:
                os.rmdir(os.path.join(root, name)) #nosec
Пример #21
0
 def _check_record_batch(batch):
     """Helper function to verify record size"""
     for index, record in enumerate(batch):
         if len(str(record)) > MAX_RECORD_SIZE:
             # Show the first 1k bytes in order to not overload
             # CloudWatch logs
             LOGGER.error('The following record is too large'
                          'be sent to Firehose: %s', str(record)[:1000])
             MetricLogger.log_metric(FUNCTION_NAME,
                                     MetricLogger.FIREHOSE_FAILED_RECORDS,
                                     1)
             batch.pop(index)
Пример #22
0
    def read_compressed_files(cls, intel_dir, delimiter=','):
        """Read intelligence into memory

        Read all intelligence from csv.gz files located in threat_intel
        directory into a dictionary. CSV filename should follow the convention
        <ioc_type_as_basename>.csv.gz. The basename (without extension) of csv
        file will be the key in return dictionary.

        Returns:
            (dict): Threat intelligence in the following format:
                {
                    "domain": {
                        "evil1.com": ["apt_domain", "source1 reported evil1.com"],
                        "evil2.com": ["c2_domain", "source2 reported evil2.com"]
                    },
                    "ip": {
                        "1.1.1.2": ["scan_ip", "source reported ip1"],
                        "2.2.2.2": ["scan_ip", "source reported ip2"]
                    },
                    "url": {
                        "www.hacker.com/evil_page": ["mal_url", "source_foo"]
                    },
                    "md5": {
                        "0123456789abcdef0123456789abcdef": ["mal_md5", "source_bar"]
                    }
                }
            None: if the intelligence directory does not exist
        """
        if not os.path.exists(intel_dir):
            return

        gz_files = [
            os.path.join(intel_dir, gz_file)
            for gz_file in os.listdir(intel_dir) if gz_file.endswith('.gz')
        ]

        for gz_file in gz_files:
            with gzip.open(gz_file, 'r') as ioc_file:
                csv_reader = csv.reader(ioc_file, delimiter=delimiter)
                ioc_type = os.path.basename(gz_file).split('.')[0]
                if ioc_type not in cls.__intelligence:
                    cls.__intelligence[ioc_type] = dict()
                for row in csv_reader:
                    if len(row) < 2:
                        LOGGER.debug(
                            'Warning, each row in CSV file should '
                            'contain at least two fields. Bad row [%s]', row)
                        continue
                    cls.__intelligence[ioc_type][row[0]] = row[1:]

        return cls.__intelligence
    def _process_ioc(self, ioc_collections):
        """Check if any info is malicious by querying DynamoDB IOC table

        Args:
            ioc_collections (list): A list of StreamIoc instances.
        """
        LOGGER.debug('[Threat Inel] Rule Processor queries %d IOCs',
                     len(ioc_collections))
        # Segment data before calling DynamoDB table with batch_get_item.
        for subset in self._segment(ioc_collections):
            query_values = []
            for ioc in subset:
                if ioc.value not in query_values:
                    query_values.append(ioc.value)

            query_result = []

            query_error_msg = 'An error occurred while quering dynamodb table. Error is: %s'
            try:
                result, unprocesed_keys = self._query(query_values)
                query_result.extend(result)
            except ClientError as err:
                LOGGER.error(query_error_msg, err.response)
                return
            except ParamValidationError as err:
                LOGGER.error(query_error_msg, err)
                return

            # If there are unprocessed keys, we will re-query once with unprocessed
            # keys only
            if unprocesed_keys:
                deserializer = self._deserialize(
                    unprocesed_keys[self._table]['Keys'])
                query_values = [elem[PRIMARY_KEY] for elem in deserializer]
                query_error_msg = 'An error occurred while processing unprocesed_keys. Error is: %s'
                try:
                    result, _ = self._query(query_values)
                    query_result.extend(result)
                except ClientError as err:
                    LOGGER.error(query_error_msg, err.response)
                    return
                except ParamValidationError as err:
                    LOGGER.error(query_error_msg, err)
                    return

            for value in ioc_collections:
                for ioc in query_result:
                    if value.value == ioc[PRIMARY_KEY]:
                        value.sub_type = ioc[SUB_TYPE_KEY]
                        value.is_ioc = True
                        continue
Пример #24
0
 def firehose_request_wrapper():
     """Firehose request wrapper to use with backoff"""
     LOGGER.info('[Firehose] Sending %d records to %s',
                 record_batch_size,
                 stream_name)
     return self._firehose_client.put_record_batch(
         DeliveryStreamName=stream_name,
         # The newline at the end is required by Firehose,
         # otherwise all records will be on a single line and
         # unsearchable in Athena.
         Records=[{'Data': json.dumps(self.sanitize_keys(record),
                                      separators=(",", ":")) + '\n'}
                  for record
                  in record_batch])
Пример #25
0
    def send_alerts(self, alerts):
        """Send alerts to the Alert Processor and to the alerts Dynamo table.

        Args:
            alerts (list): A list of dictionaries representing json alerts.
        """
        try:
            self._send_to_dynamo(alerts)
        except ClientError:
            # The batch_writer() automatically retries transient errors - any raised ClientError
            # is likely unrecoverable. Log an exception and metric
            LOGGER.exception('Error saving alerts to Dynamo')
            MetricLogger.log_metric(FUNCTION_NAME,
                                    MetricLogger.FAILED_DYNAMO_WRITES, 1)
Пример #26
0
    def _process_log_schemas(self, payload):
        """Get any log schemas that matched this log format

        Args:
            payload: A StreamAlert payload object

        Returns:
            list: Contains any schemas that matched this log format
                Each list entry contains the namedtuple of 'SchemaMatch' with
                values of log_name, root_schema, parser, and parsed_data
        """
        schema_match = namedtuple(
            'SchemaMatch', 'log_name, root_schema, parser, parsed_data')
        schema_matches = []
        log_info = self.get_log_info_for_source()

        # Loop over all logs declared in logs.json
        for log_name, attributes in log_info.iteritems():
            # Get the parser type to use for this log
            parser_name = payload.type or attributes['parser']

            schema = attributes['schema']
            options = attributes.get('configuration', {})

            # Setup the parser class
            parser_class = get_parser(parser_name)
            parser = parser_class(options)

            # Get a list of parsed records
            LOGGER.debug('Trying schema: %s', log_name)
            parsed_data = parser.parse(schema, payload.pre_parsed_record)

            if not parsed_data:
                continue

            LOGGER.debug('Parsed %d records with schema %s', len(parsed_data),
                         log_name)

            if SUPPORT_MULTIPLE_SCHEMA_MATCHING:
                schema_matches.append(
                    schema_match(log_name, schema, parser, parsed_data))
                continue

            log_patterns = parser.options.get('log_patterns')
            if all(
                    parser.matched_log_pattern(rec, log_patterns)
                    for rec in parsed_data):
                return [schema_match(log_name, schema, parser, parsed_data)]

        return schema_matches
Пример #27
0
    def enabled_log_source(cls, log_source_name):
        """Check that the incoming record is an enabled log source for Firehose

        Args:
            log_source_name (str): The log source of the record

        Returns:
            bool: Whether or not the log source is enabled to send to Firehose
        """
        if not cls._ENABLED_LOGS:
            LOGGER.error('Enabled logs not loaded')
            return False

        return cls.firehose_log_name(log_source_name) in cls._ENABLED_LOGS
Пример #28
0
    def pre_parse(self):
        """Pre-parsing method for SNS records. Extracts the SNS payload from the
        record itself and sets it as the `pre_parsed_record` property.

        Yields:
            This object with the pre_parsed_record now set
        """
        LOGGER.debug(
            'Pre-parsing record from SNS. MessageId: %s, EventSubscriptionArn: %s',
            self.raw_record['Sns']['MessageId'],
            self.raw_record['EventSubscriptionArn'])

        self.pre_parsed_record = self.raw_record['Sns']['Message']

        yield self
Пример #29
0
    def _limit_record_size(cls, batch):
        """Limits the batch size sent to Firehose by popping large records

        Args:
            batch (list): Record batch to iterate on
        """
        for index, record in enumerate(batch):
            if len(json.dumps(record, separators=(",", ":"))) > cls.MAX_RECORD_SIZE:
                # Show the first 1k bytes in order to not overload CloudWatch logs
                LOGGER.error('The following record is too large'
                             'be sent to Firehose: %s', str(record)[:1000])
                MetricLogger.log_metric(FUNCTION_NAME,
                                        MetricLogger.FIREHOSE_FAILED_RECORDS,
                                        1)
                batch.pop(index)
Пример #30
0
        def firehose_request_wrapper(data):
            """Firehose request wrapper to use with backoff"""
            # Use the current length of data here so we can track failed records that are retried
            LOGGER.info('[Firehose] Sending %d records to %s', len(data), stream_name)

            response = self._client.put_record_batch(DeliveryStreamName=stream_name, Records=data)

            # Log this as an error for now so it can be picked up in logs
            if response['FailedPutCount'] > 0:
                LOGGER.error('Received non-zero FailedPutCount: %d', response['FailedPutCount'])
                # Strip out the successful records so only the failed ones are retried. This happens
                # to the list of dictionary objects, so the called function sees the updated list
                self._strip_successful_records(data, response)

            return response