示例#1
0
    def test_sanitize_keys(self):
        """FirehoseClient - Sanitize Keys"""
        # test_log_type_json_nested
        test_event = {
            'date': 'January 01, 3005',
            'unixtime': '32661446400',
            'host': 'my-host.name.website.com',
            'data': {
                'super-duper': 'secret',
                'sanitize_me': 1,
                'example-key': 1,
                'moar**data': 2,
                'even.more': 3
            }
        }

        expected_sanitized_event = {
            'date': 'January 01, 3005',
            'unixtime': '32661446400',
            'host': 'my-host.name.website.com',
            'data': {
                'super_duper': 'secret',
                'sanitize_me': 1,
                'example_key': 1,
                'moar__data': 2,
                'even_more': 3
            }
        }

        sanitized_event = FirehoseClient.sanitize_keys(test_event)
        assert_equal(sanitized_event, expected_sanitized_event)
示例#2
0
def create_table(table, bucket, config, schema_override=None):
    """Create a 'streamalert' Athena table

    Args:
        table (str): The name of the table being rebuilt
        bucket (str): The s3 bucket to be used as the location for Athena data
        table_type (str): The type of table being refreshed
        config (CLIConfig): Loaded StreamAlert CLI
        schema_override (set): An optional set of key=value pairs to be used for
            overriding the configured column_name=value_type.
    """
    enabled_logs = FirehoseClient.load_enabled_log_sources(
        config['global']['infrastructure']['firehose'],
        config['logs']
    )

    # Convert special characters in schema name to underscores
    sanitized_table_name = FirehoseClient.firehose_log_name(table)

    # Check that the log type is enabled via Firehose
    if sanitized_table_name != 'alerts' and sanitized_table_name not in enabled_logs:
        LOGGER_CLI.error('Table name %s missing from configuration or '
                         'is not enabled.', sanitized_table_name)
        return

    athena_client = get_athena_client(config)

    # Check if the table exists
    if athena_client.check_table_exists(sanitized_table_name):
        LOGGER_CLI.info('The \'%s\' table already exists.', sanitized_table_name)
        return

    if table == 'alerts':
        # get a fake alert so we can get the keys needed and their types
        alert = Alert('temp_rule_name', {}, {})
        output = alert.output_dict()
        schema = record_to_schema(output)
        athena_schema = helpers.logs_schema_to_athena_schema(schema)

        query = _construct_create_table_statement(
            schema=athena_schema, table_name=table, bucket=bucket)

    else: # all other tables are log types

        log_info = config['logs'][table.replace('_', ':', 1)]

        schema = dict(log_info['schema'])
        sanitized_schema = FirehoseClient.sanitize_keys(schema)

        athena_schema = helpers.logs_schema_to_athena_schema(sanitized_schema)

        # Add envelope keys to Athena Schema
        configuration_options = log_info.get('configuration')
        if configuration_options:
            envelope_keys = configuration_options.get('envelope_keys')
            if envelope_keys:
                sanitized_envelope_key_schema = FirehoseClient.sanitize_keys(envelope_keys)
                # Note: this key is wrapped in backticks to be Hive compliant
                athena_schema['`streamalert:envelope_keys`'] = helpers.logs_schema_to_athena_schema(
                    sanitized_envelope_key_schema)

        # Handle Schema overrides
        #   This is useful when an Athena schema needs to differ from the normal log schema
        if schema_override:
            for override in schema_override:
                column_name, column_type = override.split('=')
                if not all([column_name, column_type]):
                    LOGGER_CLI.error('Invalid schema override [%s], use column_name=type format',
                                     override)

                # Columns are escaped to avoid Hive issues with special characters
                column_name = '`{}`'.format(column_name)
                if column_name in athena_schema:
                    athena_schema[column_name] = column_type
                    LOGGER_CLI.info('Applied schema override: %s:%s', column_name, column_type)
                else:
                    LOGGER_CLI.error(
                        'Schema override column %s not found in Athena Schema, skipping',
                        column_name)

        query = _construct_create_table_statement(
            schema=athena_schema, table_name=sanitized_table_name, bucket=bucket)

    success = athena_client.run_query(query=query)
    if not success:
        LOGGER_CLI.error('The %s table could not be created', sanitized_table_name)
        return

    # Update the CLI config
    if (table != 'alerts' and
            bucket not in config['lambda']['athena_partition_refresh_config']['buckets']):
        config['lambda']['athena_partition_refresh_config']['buckets'][bucket] = 'data'
        config.write()

    LOGGER_CLI.info('The %s table was successfully created!', sanitized_table_name)