def test_sanitize_keys(self): """FirehoseClient - Sanitize Keys""" # test_log_type_json_nested test_event = { 'date': 'January 01, 3005', 'unixtime': '32661446400', 'host': 'my-host.name.website.com', 'data': { 'super-duper': 'secret', 'sanitize_me': 1, 'example-key': 1, 'moar**data': 2, 'even.more': 3 } } expected_sanitized_event = { 'date': 'January 01, 3005', 'unixtime': '32661446400', 'host': 'my-host.name.website.com', 'data': { 'super_duper': 'secret', 'sanitize_me': 1, 'example_key': 1, 'moar__data': 2, 'even_more': 3 } } sanitized_event = FirehoseClient.sanitize_keys(test_event) assert_equal(sanitized_event, expected_sanitized_event)
def create_table(table, bucket, config, schema_override=None): """Create a 'streamalert' Athena table Args: table (str): The name of the table being rebuilt bucket (str): The s3 bucket to be used as the location for Athena data table_type (str): The type of table being refreshed config (CLIConfig): Loaded StreamAlert CLI schema_override (set): An optional set of key=value pairs to be used for overriding the configured column_name=value_type. """ enabled_logs = FirehoseClient.load_enabled_log_sources( config['global']['infrastructure']['firehose'], config['logs'] ) # Convert special characters in schema name to underscores sanitized_table_name = FirehoseClient.firehose_log_name(table) # Check that the log type is enabled via Firehose if sanitized_table_name != 'alerts' and sanitized_table_name not in enabled_logs: LOGGER_CLI.error('Table name %s missing from configuration or ' 'is not enabled.', sanitized_table_name) return athena_client = get_athena_client(config) # Check if the table exists if athena_client.check_table_exists(sanitized_table_name): LOGGER_CLI.info('The \'%s\' table already exists.', sanitized_table_name) return if table == 'alerts': # get a fake alert so we can get the keys needed and their types alert = Alert('temp_rule_name', {}, {}) output = alert.output_dict() schema = record_to_schema(output) athena_schema = helpers.logs_schema_to_athena_schema(schema) query = _construct_create_table_statement( schema=athena_schema, table_name=table, bucket=bucket) else: # all other tables are log types log_info = config['logs'][table.replace('_', ':', 1)] schema = dict(log_info['schema']) sanitized_schema = FirehoseClient.sanitize_keys(schema) athena_schema = helpers.logs_schema_to_athena_schema(sanitized_schema) # Add envelope keys to Athena Schema configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_key_schema = FirehoseClient.sanitize_keys(envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant athena_schema['`streamalert:envelope_keys`'] = helpers.logs_schema_to_athena_schema( sanitized_envelope_key_schema) # Handle Schema overrides # This is useful when an Athena schema needs to differ from the normal log schema if schema_override: for override in schema_override: column_name, column_type = override.split('=') if not all([column_name, column_type]): LOGGER_CLI.error('Invalid schema override [%s], use column_name=type format', override) # Columns are escaped to avoid Hive issues with special characters column_name = '`{}`'.format(column_name) if column_name in athena_schema: athena_schema[column_name] = column_type LOGGER_CLI.info('Applied schema override: %s:%s', column_name, column_type) else: LOGGER_CLI.error( 'Schema override column %s not found in Athena Schema, skipping', column_name) query = _construct_create_table_statement( schema=athena_schema, table_name=sanitized_table_name, bucket=bucket) success = athena_client.run_query(query=query) if not success: LOGGER_CLI.error('The %s table could not be created', sanitized_table_name) return # Update the CLI config if (table != 'alerts' and bucket not in config['lambda']['athena_partition_refresh_config']['buckets']): config['lambda']['athena_partition_refresh_config']['buckets'][bucket] = 'data' config.write() LOGGER_CLI.info('The %s table was successfully created!', sanitized_table_name)