def test_sanitize_keys(self): """StreamAlertFirehose - Sanitize Keys""" # test_log_type_json_nested test_event = { 'date': 'January 01, 3005', 'unixtime': '32661446400', 'host': 'my-host.name.website.com', 'data': { 'super-duper': 'secret', 'sanitize_me': 1, 'example-key': 1, 'moar**data': 2, 'even.more': 3 } } expected_sanitized_event = { 'date': 'January 01, 3005', 'unixtime': '32661446400', 'host': 'my-host.name.website.com', 'data': { 'super_duper': 'secret', 'sanitize_me': 1, 'example_key': 1, 'moar__data': 2, 'even_more': 3 } } sanitized_event = StreamAlertFirehose.sanitize_keys(test_event) assert_equal(sanitized_event, expected_sanitized_event)
def create_table(athena_client, options, config): """Create a 'streamalert' Athena table Args: athena_client (boto3.client): Instantiated CLI AthenaClient options (namedtuple): The parsed args passed from the CLI config (CLIConfig): Loaded StreamAlert CLI """ sa_firehose = StreamAlertFirehose( config['global']['account']['region'], config['global']['infrastructure']['firehose'], config['logs']) if not options.bucket: LOGGER_CLI.error('Missing command line argument --bucket') return if not options.refresh_type: LOGGER_CLI.error('Missing command line argument --refresh_type') return if options.type == 'data': if not options.table_name: LOGGER_CLI.error('Missing command line argument --table_name') return # Convert special characters in schema name to underscores sanitized_table_name = sa_firehose.firehose_log_name( options.table_name) # Check that the log type is enabled via Firehose if sanitized_table_name not in sa_firehose.enabled_logs: LOGGER_CLI.error( 'Table name %s missing from configuration or ' 'is not enabled.', sanitized_table_name) return # Check if the table exists if athena_client.check_table_exists(sanitized_table_name): LOGGER_CLI.info('The \'%s\' table already exists.', sanitized_table_name) return log_info = config['logs'][options.table_name.replace('_', ':', 1)] schema = dict(log_info['schema']) sanitized_schema = StreamAlertFirehose.sanitize_keys(schema) athena_schema = handler_helpers.to_athena_schema(sanitized_schema) # Add envelope keys to Athena Schema configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_key_schema = StreamAlertFirehose.sanitize_keys( envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant athena_schema[ '`streamalert:envelope_keys`'] = handler_helpers.to_athena_schema( sanitized_envelope_key_schema) # Handle Schema overrides # This is useful when an Athena schema needs to differ from the normal log schema if options.schema_override: for override in options.schema_override: if '=' not in override: LOGGER_CLI.error( 'Invalid schema override [%s], use column_name=type format', override) return column_name, column_type = override.split('=') if not all([column_name, column_type]): LOGGER_CLI.error( 'Invalid schema override [%s], use column_name=type format', override) # Columns are escaped to avoid Hive issues with special characters column_name = '`{}`'.format(column_name) if column_name in athena_schema: athena_schema[column_name] = column_type LOGGER_CLI.info('Applied schema override: %s:%s', column_name, column_type) else: LOGGER_CLI.error( 'Schema override column %s not found in Athena Schema, skipping', column_name) query = _construct_create_table_statement( schema=athena_schema, table_name=sanitized_table_name, bucket=options.bucket) elif options.type == 'alerts': if athena_client.check_table_exists(options.type): LOGGER_CLI.info('The \'alerts\' table already exists.') return query = ALERTS_TABLE_STATEMENT.format(bucket=options.bucket) if query: create_table_success, _ = athena_client.run_athena_query( query=query, database='streamalert') if create_table_success: # Update the CLI config config['lambda']['athena_partition_refresh_config'] \ ['refresh_type'][options.refresh_type][options.bucket] = options.type config.write() table_name = options.type if options.type == 'alerts' else sanitized_table_name LOGGER_CLI.info('The %s table was successfully created!', table_name)
def create_table(table, bucket, config, schema_override=None): """Create a 'streamalert' Athena table Args: table (str): The name of the table being rebuilt bucket (str): The s3 bucket to be used as the location for Athena data table_type (str): The type of table being refreshed config (CLIConfig): Loaded StreamAlert CLI schema_override (set): An optional set of key=value pairs to be used for overriding the configured column_name=value_type. """ athena_client = StreamAlertAthenaClient( config, results_key_prefix='stream_alert_cli') sa_firehose = StreamAlertFirehose( config['global']['account']['region'], config['global']['infrastructure']['firehose'], config['logs']) # Convert special characters in schema name to underscores sanitized_table_name = sa_firehose.firehose_log_name(table) # Check that the log type is enabled via Firehose if sanitized_table_name != 'alerts' and sanitized_table_name not in sa_firehose.enabled_logs: LOGGER_CLI.error( 'Table name %s missing from configuration or ' 'is not enabled.', sanitized_table_name) return # Check if the table exists if athena_client.check_table_exists(sanitized_table_name, True): LOGGER_CLI.info('The \'%s\' table already exists.', sanitized_table_name) return if table == 'alerts': # get a fake alert so we can get the keys needed and their types alert = Alert('temp_rule_name', {}, {}) output = alert.output_dict() schema = record_to_schema(output) athena_schema = handler_helpers.to_athena_schema(schema) query = _construct_create_table_statement(schema=athena_schema, table_name=table, bucket=bucket) else: # all other tables are log types log_info = config['logs'][table.replace('_', ':', 1)] schema = dict(log_info['schema']) sanitized_schema = StreamAlertFirehose.sanitize_keys(schema) athena_schema = handler_helpers.to_athena_schema(sanitized_schema) # Add envelope keys to Athena Schema configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_key_schema = StreamAlertFirehose.sanitize_keys( envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant athena_schema[ '`streamalert:envelope_keys`'] = handler_helpers.to_athena_schema( sanitized_envelope_key_schema) # Handle Schema overrides # This is useful when an Athena schema needs to differ from the normal log schema if schema_override: for override in schema_override: column_name, column_type = override.split('=') if not all([column_name, column_type]): LOGGER_CLI.error( 'Invalid schema override [%s], use column_name=type format', override) # Columns are escaped to avoid Hive issues with special characters column_name = '`{}`'.format(column_name) if column_name in athena_schema: athena_schema[column_name] = column_type LOGGER_CLI.info('Applied schema override: %s:%s', column_name, column_type) else: LOGGER_CLI.error( 'Schema override column %s not found in Athena Schema, skipping', column_name) query = _construct_create_table_statement( schema=athena_schema, table_name=sanitized_table_name, bucket=bucket) create_table_success, _ = athena_client.run_athena_query( query=query, database=athena_client.sa_database) if not create_table_success: LOGGER_CLI.error('The %s table could not be created', sanitized_table_name) return # Update the CLI config if (table != 'alerts' and bucket not in config['lambda'] ['athena_partition_refresh_config']['buckets']): config['lambda']['athena_partition_refresh_config']['buckets'][ bucket] = 'data' config.write() LOGGER_CLI.info('The %s table was successfully created!', sanitized_table_name)
def create_table(athena_client, options, config): """Create a 'streamalert' Athena table Args: athena_client (boto3.client): Instantiated CLI AthenaClient options (namedtuple): The parsed args passed from the CLI config (CLIConfig): Loaded StreamAlert CLI """ sa_firehose = StreamAlertFirehose( config['global']['account']['region'], config['global']['infrastructure']['firehose'], config['logs']) if not options.bucket: LOGGER_CLI.error('Missing command line argument --bucket') return if not options.refresh_type: LOGGER_CLI.error('Missing command line argument --refresh_type') return if options.type == 'data': if not options.table_name: LOGGER_CLI.error('Missing command line argument --table_name') return sanitized_table_name = sa_firehose.firehose_log_name( options.table_name) if sanitized_table_name not in sa_firehose.enabled_logs: LOGGER_CLI.error( 'Table name %s missing from configuration or ' 'is not enabled.', sanitized_table_name) return if athena_client.check_table_exists(sanitized_table_name): LOGGER_CLI.info('The \'%s\' table already exists.', sanitized_table_name) return log_info = config['logs'][options.table_name.replace('_', ':', 1)] schema = dict(log_info['schema']) schema_statement = '' sanitized_schema = StreamAlertFirehose.sanitize_keys(schema) athena_schema = {} _add_to_athena_schema(sanitized_schema, athena_schema) # Support envelope keys configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_key_schema = StreamAlertFirehose.sanitize_keys( envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant _add_to_athena_schema(sanitized_envelope_key_schema, athena_schema, '`streamalert:envelope_keys`') for key_name, key_type in athena_schema.iteritems(): # Account for nested structs if isinstance(key_type, dict): struct_schema = ''.join([ '{0}:{1},'.format(sub_key, sub_type) for sub_key, sub_type in key_type.iteritems() ]) nested_schema_statement = '{0} struct<{1}>, '.format( key_name, # Use the minus index to remove the last comma struct_schema[:-1]) schema_statement += nested_schema_statement else: schema_statement += '{0} {1},'.format(key_name, key_type) query = ( 'CREATE EXTERNAL TABLE {table_name} ({schema}) ' 'PARTITIONED BY (dt string) ' 'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\' ' 'WITH SERDEPROPERTIES ( \'ignore.malformed.json\' = \'true\') ' 'LOCATION \'s3://{bucket}/{table_name}/\''.format( table_name=sanitized_table_name, # Use the minus index to remove the last comma schema=schema_statement[:-1], bucket=options.bucket)) elif options.type == 'alerts': if athena_client.check_table_exists(options.type): LOGGER_CLI.info('The \'alerts\' table already exists.') return query = ('CREATE EXTERNAL TABLE alerts (' 'log_source string,' 'log_type string,' 'outputs array<string>,' 'record string,' 'rule_description string,' 'rule_name string,' 'source_entity string,' 'source_service string)' 'PARTITIONED BY (dt string)' 'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\'' 'LOCATION \'s3://{bucket}/alerts/\''.format( bucket=options.bucket)) if query: create_table_success, _ = athena_client.run_athena_query( query=query, database='streamalert') if create_table_success: # Update the CLI config config['lambda']['athena_partition_refresh_config'] \ ['refresh_type'][options.refresh_type][options.bucket] = options.type config.write() table_name = options.type if options.type == 'alerts' else sanitized_table_name LOGGER_CLI.info('The %s table was successfully created!', table_name)