def generate_data_table_schema(config, table, schema_override=None): """Generate the schema for data table in terraform Args: config (CLIConfig): Loaded StreamAlert config table (string): The name of data table Returns: athena_schema (dict): Equivalent Athena schema used for generating create table statement """ enabled_logs = FirehoseClient.load_enabled_log_sources( config['global']['infrastructure']['firehose'], config['logs']) # Convert special characters in schema name to underscores sanitized_table_name = FirehoseClient.sanitized_value(table) # Check that the log type is enabled via Firehose if sanitized_table_name not in enabled_logs: LOGGER.error( 'Table name %s missing from configuration or ' 'is not enabled.', sanitized_table_name) return None log_info = config['logs'][enabled_logs.get(sanitized_table_name)] schema = dict(log_info['schema']) sanitized_schema = FirehoseClient.sanitize_keys(schema) athena_schema = logs_schema_to_athena_schema(sanitized_schema, False) # Add envelope keys to Athena Schema configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_key_schema = FirehoseClient.sanitize_keys( envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant athena_schema[ 'streamalert:envelope_keys'] = logs_schema_to_athena_schema( sanitized_envelope_key_schema, False) # Handle Schema overrides # This is useful when an Athena schema needs to differ from the normal log schema if schema_override: for override in schema_override: column_name, column_type = override.split('=') # Columns are escaped to avoid Hive issues with special characters column_name = '{}'.format(column_name) if column_name in athena_schema: athena_schema[column_name] = column_type LOGGER.info('Applied schema override: %s:%s', column_name, column_type) else: LOGGER.error( 'Schema override column %s not found in Athena Schema, skipping', column_name) return format_schema_tf(athena_schema)
def test_sanitize_keys(self): """FirehoseClient - Sanitize Keys""" test_event = { 'date': 'January 01, 3005', 'data': { 'super-duper': 'secret', 'do_not_sanitize_me': 1, 'example-key': 2, 'moar**data': 3, 'even.more': 4 } } expected_sanitized_event = { 'date': 'January 01, 3005', 'data': { 'super_duper': 'secret', 'do_not_sanitize_me': 1, 'example_key': 2, 'moar__data': 3, 'even_more': 4 } } sanitized_event = FirehoseClient.sanitize_keys(test_event) assert_equal(sanitized_event, expected_sanitized_event)
def create_table(table, bucket, config, schema_override=None): """Create a 'streamalert' Athena table Args: table (str): The name of the table being rebuilt bucket (str): The s3 bucket to be used as the location for Athena data table_type (str): The type of table being refreshed config (CLIConfig): Loaded StreamAlert config schema_override (set): An optional set of key=value pairs to be used for overriding the configured column_name=value_type. Returns: bool: False if errors occurred, True otherwise """ enabled_logs = FirehoseClient.load_enabled_log_sources( config['global']['infrastructure']['firehose'], config['logs']) # Convert special characters in schema name to underscores sanitized_table_name = FirehoseClient.sanitized_value(table) # Check that the log type is enabled via Firehose if sanitized_table_name != 'alerts' and sanitized_table_name not in enabled_logs: LOGGER.error( 'Table name %s missing from configuration or ' 'is not enabled.', sanitized_table_name) return False athena_client = get_athena_client(config) # Check if the table exists if athena_client.check_table_exists(sanitized_table_name): LOGGER.info('The \'%s\' table already exists.', sanitized_table_name) return True if table == 'alerts': # get a fake alert so we can get the keys needed and their types alert = Alert('temp_rule_name', {}, {}) output = alert.output_dict() schema = record_to_schema(output) athena_schema = helpers.logs_schema_to_athena_schema(schema) # Use the bucket if supplied, otherwise use the default alerts bucket bucket = bucket or firehose_alerts_bucket(config) query = _construct_create_table_statement( schema=athena_schema, table_name=table, bucket=bucket, file_format=get_data_file_format(config)) else: # all other tables are log types config_data_bucket = firehose_data_bucket(config) if not config_data_bucket: LOGGER.warning( 'The \'firehose\' module is not enabled in global.json') return False # Use the bucket if supplied, otherwise use the default data bucket bucket = bucket or config_data_bucket log_info = config['logs'][enabled_logs.get(sanitized_table_name)] schema = dict(log_info['schema']) sanitized_schema = FirehoseClient.sanitize_keys(schema) athena_schema = helpers.logs_schema_to_athena_schema(sanitized_schema) # Add envelope keys to Athena Schema configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_key_schema = FirehoseClient.sanitize_keys( envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant athena_schema[ '`streamalert:envelope_keys`'] = helpers.logs_schema_to_athena_schema( sanitized_envelope_key_schema) # Handle Schema overrides # This is useful when an Athena schema needs to differ from the normal log schema if schema_override: for override in schema_override: column_name, column_type = override.split('=') # Columns are escaped to avoid Hive issues with special characters column_name = '`{}`'.format(column_name) if column_name in athena_schema: athena_schema[column_name] = column_type LOGGER.info('Applied schema override: %s:%s', column_name, column_type) else: LOGGER.error( 'Schema override column %s not found in Athena Schema, skipping', column_name) query = _construct_create_table_statement( schema=athena_schema, table_name=sanitized_table_name, bucket=bucket, file_format=get_data_file_format(config)) success = athena_client.run_query(query=query) if not success: LOGGER.error('The %s table could not be created', sanitized_table_name) return False # Update the CLI config if table != 'alerts' and bucket != config_data_bucket: # Only add buckets to the config if they are not one of the default/configured buckets # Ensure 'buckets' exists in the config (since it is not required) config['lambda']['athena_partitioner_config']['buckets'] = ( config['lambda']['athena_partitioner_config'].get('buckets', {})) if bucket not in config['lambda']['athena_partitioner_config'][ 'buckets']: config['lambda']['athena_partitioner_config']['buckets'][ bucket] = 'data' config.write() LOGGER.info('The %s table was successfully created!', sanitized_table_name) return True