def drop_all_tables(config): """Drop all 'streamalert' Athena tables Used when cleaning up an existing deployment Args: config (CLIConfig): Loaded StreamAlert CLI """ if not continue_prompt( message='Are you sure you want to drop all Athena tables?'): return athena_client = StreamAlertAthenaClient( config, results_key_prefix='stream_alert_cli') success, all_tables = athena_client.run_athena_query( query='SHOW TABLES', database=athena_client.sa_database) if not success: LOGGER_CLI.error('There was an issue getting all tables') return unique_tables = athena_helpers.unique_values_from_query(all_tables) for table in unique_tables: success, all_tables = athena_client.run_athena_query( query='DROP TABLE {}'.format(table), database=athena_client.sa_database) if not success: LOGGER_CLI.error('Unable to drop the %s table', table) else: LOGGER_CLI.info('Dropped %s', table)
def athena_handler(options): """Handle Athena operations""" athena_client = StreamAlertAthenaClient( CONFIG, results_key_prefix='stream_alert_cli') if options.subcommand == 'init': CONFIG.generate_athena() elif options.subcommand == 'enable': CONFIG.set_athena_lambda_enable() elif options.subcommand == 'create-db': if athena_client.check_database_exists(): LOGGER_CLI.info( 'The \'streamalert\' database already exists, nothing to do') return create_db_success, create_db_result = athena_client.run_athena_query( query='CREATE DATABASE streamalert') if create_db_success and create_db_result['ResultSet'].get('Rows'): LOGGER_CLI.info('streamalert database successfully created!') LOGGER_CLI.info('results: %s', create_db_result['ResultSet']['Rows']) elif options.subcommand == 'create-table': if options.type == 'alerts': if not options.bucket: LOGGER_CLI.error('Missing command line argument --bucket') return if athena_client.check_table_exists(options.type): LOGGER_CLI.info('The \'alerts\' table already exists.') return query = ('CREATE EXTERNAL TABLE alerts (' 'log_source string,' 'log_type string,' 'outputs array<string>,' 'record string,' 'rule_description string,' 'rule_name string,' 'source_entity string,' 'source_service string)' 'PARTITIONED BY (dt string)' 'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\'' 'LOCATION \'s3://{bucket}/alerts/\''.format( bucket=options.bucket)) create_table_success, _ = athena_client.run_athena_query( query=query, database='streamalert') if create_table_success: CONFIG['lambda']['athena_partition_refresh_config'] \ ['refresh_type'][options.refresh_type][options.bucket] = 'alerts' CONFIG.write() LOGGER_CLI.info('The alerts table was successfully created!')
class TestStreamAlertAthenaClient(object): """Test class for StreamAlertAthenaClient""" def setup(self): self.client = StreamAlertAthenaClient( CONFIG_DATA, results_key_prefix='unit-testing') @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_add_hive_partition(self, mock_logging): """Athena - Add Hive Partition""" query_result = [{'Repair: added data to metastore:foobar'}, {'Repair: added data to metastore:foobaz'}] self.client.athena_client = MockAthenaClient(results=query_result) result = self.client.add_hive_partition({ 'unit-testing.streamalerts': set([ 'alerts/dt=2017-08-26-14/rule_name_alerts-1304134918401.json', 'alerts/dt=2017-08-27-14/rule_name_alerts-1304134918401.json' ]), 'unit-testing.streamalert.data': set([ 'log_type_1/2017/08/26/14/test-data-11111-22222-33333.snappy', 'log_type_2/2017/08/26/14/test-data-11111-22222-33333.snappy', 'log_type_2/2017/08/26/15/test-data-11111-22222-33333.snappy', 'log_type_2/2017/08/26/16/test-data-11111-22222-33333.snappy', 'log_type_3/2017/08/26/14/test-data-11111-22222-33333.snappy', 'log_type_1/2017/08/26/11/test-data-11111-22222-33333.snappy' ]), 'test-bucket-with-data': set([ '2017/08/26/14/rule_name_alerts-1304134918401.json', '2017/08/28/14/rule_name_alerts-1304134918401.json', '2017/07/30/14/rule_name_alerts-1304134918401.json' ]) }) assert_true(mock_logging.info.called) assert_true(result) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_add_hive_partition_unknown_bucket(self, mock_logging): """Athena - Add Hive Partition - Unknown Bucket""" self.client.athena_client = MockAthenaClient(results=[]) result = self.client.add_hive_partition({ 'bucket-not-in-config.streamalerts': set([ 'alerts/dt=2017-08-26-14/rule_name_alerts-1304134918401.json', 'alerts/dt=2017-08-27-14/rule_name_alerts-1304134918401.json', ]) }) assert_true(mock_logging.error.called) assert_false(result) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_add_hive_partition_unexpected_s3_key(self, mock_logging): """Athena - Add Hive Partition - Unexpected S3 Key""" self.client.athena_client = MockAthenaClient(results=[]) result = self.client.add_hive_partition({ 'unit-testing.streamalerts': set(['a/pattern/that/does/not-match']), 'test-bucket-with-data': set(['another/pattern/that/does/not-match']) }) assert_true(mock_logging.error.called) assert_false(result) def test_check_table_exists(self): """Athena - Check Table Exists""" query_result = [{'alerts': True}] self.client.athena_client = MockAthenaClient(results=query_result) result = self.client.check_table_exists('unit-test') assert_true(result) generated_results_key = 'unit-testing/{}'.format( datetime.now().strftime('%Y/%m/%d')) assert_equal(self.client.athena_results_key, generated_results_key) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_check_table_exists_invalid(self, mock_logging): """Athena - Check Table Exists - Does Not Exist""" query_result = None self.client.athena_client = MockAthenaClient(results=query_result) result = self.client.check_table_exists('unit-test') assert_false(result) assert_true(mock_logging.info.called) def test_check_database_exists_invalid(self): """Athena - Check Database Exists - Does Not Exist""" query_result = None self.client.athena_client = MockAthenaClient(results=query_result) assert_false(self.client.check_database_exists()) def test_check_database_exists(self): """Athena - Check Database Exists""" query_result = [{'streamalert': True}] self.client.athena_client = MockAthenaClient(results=query_result) assert_true(self.client.check_database_exists()) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_run_athena_query_empty(self, mock_logging): """Athena - Run Athena Query - Empty Result""" query_result = None self.client.athena_client = MockAthenaClient(results=query_result) query_success, query_results = self.client.run_athena_query( query='SHOW DATABASES;') assert_true(query_success) assert_equal(query_results['ResultSet']['Rows'], []) assert_true(mock_logging.debug.called) def test_run_athena_query_async(self): """Athena - Run Athena Query - Async Call""" query_result = [] self.client.athena_client = MockAthenaClient(results=query_result) query_success, _ = self.client.run_athena_query( query='SHOW DATABASES;', async=True) assert_true(query_success) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_run_athena_query_error(self, mock_logging): """Athena - Run Athena Query - Error Result""" self.client.athena_client = MockAthenaClient(results=None, result_state='FAILED') query_success, query_results = self.client.run_athena_query( query='SHOW DATABASES;') assert_true(mock_logging.error.called) assert_false(query_success) assert_equal(query_results, {}) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_repair_hive_table_unknown_bucket(self, mock_logging): """Athena - Repair Hive Table - Unknown Bucket""" self.client.athena_client = MockAthenaClient(result_state='SUCCEEDED') # This bucket is not in our `repair_hive_table` config map self.client.repair_hive_table({'my-test.result.bucket'}) assert_true(mock_logging.warning.called) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_repair_hive_table_failed_refresh(self, mock_logging): """Athena - Repair Hive Table - Failed Refresh""" self.client.athena_client = MockAthenaClient(result_state='FAILED') # This bucket is not in our `repair_hive_table` config map self.client.repair_hive_table({'unit-testing.streamalerts'}) assert_true(mock_logging.error.called) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_repair_hive_table(self, mock_logging): """Athena - Repair Hive Table""" query_result = [{'Status': 'SUCCEEDED'}] self.client.athena_client = MockAthenaClient(results=query_result) self.client.repair_hive_table({'unit-testing.streamalerts'}) assert_true(mock_logging.info.called) def test_run_athena_query(self): """Athena - Run Athena Query - Normal Result""" self.client.athena_client = MockAthenaClient() query_success, query_results = self.client.run_athena_query( query='SHOW DATABASES;') assert_true(query_success) assert_equal(query_results['ResultSet']['Rows'], [{ 'Data': [{ 'test': 'test' }] }])
def athena_handler(options): """Handle Athena operations""" athena_client = StreamAlertAthenaClient( CONFIG, results_key_prefix='stream_alert_cli') if options.subcommand == 'init': CONFIG.generate_athena() elif options.subcommand == 'enable': CONFIG.set_athena_lambda_enable() elif options.subcommand == 'create-db': if athena_client.check_database_exists(): LOGGER_CLI.info( 'The \'streamalert\' database already exists, nothing to do') return create_db_success, create_db_result = athena_client.run_athena_query( query='CREATE DATABASE streamalert') if create_db_success and create_db_result['ResultSet'].get('Rows'): LOGGER_CLI.info('streamalert database successfully created!') LOGGER_CLI.info('results: %s', create_db_result['ResultSet']['Rows']) elif options.subcommand == 'create-table': if not options.bucket: LOGGER_CLI.error('Missing command line argument --bucket') return if not options.refresh_type: LOGGER_CLI.error('Missing command line argument --refresh_type') return if options.type == 'data': if not options.table_name: LOGGER_CLI.error('Missing command line argument --table_name') return if options.table_name not in enabled_firehose_logs(CONFIG): LOGGER_CLI.error( 'Table name %s missing from configuration or ' 'is not enabled.', options.table_name) return if athena_client.check_table_exists(options.table_name): LOGGER_CLI.info('The \'%s\' table already exists.', options.table_name) return log_info = CONFIG['logs'][options.table_name.replace('_', ':', 1)] schema = dict(log_info['schema']) schema_statement = '' sanitized_schema = StreamAlert.sanitize_keys(schema) athena_schema = {} schema_type_mapping = { 'string': 'string', 'integer': 'int', 'boolean': 'boolean', 'float': 'decimal', dict: 'map<string, string>', list: 'array<string>' } def add_to_athena_schema(schema, root_key=''): """Helper function to add sanitized schemas to the Athena table schema""" # Setup the root_key dict if root_key and not athena_schema.get(root_key): athena_schema[root_key] = {} for key_name, key_type in schema.iteritems(): # When using special characters in the beginning or end # of a column name, they have to be wrapped in backticks key_name = '`{}`'.format(key_name) special_key = None # Transform the {} or [] into hashable types if key_type == {}: special_key = dict elif key_type == []: special_key = list # Cast nested dict as a string for now # TODO(jacknagz): support recursive schemas elif isinstance(key_type, dict): special_key = 'string' # Account for envelope keys if root_key: if special_key is not None: athena_schema[root_key][ key_name] = schema_type_mapping[special_key] else: athena_schema[root_key][ key_name] = schema_type_mapping[key_type] else: if special_key is not None: athena_schema[key_name] = schema_type_mapping[ special_key] else: athena_schema[key_name] = schema_type_mapping[ key_type] add_to_athena_schema(sanitized_schema) # Support envelope keys configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_keys = StreamAlert.sanitize_keys( envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant add_to_athena_schema(sanitized_envelope_keys, '`streamalert:envelope_keys`') for key_name, key_type in athena_schema.iteritems(): # Account for nested structs if isinstance(key_type, dict): struct_schema = ''.join([ '{0}:{1},'.format(sub_key, sub_type) for sub_key, sub_type in key_type.iteritems() ]) nested_schema_statement = '{0} struct<{1}>, '.format( key_name, # Use the minus index to remove the last comma struct_schema[:-1]) schema_statement += nested_schema_statement else: schema_statement += '{0} {1},'.format(key_name, key_type) query = ( 'CREATE EXTERNAL TABLE {table_name} ({schema}) ' 'PARTITIONED BY (dt string) ' 'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\' ' 'LOCATION \'s3://{bucket}/{table_name}/\''.format( table_name=options.table_name, # Use the minus index to remove the last comma schema=schema_statement[:-1], bucket=options.bucket)) elif options.type == 'alerts': if athena_client.check_table_exists(options.type): LOGGER_CLI.info('The \'alerts\' table already exists.') return query = ('CREATE EXTERNAL TABLE alerts (' 'log_source string,' 'log_type string,' 'outputs array<string>,' 'record string,' 'rule_description string,' 'rule_name string,' 'source_entity string,' 'source_service string)' 'PARTITIONED BY (dt string)' 'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\'' 'LOCATION \'s3://{bucket}/alerts/\''.format( bucket=options.bucket)) if query: create_table_success, _ = athena_client.run_athena_query( query=query, database='streamalert') if create_table_success: CONFIG['lambda']['athena_partition_refresh_config'] \ ['refresh_type'][options.refresh_type][options.bucket] = options.type CONFIG.write() table_name = options.type if options.type == 'alerts' else options.table_name LOGGER_CLI.info('The %s table was successfully created!', table_name)
def athena_handler(options): """Handle Athena operations""" athena_client = StreamAlertAthenaClient( CONFIG, results_key_prefix='stream_alert_cli') if options.subcommand == 'init': CONFIG.generate_athena() elif options.subcommand == 'enable': CONFIG.set_athena_lambda_enable() elif options.subcommand == 'create-db': if athena_client.check_database_exists(): LOGGER_CLI.info( 'The \'streamalert\' database already exists, nothing to do') return create_db_success, create_db_result = athena_client.run_athena_query( query='CREATE DATABASE streamalert') if create_db_success and create_db_result['ResultSet'].get('Rows'): LOGGER_CLI.info('streamalert database successfully created!') LOGGER_CLI.info('results: %s', create_db_result['ResultSet']['Rows']) elif options.subcommand == 'create-table': if not options.bucket: LOGGER_CLI.error('Missing command line argument --bucket') return if not options.refresh_type: LOGGER_CLI.error('Missing command line argument --refresh_type') return if options.type == 'data': if not options.table_name: LOGGER_CLI.error('Missing command line argument --table_name') return if options.table_name not in enabled_firehose_logs(CONFIG): LOGGER_CLI.error( 'Table name %s missing from configuration or ' 'is not enabled.', options.table_name) return if athena_client.check_table_exists(options.table_name): LOGGER_CLI.info('The \'%s\' table already exists.', options.table_name) return schema = CONFIG['logs'][options.table_name.replace('_', ':')]['schema'] sanitized_schema = StreamAlert.sanitize_keys(schema) athena_schema = {} schema_type_mapping = { 'string': 'string', 'integer': 'int', 'boolean': 'boolean', 'float': 'decimal', dict: 'map<string, string>', list: 'array<string>' } for key_name, key_type in sanitized_schema.iteritems(): # Transform the {} or [] into hashable types if key_type == {}: key_type = dict elif key_type == []: key_type = list athena_schema[key_name] = schema_type_mapping[key_type] schema_statement = ''.join([ '{0} {1},'.format(key_name, key_type) for key_name, key_type in athena_schema.iteritems() ])[:-1] query = ('CREATE EXTERNAL TABLE {table_name} ({schema})' 'PARTITIONED BY (dt string)' 'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\'' 'LOCATION \'s3://{bucket}/{table_name}/\''.format( table_name=options.table_name, schema=schema_statement, bucket=options.bucket)) elif options.type == 'alerts': if athena_client.check_table_exists(options.type): LOGGER_CLI.info('The \'alerts\' table already exists.') return query = ('CREATE EXTERNAL TABLE alerts (' 'log_source string,' 'log_type string,' 'outputs array<string>,' 'record string,' 'rule_description string,' 'rule_name string,' 'source_entity string,' 'source_service string)' 'PARTITIONED BY (dt string)' 'ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\'' 'LOCATION \'s3://{bucket}/alerts/\''.format( bucket=options.bucket)) if query: create_table_success, _ = athena_client.run_athena_query( query=query, database='streamalert') if create_table_success: CONFIG['lambda']['athena_partition_refresh_config'] \ ['refresh_type'][options.refresh_type][options.bucket] = options.type CONFIG.write() LOGGER_CLI.info('The %s table was successfully created!', options.type)
class TestStreamAlertAthenaClient(object): """Test class for StreamAlertAthenaClient""" def __init__(self): self.config_data = { 'global': { 'account': { 'aws_account_id': '111111111111', 'kms_key_alias': 'stream_alert_secrets', 'prefix': 'unit-testing', 'region': 'us-east-2' }, 'terraform': { 'tfstate_bucket': 'unit-testing.streamalert.terraform.state', 'tfstate_s3_key': 'stream_alert_state/terraform.tfstate', 'tfvars': 'terraform.tfvars' }, 'infrastructure': { 'monitoring': { 'create_sns_topic': True } } }, 'lambda': { 'alert_processor_config': { 'handler': 'stream_alert.alert_processor.main.handler', 'source_bucket': 'unit-testing.streamalert.source', 'source_current_hash': '<auto_generated>', 'source_object_key': '<auto_generated>', 'third_party_libraries': [] }, 'rule_processor_config': { 'handler': 'stream_alert.rule_processor.main.handler', 'source_bucket': 'unit-testing.streamalert.source', 'source_current_hash': '<auto_generated>', 'source_object_key': '<auto_generated>', 'third_party_libraries': ['jsonpath_rw', 'netaddr'] }, 'athena_partition_refresh_config': { "enabled": True, "refresh_type": { "repair_hive_table": { "unit-testing.streamalerts": "alerts" }, "add_hive_partition": {} }, "handler": "main.handler", "timeout": "60", "memory": "128", "source_bucket": "unit-testing.streamalert.source", "source_current_hash": "<auto_generated>", "source_object_key": "<auto_generated", "third_party_libraries": ["backoff"] } } } def setup(self): self.client = StreamAlertAthenaClient( config=self.config_data, results_key_prefix='unit-testing') @raises(ConfigError) def test_invalid_json_config(self): """Athena - Load Invalid Config""" invalid_config_data = 'This is not JSON!!!' with mock_open(LAMBDA_FILE, invalid_config_data): with mock_open(GLOBAL_FILE, invalid_config_data): client = StreamAlertAthenaClient() @raises(ConfigError) def test_invalid_missing_config(self): """Athena - Load Missing Config File""" invalid_config_data = 'test' with mock_open(LAMBDA_FILE, invalid_config_data): with mock_open(GLOBAL_FILE, invalid_config_data): with patch('os.path.exists') as mock_exists: mock_exists.return_value = False client = StreamAlertAthenaClient() def test_load_valid_config(self): """Athena - Load Config""" global_contents = json.dumps(self.config_data['global'], indent=4) lambda_contents = json.dumps(self.config_data['lambda'], indent=4) with mock_open(GLOBAL_FILE, global_contents): with mock_open(LAMBDA_FILE, lambda_contents): client = StreamAlertAthenaClient() assert_equal(type(client.config), dict) assert_equal(set(client.config.keys()), {'global', 'lambda'}) @patch('stream_alert.athena_partition_refresh.main.LOGGER') @raises(NotImplementedError) def test_firehose_partition_refresh(self, mock_logging): """Athena - Test Firehose Parition Refresh""" self.client.firehose_partition_refresh(None) assert_true(mock_logging.error.called) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_backoff_and_success_handlers(self, mock_logging): """Athena - Test Backoff Handlers""" self.client._backoff_handler({ 'wait': 1.0, 'tries': 3, 'target': 'backoff' }) assert_true(mock_logging.debug.called) self.client._success_handler({'tries': 3, 'target': 'backoff'}) assert_true(mock_logging.debug.called) def test_check_table_exists(self): """Athena - Check Table Exists""" query_result = [{'alerts': True}] self.client.athena_client = MockAthenaClient(results=query_result) result = self.client.check_table_exists('unit-test') assert_true(result) generated_results_key = 'unit-testing/{}'.format( datetime.now().strftime('%Y/%m/%d')) assert_equal(self.client.athena_results_key, generated_results_key) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_check_table_exists_invalid(self, mock_logging): """Athena - Check Table Exists - Does Not Exist""" query_result = None self.client.athena_client = MockAthenaClient(results=query_result) result = self.client.check_table_exists('unit-test') assert_false(result) assert_true(mock_logging.info.called) def test_check_database_exists_invalid(self): """Athena - Check Database Exists - Does Not Exist""" query_result = None self.client.athena_client = MockAthenaClient(results=query_result) assert_false(self.client.check_database_exists()) def test_check_database_exists(self): """Athena - Check Database Exists""" query_result = [{'streamalert': True}] self.client.athena_client = MockAthenaClient(results=query_result) assert_true(self.client.check_database_exists()) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_run_athena_query_empty(self, mock_logging): """Athena - Run Athena Query""" query_result = None self.client.athena_client = MockAthenaClient(results=query_result) query_success, query_results = self.client.run_athena_query( query='SHOW DATABASES;') assert_true(query_success) assert_equal(query_results['ResultSet']['Rows'], []) assert_true(mock_logging.debug.called) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_run_athena_query_error(self, mock_logging): """Athena - Run Athena Query""" self.client.athena_client = MockAthenaClient(results=None, result_state='FAILED') query_success, query_results = self.client.run_athena_query( query='SHOW DATABASES;') assert_true(mock_logging.error.called) assert_false(query_success) assert_equal(query_results, {}) @patch('stream_alert.athena_partition_refresh.main.LOGGER') def test_repair_hive_table(self, mock_logging): """Athena - Repair Hive Table""" query_result = [{'Status': 'Success'}] self.client.athena_client = MockAthenaClient(results=query_result) self.client.repair_hive_table() assert_true(mock_logging.info.called) def test_run_athena_query(self): """Athena - Run Athena Query""" self.client.athena_client = MockAthenaClient() query_success, query_results = self.client.run_athena_query( query='SHOW DATABASES;') assert_true(query_success) assert_equal(query_results['ResultSet']['Rows'], [{ 'Data': [{ 'test': 'test' }] }]) @patch('stream_alert.athena_partition_refresh.main.LOGGER.error') @patch( 'stream_alert.athena_partition_refresh.main.StreamAlertAthenaClient.run_athena_query' ) def test_repair_hive_table_fail(self, mock_run_athena, mock_logging): """Athena - Repair Hive Table, Failure""" mock_run_athena.return_value = (False, None) self.client.athena_client = MockAthenaClient() self.client.repair_hive_table() assert_true(mock_logging.called)
def rebuild_partitions(table, bucket, config): """Rebuild an Athena table's partitions Steps: - Get the list of current partitions - Destroy existing table - Re-create tables - Re-create partitions Args: table (str): The name of the table being rebuilt bucket (str): The s3 bucket to be used as the location for Athena data table_type (str): The type of table being refreshed Types of 'data' and 'alert' are accepted, but only 'data' is implemented config (CLIConfig): Loaded StreamAlert CLI """ athena_client = StreamAlertAthenaClient( config, results_key_prefix='stream_alert_cli') sa_firehose = StreamAlertFirehose( config['global']['account']['region'], config['global']['infrastructure']['firehose'], config['logs']) sanitized_table_name = sa_firehose.firehose_log_name(table) # Get the current set of partitions partition_success, partitions = athena_client.run_athena_query( query='SHOW PARTITIONS {}'.format(sanitized_table_name), database=athena_client.sa_database) if not partition_success: LOGGER_CLI.error('An error occurred when loading partitions for %s', sanitized_table_name) return unique_partitions = athena_helpers.unique_values_from_query(partitions) if not unique_partitions: LOGGER_CLI.info('No partitions to rebuild for %s, nothing to do', sanitized_table_name) return # Drop the table LOGGER_CLI.info('Dropping table %s', sanitized_table_name) drop_success, _ = athena_client.run_athena_query( query='DROP TABLE {}'.format(sanitized_table_name), database=athena_client.sa_database) if not drop_success: LOGGER_CLI.error('An error occurred when dropping the %s table', sanitized_table_name) return LOGGER_CLI.info('Dropped table %s', sanitized_table_name) LOGGER_CLI.info('Creating table %s', sanitized_table_name) # Re-create the table with previous partitions create_table(table, bucket, config) new_partitions_statement = athena_helpers.partition_statement( unique_partitions, bucket, sanitized_table_name) # Make sure our new alter table statement is within the query API limits if len(new_partitions_statement) > MAX_QUERY_LENGTH: LOGGER_CLI.error( 'Partition statement too large, writing to local file') with open('partitions_{}.txt'.format(sanitized_table_name), 'w') as partition_file: partition_file.write(new_partitions_statement) return LOGGER_CLI.info('Creating %d new partitions for %s', len(unique_partitions), sanitized_table_name) new_part_success, _ = athena_client.run_athena_query( query=new_partitions_statement, database=athena_client.sa_database) if not new_part_success: LOGGER_CLI.error('Error re-creating new partitions for %s', sanitized_table_name) return LOGGER_CLI.info('Successfully rebuilt partitions for %s', sanitized_table_name)
def create_table(table, bucket, config, schema_override=None): """Create a 'streamalert' Athena table Args: table (str): The name of the table being rebuilt bucket (str): The s3 bucket to be used as the location for Athena data table_type (str): The type of table being refreshed config (CLIConfig): Loaded StreamAlert CLI schema_override (set): An optional set of key=value pairs to be used for overriding the configured column_name=value_type. """ athena_client = StreamAlertAthenaClient( config, results_key_prefix='stream_alert_cli') sa_firehose = StreamAlertFirehose( config['global']['account']['region'], config['global']['infrastructure']['firehose'], config['logs']) # Convert special characters in schema name to underscores sanitized_table_name = sa_firehose.firehose_log_name(table) # Check that the log type is enabled via Firehose if sanitized_table_name != 'alerts' and sanitized_table_name not in sa_firehose.enabled_logs: LOGGER_CLI.error( 'Table name %s missing from configuration or ' 'is not enabled.', sanitized_table_name) return # Check if the table exists if athena_client.check_table_exists(sanitized_table_name, True): LOGGER_CLI.info('The \'%s\' table already exists.', sanitized_table_name) return if table == 'alerts': # get a fake alert so we can get the keys needed and their types alert = Alert('temp_rule_name', {}, {}) output = alert.output_dict() schema = record_to_schema(output) athena_schema = handler_helpers.to_athena_schema(schema) query = _construct_create_table_statement(schema=athena_schema, table_name=table, bucket=bucket) else: # all other tables are log types log_info = config['logs'][table.replace('_', ':', 1)] schema = dict(log_info['schema']) sanitized_schema = StreamAlertFirehose.sanitize_keys(schema) athena_schema = handler_helpers.to_athena_schema(sanitized_schema) # Add envelope keys to Athena Schema configuration_options = log_info.get('configuration') if configuration_options: envelope_keys = configuration_options.get('envelope_keys') if envelope_keys: sanitized_envelope_key_schema = StreamAlertFirehose.sanitize_keys( envelope_keys) # Note: this key is wrapped in backticks to be Hive compliant athena_schema[ '`streamalert:envelope_keys`'] = handler_helpers.to_athena_schema( sanitized_envelope_key_schema) # Handle Schema overrides # This is useful when an Athena schema needs to differ from the normal log schema if schema_override: for override in schema_override: column_name, column_type = override.split('=') if not all([column_name, column_type]): LOGGER_CLI.error( 'Invalid schema override [%s], use column_name=type format', override) # Columns are escaped to avoid Hive issues with special characters column_name = '`{}`'.format(column_name) if column_name in athena_schema: athena_schema[column_name] = column_type LOGGER_CLI.info('Applied schema override: %s:%s', column_name, column_type) else: LOGGER_CLI.error( 'Schema override column %s not found in Athena Schema, skipping', column_name) query = _construct_create_table_statement( schema=athena_schema, table_name=sanitized_table_name, bucket=bucket) create_table_success, _ = athena_client.run_athena_query( query=query, database=athena_client.sa_database) if not create_table_success: LOGGER_CLI.error('The %s table could not be created', sanitized_table_name) return # Update the CLI config if (table != 'alerts' and bucket not in config['lambda'] ['athena_partition_refresh_config']['buckets']): config['lambda']['athena_partition_refresh_config']['buckets'][ bucket] = 'data' config.write() LOGGER_CLI.info('The %s table was successfully created!', sanitized_table_name)