def test_it_removes_queries_with_no_applicable_matches_for_partitioned_data( self, get_partitions_mock, get_table_mock): columns = [{"Name": "customer_id"}] partition_keys = ["product_category"] partitions = [["Books"], ["Beauty"]] get_table_mock.return_value = table_stub(columns, partition_keys) get_partitions_mock.return_value = [ partition_stub(p, columns) for p in partitions ] resp = generate_athena_queries( { "DataMapperId": "A", "QueryExecutor": "athena", "Columns": [col["Name"] for col in columns], "Format": "parquet", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test_db", "Table": "test_table", }, }, [{ "MatchId": "123", "DataMappers": ["C"] }], ) assert resp == []
def test_it_handles_multiple_partition_keys(self, get_partitions_mock, get_table_mock): columns = [{"Name": "customer_id"}] partition_keys = ["year", "month"] partitions = [["2019", "01"]] get_table_mock.return_value = table_stub(columns, partition_keys) get_partitions_mock.return_value = [ partition_stub(p, columns) for p in partitions ] resp = generate_athena_queries( { "DataMapperId": "a", "QueryExecutor": "athena", "Columns": [col["Name"] for col in columns], "Format": "parquet", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test_db", "Table": "test_table", }, }, [{ "MatchId": "hi" }], ) assert resp == [{ "DataMapperId": "a", "QueryExecutor": "athena", "Format": "parquet", "Database": "test_db", "Table": "test_table", "Columns": [{ "Column": "customer_id", "MatchIds": ["hi"], "Type": "Simple" }], "PartitionKeys": [ { "Key": "year", "Value": "2019" }, { "Key": "month", "Value": "01" }, ], "DeleteOldVersions": True, }]
def test_it_filters_users_from_non_applicable_tables( self, get_partitions_mock, get_table_mock): columns = [{"Name": "customer_id"}] partition_keys = ["product_category"] partitions = [["Books"]] get_table_mock.return_value = table_stub(columns, partition_keys) get_partitions_mock.return_value = [ partition_stub(p, columns) for p in partitions ] resp = generate_athena_queries( { "DataMapperId": "B", "QueryExecutor": "athena", "Columns": [col["Name"] for col in columns], "Format": "parquet", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test_db", "Table": "B", }, }, [ { "MatchId": "123", "DataMappers": ["A"] }, { "MatchId": "456", "DataMappers": [] }, ], ) assert resp == [{ "DataMapperId": "B", "Database": "test_db", "Table": "B", "QueryExecutor": "athena", "Format": "parquet", "Columns": [{ "Column": "customer_id", "MatchIds": ["456"], "Type": "Simple" }], "PartitionKeys": [{ "Key": "product_category", "Value": "Books" }], "DeleteOldVersions": True, }]
def test_it_handles_multiple_columns(self, get_partitions_mock, get_table_mock): columns = ["customer_id", "alt_customer_id"] partition_keys = ["product_category"] partitions = [["Books"]] get_table_mock.return_value = table_stub(columns, partition_keys) get_partitions_mock.return_value = [ partition_stub(p, columns) for p in partitions ] resp = generate_athena_queries( { "DataMapperId": "a", "QueryExecutor": "athena", "Columns": columns, "Format": "parquet", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test_db", "Table": "test_table", }, }, [{ "MatchId": "hi" }], ) assert resp == [{ "DataMapperId": "a", "QueryExecutor": "athena", "Format": "parquet", "Database": "test_db", "Table": "test_table", "Columns": [ { "Column": "customer_id", "MatchIds": ["hi"] }, { "Column": "alt_customer_id", "MatchIds": ["hi"] }, ], "PartitionKeys": [{ "Key": "product_category", "Value": "Books" }], "DeleteOldVersions": True, }]
def test_it_propagates_role_arn_for_unpartitioned_data( self, get_partitions_mock, get_table_mock): columns = [{"Name": "customer_id"}] get_table_mock.return_value = table_stub(columns, []) get_partitions_mock.return_value = [] resp = generate_athena_queries( { "DataMapperId": "a", "QueryExecutor": "athena", "Columns": [col["Name"] for col in columns], "Format": "parquet", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test_db", "Table": "test_table", }, "RoleArn": "arn:aws:iam::accountid:role/rolename", }, [{ "MatchId": "hi" }], ) assert resp == [{ "DataMapperId": "a", "Database": "test_db", "Table": "test_table", "QueryExecutor": "athena", "Format": "parquet", "Columns": [{ "Column": "customer_id", "MatchIds": ["hi"], "Type": "Simple" }], "PartitionKeys": [], "RoleArn": "arn:aws:iam::accountid:role/rolename", "DeleteOldVersions": True, }]
def test_it_removes_queries_with_no_applicable_matches( self, get_partitions_mock, get_table_mock): columns = ["customer_id"] get_table_mock.return_value = table_stub(columns, []) get_partitions_mock.return_value = [] resp = generate_athena_queries( { "DataMapperId": "A", "QueryExecutor": "athena", "Columns": columns, "Format": "parquet", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test_db", "Table": "test_table" } }, [{ "MatchId": "123", "DataMappers": ["B"] }]) assert resp == []
def test_it_handles_unpartitioned_data(self, get_partitions_mock, get_table_mock): columns = ["customer_id"] get_table_mock.return_value = table_stub(columns, []) get_partitions_mock.return_value = [] resp = generate_athena_queries( { "DataMapperId": "a", "QueryExecutor": "athena", "Columns": columns, "Format": "parquet", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test_db", "Table": "test_table", }, }, [{ "MatchId": "hi" }], ) assert resp == [{ "DataMapperId": "a", "Database": "test_db", "Table": "test_table", "QueryExecutor": "athena", "Format": "parquet", "Columns": [{ "Column": "customer_id", "MatchIds": ["hi"] }], "PartitionKeys": [], "DeleteOldVersions": True, }]
def test_it_handles_mixed_columns(self, get_partitions_mock, get_table_mock): columns = [ { "Name": "customer_id" }, { "Name": "first_name" }, { "Name": "last_name" }, { "Name": "age", "Type": "int" }, ] partition_keys = ["product_category"] partitions = [["Books"]] get_table_mock.return_value = table_stub(columns, partition_keys) get_partitions_mock.return_value = [ partition_stub(p, columns) for p in partitions ] resp = generate_athena_queries( { "DataMapperId": "a", "QueryExecutor": "athena", "Columns": [col["Name"] for col in columns], "Format": "parquet", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test_db", "Table": "test_table", }, }, [ {"MatchId": "12345", "Type": "Simple"}, {"MatchId": "23456", "Type": "Simple"}, {"MatchId": "23456", "Type": "Simple"}, # duplicate { "MatchId": [ {"Column": "first_name", "Value": "John"}, {"Column": "last_name", "Value": "Doe"}, ], "Type": "Composite", "DataMappers": ["a"], }, { "MatchId": [ {"Column": "first_name", "Value": "Jane"}, {"Column": "last_name", "Value": "Doe"}, ], "Type": "Composite", "DataMappers": ["a"], }, { # duplicate "MatchId": [ {"Column": "first_name", "Value": "Jane"}, {"Column": "last_name", "Value": "Doe"}, ], "Type": "Composite", "DataMappers": ["a"], }, { "MatchId": [ {"Column": "last_name", "Value": "Smith"}, {"Column": "age", "Value": "28"}, ], "Type": "Composite", "DataMappers": ["a"], }, ], ) assert resp == [{ "DataMapperId": "a", "QueryExecutor": "athena", "Format": "parquet", "Database": "test_db", "Table": "test_table", "Columns": [ { "Column": "customer_id", "MatchIds": ["12345", "23456"], "Type": "Simple", }, { "Column": "first_name", "MatchIds": ["12345", "23456"], "Type": "Simple", }, { "Column": "last_name", "MatchIds": ["12345", "23456"], "Type": "Simple", }, { "Column": "age", "MatchIds": [12345, 23456], "Type": "Simple" }, { "Columns": ["first_name", "last_name"], "MatchIds": [["John", "Doe"], ["Jane", "Doe"]], "Type": "Composite", }, { "Columns": ["age", "last_name"], "MatchIds": [[28, "Smith"]], "Type": "Composite", }, ], "PartitionKeys": [{ "Key": "product_category", "Value": "Books" }], "DeleteOldVersions": True, }]
def test_it_propagates_optional_properties(self, get_partitions_mock, get_table_mock): columns = ["customer_id"] partition_keys = ["year", "month"] partitions = [["2018", "12"], ["2019", "01"]] get_table_mock.return_value = table_stub(columns, partition_keys) get_partitions_mock.return_value = [ partition_stub(p, columns) for p in partitions ] resp = generate_athena_queries( { "DataMapperId": "a", "QueryExecutor": "athena", "Columns": columns, "Format": "parquet", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test_db", "Table": "test_table" }, "RoleArn": "arn:aws:iam::accountid:role/rolename", "DeleteOldVersions": True }, [{ "MatchId": "hi" }]) assert resp == [{ "DataMapperId": "a", "Database": "test_db", "Table": "test_table", "QueryExecutor": "athena", "Format": "parquet", "Columns": [{ "Column": "customer_id", "MatchIds": ["hi"] }], "PartitionKeys": [{ "Key": "year", "Value": "2018" }, { "Key": "month", "Value": "12" }], "RoleArn": "arn:aws:iam::accountid:role/rolename", "DeleteOldVersions": True }, { "DataMapperId": "a", "Database": "test_db", "Table": "test_table", "QueryExecutor": "athena", "Format": "parquet", "Columns": [{ "Column": "customer_id", "MatchIds": ["hi"] }], "PartitionKeys": [{ "Key": "year", "Value": "2019" }, { "Key": "month", "Value": "01" }], "RoleArn": "arn:aws:iam::accountid:role/rolename", "DeleteOldVersions": True }]