示例#1
0
 def test_it_removes_queries_with_no_applicable_matches_for_partitioned_data(
         self, get_partitions_mock, get_table_mock):
     columns = [{"Name": "customer_id"}]
     partition_keys = ["product_category"]
     partitions = [["Books"], ["Beauty"]]
     get_table_mock.return_value = table_stub(columns, partition_keys)
     get_partitions_mock.return_value = [
         partition_stub(p, columns) for p in partitions
     ]
     resp = generate_athena_queries(
         {
             "DataMapperId": "A",
             "QueryExecutor": "athena",
             "Columns": [col["Name"] for col in columns],
             "Format": "parquet",
             "QueryExecutorParameters": {
                 "DataCatalogProvider": "glue",
                 "Database": "test_db",
                 "Table": "test_table",
             },
         },
         [{
             "MatchId": "123",
             "DataMappers": ["C"]
         }],
     )
     assert resp == []
示例#2
0
    def test_it_handles_multiple_partition_keys(self, get_partitions_mock,
                                                get_table_mock):
        columns = [{"Name": "customer_id"}]
        partition_keys = ["year", "month"]
        partitions = [["2019", "01"]]
        get_table_mock.return_value = table_stub(columns, partition_keys)
        get_partitions_mock.return_value = [
            partition_stub(p, columns) for p in partitions
        ]

        resp = generate_athena_queries(
            {
                "DataMapperId": "a",
                "QueryExecutor": "athena",
                "Columns": [col["Name"] for col in columns],
                "Format": "parquet",
                "QueryExecutorParameters": {
                    "DataCatalogProvider": "glue",
                    "Database": "test_db",
                    "Table": "test_table",
                },
            },
            [{
                "MatchId": "hi"
            }],
        )

        assert resp == [{
            "DataMapperId":
            "a",
            "QueryExecutor":
            "athena",
            "Format":
            "parquet",
            "Database":
            "test_db",
            "Table":
            "test_table",
            "Columns": [{
                "Column": "customer_id",
                "MatchIds": ["hi"],
                "Type": "Simple"
            }],
            "PartitionKeys": [
                {
                    "Key": "year",
                    "Value": "2019"
                },
                {
                    "Key": "month",
                    "Value": "01"
                },
            ],
            "DeleteOldVersions":
            True,
        }]
示例#3
0
    def test_it_filters_users_from_non_applicable_tables(
            self, get_partitions_mock, get_table_mock):
        columns = [{"Name": "customer_id"}]
        partition_keys = ["product_category"]
        partitions = [["Books"]]
        get_table_mock.return_value = table_stub(columns, partition_keys)
        get_partitions_mock.return_value = [
            partition_stub(p, columns) for p in partitions
        ]
        resp = generate_athena_queries(
            {
                "DataMapperId": "B",
                "QueryExecutor": "athena",
                "Columns": [col["Name"] for col in columns],
                "Format": "parquet",
                "QueryExecutorParameters": {
                    "DataCatalogProvider": "glue",
                    "Database": "test_db",
                    "Table": "B",
                },
            },
            [
                {
                    "MatchId": "123",
                    "DataMappers": ["A"]
                },
                {
                    "MatchId": "456",
                    "DataMappers": []
                },
            ],
        )

        assert resp == [{
            "DataMapperId":
            "B",
            "Database":
            "test_db",
            "Table":
            "B",
            "QueryExecutor":
            "athena",
            "Format":
            "parquet",
            "Columns": [{
                "Column": "customer_id",
                "MatchIds": ["456"],
                "Type": "Simple"
            }],
            "PartitionKeys": [{
                "Key": "product_category",
                "Value": "Books"
            }],
            "DeleteOldVersions":
            True,
        }]
示例#4
0
    def test_it_handles_multiple_columns(self, get_partitions_mock,
                                         get_table_mock):
        columns = ["customer_id", "alt_customer_id"]
        partition_keys = ["product_category"]
        partitions = [["Books"]]
        get_table_mock.return_value = table_stub(columns, partition_keys)
        get_partitions_mock.return_value = [
            partition_stub(p, columns) for p in partitions
        ]
        resp = generate_athena_queries(
            {
                "DataMapperId": "a",
                "QueryExecutor": "athena",
                "Columns": columns,
                "Format": "parquet",
                "QueryExecutorParameters": {
                    "DataCatalogProvider": "glue",
                    "Database": "test_db",
                    "Table": "test_table",
                },
            },
            [{
                "MatchId": "hi"
            }],
        )

        assert resp == [{
            "DataMapperId":
            "a",
            "QueryExecutor":
            "athena",
            "Format":
            "parquet",
            "Database":
            "test_db",
            "Table":
            "test_table",
            "Columns": [
                {
                    "Column": "customer_id",
                    "MatchIds": ["hi"]
                },
                {
                    "Column": "alt_customer_id",
                    "MatchIds": ["hi"]
                },
            ],
            "PartitionKeys": [{
                "Key": "product_category",
                "Value": "Books"
            }],
            "DeleteOldVersions":
            True,
        }]
示例#5
0
 def test_it_propagates_role_arn_for_unpartitioned_data(
         self, get_partitions_mock, get_table_mock):
     columns = [{"Name": "customer_id"}]
     get_table_mock.return_value = table_stub(columns, [])
     get_partitions_mock.return_value = []
     resp = generate_athena_queries(
         {
             "DataMapperId": "a",
             "QueryExecutor": "athena",
             "Columns": [col["Name"] for col in columns],
             "Format": "parquet",
             "QueryExecutorParameters": {
                 "DataCatalogProvider": "glue",
                 "Database": "test_db",
                 "Table": "test_table",
             },
             "RoleArn": "arn:aws:iam::accountid:role/rolename",
         },
         [{
             "MatchId": "hi"
         }],
     )
     assert resp == [{
         "DataMapperId":
         "a",
         "Database":
         "test_db",
         "Table":
         "test_table",
         "QueryExecutor":
         "athena",
         "Format":
         "parquet",
         "Columns": [{
             "Column": "customer_id",
             "MatchIds": ["hi"],
             "Type": "Simple"
         }],
         "PartitionKeys": [],
         "RoleArn":
         "arn:aws:iam::accountid:role/rolename",
         "DeleteOldVersions":
         True,
     }]
示例#6
0
 def test_it_removes_queries_with_no_applicable_matches(
         self, get_partitions_mock, get_table_mock):
     columns = ["customer_id"]
     get_table_mock.return_value = table_stub(columns, [])
     get_partitions_mock.return_value = []
     resp = generate_athena_queries(
         {
             "DataMapperId": "A",
             "QueryExecutor": "athena",
             "Columns": columns,
             "Format": "parquet",
             "QueryExecutorParameters": {
                 "DataCatalogProvider": "glue",
                 "Database": "test_db",
                 "Table": "test_table"
             }
         }, [{
             "MatchId": "123",
             "DataMappers": ["B"]
         }])
     assert resp == []
示例#7
0
 def test_it_handles_unpartitioned_data(self, get_partitions_mock,
                                        get_table_mock):
     columns = ["customer_id"]
     get_table_mock.return_value = table_stub(columns, [])
     get_partitions_mock.return_value = []
     resp = generate_athena_queries(
         {
             "DataMapperId": "a",
             "QueryExecutor": "athena",
             "Columns": columns,
             "Format": "parquet",
             "QueryExecutorParameters": {
                 "DataCatalogProvider": "glue",
                 "Database": "test_db",
                 "Table": "test_table",
             },
         },
         [{
             "MatchId": "hi"
         }],
     )
     assert resp == [{
         "DataMapperId":
         "a",
         "Database":
         "test_db",
         "Table":
         "test_table",
         "QueryExecutor":
         "athena",
         "Format":
         "parquet",
         "Columns": [{
             "Column": "customer_id",
             "MatchIds": ["hi"]
         }],
         "PartitionKeys": [],
         "DeleteOldVersions":
         True,
     }]
示例#8
0
    def test_it_handles_mixed_columns(self, get_partitions_mock,
                                      get_table_mock):
        columns = [
            {
                "Name": "customer_id"
            },
            {
                "Name": "first_name"
            },
            {
                "Name": "last_name"
            },
            {
                "Name": "age",
                "Type": "int"
            },
        ]
        partition_keys = ["product_category"]
        partitions = [["Books"]]
        get_table_mock.return_value = table_stub(columns, partition_keys)
        get_partitions_mock.return_value = [
            partition_stub(p, columns) for p in partitions
        ]
        resp = generate_athena_queries(
            {
                "DataMapperId": "a",
                "QueryExecutor": "athena",
                "Columns": [col["Name"] for col in columns],
                "Format": "parquet",
                "QueryExecutorParameters": {
                    "DataCatalogProvider": "glue",
                    "Database": "test_db",
                    "Table": "test_table",
                },
            },
            [
                {"MatchId": "12345", "Type": "Simple"},
                {"MatchId": "23456", "Type": "Simple"},
                {"MatchId": "23456", "Type": "Simple"},  # duplicate
                {
                    "MatchId": [
                        {"Column": "first_name", "Value": "John"},
                        {"Column": "last_name", "Value": "Doe"},
                    ],
                    "Type": "Composite",
                    "DataMappers": ["a"],
                },
                {
                    "MatchId": [
                        {"Column": "first_name", "Value": "Jane"},
                        {"Column": "last_name", "Value": "Doe"},
                    ],
                    "Type": "Composite",
                    "DataMappers": ["a"],
                },
                {  # duplicate
                    "MatchId": [
                        {"Column": "first_name", "Value": "Jane"},
                        {"Column": "last_name", "Value": "Doe"},
                    ],
                    "Type": "Composite",
                    "DataMappers": ["a"],
                },
                {
                    "MatchId": [
                        {"Column": "last_name", "Value": "Smith"},
                        {"Column": "age", "Value": "28"},
                    ],
                    "Type": "Composite",
                    "DataMappers": ["a"],
                },
            ],
        )

        assert resp == [{
            "DataMapperId":
            "a",
            "QueryExecutor":
            "athena",
            "Format":
            "parquet",
            "Database":
            "test_db",
            "Table":
            "test_table",
            "Columns": [
                {
                    "Column": "customer_id",
                    "MatchIds": ["12345", "23456"],
                    "Type": "Simple",
                },
                {
                    "Column": "first_name",
                    "MatchIds": ["12345", "23456"],
                    "Type": "Simple",
                },
                {
                    "Column": "last_name",
                    "MatchIds": ["12345", "23456"],
                    "Type": "Simple",
                },
                {
                    "Column": "age",
                    "MatchIds": [12345, 23456],
                    "Type": "Simple"
                },
                {
                    "Columns": ["first_name", "last_name"],
                    "MatchIds": [["John", "Doe"], ["Jane", "Doe"]],
                    "Type": "Composite",
                },
                {
                    "Columns": ["age", "last_name"],
                    "MatchIds": [[28, "Smith"]],
                    "Type": "Composite",
                },
            ],
            "PartitionKeys": [{
                "Key": "product_category",
                "Value": "Books"
            }],
            "DeleteOldVersions":
            True,
        }]
示例#9
0
    def test_it_propagates_optional_properties(self, get_partitions_mock,
                                               get_table_mock):
        columns = ["customer_id"]
        partition_keys = ["year", "month"]
        partitions = [["2018", "12"], ["2019", "01"]]
        get_table_mock.return_value = table_stub(columns, partition_keys)
        get_partitions_mock.return_value = [
            partition_stub(p, columns) for p in partitions
        ]

        resp = generate_athena_queries(
            {
                "DataMapperId": "a",
                "QueryExecutor": "athena",
                "Columns": columns,
                "Format": "parquet",
                "QueryExecutorParameters": {
                    "DataCatalogProvider": "glue",
                    "Database": "test_db",
                    "Table": "test_table"
                },
                "RoleArn": "arn:aws:iam::accountid:role/rolename",
                "DeleteOldVersions": True
            }, [{
                "MatchId": "hi"
            }])

        assert resp == [{
            "DataMapperId":
            "a",
            "Database":
            "test_db",
            "Table":
            "test_table",
            "QueryExecutor":
            "athena",
            "Format":
            "parquet",
            "Columns": [{
                "Column": "customer_id",
                "MatchIds": ["hi"]
            }],
            "PartitionKeys": [{
                "Key": "year",
                "Value": "2018"
            }, {
                "Key": "month",
                "Value": "12"
            }],
            "RoleArn":
            "arn:aws:iam::accountid:role/rolename",
            "DeleteOldVersions":
            True
        }, {
            "DataMapperId":
            "a",
            "Database":
            "test_db",
            "Table":
            "test_table",
            "QueryExecutor":
            "athena",
            "Format":
            "parquet",
            "Columns": [{
                "Column": "customer_id",
                "MatchIds": ["hi"]
            }],
            "PartitionKeys": [{
                "Key": "year",
                "Value": "2019"
            }, {
                "Key": "month",
                "Value": "01"
            }],
            "RoleArn":
            "arn:aws:iam::accountid:role/rolename",
            "DeleteOldVersions":
            True
        }]