Пример #1
0
    def test_create_partition_batches_by_one_hundred(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partitions = sorted(self.helper.create_many_partitions(count=150))
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)

        create_partitions_mock = MagicMock(return_value=[])
        partitioner.glue.batch_create_partition = create_partitions_mock

        partitioner.create_partitions(partitions)

        first_list = [
            partitioner._partition_input(p) for p in partitions[:100]
        ]
        second_list = [
            partitioner._partition_input(p) for p in partitions[100:]
        ]
        calls = [
            call(DatabaseName=self.database,
                 TableName=self.table,
                 PartitionInputList=first_list),
            call(DatabaseName=self.database,
                 TableName=self.table,
                 PartitionInputList=second_list),
        ]

        create_partitions_mock.call_count.should.equal(2)
        create_partitions_mock.assert_has_calls(calls)
Пример #2
0
    def test_create_partitions_error_output(self):
        """ Technically this should _never_ happen, but on the off chance that
        batch_get_partition ever returns bad values we'll leave it in"""
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()

        expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t"
        expected_output += ", ".join(map(str, partitions))
        expected_output += f"\nOne or more errors occurred when attempting to create partitions\nError on {partitions[0].values}: AlreadyExistsException"

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions([partitions[0]])
        mock = MagicMock(return_value=partitions)
        partitioner.partitions_to_create = mock

        with captured_output() as (out, err):
            create_found_partitions(partitioner)
        output = out.getvalue().strip()
        output.should.equal(expected_output)
        self.exit_mock.assert_called_with(1)

        fresh_partitioner = Partitioner(self.database,
                                        self.table,
                                        aws_region=self.region)
        exists = fresh_partitioner.existing_partitions()

        set(exists).should.equal(set(partitions))
Пример #3
0
    def test_delete_bad_partitions_dry_run(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(
            count=10, prefix="not-this-table")
        partitions.sort()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        expected_out = "Found 10 partitions to delete\nDeleting the following partitions:"
        for partition in partitions:
            expected_out += f"\n\t{str(partition)}"

        out, err = self.get_cmd_output(
            cli,
            ["delete-bad-partitions", self.database, self.table, "--dry-run"])
        out.should.equal(expected_out)

        found_partitions = partitioner.existing_partitions()
        found_partitions.should.have.length_of(10)
Пример #4
0
    def test_update_partitions_error_output(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)

        partition = self.helper.create_partition_data()
        partition.location = "s3://old-bucket/old-table/"
        partitioner.create_partitions([partition])

        mock = MagicMock()
        mock.return_value = [{
            "PartitionValues": partition.values,
            "ErrorDetail": {
                "ErrorCode": "PartitionNotFound",
                "ErrorMessage": "Partition not found"
            }
        }]
        partitioner.update_partition_locations = mock

        partitioner_mock = MagicMock(return_value=partitioner)
        cli.get_partitioner = partitioner_mock

        expected_output = f"Found 1 moved partitions\n\t{partition}\nOne or more errors occurred when attempting to update partitions\nError on {partition.values}: PartitionNotFound"
        out, err = self.get_cmd_output(
            cli, ["update-partitions", self.database, self.table])
        out.should.equal(expected_output)

        self.exit_mock.assert_called_with(1)
Пример #5
0
    def test_delete_missing_partitions(self):
        self.helper.make_database_and_table()
        cli = Cli()

        self.s3.create_bucket(Bucket=self.bucket)
        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        s3resource = boto3.resource("s3")
        bucket = s3resource.Bucket(self.bucket)
        for obj in bucket.objects.all():
            obj.delete()

        expected_out = "Found 10 partitions to delete:"
        for partition in partitions:
            expected_out += f"\n\t{partition}"

        out, err = self.get_cmd_output(
            cli, ["delete-missing-partitions", self.database, self.table])
        out.should.equal(expected_out)

        found_partitions = partitioner.existing_partitions()
        found_partitions.should.have.length_of(0)
Пример #6
0
    def test_update_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(10)
        partitions.sort()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        expected_output = "Found 5 moved partitions"
        partitions_to_move = partitions[0:5]
        for p in partitions_to_move:
            subpath = "/".join(p.values)
            new_location = f"s3://old-bucket/old-table/{subpath}/"
            p.location = new_location
            expected_output += f"\n\t{p}"

        partitioner.update_partition_locations(partitions_to_move)

        out, err = self.get_cmd_output(
            cli, ["update-partitions", self.database, self.table])
        out.should.equal(expected_output)

        found_map = PartitionMap(partitioner.existing_partitions())
        for partition in partitions_to_move:
            matching = found_map.get(partition)
            matching.should_not.be.false
            matching.location.startswith(
                f"s3://{self.bucket}/{self.table}/").should.be.true
Пример #7
0
    def test_delete_partitions_in_groups_of_twenty_five(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partitions = sorted(self.helper.create_many_partitions(count=30))

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        mock = MagicMock(return_value=[])
        partitioner.glue.batch_delete_partition = mock

        existing_partitions = partitioner.existing_partitions()
        partitioner.delete_partitions(existing_partitions)

        first_list = [{"Values": p.values} for p in partitions[:25]]
        second_list = [{"Values": p.values} for p in partitions[25:]]
        calls = [
            call(DatabaseName=self.database,
                 TableName=self.table,
                 PartitionsToDelete=first_list),
            call(DatabaseName=self.database,
                 TableName=self.table,
                 PartitionsToDelete=second_list),
        ]

        mock.call_count.should.equal(2)
        mock.assert_has_calls(calls)
Пример #8
0
    def test_delete_bad_partitions_error_output(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partition = self.helper.create_partition_data(prefix="not-this-table")
        partitioner.create_partitions([partition])
        mock = MagicMock()
        mock.return_value = [{
            "PartitionValues": partition.values,
            "ErrorDetail": {
                "ErrorCode": "PartitionNotFound",
                "ErrorMessage": "Partition not found"
            }
        }]
        partitioner.delete_partitions = mock
        partitioner_mock = MagicMock(return_value=partitioner)
        cli.get_partitioner = partitioner_mock

        expected_output = f"Found 1 partitions to delete\nDeleting the following partitions:\n\t{partition}\nOne or more errors occurred when attempting to delete partitions\nError on {partition.values}: PartitionNotFound"
        out, err = self.get_cmd_output(
            cli, ["delete-bad-partitions", self.database, self.table])
        out.should.equal(expected_output)

        self.exit_mock.assert_called_with(1)
Пример #9
0
    def test_find_partitions_in_glue_catalog(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partition = self.helper.create_partition_data()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions([partition])

        existing_partitions = partitioner.existing_partitions()
        existing_partitions.should.have.length_of(1)
        existing_partitions[0].values.should.equal(partition.values)
        existing_partitions[0].location.should.equal(partition.location)
Пример #10
0
    def test_create_partition_when_partition_exists(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partition = self.helper.create_partition_data()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)

        create_partitions_mock = MagicMock(
            return_value={
                "Errors": [{
                    "PartitionValues": partition.values,
                    "ErrorDetail": {
                        "ErrorCode": "AlreadyExistsException",
                        "ErrorMessage": "Partition already exists"
                    }
                }]
            })
        partitioner.glue.batch_create_partition = create_partitions_mock

        errors = partitioner.create_partitions([partition])

        create_partitions_mock.assert_called_once()
        errors.should.have.length_of(1)
        errors[0]["PartitionValues"].should.equal(partition.values)
        errors[0]["ErrorDetail"]["ErrorCode"].should.equal(
            "AlreadyExistsException")
Пример #11
0
    def test_update_partitions_no_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(10)
        partitions.sort()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        # all partitions correctly located
        out, err = self.get_cmd_output(
            cli, ["update-partitions", self.database, self.table])
        out.should.equal("No partitions to update")
Пример #12
0
    def test_partitions_to_create(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        already_created = self.helper.create_many_partitions(count=10,
                                                             write=True)
        to_create = self.helper.create_many_partitions(count=3, write=True)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(already_created)

        found = partitioner.partitions_on_disk()
        wants_to_create = partitioner.partitions_to_create(found)

        set(wants_to_create).should.equal(set(to_create))
Пример #13
0
    def test_create_partitions_nothing_new(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 0 new partitions to create"

        out, err = self.get_cmd_output(
            cli, ["create-partitions", self.database, self.table])
        out.should.equal(expected_output)
Пример #14
0
    def test_create_partitions_nothing_new(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 0 new partitions to create"

        with captured_output() as (out, err):
            create_found_partitions(partitioner)
        output = out.getvalue().strip()
        output.should.equal(expected_output)
Пример #15
0
    def test_update_partition_storage_descriptors(self):
        """Partitioner.update_storage_descriptors() updates the storage descriptors of all partitions"""
        self.helper.make_database_and_table()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(
            self.helper.create_many_partitions(write=False))

        # get and update table
        columns = [
            {
                "Name": "foo",
                "Type": "string"
            },
            {
                "Name": "bar",
                "Type": "string"
            },
            {
                "Name": "only-in-this-test",
                "Type": "string"
            },
        ]

        table = partitioner.glue.get_table(DatabaseName=self.database,
                                           Name=self.table)["Table"]
        for key in [
                "DatabaseName", "CreateTime", "CreatedBy",
                "IsRegisteredWithLakeFormation", "CatalogId"
        ]:
            if key in table:
                del table[key]

        table["StorageDescriptor"]["Columns"] = columns
        partitioner.glue.update_table(DatabaseName=self.database,
                                      TableInput=table)

        errors = partitioner.update_partition_storage_descriptors()
        errors.should.have.length_of(0)

        for partition in partitioner.existing_partitions():
            partition.raw["StorageDescriptor"]["Columns"].should.equal(columns)
Пример #16
0
    def test_delete_missing_partitions_no_partitions(self):
        self.helper.make_database_and_table()
        cli = Cli()

        self.s3.create_bucket(Bucket=self.bucket)
        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        out, err = self.get_cmd_output(
            cli, ["delete-missing-partitions", self.database, self.table])
        out.should.equal("Found 0 partitions to delete:")

        catalog_partitions = partitioner.existing_partitions()
        catalog_partitions.should.have.length_of(10)
Пример #17
0
    def test_create_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partition = self.helper.create_partition_data()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)

        create_partitions_mock = MagicMock(return_value=[])
        partitioner.glue.batch_create_partition = create_partitions_mock

        partitioner.create_partitions([partition])

        create_partitions_mock.assert_called_with(
            DatabaseName=self.database,
            TableName=self.table,
            PartitionInputList=[partitioner._partition_input(partition)])
Пример #18
0
    def test_delete_all_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        expected_out = "Deleting the following partitions:"
        for partition in partitions:
            expected_out += f"\n\t{str(partition)}"

        out, err = self.get_cmd_output(
            cli, ["delete-all-partitions", self.database, self.table])
        out.should.equal(expected_out)

        found_partitions = partitioner.existing_partitions()
        found_partitions.should.have.length_of(0)
Пример #19
0
    def test_delete_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        self.helper.create_partition_data()

        partition = self.helper.create_partition_data()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions([partition])

        mock = MagicMock(return_value=[])
        partitioner.glue.batch_delete_partition = mock

        to_delete = partitioner.existing_partitions()
        partitioner.delete_partitions(to_delete)

        mock.assert_called_with(DatabaseName=self.database,
                                TableName=self.table,
                                PartitionsToDelete=[{
                                    "Values":
                                    to_delete[0].values
                                }])
Пример #20
0
    def test_create_partition_already_exists_in_multiple_batches(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partitions = sorted(self.helper.create_many_partitions(count=150))
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)

        # prime partitions list with two partitions, one in each group
        already_exists = [partitions[5], partitions[115]]
        errors = partitioner.create_partitions(already_exists)
        errors.should.be.empty

        # now attempt to create them as part of a large batch
        errors = partitioner.create_partitions(partitions)
        errors.should.have.length_of(2)

        for idx, error in enumerate(errors):
            partition = already_exists[idx]
            error["PartitionValues"].should.equal(partition.values)
            error["ErrorDetail"]["ErrorCode"].should.equal(
                "AlreadyExistsException")