예제 #1
0
    def test_parse_from_aws(self):
        normal_aws_response = {
            "Values": ["2019", "01", "02", "03"],
            "StorageDescriptor": {
                "Location": "s3://bucket/location/2019/01/02/03/",
            },
        }

        partition = Partition.from_aws_response(normal_aws_response)
        partition.values[0].should.equal("2019")
        partition.values[1].should.equal("01")
        partition.values[2].should.equal("02")
        partition.values[3].should.equal("03")
        partition.location.should.equal("s3://bucket/location/2019/01/02/03/")

        # Conform location gets normalized by Partition
        bad_location_aws_response = {
            "Values": ["2019", "01", "02", "03"],
            "StorageDescriptor": {
                "Location": "s3://bucket/location/2019/01/02/03",
            },
        }
        partition2 = Partition.from_aws_response(bad_location_aws_response)
        partition2.values[0].should.equal("2019")
        partition2.values[1].should.equal("01")
        partition2.values[2].should.equal("02")
        partition2.values[3].should.equal("03")
        partition2.location.should.equal("s3://bucket/location/2019/01/02/03/")

        partition2.should.equal(partition)
예제 #2
0
파일: helper.py 프로젝트: webysther/glutil
    def create_partition_data(self,
                              values=None,
                              prefix=None,
                              bucket=None,
                              save=True):
        if not values:
            values = self.create_partition_values()

        if not prefix:
            prefix = self.default_table

        if prefix[0] == "/":
            prefix = prefix[1:]
        if prefix[-1] != "/":
            prefix = prefix + "/"

        if not bucket:
            bucket = self.default_bucket

        s3_key = f"{prefix}{values[0]}/{values[1]}/{values[2]}/{values[3]}/"
        location = f"s3://{bucket}/{s3_key}"

        partition = Partition(values, location)
        if save:
            self.write_partition_to_s3(partition)

        return partition
예제 #3
0
    def test_find_partitions_with_limit_hive_format(self):
        """Partitioner.partitions_on_disk() with limit days set should work on hive-formatted partitions"""
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")
            hour = "03"

            partition = Partition([
                year, month, day, hour
            ], f"s3://{self.bucket}/{self.table}/year={year}/month={month}/day={day}/hour={hour}/"
                                  )
            self.helper.write_partition_to_s3(partition)
            partitions.append(partition)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk(limit_days=7)
        found_partitions.should.have.length_of(7)
        set(found_partitions).should.equal(set(partitions[0:7]))
예제 #4
0
    def test_update_partition_locations_with_mix_of_good_and_bad(self):
        self.helper.make_database_and_table()

        good_old_location = "s3://old-bucket/table/data1/"
        good_new_location = f"s3://{self.bucket}/{self.table}/2019-01-01-01/"
        good_partition = Partition(["2019", "01", "01", "01"],
                                   good_old_location)
        bad_partition = Partition(["2018", "02", "02", "02"],
                                  "s3://old-bucket/table/data2/")

        self.glue.create_partition(DatabaseName=self.database,
                                   TableName=self.table,
                                   PartitionInput={
                                       "Values": good_partition.values,
                                       "StorageDescriptor": {
                                           "Location": good_partition.location
                                       }
                                   })

        good_partition.location = good_new_location

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        mock = MagicMock()
        partitioner.glue.update_partition = mock

        errors = partitioner.update_partition_locations(
            [bad_partition, good_partition])

        mock.assert_called_with(DatabaseName=self.database,
                                TableName=self.table,
                                PartitionValueList=good_partition.values,
                                PartitionInput={
                                    "Values": good_partition.values,
                                    "StorageDescriptor": {
                                        "Location": good_new_location
                                    }
                                })

        errors.should.have.length_of(1)
        errors[0]["Partition"].should.equal(bad_partition.values)
예제 #5
0
    def test_partition_comparisons(self):
        p1 = Partition(["2019", "01", "01", "01"], "s3://bucket/table/")
        p2 = Partition(["2019", "02", "02", "02"], "s3://bucket/table2/")
        (p1 > p2).should.be.false
        (p1 < p2).should.be.true

        p3 = Partition(["2019", "01", "01", "01"], "s3://bucket/table/")
        (p1 == p3).should.be.true
        p1._cmp(p3).should.equal(0)

        p4 = Partition(["2019", "01", "01", "01"], "s3://bucket/z-table/")
        (p1 > p4).should.be.true
        (p4 > p1).should.be.false
예제 #6
0
    def test_update_partition_locations_with_non_existent_partition(self):
        self.helper.make_database_and_table()
        bad_partition = Partition(["2019", "01", "01", "01"],
                                  "s3://who/cares/")

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        mock = MagicMock()
        partitioner.glue.update_partition = mock

        errors = partitioner.update_partition_locations([bad_partition])
        errors.should.have.length_of(1)
        errors[0]["Partition"].should.equal(bad_partition.values)
        mock.assert_not_called()
예제 #7
0
    def test_find_partitions_with_limit_hive_format_capital_keys(self):
        """Partitioner.partitions_on_disk() with limit days set should work on hive-formatted partitions where they keys are not lowercase"""
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database()
        self.helper.make_table(partition_keys=[
            {
                "Name": "Year",
                "Type": "int"
            },
            {
                "Name": "Month",
                "Type": "int"
            },
            {
                "Name": "Day",
                "Type": "int"
            },
            {
                "Name": "Hour",
                "Type": "int"
            },
        ])

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")
            hour = "03"

            partition = Partition([
                year, month, day, hour
            ], f"s3://{self.bucket}/{self.table}/Year={year}/Month={month}/Day={day}/Hour={hour}/"
                                  )
            self.helper.write_partition_to_s3(partition)
            partitions.append(partition)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk(limit_days=7)
        print(found_partitions)
        print(partitions[0:7])
        found_partitions.should.have.length_of(7)
        set(found_partitions).should.equal(set(partitions[0:7]))
예제 #8
0
    def test_delete_nonexistent_partition(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)

        partition = Partition(
            ["2019", "01", "02", "03"],
            f"s3://{self.bucket}/{self.table}/2019/01/02/03/")

        result = partitioner.delete_partitions([partition])
        result.should.have.length_of(1)
        result[0]["PartitionValues"].should.equal(["2019", "01", "02", "03"])
        result[0]["ErrorDetail"]["ErrorCode"].should.equal(
            "EntityNotFoundException")
예제 #9
0
    def test_find_partitions_with_limit_no_hour_partition(self):
        """Partitioner.partitions_on_disk, limit_days set,
            on a table partitioned by day, should work"""
        self.s3.create_bucket(Bucket=self.bucket)
        db_input = self.helper.create_database_input()
        self.glue.create_database(**db_input)

        table_input = self.helper.create_table_input(
            location=f"s3://{self.bucket}/{self.table}/")
        table_input["TableInput"]["PartitionKeys"] = [
            {
                "Name": "year",
                "Type": "string"
            },
            {
                "Name": "month",
                "Type": "string"
            },
            {
                "Name": "day",
                "Type": "string"
            },
        ]

        self.glue.create_table(**table_input)

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")

            partition = Partition(
                [year, month, day],
                f"s3://{self.bucket}/{self.table}/{year}/{month}/{day}/")
            self.helper.write_partition_to_s3(partition)
            partitions.append(partition)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk(limit_days=4)
        found_partitions.should.have.length_of(4)
        set(found_partitions).should.equal(set(partitions[0:4]))
예제 #10
0
    def test_find_partitions_in_s3_with_hive_formatted_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        # partitions = self.helper.create_many_partitions(count=10)
        partitions = []
        for i in range(1, 11):
            partition = Partition([
                "2019", "01", f"{i:02d}", "03"
            ], f"s3://{self.bucket}/{self.table}/year=2019/month=01/day={i:02d}/hour=03/"
                                  )
            print(partition.location)
            self.helper.write_partition_to_s3(partition)
            partitions.append(partition)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk()

        set(found_partitions).should.equal(set(partitions))
예제 #11
0
    def test_find_partitions_single_key(self):
        """Partitioner.partitions_on_disk should work with single-key tables, in hive-format"""
        self.s3.create_bucket(Bucket=self.bucket)
        db_input = self.helper.create_database_input()
        self.glue.create_database(**db_input)

        table_input = self.helper.create_table_input()
        table_input["TableInput"]["PartitionKeys"] = [
            {
                "Name": "dt",
                "Type": "string"
            },
        ]

        self.glue.create_table(**table_input)

        # create initial partition
        prefix = table_input["TableInput"]["StorageDescriptor"]["Location"]
        location = f"{prefix}/dt=2019-01-02/"
        s3_key = f"{location}object.json"
        splits = s3_key[len("s3://"):].split("/", 1)
        bucket = splits[0]
        path = splits[1]

        self.s3.put_object(
            Body='{"foo": "bar"}',
            Bucket=bucket,
            Key=path,
        )

        partitions = [Partition(["2019-01-02"], location)]

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk()

        set(found_partitions).should.equal(set(partitions))
예제 #12
0
    def test_create_partitions_limit_days(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")
            hour = "03"

            partition = Partition([
                year, month, day, hour
            ], f"s3://{self.bucket}/{self.table}/{year}/{month}/{day}/{hour}/")
            self.helper.write_partition_to_s3(partition)
            partitions.append(partition)

        partitions.sort()

        expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 7 new partitions to create\n\t"
        expected_output += ", ".join(map(str, partitions[3:]))

        out, err = self.get_cmd_output(
            cli,
            ["create-partitions", self.database, self.table, "--limit-days=7"])
        out.should.equal(expected_output)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found = partitioner.existing_partitions()
        found.should.have.length_of(7)
        set(found).should.equal(set(partitions[3:]))
예제 #13
0
    def test_find_partitions_with_limit_days_and_prefix(self):
        """Partitioner.partitions_on_disk() with limit_days and prefix_partitions should find preceding partitions with hive-format names"""
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database()
        self.helper.make_table(partition_keys=[
            {
                "Name": "region",
                "Type": "string"
            },
            {
                "Name": "year",
                "Type": "int"
            },
            {
                "Name": "month",
                "Type": "int"
            },
            {
                "Name": "day",
                "Type": "int"
            },
        ])

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")

            partition_east = Partition([
                "us-east-1", year, month, day
            ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/"
                                       )
            partition_west = Partition([
                "us-west-2", year, month, day
            ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/"
                                       )
            self.helper.write_partition_to_s3(partition_east)
            self.helper.write_partition_to_s3(partition_west)
            partitions.append(partition_east)
            partitions.append(partition_west)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk(
            limit_days=4, prefix_partitions=["us-east-1"])
        found_partitions.should.have.length_of(4)

        to_be_found = []
        for i in range(1, 5):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")

            to_be_found.append(
                Partition([
                    "us-east-1", year, month, day
                ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/"
                          ))

        set(found_partitions).should.equal(set(to_be_found))