示例#1
0
    def test_find_partitions_with_limit_hive_format(self):
        """Partitioner.partitions_on_disk() with limit days set should work on hive-formatted partitions"""
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")
            hour = "03"

            partition = Partition([
                year, month, day, hour
            ], f"s3://{self.bucket}/{self.table}/year={year}/month={month}/day={day}/hour={hour}/"
                                  )
            self.helper.write_partition_to_s3(partition)
            partitions.append(partition)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk(limit_days=7)
        found_partitions.should.have.length_of(7)
        set(found_partitions).should.equal(set(partitions[0:7]))
示例#2
0
    def test_find_partitions_in_s3(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partitions = self.helper.create_many_partitions(count=10)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk()

        set(found_partitions).should.equal(set(partitions))
示例#3
0
    def test_find_partitions_with_limit_hive_format_capital_keys(self):
        """Partitioner.partitions_on_disk() with limit days set should work on hive-formatted partitions where they keys are not lowercase"""
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database()
        self.helper.make_table(partition_keys=[
            {
                "Name": "Year",
                "Type": "int"
            },
            {
                "Name": "Month",
                "Type": "int"
            },
            {
                "Name": "Day",
                "Type": "int"
            },
            {
                "Name": "Hour",
                "Type": "int"
            },
        ])

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")
            hour = "03"

            partition = Partition([
                year, month, day, hour
            ], f"s3://{self.bucket}/{self.table}/Year={year}/Month={month}/Day={day}/Hour={hour}/"
                                  )
            self.helper.write_partition_to_s3(partition)
            partitions.append(partition)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk(limit_days=7)
        print(found_partitions)
        print(partitions[0:7])
        found_partitions.should.have.length_of(7)
        set(found_partitions).should.equal(set(partitions[0:7]))
示例#4
0
    def test_find_partitions_with_limit_no_hour_partition(self):
        """Partitioner.partitions_on_disk, limit_days set,
            on a table partitioned by day, should work"""
        self.s3.create_bucket(Bucket=self.bucket)
        db_input = self.helper.create_database_input()
        self.glue.create_database(**db_input)

        table_input = self.helper.create_table_input(
            location=f"s3://{self.bucket}/{self.table}/")
        table_input["TableInput"]["PartitionKeys"] = [
            {
                "Name": "year",
                "Type": "string"
            },
            {
                "Name": "month",
                "Type": "string"
            },
            {
                "Name": "day",
                "Type": "string"
            },
        ]

        self.glue.create_table(**table_input)

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")

            partition = Partition(
                [year, month, day],
                f"s3://{self.bucket}/{self.table}/{year}/{month}/{day}/")
            self.helper.write_partition_to_s3(partition)
            partitions.append(partition)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk(limit_days=4)
        found_partitions.should.have.length_of(4)
        set(found_partitions).should.equal(set(partitions[0:4]))
示例#5
0
    def test_partitions_to_create(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        already_created = self.helper.create_many_partitions(count=10,
                                                             write=True)
        to_create = self.helper.create_many_partitions(count=3, write=True)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(already_created)

        found = partitioner.partitions_on_disk()
        wants_to_create = partitioner.partitions_to_create(found)

        set(wants_to_create).should.equal(set(to_create))
示例#6
0
    def test_create_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()

        expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t"
        expected_output += ", ".join(map(str, partitions))

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        with captured_output() as (out, err):
            create_found_partitions(partitioner, dry_run=False)
        output = out.getvalue().strip()
        output.should.equal(expected_output)

        found = partitioner.partitions_on_disk()
        set(found).should.equal(set(partitions))
示例#7
0
    def test_find_partitions_in_s3_with_hive_formatted_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        # partitions = self.helper.create_many_partitions(count=10)
        partitions = []
        for i in range(1, 11):
            partition = Partition([
                "2019", "01", f"{i:02d}", "03"
            ], f"s3://{self.bucket}/{self.table}/year=2019/month=01/day={i:02d}/hour=03/"
                                  )
            print(partition.location)
            self.helper.write_partition_to_s3(partition)
            partitions.append(partition)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk()

        set(found_partitions).should.equal(set(partitions))
示例#8
0
    def test_create_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()

        expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t"
        expected_output += ", ".join(map(str, partitions))

        out, err = self.get_cmd_output(
            cli, ["create-partitions", self.database, self.table])
        out.should.equal(expected_output)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found = partitioner.partitions_on_disk()

        set(found).should.equal(set(partitions))
示例#9
0
    def test_find_partitions_single_key(self):
        """Partitioner.partitions_on_disk should work with single-key tables, in hive-format"""
        self.s3.create_bucket(Bucket=self.bucket)
        db_input = self.helper.create_database_input()
        self.glue.create_database(**db_input)

        table_input = self.helper.create_table_input()
        table_input["TableInput"]["PartitionKeys"] = [
            {
                "Name": "dt",
                "Type": "string"
            },
        ]

        self.glue.create_table(**table_input)

        # create initial partition
        prefix = table_input["TableInput"]["StorageDescriptor"]["Location"]
        location = f"{prefix}/dt=2019-01-02/"
        s3_key = f"{location}object.json"
        splits = s3_key[len("s3://"):].split("/", 1)
        bucket = splits[0]
        path = splits[1]

        self.s3.put_object(
            Body='{"foo": "bar"}',
            Bucket=bucket,
            Key=path,
        )

        partitions = [Partition(["2019-01-02"], location)]

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk()

        set(found_partitions).should.equal(set(partitions))
示例#10
0
    def test_create_partitions_on_disk_with_bad_table_location(self):
        self.s3.create_bucket(Bucket=self.bucket)
        database_input = self.helper.create_database_input()
        self.glue.create_database(**database_input)

        # no trailing slash for location is on purpose and what this
        # test is checking against
        table_input = self.helper.create_table_input(
            location=f"s3://{self.bucket}/{self.table}")
        self.glue.create_table(**table_input)

        partition = self.helper.create_partition_data()
        subpath = "/".join(partition.values)
        full_location = f"s3://{self.bucket}/{self.table}/{subpath}/"

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)

        found_partitions = partitioner.partitions_on_disk()

        found_partitions.should.have.length_of(1)
        found_partitions[0].location.should.equal(full_location)
示例#11
0
    def test_find_partitions_with_limit_days_and_prefix(self):
        """Partitioner.partitions_on_disk() with limit_days and prefix_partitions should find preceding partitions with hive-format names"""
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database()
        self.helper.make_table(partition_keys=[
            {
                "Name": "region",
                "Type": "string"
            },
            {
                "Name": "year",
                "Type": "int"
            },
            {
                "Name": "month",
                "Type": "int"
            },
            {
                "Name": "day",
                "Type": "int"
            },
        ])

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")

            partition_east = Partition([
                "us-east-1", year, month, day
            ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/"
                                       )
            partition_west = Partition([
                "us-west-2", year, month, day
            ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/"
                                       )
            self.helper.write_partition_to_s3(partition_east)
            self.helper.write_partition_to_s3(partition_west)
            partitions.append(partition_east)
            partitions.append(partition_west)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk(
            limit_days=4, prefix_partitions=["us-east-1"])
        found_partitions.should.have.length_of(4)

        to_be_found = []
        for i in range(1, 5):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")

            to_be_found.append(
                Partition([
                    "us-east-1", year, month, day
                ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/"
                          ))

        set(found_partitions).should.equal(set(to_be_found))