def test_parse_from_aws(self): normal_aws_response = { "Values": ["2019", "01", "02", "03"], "StorageDescriptor": { "Location": "s3://bucket/location/2019/01/02/03/", }, } partition = Partition.from_aws_response(normal_aws_response) partition.values[0].should.equal("2019") partition.values[1].should.equal("01") partition.values[2].should.equal("02") partition.values[3].should.equal("03") partition.location.should.equal("s3://bucket/location/2019/01/02/03/") # Conform location gets normalized by Partition bad_location_aws_response = { "Values": ["2019", "01", "02", "03"], "StorageDescriptor": { "Location": "s3://bucket/location/2019/01/02/03", }, } partition2 = Partition.from_aws_response(bad_location_aws_response) partition2.values[0].should.equal("2019") partition2.values[1].should.equal("01") partition2.values[2].should.equal("02") partition2.values[3].should.equal("03") partition2.location.should.equal("s3://bucket/location/2019/01/02/03/") partition2.should.equal(partition)
def create_partition_data(self, values=None, prefix=None, bucket=None, save=True): if not values: values = self.create_partition_values() if not prefix: prefix = self.default_table if prefix[0] == "/": prefix = prefix[1:] if prefix[-1] != "/": prefix = prefix + "/" if not bucket: bucket = self.default_bucket s3_key = f"{prefix}{values[0]}/{values[1]}/{values[2]}/{values[3]}/" location = f"s3://{bucket}/{s3_key}" partition = Partition(values, location) if save: self.write_partition_to_s3(partition) return partition
def test_find_partitions_with_limit_hive_format(self): """Partitioner.partitions_on_disk() with limit days set should work on hive-formatted partitions""" self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") hour = "03" partition = Partition([ year, month, day, hour ], f"s3://{self.bucket}/{self.table}/year={year}/month={month}/day={day}/hour={hour}/" ) self.helper.write_partition_to_s3(partition) partitions.append(partition) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk(limit_days=7) found_partitions.should.have.length_of(7) set(found_partitions).should.equal(set(partitions[0:7]))
def test_update_partition_locations_with_mix_of_good_and_bad(self): self.helper.make_database_and_table() good_old_location = "s3://old-bucket/table/data1/" good_new_location = f"s3://{self.bucket}/{self.table}/2019-01-01-01/" good_partition = Partition(["2019", "01", "01", "01"], good_old_location) bad_partition = Partition(["2018", "02", "02", "02"], "s3://old-bucket/table/data2/") self.glue.create_partition(DatabaseName=self.database, TableName=self.table, PartitionInput={ "Values": good_partition.values, "StorageDescriptor": { "Location": good_partition.location } }) good_partition.location = good_new_location partitioner = Partitioner(self.database, self.table, aws_region=self.region) mock = MagicMock() partitioner.glue.update_partition = mock errors = partitioner.update_partition_locations( [bad_partition, good_partition]) mock.assert_called_with(DatabaseName=self.database, TableName=self.table, PartitionValueList=good_partition.values, PartitionInput={ "Values": good_partition.values, "StorageDescriptor": { "Location": good_new_location } }) errors.should.have.length_of(1) errors[0]["Partition"].should.equal(bad_partition.values)
def test_partition_comparisons(self): p1 = Partition(["2019", "01", "01", "01"], "s3://bucket/table/") p2 = Partition(["2019", "02", "02", "02"], "s3://bucket/table2/") (p1 > p2).should.be.false (p1 < p2).should.be.true p3 = Partition(["2019", "01", "01", "01"], "s3://bucket/table/") (p1 == p3).should.be.true p1._cmp(p3).should.equal(0) p4 = Partition(["2019", "01", "01", "01"], "s3://bucket/z-table/") (p1 > p4).should.be.true (p4 > p1).should.be.false
def test_update_partition_locations_with_non_existent_partition(self): self.helper.make_database_and_table() bad_partition = Partition(["2019", "01", "01", "01"], "s3://who/cares/") partitioner = Partitioner(self.database, self.table, aws_region=self.region) mock = MagicMock() partitioner.glue.update_partition = mock errors = partitioner.update_partition_locations([bad_partition]) errors.should.have.length_of(1) errors[0]["Partition"].should.equal(bad_partition.values) mock.assert_not_called()
def test_find_partitions_with_limit_hive_format_capital_keys(self): """Partitioner.partitions_on_disk() with limit days set should work on hive-formatted partitions where they keys are not lowercase""" self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database() self.helper.make_table(partition_keys=[ { "Name": "Year", "Type": "int" }, { "Name": "Month", "Type": "int" }, { "Name": "Day", "Type": "int" }, { "Name": "Hour", "Type": "int" }, ]) today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") hour = "03" partition = Partition([ year, month, day, hour ], f"s3://{self.bucket}/{self.table}/Year={year}/Month={month}/Day={day}/Hour={hour}/" ) self.helper.write_partition_to_s3(partition) partitions.append(partition) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk(limit_days=7) print(found_partitions) print(partitions[0:7]) found_partitions.should.have.length_of(7) set(found_partitions).should.equal(set(partitions[0:7]))
def test_delete_nonexistent_partition(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partition = Partition( ["2019", "01", "02", "03"], f"s3://{self.bucket}/{self.table}/2019/01/02/03/") result = partitioner.delete_partitions([partition]) result.should.have.length_of(1) result[0]["PartitionValues"].should.equal(["2019", "01", "02", "03"]) result[0]["ErrorDetail"]["ErrorCode"].should.equal( "EntityNotFoundException")
def test_find_partitions_with_limit_no_hour_partition(self): """Partitioner.partitions_on_disk, limit_days set, on a table partitioned by day, should work""" self.s3.create_bucket(Bucket=self.bucket) db_input = self.helper.create_database_input() self.glue.create_database(**db_input) table_input = self.helper.create_table_input( location=f"s3://{self.bucket}/{self.table}/") table_input["TableInput"]["PartitionKeys"] = [ { "Name": "year", "Type": "string" }, { "Name": "month", "Type": "string" }, { "Name": "day", "Type": "string" }, ] self.glue.create_table(**table_input) today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") partition = Partition( [year, month, day], f"s3://{self.bucket}/{self.table}/{year}/{month}/{day}/") self.helper.write_partition_to_s3(partition) partitions.append(partition) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk(limit_days=4) found_partitions.should.have.length_of(4) set(found_partitions).should.equal(set(partitions[0:4]))
def test_find_partitions_in_s3_with_hive_formatted_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() # partitions = self.helper.create_many_partitions(count=10) partitions = [] for i in range(1, 11): partition = Partition([ "2019", "01", f"{i:02d}", "03" ], f"s3://{self.bucket}/{self.table}/year=2019/month=01/day={i:02d}/hour=03/" ) print(partition.location) self.helper.write_partition_to_s3(partition) partitions.append(partition) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk() set(found_partitions).should.equal(set(partitions))
def test_find_partitions_single_key(self): """Partitioner.partitions_on_disk should work with single-key tables, in hive-format""" self.s3.create_bucket(Bucket=self.bucket) db_input = self.helper.create_database_input() self.glue.create_database(**db_input) table_input = self.helper.create_table_input() table_input["TableInput"]["PartitionKeys"] = [ { "Name": "dt", "Type": "string" }, ] self.glue.create_table(**table_input) # create initial partition prefix = table_input["TableInput"]["StorageDescriptor"]["Location"] location = f"{prefix}/dt=2019-01-02/" s3_key = f"{location}object.json" splits = s3_key[len("s3://"):].split("/", 1) bucket = splits[0] path = splits[1] self.s3.put_object( Body='{"foo": "bar"}', Bucket=bucket, Key=path, ) partitions = [Partition(["2019-01-02"], location)] partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk() set(found_partitions).should.equal(set(partitions))
def test_create_partitions_limit_days(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") hour = "03" partition = Partition([ year, month, day, hour ], f"s3://{self.bucket}/{self.table}/{year}/{month}/{day}/{hour}/") self.helper.write_partition_to_s3(partition) partitions.append(partition) partitions.sort() expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 7 new partitions to create\n\t" expected_output += ", ".join(map(str, partitions[3:])) out, err = self.get_cmd_output( cli, ["create-partitions", self.database, self.table, "--limit-days=7"]) out.should.equal(expected_output) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found = partitioner.existing_partitions() found.should.have.length_of(7) set(found).should.equal(set(partitions[3:]))
def test_find_partitions_with_limit_days_and_prefix(self): """Partitioner.partitions_on_disk() with limit_days and prefix_partitions should find preceding partitions with hive-format names""" self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database() self.helper.make_table(partition_keys=[ { "Name": "region", "Type": "string" }, { "Name": "year", "Type": "int" }, { "Name": "month", "Type": "int" }, { "Name": "day", "Type": "int" }, ]) today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") partition_east = Partition([ "us-east-1", year, month, day ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/" ) partition_west = Partition([ "us-west-2", year, month, day ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/" ) self.helper.write_partition_to_s3(partition_east) self.helper.write_partition_to_s3(partition_west) partitions.append(partition_east) partitions.append(partition_west) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk( limit_days=4, prefix_partitions=["us-east-1"]) found_partitions.should.have.length_of(4) to_be_found = [] for i in range(1, 5): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") to_be_found.append( Partition([ "us-east-1", year, month, day ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/" )) set(found_partitions).should.equal(set(to_be_found))