def test_fetches_diff_none(): with get_s3_client() as s3_client: input_key = "clay/beads" input_bucket = "kiln" comparison_key = "new-case" comparison_bucket = "storefront" partitions = ["price"] part_types = {"count": "int", "price": "float"} input_df = pd.DataFrame({ "count": [2, 4, 7, 9], "price": [2.43, 1.23, 5.76, 3.28] }) s3_client.create_bucket(Bucket=input_bucket) s3_client.create_bucket(Bucket=comparison_bucket) setup_partitioned_parquet(dataframe=input_df, bucket=input_bucket, key=input_key, partition_data_types={"price": "float"}, s3_client=s3_client) fetched_diff = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], parallel=False) fetched_diff.sort_values(by=['price'], inplace=True) input_df.sort_values(by=['price'], inplace=True) sorted_dfs_equal_by_pandas_testing(fetched_diff, input_df) fetched_diff_reverse = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], reverse=True, parallel=False) assert fetched_diff_reverse.empty fetched_diff_reverse_both = fetch_parq.fetch_diff( input_bucket=comparison_bucket, input_key=comparison_key, comparison_bucket=input_bucket, comparison_key=input_key, partition=partitions[0], reverse=True, parallel=False) sorted_dfs_equal_by_pandas_testing(fetched_diff_reverse_both, input_df)
def test_fetches_diff(): with get_s3_client() as s3_client: input_key = "burger-shipment/buns" input_bucket = "loadingdock" comparison_key = "burger-inventory/buns" comparison_bucket = "backroom" partitions = ["exp-date"] part_types = {"count": "int", "price": "float", "exp-date": "string"} input_df = pd.DataFrame({ "count": [2, 4, 7, 9, 9], "price": [2.43, 1.23, 5.76, 3.28, 4.44], "exp-date": ["x", "z", "a", "zz", "l"] }) comparison_df = pd.DataFrame({ "count": [2, 3, 4, 9], "price": [2.43, 4.35, 1.23, 3.28], "exp-date": ["x", "y", "z", "zz"] }) setup_partitioned_parquet(dataframe=input_df, bucket=input_bucket, key=input_key, partition_data_types={"exp-date": "string"}, s3_client=s3_client) setup_partitioned_parquet(dataframe=comparison_df, bucket=comparison_bucket, key=comparison_key, partition_data_types={"exp-date": "string"}, s3_client=s3_client) test_df = pd.DataFrame({ "count": [7, 9], "price": [5.76, 4.44], "exp-date": ["a", "l"] }) fetched_diff = fetch_parq.fetch_diff( input_bucket=input_bucket, input_key=input_key, comparison_bucket=comparison_bucket, comparison_key=comparison_key, partition=partitions[0], parallel=False) assert fetched_diff.shape == test_df.shape sorted_dfs_equal_by_pandas_testing(fetched_diff, test_df)
def test_s3_parquet_to_dataframe(): with get_s3_client() as s3_client: columns = { "string_col": "string", "int_col": "integer", "float_col": "float", "bool_col": "boolean", "datetime_col": "datetime" } bucket = "foobucket" key = "fookey" df = setup_grouped_dataframe(count=10, columns=columns) bucket, parquet_paths = setup_partitioned_parquet( dataframe=df, bucket=bucket, key=key, partition_data_types={}, s3_client=s3_client) first_published_file = parquet_paths[0] response = fetch_parq._s3_parquet_to_dataframe( bucket=bucket, key=first_published_file, partition_metadata={}) assert isinstance(response, pd.DataFrame) for col in columns.keys(): assert (col in response.columns) assert response.shape == df.shape sorted_dfs_equal_by_pandas_testing(response, df)
def test_s3_partitioned_parquet_to_dataframe(): partition_types = {"string_col": "string", "int_col": "integer", "float_col": "float", "bool_col": "boolean", "datetime_col": "datetime"} columns = dict(partition_types) columns["metrics"] = "int" bucket = "foobucket" key = "fookey" df = setup_grouped_dataframe(count=10, columns=columns) bucket, parquet_paths = setup_partitioned_parquet( dataframe=df, bucket=bucket, key=key, partition_data_types=partition_types ) first_published_file = parquet_paths[0] response = fetch_parq._s3_parquet_to_dataframe( bucket=bucket, key=first_published_file, partition_metadata=partition_types) assert isinstance(response, pd.DataFrame) for col in columns.keys(): assert (col in response.columns) full_response = pd.DataFrame() for path in parquet_paths: full_response = full_response.append(fetch_parq._s3_parquet_to_dataframe( bucket=bucket, key=path, partition_metadata=partition_types)) assert full_response.shape == df.shape sorted_dfs_equal_by_pandas_testing(full_response, df)
def test_fetch_when_none(): input_key = "burger-shipment/buns" input_bucket = "loadingdock" partitions = ["exp-date"] part_types = {"count": "int", "price": "float", "exp-date": "str"} fetched_dtypes = pd.Series(["int64", "float64", "object"], index=["count", "price", "exp-date"]) input_df = pd.DataFrame({ "count": [2, 4, 7, 9], "price": [2.43, 1.23, 5.76, 3.28], "exp-date": ["x", "z", "a", "zz"] }) s3_client = boto3.client('s3') setup_partitioned_parquet(dataframe=input_df, bucket=input_bucket, key=input_key, partition_data_types={"exp-date": "string"}, s3_client=s3_client) filters = [{ "partition": "exp-date", "comparison": "==", "values": ["not-there"] }] fetched = fetch_parq.fetch(bucket=input_bucket, key=input_key, filters=filters, parallel=False) # Testing that DF is empty and has the expected columns+dtypes assert fetched.empty assert fetched.dtypes.equals(fetched_dtypes)
def test_get_data_types_from_s3(): bucket, parquet_paths = setup_partitioned_parquet() s3_client = boto3.client('s3') files = s3_client.list_objects_v2(Bucket=bucket) first_file_key = files["Contents"][0]["Key"] partition_metadata = fetch_parq._get_partitions_and_types( first_file_key, bucket) assert partition_metadata == { "string_col": "string", "int_col": "integer", "float_col": "float", "bool_col": "boolean", "datetime_col": "datetime" }
def test_gets_max_denies_text(): key = "safekeyprefixname/safedatasetname" bucket = "safebucketname" part_types = {"string_col": "string", "bool_col": "bool"} col_types = dict(part_types) col_types["metrics"] = "int" df = setup_grouped_dataframe(count=10, columns=col_types) bucket, parquet_paths = setup_partitioned_parquet( dataframe=df, bucket=bucket, key=key, partition_data_types=part_types) with pytest.raises(ValueError): fetched_max = fetch_parq.get_max_partition_value( bucket=bucket, key=key, partition="string_col") with pytest.raises(ValueError): fetched_max = fetch_parq.get_max_partition_value(bucket=bucket, key=key, partition="bool_col")
def test_gets_max(): key = "safekeyprefixname/safedatasetname" bucket = "safebucketname" part_types = {"int_col": "int", "float_col": "float"} df = setup_grouped_dataframe(count=10, columns=part_types) bucket, parquet_paths = setup_partitioned_parquet( dataframe=df, bucket=bucket, key=key, partition_data_types={"int_col": "int"}) fetched_max = fetch_parq.get_max_partition_value(bucket=bucket, key=key, partition="int_col") # Test max of column is max of the fetched partition assert df["int_col"].max() == fetched_max