Пример #1
0
def test_fetches_diff_none():
    with get_s3_client() as s3_client:
        input_key = "clay/beads"
        input_bucket = "kiln"
        comparison_key = "new-case"
        comparison_bucket = "storefront"
        partitions = ["price"]

        part_types = {"count": "int", "price": "float"}

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9],
            "price": [2.43, 1.23, 5.76, 3.28]
        })

        s3_client.create_bucket(Bucket=input_bucket)
        s3_client.create_bucket(Bucket=comparison_bucket)

        setup_partitioned_parquet(dataframe=input_df,
                                  bucket=input_bucket,
                                  key=input_key,
                                  partition_data_types={"price": "float"},
                                  s3_client=s3_client)

        fetched_diff = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            parallel=False)

        fetched_diff.sort_values(by=['price'], inplace=True)
        input_df.sort_values(by=['price'], inplace=True)

        sorted_dfs_equal_by_pandas_testing(fetched_diff, input_df)

        fetched_diff_reverse = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            reverse=True,
            parallel=False)

        assert fetched_diff_reverse.empty

        fetched_diff_reverse_both = fetch_parq.fetch_diff(
            input_bucket=comparison_bucket,
            input_key=comparison_key,
            comparison_bucket=input_bucket,
            comparison_key=input_key,
            partition=partitions[0],
            reverse=True,
            parallel=False)

        sorted_dfs_equal_by_pandas_testing(fetched_diff_reverse_both, input_df)
Пример #2
0
    def test_fetches_diff_none(self):
        input_key = "clay/beads"
        input_bucket = "kiln"
        comparison_key = "new-case"
        comparison_bucket = "storefront"
        partitions = ["price"]

        part_types = {"count": "int", "price": "float"}

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9],
            "price": [2.43, 1.23, 5.76, 3.28]
        })

        s3_client = boto3.client('s3')
        s3_client.create_bucket(Bucket=input_bucket)
        s3_client.create_bucket(Bucket=comparison_bucket)

        published_files = publish(bucket=input_bucket,
                                  key=input_key,
                                  dataframe=input_df,
                                  partitions=partitions)

        fetched_diff = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            parallel=False)

        assert fetched_diff.empty

        fetched_diff_reverse = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            reverse=True,
            parallel=False)

        fetched_diff_reverse.sort_index(inplace=True)
        input_df.sort_index(inplace=True)

        assert fetched_diff_reverse['count'].equals(input_df['count'])
        assert fetched_diff_reverse['price'].equals(input_df['price'])
Пример #3
0
    def test_fetches_diff(self):
        input_key = "burger-shipment/buns"
        input_bucket = "loadingdock"
        comparison_key = "burger-inventory/buns"
        comparison_bucket = "backroom"
        partitions = ["exp-date"]

        part_types = {"count": "int", "price": "float", "exp-date": "string"}

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9],
            "price": [2.43, 1.23, 5.76, 3.28],
            "exp-date": ["x", "z", "a", "zz"]
        })
        comparison_df = pd.DataFrame({
            "count": [2, 3, 4, 9],
            "price": [2.43, 4.35, 1.23, 3.28],
            "exp-date": ["x", "y", "z", "zz"]
        })

        s3_client = boto3.client('s3')
        s3_client.create_bucket(Bucket=input_bucket)
        s3_client.create_bucket(Bucket=comparison_bucket)

        published_files = publish(bucket=input_bucket,
                                  key=input_key,
                                  dataframe=input_df,
                                  partitions=partitions)

        published_files = publish(bucket=comparison_bucket,
                                  key=comparison_key,
                                  dataframe=comparison_df,
                                  partitions=partitions)

        test_df = pd.DataFrame({
            "count": [7],
            "price": [5.76],
            "exp-date": ["a"]
        })

        fetched_diff = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            parallel=False)

        # Test data knows these are single row-ed DFs, testing that data
        #   like this cause pandas DF equals is ???
        assert fetched_diff.iloc[0].equals(test_df.iloc[0])
Пример #4
0
def test_fetches_diff():
    with get_s3_client() as s3_client:
        input_key = "burger-shipment/buns"
        input_bucket = "loadingdock"
        comparison_key = "burger-inventory/buns"
        comparison_bucket = "backroom"
        partitions = ["exp-date"]

        part_types = {"count": "int", "price": "float", "exp-date": "string"}

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9, 9],
            "price": [2.43, 1.23, 5.76, 3.28, 4.44],
            "exp-date": ["x", "z", "a", "zz", "l"]
        })
        comparison_df = pd.DataFrame({
            "count": [2, 3, 4, 9],
            "price": [2.43, 4.35, 1.23, 3.28],
            "exp-date": ["x", "y", "z", "zz"]
        })

        setup_partitioned_parquet(dataframe=input_df,
                                  bucket=input_bucket,
                                  key=input_key,
                                  partition_data_types={"exp-date": "string"},
                                  s3_client=s3_client)

        setup_partitioned_parquet(dataframe=comparison_df,
                                  bucket=comparison_bucket,
                                  key=comparison_key,
                                  partition_data_types={"exp-date": "string"},
                                  s3_client=s3_client)

        test_df = pd.DataFrame({
            "count": [7, 9],
            "price": [5.76, 4.44],
            "exp-date": ["a", "l"]
        })

        fetched_diff = fetch_parq.fetch_diff(
            input_bucket=input_bucket,
            input_key=input_key,
            comparison_bucket=comparison_bucket,
            comparison_key=comparison_key,
            partition=partitions[0],
            parallel=False)

        assert fetched_diff.shape == test_df.shape
        sorted_dfs_equal_by_pandas_testing(fetched_diff, test_df)
Пример #5
0
 def fetch_diff(self,
     input_bucket: str, 
     input_key: str, 
     comparison_bucket: str, 
     comparison_key: str, 
     partition: str, 
     parallel: bool = True
 ) -> pd.DataFrame:
     return fetch_diff(
         input_bucket = input_bucket, 
         input_key = input_key, 
         comparison_bucket = comparison_bucket, 
         comparison_key = comparison_key, 
         partition = partition, 
         parallel = parallel
     )