Exemplo n.º 1
0
def test_fetches_nons3parq():
    input_key = "burger-shipment/buns"
    input_bucket = "loadingdock"

    input_df = pd.DataFrame({
        "count": [2, 4, 7, 9, 9],
        "price": [2.43, 1.23, 5.76, 3.28, 4.44],
        "exp-date": ["x", "z", "a", "zz", "l"]
    })

    s3_client = boto3.client('s3')

    s3_key = "burger-shipment/buns"

    setup_nons3parq_parquet(dataframe=input_df,
                            bucket=input_bucket,
                            key=input_key,
                            s3_client=s3_client)

    fetched_diff = fetch_parq.fetch(bucket=input_bucket,
                                    key=s3_key,
                                    parallel=False)

    assert fetched_diff.shape == input_df.shape
    sorted_dfs_equal_by_pandas_testing(fetched_diff, input_df)
Exemplo n.º 2
0
    def fetch(self,
              bucket: str,
              key: str,
              **kwargs
              ) -> None:

        return fetch(key=key,
                     bucket=bucket,
                     filters=kwargs.get('partitions', dict())
                     )
Exemplo n.º 3
0
    def test_invalid_filters(self):
        inv_fil_params = {
            "bucket": "fake-bucket",
            "key": "fake-key",
            "filters": [{
                "comparison": "==",
                "values": ["fake-value"]
            }]
        }
        with pytest.raises(ValueError):
            fetch_parq.fetch(**inv_fil_params)

        inv_fil_params["filters"] = [{
            "partition": "fake-part",
            "comparison": "&&",
            "value": "some-string"
        }]

        with pytest.raises(ValueError):
            fetch_parq.fetch(**inv_fil_params)
Exemplo n.º 4
0
    def fetch(self,
              bucket: str,
              key: str,
              **kwargs
              ) -> pd.DataFrame:

        return fetch(key=key,
                     bucket=bucket,
                     filters=kwargs.get('partitions', dict()),
                     parallel=kwargs.get('parallel',True)
                     )
Exemplo n.º 5
0
def test_via_public_interface():
    s3_client = boto3.client('s3')
    bucket_name = 'another-bucket'
    key = 'testing/is/fun/dataset-name'
    s3_client.create_bucket(Bucket=bucket_name)

    publish(bucket=bucket_name,
            key=key,
            dataframe=df.dataframe,
            partitions=['datetime_options'])

    ## moto explodes when we use parallel :( need to test this with a real boto call
    result = fetch(bucket=bucket_name, key=key, parallel=False)
    assert result.shape == df.dataframe.shape
    assert df_equal_by_set(result, df.dataframe, df.dataframe.columns.tolist())
Exemplo n.º 6
0
def test_fetches_nons3parq_large_parquet():
    input_key = "burger-shipment/buns"
    input_bucket = "loadingdock"

    df = dfmock.DFMock(count=100000)
    df.columns = {
        "string_options": {
            "option_count": 4,
            "option_type": "string"
        },
        "int_options": {
            "option_count": 4,
            "option_type": "int"
        },
        "datetime_options": {
            "option_count": 5,
            "option_type": "datetime"
        },
        "float_options": {
            "option_count": 2,
            "option_type": "float"
        },
        "metrics": "integer"
    }

    df.generate_dataframe()
    # This is unfortunately big, but getting it to force a partition doesn't work otherwise
    df.grow_dataframe_to_size(500)

    input_df = pd.DataFrame(df.dataframe)

    s3_client = boto3.client('s3')

    s3_key = "burger-shipment/buns"

    setup_nons3parq_parquet(dataframe=input_df,
                            bucket=input_bucket,
                            key=input_key,
                            s3_client=s3_client)

    fetched_diff = fetch_parq.fetch(bucket=input_bucket,
                                    key=s3_key,
                                    parallel=False)

    assert fetched_diff.shape == input_df.shape
    sorted_dfs_equal_by_pandas_testing(fetched_diff, input_df)
Exemplo n.º 7
0
def test_end_to_end():
    df = dfmock.DFMock(count=100000)
    df.columns = {
        "string_options": {
            "option_count": 4,
            "option_type": "string"
        },
        "int_options": {
            "option_count": 4,
            "option_type": "int"
        },
        "datetime_options": {
            "option_count": 5,
            "option_type": "datetime"
        },
        "float_options": {
            "option_count": 2,
            "option_type": "float"
        },
        "metrics": "integer"
    }

    df.generate_dataframe()
    # This is unfortunately big, but getting it to force a partition doesn't work otherwise
    df.grow_dataframe_to_size(500)

    s3_client = boto3.client('s3')
    bucket_name = 'thistestbucket'
    key = 'thisdataset'
    s3_client.create_bucket(Bucket=bucket_name)

    old_df = pd.DataFrame(df.dataframe)
    # pub it
    publish(bucket=bucket_name,
            key=key,
            dataframe=old_df,
            partitions=['string_options', 'datetime_options', 'float_options'])

    # go get it
    fetched_df = fetch(bucket=bucket_name, key=key, parallel=False)

    assert fetched_df.shape == old_df.shape
    assert df_equal_by_set(fetched_df, old_df, old_df.columns)
    sorted_dfs_equal_by_pandas_testing(fetched_df, old_df)
Exemplo n.º 8
0
def test_end_to_end():
    df = dfmock.DFMock(count=1000)
    df.columns = {
        "string_options": {
            "option_count": 4,
            "option_type": "string"
        },
        "int_options": {
            "option_count": 4,
            "option_type": "int"
        },
        "datetime_options": {
            "option_count": 5,
            "option_type": "datetime"
        },
        "float_options": {
            "option_count": 2,
            "option_type": "float"
        },
        "metrics": "integer"
    }
    df.generate_dataframe()
    df.grow_dataframe_to_size(250)

    s3_client = boto3.client('s3')

    bucket_name = 'thistestbucket'
    key = 'thisdataset'

    s3_client.create_bucket(Bucket=bucket_name)

    # pub it
    publish(bucket=bucket_name,
            key=key,
            dataframe=df.dataframe,
            partitions=['string_options', 'datetime_options', 'float_options'])

    # go get it
    fetched_df = fetch(bucket=bucket_name, key=key, parallel=False)

    assert fetched_df.shape == df.dataframe.shape
    pd.DataFrame.eq(fetched_df, df.dataframe)
    fetched_df.head()
Exemplo n.º 9
0
def test_end_to_end():
    # make a sample DF for all the tests
    df = dfmock.DFMock(count=10000)
    df.columns = {
        "string_options": {
            "option_count": 4,
            "option_type": "string"
        },
        "int_options": {
            "option_count": 4,
            "option_type": "int"
        },
        "datetime_options": {
            "option_count": 5,
            "option_type": "datetime"
        },
        "float_options": {
            "option_count": 2,
            "option_type": "float"
        },
        "metrics": "integer"
    }
    df.generate_dataframe()

    s3_client = boto3.client('s3')
    bucket_name = 'thistestbucket'
    key = 'thisdataset'
    s3_client.create_bucket(Bucket=bucket_name)

    old_df = pd.DataFrame(df.dataframe)

    # pub it
    publish(bucket=bucket_name,
            key=key,
            dataframe=old_df,
            partitions=['string_options', 'datetime_options', 'float_options'])

    # go get it
    fetched_df = fetch(bucket=bucket_name, key=key, parallel=False)

    assert fetched_df.shape == old_df.shape
    assert df_equal_by_set(fetched_df, old_df, old_df.columns)
    sorted_dfs_equal_by_pandas_testing(fetched_df, old_df)
Exemplo n.º 10
0
def test_end_to_end():
    s3_client = boto3.client('s3')

    bucket_name = 'thistestbucket'
    key = 'thisdataset'

    s3_client.create_bucket(Bucket=bucket_name)

    # pub it
    publish(bucket=bucket_name,
            key=key,
            dataframe=df.dataframe,
            partitions=['string_options', 'datetime_options', 'float_options'])

    # go get it
    dataframe = fetch(bucket=bucket_name, key=key, parallel=False)

    assert dataframe.shape == df.dataframe.shape
    pd.DataFrame.eq(dataframe, df.dataframe)
    dataframe.head()
Exemplo n.º 11
0
    def test_fetch_when_none(self):
        input_key = "burger-shipment/buns"
        input_bucket = "loadingdock"
        partitions = ["exp-date"]

        part_types = {"count": "int", "price": "float", "exp-date": "str"}

        fetched_dtypes = pd.Series(["int64", "float64", "object"],
                                   index=["count", "price", "exp-date"])

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9],
            "price": [2.43, 1.23, 5.76, 3.28],
            "exp-date": ["x", "z", "a", "zz"]
        })

        s3_client = boto3.client('s3')
        s3_client.create_bucket(Bucket=input_bucket)

        published_files = publish(bucket=input_bucket,
                                  key=input_key,
                                  dataframe=input_df,
                                  partitions=partitions)

        filters = [{
            "partition": "exp-date",
            "comparison": "==",
            "values": ["not-there"]
        }]

        fetched = fetch_parq.fetch(bucket=input_bucket,
                                   key=input_key,
                                   filters=filters,
                                   parallel=False)

        # Testing that DF is empty and has the expected columns+dtypes
        assert fetched.empty
        assert fetched.dtypes.equals(fetched_dtypes)
Exemplo n.º 12
0
def test_not_fetches_nons3parq():
    with get_s3_client() as s3_client:
        input_key = "burger-shipment/buns"
        input_bucket = "loadingdock"

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9, 9],
            "price": [2.43, 1.23, 5.76, 3.28, 4.44],
            "exp-date": ["x", "z", "a", "zz", "l"]
        })

        s3_key = "burger-shipment/buns"

        setup_nons3parq_parquet(dataframe=input_df,
                                bucket=input_bucket,
                                key=input_key,
                                s3_client=s3_client)

        with pytest.raises(MissingS3ParqMetadata):
            fetched_diff = fetch_parq.fetch(bucket=input_bucket,
                                            key=s3_key,
                                            parallel=False,
                                            accept_not_s3parq=False)
Exemplo n.º 13
0
def test_fetch_when_none():
    with get_s3_client() as s3_client:
        input_key = "burger-shipment/buns"
        input_bucket = "loadingdock"
        partitions = ["exp-date"]

        part_types = {"count": "int", "price": "float", "exp-date": "str"}

        fetched_dtypes = pd.Series(["int64", "float64", "object"],
                                   index=["count", "price", "exp-date"])

        input_df = pd.DataFrame({
            "count": [2, 4, 7, 9],
            "price": [2.43, 1.23, 5.76, 3.28],
            "exp-date": ["x", "z", "a", "zz"]
        })

        setup_partitioned_parquet(dataframe=input_df,
                                  bucket=input_bucket,
                                  key=input_key,
                                  partition_data_types={"exp-date": "string"},
                                  s3_client=s3_client)

        filters = [{
            "partition": "exp-date",
            "comparison": "==",
            "values": ["not-there"]
        }]

        fetched = fetch_parq.fetch(bucket=input_bucket,
                                   key=input_key,
                                   filters=filters,
                                   parallel=False)

        # Testing that DF is empty and has the expected columns+dtypes
        assert fetched.empty
        assert fetched.dtypes.equals(fetched_dtypes)