예제 #1
0
def test_fetches_nons3parq_large_parquet():
    input_key = "burger-shipment/buns"
    input_bucket = "loadingdock"

    df = dfmock.DFMock(count=100000)
    df.columns = {
        "string_options": {
            "option_count": 4,
            "option_type": "string"
        },
        "int_options": {
            "option_count": 4,
            "option_type": "int"
        },
        "datetime_options": {
            "option_count": 5,
            "option_type": "datetime"
        },
        "float_options": {
            "option_count": 2,
            "option_type": "float"
        },
        "metrics": "integer"
    }

    df.generate_dataframe()
    # This is unfortunately big, but getting it to force a partition doesn't work otherwise
    df.grow_dataframe_to_size(500)

    input_df = pd.DataFrame(df.dataframe)

    s3_client = boto3.client('s3')

    s3_key = "burger-shipment/buns"

    setup_nons3parq_parquet(dataframe=input_df,
                            bucket=input_bucket,
                            key=input_key,
                            s3_client=s3_client)

    fetched_diff = fetch_parq.fetch(bucket=input_bucket,
                                    key=s3_key,
                                    parallel=False)

    assert fetched_diff.shape == input_df.shape
    sorted_dfs_equal_by_pandas_testing(fetched_diff, input_df)
예제 #2
0
def test_end_to_end():
    df = dfmock.DFMock(count=100000)
    df.columns = {
        "string_options": {
            "option_count": 4,
            "option_type": "string"
        },
        "int_options": {
            "option_count": 4,
            "option_type": "int"
        },
        "datetime_options": {
            "option_count": 5,
            "option_type": "datetime"
        },
        "float_options": {
            "option_count": 2,
            "option_type": "float"
        },
        "metrics": "integer"
    }

    df.generate_dataframe()
    # This is unfortunately big, but getting it to force a partition doesn't work otherwise
    df.grow_dataframe_to_size(500)

    s3_client = boto3.client('s3')
    bucket_name = 'thistestbucket'
    key = 'thisdataset'
    s3_client.create_bucket(Bucket=bucket_name)

    old_df = pd.DataFrame(df.dataframe)
    # pub it
    publish(bucket=bucket_name,
            key=key,
            dataframe=old_df,
            partitions=['string_options', 'datetime_options', 'float_options'])

    # go get it
    fetched_df = fetch(bucket=bucket_name, key=key, parallel=False)

    assert fetched_df.shape == old_df.shape
    assert df_equal_by_set(fetched_df, old_df, old_df.columns)
    sorted_dfs_equal_by_pandas_testing(fetched_df, old_df)
예제 #3
0
def test_end_to_end():
    df = dfmock.DFMock(count=1000)
    df.columns = {
        "string_options": {
            "option_count": 4,
            "option_type": "string"
        },
        "int_options": {
            "option_count": 4,
            "option_type": "int"
        },
        "datetime_options": {
            "option_count": 5,
            "option_type": "datetime"
        },
        "float_options": {
            "option_count": 2,
            "option_type": "float"
        },
        "metrics": "integer"
    }
    df.generate_dataframe()
    df.grow_dataframe_to_size(250)

    s3_client = boto3.client('s3')

    bucket_name = 'thistestbucket'
    key = 'thisdataset'

    s3_client.create_bucket(Bucket=bucket_name)

    # pub it
    publish(bucket=bucket_name,
            key=key,
            dataframe=df.dataframe,
            partitions=['string_options', 'datetime_options', 'float_options'])

    # go get it
    fetched_df = fetch(bucket=bucket_name, key=key, parallel=False)

    assert fetched_df.shape == df.dataframe.shape
    pd.DataFrame.eq(fetched_df, df.dataframe)
    fetched_df.head()
예제 #4
0
def test_end_to_end():
    # make a sample DF for all the tests
    df = dfmock.DFMock(count=10000)
    df.columns = {
        "string_options": {
            "option_count": 4,
            "option_type": "string"
        },
        "int_options": {
            "option_count": 4,
            "option_type": "int"
        },
        "datetime_options": {
            "option_count": 5,
            "option_type": "datetime"
        },
        "float_options": {
            "option_count": 2,
            "option_type": "float"
        },
        "metrics": "integer"
    }
    df.generate_dataframe()

    s3_client = boto3.client('s3')
    bucket_name = 'thistestbucket'
    key = 'thisdataset'
    s3_client.create_bucket(Bucket=bucket_name)

    old_df = pd.DataFrame(df.dataframe)

    # pub it
    publish(bucket=bucket_name,
            key=key,
            dataframe=old_df,
            partitions=['string_options', 'datetime_options', 'float_options'])

    # go get it
    fetched_df = fetch(bucket=bucket_name, key=key, parallel=False)

    assert fetched_df.shape == old_df.shape
    assert df_equal_by_set(fetched_df, old_df, old_df.columns)
    sorted_dfs_equal_by_pandas_testing(fetched_df, old_df)
예제 #5
0
import boto3
import moto
import s3parq
import pytest
import dfmock
from s3parq.testing_helper import df_equal_by_set
from s3parq.publish_parq import publish
from s3parq.fetch_parq import fetch
import pandas as pd

# make a sample DF for all the tests
df = dfmock.DFMock(count=10000)
df.columns = {
    "string_options": {
        "option_count": 4,
        "option_type": "string"
    },
    "int_options": {
        "option_count": 4,
        "option_type": "int"
    },
    "datetime_options": {
        "option_count": 5,
        "option_type": "datetime"
    },
    "float_options": {
        "option_count": 2,
        "option_type": "float"
    },
    "metrics": "integer"
}