示例#1
0
def test_s3_select_b_filtered_from_small_multicolumn_parquet():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    response = s3.select_object_content(Bucket='s3filter', Key='parquet/small.multicolumn.9999.parquet',
                                        Expression='select a from s3Object where a < 5000', ExpressionType='SQL',
                                        InputSerialization={
                                            'CompressionType': 'NONE',
                                            'Parquet': {}

                                        },
                                        OutputSerialization={
                                            'CSV': {}
                                        })

    df = None

    cursor = PandasCursor(None)
    cursor.event_stream = response['Payload']
    dfs = cursor.parse_event_stream()

    for partial_df in dfs:
        if df is None:
            df = partial_df
        else:
            df = pd.concat(df, partial_df)

    assert len(df) == 5000

    print()
    print("{} | {}".format(test_s3_select_from_small_parquet.__name__, cursor.bytes_scanned))
    print("{} | {}".format(test_s3_select_from_small_parquet.__name__, cursor.bytes_processed))
    print("{} | {}".format(test_s3_select_from_small_parquet.__name__, cursor.bytes_returned))
示例#2
0
def test_s3_select_from_csv():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    response = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.csv',
                                        Expression='select sum(cast(s_acctbal as float)) from s3Object',
                                        ExpressionType='SQL',
                                        InputSerialization={
                                            'CompressionType': 'NONE',
                                            'CSV': {'FileHeaderInfo': 'Use', 'RecordDelimiter': '\n',
                                                    'FieldDelimiter': '|'}
                                        },
                                        OutputSerialization={
                                            'CSV': {}
                                        })

    df = None

    cursor = PandasCursor(None)
    cursor.event_stream = response['Payload']
    dfs = cursor.parse_event_stream()

    for partial_df in dfs:
        if df is None:
            df = partial_df
        else:
            df = pd.concat(df, partial_df)

    assert len(df) == 1

    assert pd.to_numeric(df.iloc[0]['_0']) == pytest.approx(45103548.64999)
示例#3
0
def test_filtered_s3_select_from_parquet():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    response = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet',
                                        Expression='select * from s3Object where cast(s_acctbal as float) > 500.0 ',
                                        ExpressionType='SQL',
                                        InputSerialization={
                                            'CompressionType': 'NONE',
                                            'Parquet': {}
                                        },
                                        OutputSerialization={
                                            'CSV': {}
                                        })

    df = None

    cursor = PandasCursor(None)
    cursor.event_stream = response['Payload']
    dfs = cursor.parse_event_stream()

    for partial_df in dfs:
        if df is None:
            df = partial_df
        else:
            df = pd.concat(df, partial_df)

    assert_supplier_table(df, 8642, use_ordinal_columns=True)
示例#4
0
    def run():
        cfg = Config(region_name="us-east-1",
                     parameter_validation=False,
                     max_pool_connections=10)
        session = Session()
        s3 = session.client('s3', config=cfg)
        response = s3.select_object_content(
            Bucket='s3filter',
            Key='parquet/supplier.large.parquet',
            Expression='select sum(cast(s_suppkey as int)) from s3Object',
            ExpressionType='SQL',
            InputSerialization={
                'CompressionType': 'NONE',
                'Parquet': {}
            },
            OutputSerialization={'CSV': {}})
        df = None
        cursor = PandasCursor(None)
        cursor.event_stream = response['Payload']
        dfs = cursor.parse_event_stream()
        for partial_df in dfs:
            if df is None:
                df = partial_df
            else:
                df = pd.concat(df, partial_df)
        assert len(df) == 1
        # assert pd.to_numeric(df.iloc[0]['_0']) == pytest.approx(22551774325.00404)

        print("{} | {}".format(test_s3_select_from_large_parquet.__name__,
                               cursor.bytes_scanned))
        print("{} | {}".format(test_s3_select_from_large_parquet.__name__,
                               cursor.bytes_processed))
        print("{} | {}".format(test_s3_select_from_large_parquet.__name__,
                               cursor.bytes_returned))
示例#5
0
def test_projected_s3_select_from_parquet():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    response = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet',
                                        Expression='select s_suppkey from s3Object',
                                        ExpressionType='SQL',
                                        InputSerialization={
                                            'CompressionType': 'NONE',
                                            'Parquet': {}

                                        },
                                        OutputSerialization={
                                            'CSV': {}
                                        })

    df = None

    cursor = PandasCursor(None)
    cursor.event_stream = response['Payload']
    dfs = cursor.parse_event_stream()

    for partial_df in dfs:
        if df is None:
            df = partial_df
        else:
            df = pd.concat(df, partial_df)

    assert len(df) == 10000

    rows = df[df['_0'] == '1']

    assert len(rows) == 1

    row = rows.iloc[0]

    assert row['_0'] == "1"
示例#6
0
def test_projected_vs_all_s3_select_from_parquet():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    response1 = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet',
                                         Expression='select s_suppkey from s3Object',
                                         ExpressionType='SQL',
                                         InputSerialization={
                                             'CompressionType': 'NONE',
                                             'Parquet': {}

                                         },
                                         OutputSerialization={
                                             'CSV': {}
                                         })

    df1 = None

    cursor1 = PandasCursor(None)
    cursor1.event_stream = response1['Payload']
    dfs1 = cursor1.parse_event_stream()

    for partial_df1 in dfs1:
        if df1 is None:
            df1 = partial_df1
        else:
            df1 = pd.concat(df1, partial_df1)

    response2 = s3.select_object_content(Bucket='s3filter', Key='parquet/supplier.parquet',
                                         Expression='select * from s3Object',
                                         ExpressionType='SQL',
                                         InputSerialization={
                                             'CompressionType': 'NONE',
                                             'Parquet': {}

                                         },
                                         OutputSerialization={
                                             'CSV': {}
                                         })

    df2 = None

    cursor2 = PandasCursor(None)
    cursor2.event_stream = response2['Payload']
    dfs2 = cursor2.parse_event_stream()

    for partial_df2 in dfs2:
        if df2 is None:
            df2 = partial_df2
        else:
            df2 = pd.concat(df2, partial_df2)

    assert len(df1) == 10000
    assert len(df2) == 10000

    rows1 = df1[df1['_0'] == '1']
    rows2 = df2[df2['_0'] == '1']

    assert len(rows1) == 1
    assert len(rows2) == 1

    row1 = rows1.iloc[0]
    row2 = rows2.iloc[0]

    assert row1['_0'] == "1"
    assert row2['_0'] == "1"

    assert cursor2.bytes_scanned > cursor1.bytes_scanned