示例#1
0
def test_non_existent_key():
    """Executes a select against a non existent key.

    :return: None
    """

    cur = PandasCursor(boto3.client('s3'))\
        .select('does-not-exist.csv', 'select * from S3Object')

    try:
        with pytest.raises(Exception):
            cur.execute()
    finally:
        cur.close()
示例#2
0
def test_non_empty_results():
    """Executes a select where results are returned.

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3'))\
        .select('region.csv', 'select * from S3Object')

    try:
        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                # print("{}:{}".format(num_rows, r))

        assert num_rows == 5
    finally:
        cur.close()
示例#3
0
def test_select_projected_filtered_topk_with_cursor():
    cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10,
                 s3={'payload_signing_enabled': False})
    session = Session()
    s3 = session.client('s3', use_ssl=False, verify=False, config=cfg)

    num_rows = 0

    cur = PandasCursor(s3) \
        .parquet() \
        .select('parquet/small.multicolumn.9999.parquet', 'select b from s3Object where a > 5000 limit 100')

    try:
        dfs = cur.execute()
        for df in dfs:
            num_rows += len(df)
            print("{}:{}".format(num_rows, df))

        assert num_rows == 100
    finally:
        cur.close()
示例#4
0
def test_aggregate():
    """Executes a select with an aggregate.

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3')) \
        .select('region.csv', 'select count(*) from S3Object')

    try:
        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                assert r._0 == '5'
                # print("{}:{}".format(num_rows, r))

        assert num_rows == 1
    finally:
        cur.close()
示例#5
0
def test_where_predicate():
    """Executes a select with a where clause on one of the attributes.

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3'))\
        .select('region.csv', 'select * from S3Object where r_name = \'AMERICA\';')

    try:
        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                assert r._1 == 'AMERICA'
                # print("{}:{}".format(num_rows, r))

        assert num_rows == 1
    finally:
        cur.close()
示例#6
0
def test_large_results():
    """Executes a select where a large number of records are expected

    :return: None
    """

    num_rows = 0

    cur = PandasCursor(boto3.client('s3')) \
        .select('lineitem.csv', 'select * from S3Object limit 150000')

    try:

        # pr = cProfile.Profile()
        # pr.enable()

        start = timeit.default_timer()

        dfs = cur.execute()
        for df in dfs:
            for i, r in df.iterrows():
                num_rows += 1
                # print("{}:{}".format(num_rows, r))
        end = timeit.default_timer()

        elapsed = end - start
        print('Elapsed {}'.format(elapsed))

        # pr.disable()
        # s = StringIO.StringIO()
        # sortby = 'cumulative'
        # ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        # ps.print_stats()
        # print (s.getvalue())

        assert num_rows == 150000
    finally:
        cur.close()
示例#7
0
    def execute_pandas_query(op):
        cur = PandasCursor(op.s3).select(op.s3key, op.s3sql)
        dfs = cur.execute()
        op.op_metrics.query_bytes = cur.query_bytes
        op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time()
        first_tuple = True
        for df in dfs:

            assert (len(df) > 0)

            if first_tuple:
                assert (len(df.columns.values) > 0)
                op.send(TupleMessage(Tuple(df.columns.values)), op.consumers)
                first_tuple = False

                if op.log_enabled:
                    print("{}('{}') | Sending field names: {}".format(
                        op.__class__.__name__, op.name, df.columns.values))

            op.op_metrics.rows_returned += len(df)

            op.send(df, op.consumers)
        return cur
示例#8
0
def measure_R_scan():
    cur = PandasCursor(boto3.client('s3')) \
        .select('access_method_benchmark/10-shards-10GB/data_0.csv',
                'select * from S3Object where cast(F0 AS float) < 0.01; ')

    end_times = []
    start = timeit.default_timer()
    for i in range(3):
        dfs = cur.execute()
        for df in dfs:
            pass
        end_times.append(timeit.default_timer())

    end = timeit.default_timer()

    for n in range(3):
        print("trial {} takes {}".format(
            n, end_times[n] - start if n == 0 else end_times[n] -
            end_times[n - 1]))
    print("{} bytes scanned".format(cur.bytes_scanned))
    print("time = {}".format(end - start))
    print("R_scan = {}".format(1.0 * cur.bytes_scanned / (end - start)))

    cur.close()
示例#9
0
def test_no_ssl():
    """Executes a select where results are returned.

    :return: None
    """

    num_rows = 0

    # Boto is not thread safe so need one of these per scan op
    cfg = Config(region_name="us-east-1",
                 parameter_validation=False,
                 max_pool_connections=10,
                 s3={'payload_signing_enabled': False})
    session = Session()
    s3 = session.client('s3', use_ssl=False, verify=False, config=cfg)

    cur = PandasCursor(s3)\
        .select('lineitem.csv', 'select * from S3Object limit 150000')

    try:
        pr = cProfile.Profile()
        pr.enable()

        dfs = cur.execute()
        for df in dfs:
            num_rows += len(df)

        pr.disable()
        s = StringIO.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())

        assert num_rows == 150000
    finally:
        cur.close()

    num_rows = 0

    # Boto is not thread safe so need one of these per scan op
    cfg = Config(region_name="us-east-1",
                 parameter_validation=False,
                 max_pool_connections=10)
    session = Session()
    s3 = session.client('s3', config=cfg)

    cur = PandasCursor(s3) \
        .select('lineitem.csv', 'select * from S3Object limit 150000')

    try:
        pr = cProfile.Profile()
        pr.enable()

        dfs = cur.execute()
        for df in dfs:
            num_rows += len(df)

        pr.disable()
        s = StringIO.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())

        assert num_rows == 150000
    finally:
        cur.close()
示例#10
0
    def execute_pandas_query(op):

        if op.use_native:

            closure = {'first_tuple': True}

            def on_numpy_array(np_array):

                df = pd.DataFrame(np_array)

                if closure['first_tuple']:
                    assert (len(df.columns.values) > 0)
                    op.send(TupleMessage(Tuple(df.columns.values)),
                            op.consumers)
                    closure['first_tuple'] = False

                    if op.log_enabled:
                        print("{}('{}') | Sending field names: {}".format(
                            op.__class__.__name__, op.name, df.columns.values))

                op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time(
                )
                op.op_metrics.rows_returned += len(df)

                if op.log_enabled:
                    print("{}('{}') | Sending field values:".format(
                        op.__class__.__name__, op.name))
                    print(df)

                op.send(df, op.consumers)

            # cur = NativeCursor(op.fast_s3).select(op.s3key, op.s3sql)
            # cur.execute(on_numpy_array)
            #
            # op.op_metrics.query_bytes = cur.query_bytes
            #
            # return cur
        else:
            if op.format_ is Format.CSV:
                cur = PandasCursor(op.s3).csv().select(op.s3key, op.s3sql)
            elif op.format_ is Format.PARQUET:
                cur = PandasCursor(op.s3).parquet().select(op.s3key, op.s3sql)
            else:
                raise Exception("Unrecognized format {}", op.format_)

            dfs = cur.execute()
            op.op_metrics.query_bytes = cur.query_bytes
            op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time()
            first_tuple = True

            counter = 0

            buffer_ = pd.DataFrame()
            #print("DataFrames: ")
            #print(dfs)
            for df in dfs:
                if op.fn:
                    df = op.fn(df)

                if first_tuple:
                    # assert (len(df.columns.values) > 0)
                    # op.send(TupleMessage(Tuple(df.columns.values)), op.consumers)
                    first_tuple = False

                    if op.log_enabled:
                        print("{}('{}') | Sending field names: {}".format(
                            op.__class__.__name__, op.name, df.columns.values))

                op.op_metrics.rows_returned += len(df)

                # Apply filter if there is one
                # if op.filter_fn is not None:
                #    df = df[op.filter_fn(df)]

                # if op.log_enabled:
                #     with pd.option_context('display.max_rows', None, 'display.max_columns', None):
                #         print("{}('{}') | Sending field values: \n{}".format(op.__class__.__name__, op.name, df))

                counter += 1
                if op.log_enabled:
                    sys.stdout.write('.')
                    if counter % 100 == 0:
                        print("Rows {}".format(op.op_metrics.rows_returned))

                op.send(DataFrameMessage(df), op.consumers)

                # buffer_ = pd.concat([buffer_, df], axis=0, sort=False, ignore_index=True, copy=False)
                # if len(buffer_) >= 8192:
                #    op.send(buffer_, op.consumers)
                #    buffer_ = pd.DataFrame()

            if len(buffer_) > 0:
                op.send(buffer_, op.consumers)
                del buffer_

            return cur
示例#11
0
def run(use_pandas, secure, use_native):
    print("Cursor | Settings {}".format({
        'use_pandas': use_pandas,
        'secure': secure,
        'use_native': use_native
    }))

    key = 'sf1000-lineitem/lineitem_27.csv'
    sql = 'select * from S3Object limit 10000'

    boto3.set_stream_logger('', logging.DEBUG)

    if secure:
        cfg = Config(region_name="us-east-1",
                     parameter_validation=False,
                     max_pool_connections=10)
        session = Session()
        s3 = session.client('s3', config=cfg)
    else:
        cfg = Config(region_name="us-east-1",
                     parameter_validation=False,
                     max_pool_connections=10,
                     s3={'payload_signing_enabled': False})
        session = Session()
        s3 = session.client('s3', use_ssl=False, verify=False, config=cfg)

    if use_pandas and not use_native:

        cur = PandasCursor(s3).select(key, sql)
        dfs = cur.execute()

        rows_returned = 0
        for df in dfs:
            rows_returned += len(df)

        print("{}".format({
            'elapsed_time':
            round(cur.timer.elapsed(), 5),
            'rows_returned':
            rows_returned,
            'query_bytes':
            cur.query_bytes,
            'bytes_scanned':
            cur.bytes_scanned,
            'bytes_processed':
            cur.bytes_processed,
            'bytes_returned':
            "{} ({} MB / {} GB)".format(
                cur.bytes_returned,
                round(float(cur.bytes_returned) / 1000000.0, 5),
                round(float(cur.bytes_returned) / 1000000000.0, 5)),
            'bytes_returned_per_sec':
            "{} ({} MB / {} GB)".format(
                round(float(cur.bytes_returned) / cur.timer.elapsed(), 5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() / 1000000,
                    5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() /
                    1000000000, 5)),
            'time_to_first_record_response':
            None if cur.time_to_first_record_response is None else round(
                cur.time_to_first_record_response, 5),
            'time_to_last_record_response':
            None if cur.time_to_last_record_response is None else round(
                cur.time_to_last_record_response, 5)
        }))

    elif use_pandas and use_native:

        closure = {'df': None, 'rows_returned': 0}

        def on_data(data):

            # print("|||")
            # print(type(data))
            # print(data)
            # print("|||")

            # if closure['df'] is None:
            #     closure['df'] = pd.DataFrame(data)
            # else:
            #     closure['df'] = pd.concat([closure['df'], pd.DataFrame(data)], ignore_index=True, copy=False)

            closure['rows_returned'] += len(pd.DataFrame(data))

        cur = NativeCursor(scan).select(key, sql)
        cur.execute(on_data)

        rows_returned = closure['rows_returned']

        print("{}".format({
            'elapsed_time':
            round(cur.timer.elapsed(), 5),
            'rows_returned':
            rows_returned,
            'query_bytes':
            cur.query_bytes,
            'bytes_scanned':
            cur.bytes_scanned,
            'bytes_processed':
            cur.bytes_processed,
            'bytes_returned':
            "{} ({} MB / {} GB)".format(
                cur.bytes_returned,
                round(float(cur.bytes_returned) / 1000000.0, 5),
                round(float(cur.bytes_returned) / 1000000000.0, 5)),
            'bytes_returned_per_sec':
            "{} ({} MB / {} GB)".format(
                round(float(cur.bytes_returned) / cur.timer.elapsed(), 5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() / 1000000,
                    5),
                round(
                    float(cur.bytes_returned) / cur.timer.elapsed() /
                    1000000000, 5)),
            'time_to_first_record_response':
            None if cur.time_to_first_record_response is None else round(
                cur.time_to_first_record_response, 5),
            'time_to_last_record_response':
            None if cur.time_to_last_record_response is None else round(
                cur.time_to_last_record_response, 5)
        }))

    else:

        timer = Timer()
        timer.start()

        cur = Cursor(s3).select(key, sql)
        rows = cur.execute()
        for _ in rows:
            pass

        print(timer.elapsed())

    print("Cursor | Done")