def test_non_existent_key(): """Executes a select against a non existent key. :return: None """ cur = PandasCursor(boto3.client('s3'))\ .select('does-not-exist.csv', 'select * from S3Object') try: with pytest.raises(Exception): cur.execute() finally: cur.close()
def test_non_empty_results(): """Executes a select where results are returned. :return: None """ num_rows = 0 cur = PandasCursor(boto3.client('s3'))\ .select('region.csv', 'select * from S3Object') try: dfs = cur.execute() for df in dfs: for i, r in df.iterrows(): num_rows += 1 # print("{}:{}".format(num_rows, r)) assert num_rows == 5 finally: cur.close()
def test_select_projected_filtered_topk_with_cursor(): cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10, s3={'payload_signing_enabled': False}) session = Session() s3 = session.client('s3', use_ssl=False, verify=False, config=cfg) num_rows = 0 cur = PandasCursor(s3) \ .parquet() \ .select('parquet/small.multicolumn.9999.parquet', 'select b from s3Object where a > 5000 limit 100') try: dfs = cur.execute() for df in dfs: num_rows += len(df) print("{}:{}".format(num_rows, df)) assert num_rows == 100 finally: cur.close()
def test_aggregate(): """Executes a select with an aggregate. :return: None """ num_rows = 0 cur = PandasCursor(boto3.client('s3')) \ .select('region.csv', 'select count(*) from S3Object') try: dfs = cur.execute() for df in dfs: for i, r in df.iterrows(): num_rows += 1 assert r._0 == '5' # print("{}:{}".format(num_rows, r)) assert num_rows == 1 finally: cur.close()
def test_where_predicate(): """Executes a select with a where clause on one of the attributes. :return: None """ num_rows = 0 cur = PandasCursor(boto3.client('s3'))\ .select('region.csv', 'select * from S3Object where r_name = \'AMERICA\';') try: dfs = cur.execute() for df in dfs: for i, r in df.iterrows(): num_rows += 1 assert r._1 == 'AMERICA' # print("{}:{}".format(num_rows, r)) assert num_rows == 1 finally: cur.close()
def test_large_results(): """Executes a select where a large number of records are expected :return: None """ num_rows = 0 cur = PandasCursor(boto3.client('s3')) \ .select('lineitem.csv', 'select * from S3Object limit 150000') try: # pr = cProfile.Profile() # pr.enable() start = timeit.default_timer() dfs = cur.execute() for df in dfs: for i, r in df.iterrows(): num_rows += 1 # print("{}:{}".format(num_rows, r)) end = timeit.default_timer() elapsed = end - start print('Elapsed {}'.format(elapsed)) # pr.disable() # s = StringIO.StringIO() # sortby = 'cumulative' # ps = pstats.Stats(pr, stream=s).sort_stats(sortby) # ps.print_stats() # print (s.getvalue()) assert num_rows == 150000 finally: cur.close()
def execute_pandas_query(op): cur = PandasCursor(op.s3).select(op.s3key, op.s3sql) dfs = cur.execute() op.op_metrics.query_bytes = cur.query_bytes op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time() first_tuple = True for df in dfs: assert (len(df) > 0) if first_tuple: assert (len(df.columns.values) > 0) op.send(TupleMessage(Tuple(df.columns.values)), op.consumers) first_tuple = False if op.log_enabled: print("{}('{}') | Sending field names: {}".format( op.__class__.__name__, op.name, df.columns.values)) op.op_metrics.rows_returned += len(df) op.send(df, op.consumers) return cur
def measure_R_scan(): cur = PandasCursor(boto3.client('s3')) \ .select('access_method_benchmark/10-shards-10GB/data_0.csv', 'select * from S3Object where cast(F0 AS float) < 0.01; ') end_times = [] start = timeit.default_timer() for i in range(3): dfs = cur.execute() for df in dfs: pass end_times.append(timeit.default_timer()) end = timeit.default_timer() for n in range(3): print("trial {} takes {}".format( n, end_times[n] - start if n == 0 else end_times[n] - end_times[n - 1])) print("{} bytes scanned".format(cur.bytes_scanned)) print("time = {}".format(end - start)) print("R_scan = {}".format(1.0 * cur.bytes_scanned / (end - start))) cur.close()
def test_no_ssl(): """Executes a select where results are returned. :return: None """ num_rows = 0 # Boto is not thread safe so need one of these per scan op cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10, s3={'payload_signing_enabled': False}) session = Session() s3 = session.client('s3', use_ssl=False, verify=False, config=cfg) cur = PandasCursor(s3)\ .select('lineitem.csv', 'select * from S3Object limit 150000') try: pr = cProfile.Profile() pr.enable() dfs = cur.execute() for df in dfs: num_rows += len(df) pr.disable() s = StringIO.StringIO() sortby = 'cumulative' ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue()) assert num_rows == 150000 finally: cur.close() num_rows = 0 # Boto is not thread safe so need one of these per scan op cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10) session = Session() s3 = session.client('s3', config=cfg) cur = PandasCursor(s3) \ .select('lineitem.csv', 'select * from S3Object limit 150000') try: pr = cProfile.Profile() pr.enable() dfs = cur.execute() for df in dfs: num_rows += len(df) pr.disable() s = StringIO.StringIO() sortby = 'cumulative' ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue()) assert num_rows == 150000 finally: cur.close()
def execute_pandas_query(op): if op.use_native: closure = {'first_tuple': True} def on_numpy_array(np_array): df = pd.DataFrame(np_array) if closure['first_tuple']: assert (len(df.columns.values) > 0) op.send(TupleMessage(Tuple(df.columns.values)), op.consumers) closure['first_tuple'] = False if op.log_enabled: print("{}('{}') | Sending field names: {}".format( op.__class__.__name__, op.name, df.columns.values)) op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time( ) op.op_metrics.rows_returned += len(df) if op.log_enabled: print("{}('{}') | Sending field values:".format( op.__class__.__name__, op.name)) print(df) op.send(df, op.consumers) # cur = NativeCursor(op.fast_s3).select(op.s3key, op.s3sql) # cur.execute(on_numpy_array) # # op.op_metrics.query_bytes = cur.query_bytes # # return cur else: if op.format_ is Format.CSV: cur = PandasCursor(op.s3).csv().select(op.s3key, op.s3sql) elif op.format_ is Format.PARQUET: cur = PandasCursor(op.s3).parquet().select(op.s3key, op.s3sql) else: raise Exception("Unrecognized format {}", op.format_) dfs = cur.execute() op.op_metrics.query_bytes = cur.query_bytes op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time() first_tuple = True counter = 0 buffer_ = pd.DataFrame() #print("DataFrames: ") #print(dfs) for df in dfs: if op.fn: df = op.fn(df) if first_tuple: # assert (len(df.columns.values) > 0) # op.send(TupleMessage(Tuple(df.columns.values)), op.consumers) first_tuple = False if op.log_enabled: print("{}('{}') | Sending field names: {}".format( op.__class__.__name__, op.name, df.columns.values)) op.op_metrics.rows_returned += len(df) # Apply filter if there is one # if op.filter_fn is not None: # df = df[op.filter_fn(df)] # if op.log_enabled: # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # print("{}('{}') | Sending field values: \n{}".format(op.__class__.__name__, op.name, df)) counter += 1 if op.log_enabled: sys.stdout.write('.') if counter % 100 == 0: print("Rows {}".format(op.op_metrics.rows_returned)) op.send(DataFrameMessage(df), op.consumers) # buffer_ = pd.concat([buffer_, df], axis=0, sort=False, ignore_index=True, copy=False) # if len(buffer_) >= 8192: # op.send(buffer_, op.consumers) # buffer_ = pd.DataFrame() if len(buffer_) > 0: op.send(buffer_, op.consumers) del buffer_ return cur
def run(use_pandas, secure, use_native): print("Cursor | Settings {}".format({ 'use_pandas': use_pandas, 'secure': secure, 'use_native': use_native })) key = 'sf1000-lineitem/lineitem_27.csv' sql = 'select * from S3Object limit 10000' boto3.set_stream_logger('', logging.DEBUG) if secure: cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10) session = Session() s3 = session.client('s3', config=cfg) else: cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10, s3={'payload_signing_enabled': False}) session = Session() s3 = session.client('s3', use_ssl=False, verify=False, config=cfg) if use_pandas and not use_native: cur = PandasCursor(s3).select(key, sql) dfs = cur.execute() rows_returned = 0 for df in dfs: rows_returned += len(df) print("{}".format({ 'elapsed_time': round(cur.timer.elapsed(), 5), 'rows_returned': rows_returned, 'query_bytes': cur.query_bytes, 'bytes_scanned': cur.bytes_scanned, 'bytes_processed': cur.bytes_processed, 'bytes_returned': "{} ({} MB / {} GB)".format( cur.bytes_returned, round(float(cur.bytes_returned) / 1000000.0, 5), round(float(cur.bytes_returned) / 1000000000.0, 5)), 'bytes_returned_per_sec': "{} ({} MB / {} GB)".format( round(float(cur.bytes_returned) / cur.timer.elapsed(), 5), round( float(cur.bytes_returned) / cur.timer.elapsed() / 1000000, 5), round( float(cur.bytes_returned) / cur.timer.elapsed() / 1000000000, 5)), 'time_to_first_record_response': None if cur.time_to_first_record_response is None else round( cur.time_to_first_record_response, 5), 'time_to_last_record_response': None if cur.time_to_last_record_response is None else round( cur.time_to_last_record_response, 5) })) elif use_pandas and use_native: closure = {'df': None, 'rows_returned': 0} def on_data(data): # print("|||") # print(type(data)) # print(data) # print("|||") # if closure['df'] is None: # closure['df'] = pd.DataFrame(data) # else: # closure['df'] = pd.concat([closure['df'], pd.DataFrame(data)], ignore_index=True, copy=False) closure['rows_returned'] += len(pd.DataFrame(data)) cur = NativeCursor(scan).select(key, sql) cur.execute(on_data) rows_returned = closure['rows_returned'] print("{}".format({ 'elapsed_time': round(cur.timer.elapsed(), 5), 'rows_returned': rows_returned, 'query_bytes': cur.query_bytes, 'bytes_scanned': cur.bytes_scanned, 'bytes_processed': cur.bytes_processed, 'bytes_returned': "{} ({} MB / {} GB)".format( cur.bytes_returned, round(float(cur.bytes_returned) / 1000000.0, 5), round(float(cur.bytes_returned) / 1000000000.0, 5)), 'bytes_returned_per_sec': "{} ({} MB / {} GB)".format( round(float(cur.bytes_returned) / cur.timer.elapsed(), 5), round( float(cur.bytes_returned) / cur.timer.elapsed() / 1000000, 5), round( float(cur.bytes_returned) / cur.timer.elapsed() / 1000000000, 5)), 'time_to_first_record_response': None if cur.time_to_first_record_response is None else round( cur.time_to_first_record_response, 5), 'time_to_last_record_response': None if cur.time_to_last_record_response is None else round( cur.time_to_last_record_response, 5) })) else: timer = Timer() timer.start() cur = Cursor(s3).select(key, sql) rows = cur.execute() for _ in rows: pass print(timer.elapsed()) print("Cursor | Done")